[PATCH] syscall latency improvement #1

David Howells (dhowells@redhat.com)
Fri, 25 Jan 2002 18:54:02 +0000


Hi Linus,

The attached patch does the following to 2.5.3-pre5:

* consolidates various status items that are found in the lower reaches of
task_struct into one 32-bit word, thus allowing them to be tested
atomically without the need to disable interrupts in entry.S.

* optimises the instructions in the system_call path in entry.S

* frees up a hole in the bottom part of the task_struct (on the 1st cache
line).

* improves base syscall latency by approximately 5.4% (dual PIII) or 3.6%
(dual Athlon) as measured by lmbench's "lat_syscall null" command against
the vanilla kernel.

Most notable are the changes to the following files:

arch/i386/kernel/entry.S
include/linux/sched.h

David

diff -uNr linux-2.5.3-pre5/arch/i386/kernel/entry.S linux-work-253p5/arch/i386/kernel/entry.S
--- linux-2.5.3-pre5/arch/i386/kernel/entry.S Tue Jan 22 09:06:49 2002
+++ linux-work-253p5/arch/i386/kernel/entry.S Fri Jan 25 15:01:07 2002
@@ -72,10 +72,13 @@
*/
state = 0
flags = 4
-sigpending = 8
+work = 8
+need_resched = work+0
+syscall_trace = work+1
+sigpending = work+2
+notify_resume = work+3
addr_limit = 12
exec_domain = 16
-need_resched = 20
tsk_ptrace = 24
processor = 52

@@ -151,7 +154,7 @@
call *%edx
addl $4, %esp
popl %eax
- jmp ret_from_sys_call
+ jmp resume_userspace

ENTRY(lcall27)
pushfl # We get a different stack layout with call gates,
@@ -172,7 +175,7 @@
call *%edx
addl $4, %esp
popl %eax
- jmp ret_from_sys_call
+ jmp resume_userspace


ENTRY(ret_from_fork)
@@ -180,9 +183,7 @@
call SYMBOL_NAME(schedule_tail)
addl $4, %esp
GET_CURRENT(%ebx)
- testb $0x02,tsk_ptrace(%ebx) # PT_TRACESYS
- jne tracesys_exit
- jmp ret_from_sys_call
+ jmp syscall_exit

/*
* Return to user mode is not as complex as all this looks,
@@ -191,73 +192,99 @@
* less clear than it otherwise should be.
*/

+ # userspace resumption stub bypassing syscall exit tracing
+ ALIGN
+ENTRY(ret_from_intr)
+ GET_CURRENT(%ebx)
+ret_from_exception:
+ movl EFLAGS(%esp),%eax # mix EFLAGS and CS
+ movb CS(%esp),%al
+ testl $(VM_MASK | 3),%eax
+ jz restore_all # returning to kernel-space or vm86-space
+ sti # we may have come from an interrupt handler
+ENTRY(resume_userspace)
+ movl work(%ebx),%ecx
+ andl $0xffff00ff,%ecx # current->work (ignoring syscall_trace)
+ jne work_pending
+ jmp restore_all
+
+ # system call handler stub
+ ALIGN
ENTRY(system_call)
pushl %eax # save orig_eax
SAVE_ALL
GET_CURRENT(%ebx)
- testb $0x02,tsk_ptrace(%ebx) # PT_TRACESYS
- jne tracesys
cmpl $(NR_syscalls),%eax
- jae badsys
+ jae syscall_badsys
+ testb $0xff,syscall_trace(%ebx) # system call tracing in operation
+ jnz syscall_trace_entry
+syscall_traced:
call *SYMBOL_NAME(sys_call_table)(,%eax,4)
- movl %eax,EAX(%esp) # save the return value
-ENTRY(ret_from_sys_call)
- cli # need_resched and signals atomic test
- cmpl $0,need_resched(%ebx)
- jne reschedule
- cmpl $0,sigpending(%ebx)
- jne signal_return
+ movl %eax,EAX(%esp) # store the return value
+syscall_exit:
+ movl work(%ebx),%ecx
+ testl %ecx,%ecx # current->work
+ jne syscall_exit_work
restore_all:
RESTORE_ALL

+ # perform work that needs to be done immediately before resumption
ALIGN
-signal_return:
- sti # we can get here from an interrupt handler
+work_pending:
+ testb %cl,%cl # current->work.need_resched
+ jz work_notifysig
+work_resched:
+ call SYMBOL_NAME(schedule)
+ movl work(%ebx),%ecx
+ andl $0xffff00ff,%ecx # ignore the syscall trace counter
+ jz restore_all
+ testb %cl,%cl # current->work.need_resched
+ jnz work_resched
+
+work_notifysig: # deal with pending signals and notify-resume requests
testl $(VM_MASK),EFLAGS(%esp)
movl %esp,%eax
- jne v86_signal_return
+ jne work_notifysig_v86 # returning to kernel-space or vm86-space
xorl %edx,%edx
- call SYMBOL_NAME(do_signal)
+ call SYMBOL_NAME(do_notify_resume)
jmp restore_all

ALIGN
-v86_signal_return:
+work_notifysig_v86:
+ pushl %ecx
call SYMBOL_NAME(save_v86_state)
+ popl %ecx
movl %eax,%esp
xorl %edx,%edx
- call SYMBOL_NAME(do_signal)
+ call SYMBOL_NAME(do_notify_resume)
jmp restore_all

+ # perform syscall exit tracing
ALIGN
-tracesys:
+syscall_trace_entry:
movl $-ENOSYS,EAX(%esp)
- call SYMBOL_NAME(syscall_trace)
+ movl %esp,%eax
+ xorl %edx,%edx
+ call SYMBOL_NAME(do_syscall_trace)
movl ORIG_EAX(%esp),%eax
cmpl $(NR_syscalls),%eax
- jae tracesys_exit
- call *SYMBOL_NAME(sys_call_table)(,%eax,4)
- movl %eax,EAX(%esp) # save the return value
-tracesys_exit:
- call SYMBOL_NAME(syscall_trace)
- jmp ret_from_sys_call
-badsys:
- movl $-ENOSYS,EAX(%esp)
- jmp ret_from_sys_call
+ jnae syscall_traced
+ jmp syscall_exit

+ # perform syscall exit tracing
ALIGN
-ENTRY(ret_from_intr)
- GET_CURRENT(%ebx)
-ret_from_exception:
- movl EFLAGS(%esp),%eax # mix EFLAGS and CS
- movb CS(%esp),%al
- testl $(VM_MASK | 3),%eax # return to VM86 mode or non-supervisor?
- jne ret_from_sys_call
- jmp restore_all
+syscall_exit_work:
+ testb %ch,%ch # current->work.syscall_trace
+ jz work_pending
+ movl %esp,%eax
+ movl $1,%edx
+ call SYMBOL_NAME(do_syscall_trace)
+ jmp resume_userspace

ALIGN
-reschedule:
- call SYMBOL_NAME(schedule) # test
- jmp ret_from_sys_call
+syscall_badsys:
+ movl $-ENOSYS,EAX(%esp)
+ jmp resume_userspace

ENTRY(divide_error)
pushl $0 # no error code
diff -uNr linux-2.5.3-pre5/arch/i386/kernel/process.c linux-work-253p5/arch/i386/kernel/process.c
--- linux-2.5.3-pre5/arch/i386/kernel/process.c Fri Jan 25 14:52:14 2002
+++ linux-work-253p5/arch/i386/kernel/process.c Fri Jan 25 15:01:07 2002
@@ -89,7 +89,7 @@

/*
* On SMP it's slightly faster (but much more power-consuming!)
- * to poll the ->need_resched flag instead of waiting for the
+ * to poll the ->work.need_resched flag instead of waiting for the
* cross-CPU IPI to arrive. Use this option with caution.
*/
static void poll_idle (void)
@@ -102,15 +102,15 @@
* Deal with another CPU just having chosen a thread to
* run here:
*/
- oldval = xchg(&current->need_resched, -1);
+ oldval = xchg(&current->work.need_resched, -1);

if (!oldval)
asm volatile(
"2:"
- "cmpl $-1, %0;"
+ "cmpb $-1, %0;"
"rep; nop;"
"je 2b;"
- : :"m" (current->need_resched));
+ : :"m" (current->work.need_resched));
}

/*
diff -uNr linux-2.5.3-pre5/arch/i386/kernel/ptrace.c linux-work-253p5/arch/i386/kernel/ptrace.c
--- linux-2.5.3-pre5/arch/i386/kernel/ptrace.c Tue Jan 22 09:06:49 2002
+++ linux-work-253p5/arch/i386/kernel/ptrace.c Fri Jan 25 15:01:07 2002
@@ -277,10 +277,18 @@
ret = -EIO;
if ((unsigned long) data > _NSIG)
break;
- if (request == PTRACE_SYSCALL)
- child->ptrace |= PT_TRACESYS;
- else
- child->ptrace &= ~PT_TRACESYS;
+ if (request == PTRACE_SYSCALL) {
+ if (!(child->ptrace & PT_SYSCALLTRACE)) {
+ child->ptrace |= PT_SYSCALLTRACE;
+ child->work.syscall_trace++;
+ }
+ }
+ else {
+ if (child->ptrace & PT_SYSCALLTRACE) {
+ child->ptrace &= ~PT_SYSCALLTRACE;
+ child->work.syscall_trace--;
+ }
+ }
child->exit_code = data;
/* make sure the single step bit is not set. */
tmp = get_stack_long(child, EFL_OFFSET) & ~TRAP_FLAG;
@@ -315,7 +323,10 @@
ret = -EIO;
if ((unsigned long) data > _NSIG)
break;
- child->ptrace &= ~PT_TRACESYS;
+ if (child->ptrace & PT_SYSCALLTRACE) {
+ child->ptrace &= ~PT_SYSCALLTRACE;
+ child->work.syscall_trace--;
+ }
if ((child->ptrace & PT_DTRACE) == 0) {
/* Spurious delayed TF traps may occur */
child->ptrace |= PT_DTRACE;
@@ -439,10 +450,14 @@
return ret;
}

-asmlinkage void syscall_trace(void)
+/* notification of system call entry/exit
+ * - triggered by current->work.syscall_trace
+ */
+__attribute__((regparm(3)))
+void do_syscall_trace(struct pt_regs *regs, int entryexit)
{
- if ((current->ptrace & (PT_PTRACED|PT_TRACESYS)) !=
- (PT_PTRACED|PT_TRACESYS))
+ if ((current->ptrace & (PT_PTRACED|PT_SYSCALLTRACE)) !=
+ (PT_PTRACED|PT_SYSCALLTRACE))
return;
/* the 0x80 provides a way for the tracing parent to distinguish
between a syscall stop and SIGTRAP delivery */
@@ -461,3 +476,15 @@
current->exit_code = 0;
}
}
+
+/* notification of userspace execution resumption
+ * - triggered by current->work.notify_resume
+ */
+__attribute__((regparm(3)))
+void do_notify_resume(struct pt_regs *regs, sigset_t *oldset,
+ struct task_work work_pending)
+{
+ /* deal with pending signal delivery */
+ if (work_pending.sigpending)
+ do_signal(regs,oldset);
+}
diff -uNr linux-2.5.3-pre5/arch/i386/kernel/signal.c linux-work-253p5/arch/i386/kernel/signal.c
--- linux-2.5.3-pre5/arch/i386/kernel/signal.c Tue Jan 22 09:06:49 2002
+++ linux-work-253p5/arch/i386/kernel/signal.c Fri Jan 25 15:01:07 2002
@@ -28,8 +28,6 @@

#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))

-int FASTCALL(do_signal(struct pt_regs *regs, sigset_t *oldset));
-
int copy_siginfo_to_user(siginfo_t *to, siginfo_t *from)
{
if (!access_ok (VERIFY_WRITE, to, sizeof(siginfo_t)))
diff -uNr linux-2.5.3-pre5/arch/i386/kernel/vm86.c linux-work-253p5/arch/i386/kernel/vm86.c
--- linux-2.5.3-pre5/arch/i386/kernel/vm86.c Tue Jan 22 09:06:49 2002
+++ linux-work-253p5/arch/i386/kernel/vm86.c Fri Jan 25 15:01:07 2002
@@ -212,7 +212,7 @@
info->regs.__null_ds = 0;
info->regs.__null_es = 0;

-/* we are clearing fs,gs later just before "jmp ret_from_sys_call",
+/* we are clearing fs,gs later just before "jmp resume_userspace",
* because starting with Linux 2.1.x they aren't no longer saved/restored
*/

@@ -255,7 +255,7 @@
__asm__ __volatile__(
"xorl %%eax,%%eax; movl %%eax,%%fs; movl %%eax,%%gs\n\t"
"movl %0,%%esp\n\t"
- "jmp ret_from_sys_call"
+ "jmp resume_userspace"
: /* no outputs */
:"r" (&info->regs), "b" (tsk) : "ax");
/* we never return here */
@@ -268,7 +268,7 @@
regs32 = save_v86_state(regs16);
regs32->eax = retval;
__asm__ __volatile__("movl %0,%%esp\n\t"
- "jmp ret_from_sys_call"
+ "jmp resume_userspace"
: : "r" (regs32), "b" (current));
}

diff -uNr linux-2.5.3-pre5/fs/lockd/svc.c linux-work-253p5/fs/lockd/svc.c
--- linux-2.5.3-pre5/fs/lockd/svc.c Tue Jan 22 09:05:58 2002
+++ linux-work-253p5/fs/lockd/svc.c Fri Jan 25 15:01:07 2002
@@ -304,7 +304,7 @@
* Wait for the lockd process to exit, but since we're holding
* the lockd semaphore, we can't wait around forever ...
*/
- current->sigpending = 0;
+ current->work.sigpending = 0;
interruptible_sleep_on_timeout(&lockd_exit, HZ);
if (nlmsvc_pid) {
printk(KERN_WARNING
diff -uNr linux-2.5.3-pre5/fs/nfsd/export.c linux-work-253p5/fs/nfsd/export.c
--- linux-2.5.3-pre5/fs/nfsd/export.c Tue Jan 22 09:05:58 2002
+++ linux-work-253p5/fs/nfsd/export.c Fri Jan 25 15:01:07 2002
@@ -468,7 +468,7 @@
return 0;
}

- current->sigpending = 0;
+ current->work.sigpending = 0;
want_lock++;
while (hash_count || hash_lock) {
interruptible_sleep_on(&hash_wait);
diff -uNr linux-2.5.3-pre5/include/asm-i386/signal.h linux-work-253p5/include/asm-i386/signal.h
--- linux-2.5.3-pre5/include/asm-i386/signal.h Thu Jan 24 14:53:26 2002
+++ linux-work-253p5/include/asm-i386/signal.h Fri Jan 25 15:05:45 2002
@@ -2,6 +2,7 @@
#define _ASMi386_SIGNAL_H

#include <linux/types.h>
+#include <linux/linkage.h>

/* Avoid too many header ordering problems. */
struct siginfo;
@@ -216,6 +217,8 @@
return word;
}

+extern int FASTCALL(do_signal(struct pt_regs *regs, sigset_t *oldset));
+
#endif /* __KERNEL__ */

#endif
diff -uNr linux-2.5.3-pre5/include/linux/init_task.h linux-work-253p5/include/linux/init_task.h
--- linux-2.5.3-pre5/include/linux/init_task.h Fri Jan 25 14:52:17 2002
+++ linux-work-253p5/include/linux/init_task.h Fri Jan 25 15:12:17 2002
@@ -35,6 +35,14 @@
siglock: SPIN_LOCK_UNLOCKED \
}

+#define INIT_TASK_WORK \
+{ \
+ need_resched: 0, \
+ syscall_trace: 0, \
+ sigpending: 0, \
+ notify_resume: 0, \
+}
+
/*
* INIT_TASK is used to set up the first task table, touch at
* your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -43,7 +51,7 @@
{ \
state: 0, \
flags: 0, \
- sigpending: 0, \
+ work: INIT_TASK_WORK, \
addr_limit: KERNEL_DS, \
exec_domain: &default_exec_domain, \
lock_depth: -1, \
diff -uNr linux-2.5.3-pre5/include/linux/sched.h linux-work-253p5/include/linux/sched.h
--- linux-2.5.3-pre5/include/linux/sched.h Fri Jan 25 14:52:17 2002
+++ linux-work-253p5/include/linux/sched.h Fri Jan 25 15:05:45 2002
@@ -228,19 +228,29 @@

typedef struct prio_array prio_array_t;

+/* this struct must occupy one 32-bit chunk so that is can be read in one go */
+struct task_work {
+ __s8 need_resched;
+ __u8 syscall_trace; /* count of syscall interceptors */
+ __u8 sigpending;
+ __u8 notify_resume; /* request for notification on
+ userspace execution resumption */
+} __attribute__((packed));
+
struct task_struct {
/*
* offsets of these are hardcoded elsewhere - touch with care
*/
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
unsigned long flags; /* per process flags, defined below */
- int sigpending;
+ volatile struct task_work work;
+
mm_segment_t addr_limit; /* thread address space:
0-0xBFFFFFFF for user-thead
0-0xFFFFFFFF for kernel-thread
*/
struct exec_domain *exec_domain;
- volatile long need_resched;
+ long __pad;
unsigned long ptrace;

int lock_depth; /* Lock depth */
@@ -381,7 +391,7 @@
*/

#define PT_PTRACED 0x00000001
-#define PT_TRACESYS 0x00000002
+#define PT_SYSCALLTRACE 0x00000002 /* T if syscall_trace is +1 for ptrace() */
#define PT_DTRACE 0x00000004 /* delayed trace (used on m68k, i386) */
#define PT_TRACESYSGOOD 0x00000008
#define PT_PTRACE_CAP 0x00000010 /* ptracer can follow suid-exec */
@@ -564,12 +574,12 @@

static inline int signal_pending(struct task_struct *p)
{
- return (p->sigpending != 0);
+ return (p->work.sigpending != 0);
}

static inline int need_resched(void)
{
- return unlikely(current->need_resched != 0);
+ return unlikely(current->work.need_resched != 0);
}

static inline void cond_resched(void)
@@ -614,7 +624,7 @@

static inline void recalc_sigpending(struct task_struct *t)
{
- t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked);
+ t->work.sigpending = has_pending_signals(&t->pending.signal, &t->blocked);
}

/* True if we are on the alternate signal stack. */
diff -uNr linux-2.5.3-pre5/kernel/fork.c linux-work-253p5/kernel/fork.c
--- linux-2.5.3-pre5/kernel/fork.c Fri Jan 25 14:52:17 2002
+++ linux-work-253p5/kernel/fork.c Fri Jan 25 15:01:07 2002
@@ -631,7 +631,7 @@
}
spin_lock_init(&p->alloc_lock);

- p->sigpending = 0;
+ p->work.sigpending = 0;
init_sigpending(&p->pending);

p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
@@ -756,7 +756,7 @@
* Let the child process run first, to avoid most of the
* COW overhead when the child exec()s afterwards.
*/
- current->need_resched = 1;
+ current->work.need_resched = 1;

fork_out:
return retval;
diff -uNr linux-2.5.3-pre5/kernel/sched.c linux-work-253p5/kernel/sched.c
--- linux-2.5.3-pre5/kernel/sched.c Fri Jan 25 14:52:17 2002
+++ linux-work-253p5/kernel/sched.c Fri Jan 25 15:01:07 2002
@@ -176,9 +176,9 @@
{
int need_resched;

- need_resched = p->need_resched;
+ need_resched = p->work.need_resched;
wmb();
- p->need_resched = 1;
+ p->work.need_resched = 1;
if (!need_resched && (p->cpu != smp_processor_id()))
smp_send_reschedule(p->cpu);
}
@@ -483,7 +483,7 @@
this_rq->nr_running++;
enqueue_task(next, this_rq->active);
if (next->prio < current->prio)
- current->need_resched = 1;
+ current->work.need_resched = 1;
if (!idle && --imbalance) {
if (array == busiest->expired) {
array = busiest->active;
@@ -528,7 +528,7 @@
return idle_tick();
/* Task might have expired already, but not scheduled off yet */
if (p->array != rq->active) {
- p->need_resched = 1;
+ p->work.need_resched = 1;
return;
}
spin_lock(&rq->lock);
@@ -539,7 +539,7 @@
*/
if ((p->policy == SCHED_RR) && !--p->time_slice) {
p->time_slice = NICE_TO_TIMESLICE(p->__nice);
- p->need_resched = 1;
+ p->work.need_resched = 1;

/* put it at the end of the queue: */
dequeue_task(p, rq->active);
@@ -559,7 +559,7 @@
p->sleep_avg--;
if (!--p->time_slice) {
dequeue_task(p, rq->active);
- p->need_resched = 1;
+ p->work.need_resched = 1;
p->prio = effective_prio(p);
p->time_slice = NICE_TO_TIMESLICE(p->__nice);
enqueue_task(p, TASK_INTERACTIVE(p) ? rq->active : rq->expired);
@@ -622,7 +622,7 @@
next = list_entry(queue->next, task_t, run_list);

switch_tasks:
- prev->need_resched = 0;
+ prev->work.need_resched = 0;

if (likely(prev != next)) {
rq->nr_switches++;
@@ -1246,7 +1246,7 @@
current->prio = MAX_PRIO;
current->state = TASK_RUNNING;
double_rq_unlock(this_rq, rq);
- current->need_resched = 1;
+ current->work.need_resched = 1;
__restore_flags(flags);
}

diff -uNr linux-2.5.3-pre5/kernel/signal.c linux-work-253p5/kernel/signal.c
--- linux-2.5.3-pre5/kernel/signal.c Tue Jan 22 09:06:00 2002
+++ linux-work-253p5/kernel/signal.c Fri Jan 25 15:01:07 2002
@@ -105,7 +105,7 @@
void
flush_signals(struct task_struct *t)
{
- t->sigpending = 0;
+ t->work.sigpending = 0;
flush_sigqueue(&t->pending);
}

@@ -119,7 +119,7 @@
if (atomic_dec_and_test(&sig->count))
kmem_cache_free(sigact_cachep, sig);
}
- tsk->sigpending = 0;
+ tsk->work.sigpending = 0;
flush_sigqueue(&tsk->pending);
spin_unlock_irq(&tsk->sigmask_lock);
}
@@ -246,7 +246,7 @@
if (current->notifier) {
if (sigismember(current->notifier_mask, sig)) {
if (!(current->notifier)(current->notifier_data)) {
- current->sigpending = 0;
+ current->work.sigpending = 0;
return 0;
}
}
@@ -465,7 +465,7 @@
*/
static inline void signal_wake_up(struct task_struct *t)
{
- t->sigpending = 1;
+ t->work.sigpending = 1;

#ifdef CONFIG_SMP
/*
diff -uNr linux-2.5.3-pre5/net/sunrpc/sched.c linux-work-253p5/net/sunrpc/sched.c
--- linux-2.5.3-pre5/net/sunrpc/sched.c Fri Jan 25 14:52:17 2002
+++ linux-work-253p5/net/sunrpc/sched.c Fri Jan 25 15:01:07 2002
@@ -1109,7 +1109,7 @@
unsigned long flags;

while (all_tasks) {
- current->sigpending = 0;
+ current->work.sigpending = 0;
rpc_killall_tasks(NULL);
__rpc_schedule();
if (all_tasks) {
@@ -1183,7 +1183,7 @@
* Usually rpciod will exit very quickly, so we
* wait briefly before checking the process id.
*/
- current->sigpending = 0;
+ current->work.sigpending = 0;
yield();
/*
* Display a message if we're going to wait longer.
diff -uNr linux-2.5.3-pre5/net/sunrpc/svc.c linux-work-253p5/net/sunrpc/svc.c
--- linux-2.5.3-pre5/net/sunrpc/svc.c Tue Jan 22 09:06:09 2002
+++ linux-work-253p5/net/sunrpc/svc.c Fri Jan 25 15:01:08 2002
@@ -185,7 +185,7 @@
progp->pg_name, proto == IPPROTO_UDP? "udp" : "tcp", port);

if (!port)
- current->sigpending = 0;
+ current->work.sigpending = 0;

for (i = 0; i < progp->pg_nvers; i++) {
if (progp->pg_vers[i] == NULL)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/