[PATCH] syscall latency improvement #1 (253p6)

David Howells (dhowells@redhat.com)
Tue, 29 Jan 2002 15:25:50 +0000


Hi Linus,

I've re-done the patches against -pre6. This is patch #1 which provides an
extra notification from entry.S for use by task ornaments (patch to
follow). It also consolidates the work-to-be-done part of task_struct into a
single 32-bit word, and improved the syscall latency slightly.

The improvement is appears to be only ~1.05% now (as opposed to 2.72% with
-pre5 now that the CLI's have been put back in).

This improvement is actually slightly better than it might otherwise seem, as
there'd be an extra test and failed conditional branch in the system-call,
interrupt and exception return path to support the extra notification that is,
for me, the main aim of this patch.

David

diff -uNr linux-2.5.3-pre6/arch/i386/kernel/entry.S linux-work-253p6/arch/i386/kernel/entry.S
--- linux-2.5.3-pre6/arch/i386/kernel/entry.S Tue Jan 29 08:19:23 2002
+++ linux-work-253p6/arch/i386/kernel/entry.S Tue Jan 29 08:38:49 2002
@@ -72,10 +72,13 @@
*/
state = 0
flags = 4
-sigpending = 8
+work = 8
+need_resched = work+0
+syscall_trace = work+1
+sigpending = work+2
+notify_resume = work+3
addr_limit = 12
exec_domain = 16
-need_resched = 20
tsk_ptrace = 24
cpu = 32

@@ -151,7 +154,7 @@
call *%edx
addl $4, %esp
popl %eax
- jmp ret_from_sys_call
+ jmp resume_userspace

ENTRY(lcall27)
pushfl # We get a different stack layout with call gates,
@@ -172,7 +175,7 @@
call *%edx
addl $4, %esp
popl %eax
- jmp ret_from_sys_call
+ jmp resume_userspace


ENTRY(ret_from_fork)
@@ -180,9 +183,7 @@
call SYMBOL_NAME(schedule_tail)
addl $4, %esp
GET_CURRENT(%ebx)
- testb $0x02,tsk_ptrace(%ebx) # PT_TRACESYS
- jne tracesys_exit
- jmp ret_from_sys_call
+ jmp syscall_exit

/*
* Return to user mode is not as complex as all this looks,
@@ -191,73 +192,105 @@
* less clear than it otherwise should be.
*/

+ # userspace resumption stub bypassing syscall exit tracing
+ ALIGN
+ENTRY(ret_from_intr)
+ GET_CURRENT(%ebx)
+ret_from_exception:
+ movl EFLAGS(%esp),%eax # mix EFLAGS and CS
+ movb CS(%esp),%al
+ testl $(VM_MASK | 3),%eax
+ jz restore_all # returning to kernel-space or vm86-space
+ENTRY(resume_userspace)
+ cli # make sure we don't miss an interrupt setting need_resched
+ # or sigpending between sampling and the iret
+ movl work(%ebx),%ecx
+ andl $0xffff00ff,%ecx # current->work (ignoring syscall_trace)
+ jne work_pending
+ jmp restore_all
+
+ # system call handler stub
+ ALIGN
ENTRY(system_call)
pushl %eax # save orig_eax
SAVE_ALL
GET_CURRENT(%ebx)
- testb $0x02,tsk_ptrace(%ebx) # PT_TRACESYS
- jne tracesys
cmpl $(NR_syscalls),%eax
- jae badsys
+ jae syscall_badsys
+ testb $0xff,syscall_trace(%ebx) # system call tracing in operation
+ jnz syscall_trace_entry
+syscall_traced:
call *SYMBOL_NAME(sys_call_table)(,%eax,4)
- movl %eax,EAX(%esp) # save the return value
-ENTRY(ret_from_sys_call)
- cli # need_resched and signals atomic test
- cmpl $0,need_resched(%ebx)
- jne reschedule
- cmpl $0,sigpending(%ebx)
- jne signal_return
+ movl %eax,EAX(%esp) # store the return value
+syscall_exit:
+ cli # make sure we don't miss an interrupt setting need_resched
+ # or sigpending between sampling and the iret
+ movl work(%ebx),%ecx
+ testl %ecx,%ecx # current->work
+ jne syscall_exit_work
restore_all:
RESTORE_ALL

+ # perform work that needs to be done immediately before resumption
ALIGN
-signal_return:
- sti # we can get here from an interrupt handler
+work_pending:
+ testb %cl,%cl # current->work.need_resched
+ jz work_notifysig
+work_resched:
+ call SYMBOL_NAME(schedule)
+ cli # make sure we don't miss an interrupt setting need_resched
+ # or sigpending between sampling and the iret
+ movl work(%ebx),%ecx
+ andl $0xffff00ff,%ecx # ignore the syscall trace counter
+ jz restore_all
+ testb %cl,%cl # current->work.need_resched
+ jnz work_resched
+
+work_notifysig: # deal with pending signals and notify-resume requests
testl $(VM_MASK),EFLAGS(%esp)
movl %esp,%eax
- jne v86_signal_return
+ jne work_notifysig_v86 # returning to kernel-space or vm86-space
xorl %edx,%edx
- call SYMBOL_NAME(do_signal)
+ call SYMBOL_NAME(do_notify_resume)
jmp restore_all

ALIGN
-v86_signal_return:
+work_notifysig_v86:
+ pushl %ecx
call SYMBOL_NAME(save_v86_state)
+ popl %ecx
movl %eax,%esp
xorl %edx,%edx
- call SYMBOL_NAME(do_signal)
+ call SYMBOL_NAME(do_notify_resume)
jmp restore_all

+ # perform syscall exit tracing
ALIGN
-tracesys:
+syscall_trace_entry:
movl $-ENOSYS,EAX(%esp)
- call SYMBOL_NAME(syscall_trace)
+ movl %esp,%eax
+ xorl %edx,%edx
+ call SYMBOL_NAME(do_syscall_trace)
movl ORIG_EAX(%esp),%eax
cmpl $(NR_syscalls),%eax
- jae tracesys_exit
- call *SYMBOL_NAME(sys_call_table)(,%eax,4)
- movl %eax,EAX(%esp) # save the return value
-tracesys_exit:
- call SYMBOL_NAME(syscall_trace)
- jmp ret_from_sys_call
-badsys:
- movl $-ENOSYS,EAX(%esp)
- jmp ret_from_sys_call
+ jnae syscall_traced
+ jmp syscall_exit

+ # perform syscall exit tracing
ALIGN
-ENTRY(ret_from_intr)
- GET_CURRENT(%ebx)
-ret_from_exception:
- movl EFLAGS(%esp),%eax # mix EFLAGS and CS
- movb CS(%esp),%al
- testl $(VM_MASK | 3),%eax # return to VM86 mode or non-supervisor?
- jne ret_from_sys_call
- jmp restore_all
+syscall_exit_work:
+ testb %ch,%ch # current->work.syscall_trace
+ jz work_pending
+ sti # could let do_syscall_trace() call schedule() instead
+ movl %esp,%eax
+ movl $1,%edx
+ call SYMBOL_NAME(do_syscall_trace)
+ jmp resume_userspace

ALIGN
-reschedule:
- call SYMBOL_NAME(schedule) # test
- jmp ret_from_sys_call
+syscall_badsys:
+ movl $-ENOSYS,EAX(%esp)
+ jmp resume_userspace

ENTRY(divide_error)
pushl $0 # no error code
diff -uNr linux-2.5.3-pre6/arch/i386/kernel/process.c linux-work-253p6/arch/i386/kernel/process.c
--- linux-2.5.3-pre6/arch/i386/kernel/process.c Tue Jan 29 08:19:23 2002
+++ linux-work-253p6/arch/i386/kernel/process.c Tue Jan 29 08:32:57 2002
@@ -89,7 +89,7 @@

/*
* On SMP it's slightly faster (but much more power-consuming!)
- * to poll the ->need_resched flag instead of waiting for the
+ * to poll the ->work.need_resched flag instead of waiting for the
* cross-CPU IPI to arrive. Use this option with caution.
*/
static void poll_idle (void)
@@ -102,15 +102,15 @@
* Deal with another CPU just having chosen a thread to
* run here:
*/
- oldval = xchg(&current->need_resched, -1);
+ oldval = xchg(&current->work.need_resched, -1);

if (!oldval)
asm volatile(
"2:"
- "cmpl $-1, %0;"
+ "cmpb $-1, %0;"
"rep; nop;"
"je 2b;"
- : :"m" (current->need_resched));
+ : :"m" (current->work.need_resched));
}

/*
diff -uNr linux-2.5.3-pre6/arch/i386/kernel/ptrace.c linux-work-253p6/arch/i386/kernel/ptrace.c
--- linux-2.5.3-pre6/arch/i386/kernel/ptrace.c Tue Jan 22 09:06:49 2002
+++ linux-work-253p6/arch/i386/kernel/ptrace.c Tue Jan 29 08:32:57 2002
@@ -277,10 +277,18 @@
ret = -EIO;
if ((unsigned long) data > _NSIG)
break;
- if (request == PTRACE_SYSCALL)
- child->ptrace |= PT_TRACESYS;
- else
- child->ptrace &= ~PT_TRACESYS;
+ if (request == PTRACE_SYSCALL) {
+ if (!(child->ptrace & PT_SYSCALLTRACE)) {
+ child->ptrace |= PT_SYSCALLTRACE;
+ child->work.syscall_trace++;
+ }
+ }
+ else {
+ if (child->ptrace & PT_SYSCALLTRACE) {
+ child->ptrace &= ~PT_SYSCALLTRACE;
+ child->work.syscall_trace--;
+ }
+ }
child->exit_code = data;
/* make sure the single step bit is not set. */
tmp = get_stack_long(child, EFL_OFFSET) & ~TRAP_FLAG;
@@ -315,7 +323,10 @@
ret = -EIO;
if ((unsigned long) data > _NSIG)
break;
- child->ptrace &= ~PT_TRACESYS;
+ if (child->ptrace & PT_SYSCALLTRACE) {
+ child->ptrace &= ~PT_SYSCALLTRACE;
+ child->work.syscall_trace--;
+ }
if ((child->ptrace & PT_DTRACE) == 0) {
/* Spurious delayed TF traps may occur */
child->ptrace |= PT_DTRACE;
@@ -439,10 +450,14 @@
return ret;
}

-asmlinkage void syscall_trace(void)
+/* notification of system call entry/exit
+ * - triggered by current->work.syscall_trace
+ */
+__attribute__((regparm(3)))
+void do_syscall_trace(struct pt_regs *regs, int entryexit)
{
- if ((current->ptrace & (PT_PTRACED|PT_TRACESYS)) !=
- (PT_PTRACED|PT_TRACESYS))
+ if ((current->ptrace & (PT_PTRACED|PT_SYSCALLTRACE)) !=
+ (PT_PTRACED|PT_SYSCALLTRACE))
return;
/* the 0x80 provides a way for the tracing parent to distinguish
between a syscall stop and SIGTRAP delivery */
@@ -461,3 +476,15 @@
current->exit_code = 0;
}
}
+
+/* notification of userspace execution resumption
+ * - triggered by current->work.notify_resume
+ */
+__attribute__((regparm(3)))
+void do_notify_resume(struct pt_regs *regs, sigset_t *oldset,
+ struct task_work work_pending)
+{
+ /* deal with pending signal delivery */
+ if (work_pending.sigpending)
+ do_signal(regs,oldset);
+}
diff -uNr linux-2.5.3-pre6/arch/i386/kernel/signal.c linux-work-253p6/arch/i386/kernel/signal.c
--- linux-2.5.3-pre6/arch/i386/kernel/signal.c Tue Jan 22 09:06:49 2002
+++ linux-work-253p6/arch/i386/kernel/signal.c Tue Jan 29 08:32:57 2002
@@ -28,8 +28,6 @@

#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))

-int FASTCALL(do_signal(struct pt_regs *regs, sigset_t *oldset));
-
int copy_siginfo_to_user(siginfo_t *to, siginfo_t *from)
{
if (!access_ok (VERIFY_WRITE, to, sizeof(siginfo_t)))
diff -uNr linux-2.5.3-pre6/arch/i386/kernel/vm86.c linux-work-253p6/arch/i386/kernel/vm86.c
--- linux-2.5.3-pre6/arch/i386/kernel/vm86.c Tue Jan 22 09:06:49 2002
+++ linux-work-253p6/arch/i386/kernel/vm86.c Tue Jan 29 08:32:57 2002
@@ -212,7 +212,7 @@
info->regs.__null_ds = 0;
info->regs.__null_es = 0;

-/* we are clearing fs,gs later just before "jmp ret_from_sys_call",
+/* we are clearing fs,gs later just before "jmp resume_userspace",
* because starting with Linux 2.1.x they aren't no longer saved/restored
*/

@@ -255,7 +255,7 @@
__asm__ __volatile__(
"xorl %%eax,%%eax; movl %%eax,%%fs; movl %%eax,%%gs\n\t"
"movl %0,%%esp\n\t"
- "jmp ret_from_sys_call"
+ "jmp resume_userspace"
: /* no outputs */
:"r" (&info->regs), "b" (tsk) : "ax");
/* we never return here */
@@ -268,7 +268,7 @@
regs32 = save_v86_state(regs16);
regs32->eax = retval;
__asm__ __volatile__("movl %0,%%esp\n\t"
- "jmp ret_from_sys_call"
+ "jmp resume_userspace"
: : "r" (regs32), "b" (current));
}

diff -uNr linux-2.5.3-pre6/fs/lockd/svc.c linux-work-253p6/fs/lockd/svc.c
--- linux-2.5.3-pre6/fs/lockd/svc.c Tue Jan 22 09:05:58 2002
+++ linux-work-253p6/fs/lockd/svc.c Tue Jan 29 08:32:57 2002
@@ -304,7 +304,7 @@
* Wait for the lockd process to exit, but since we're holding
* the lockd semaphore, we can't wait around forever ...
*/
- current->sigpending = 0;
+ current->work.sigpending = 0;
interruptible_sleep_on_timeout(&lockd_exit, HZ);
if (nlmsvc_pid) {
printk(KERN_WARNING
diff -uNr linux-2.5.3-pre6/fs/nfsd/export.c linux-work-253p6/fs/nfsd/export.c
--- linux-2.5.3-pre6/fs/nfsd/export.c Tue Jan 22 09:05:58 2002
+++ linux-work-253p6/fs/nfsd/export.c Tue Jan 29 08:32:57 2002
@@ -468,7 +468,7 @@
return 0;
}

- current->sigpending = 0;
+ current->work.sigpending = 0;
want_lock++;
while (hash_count || hash_lock) {
interruptible_sleep_on(&hash_wait);
diff -uNr linux-2.5.3-pre6/include/asm-i386/signal.h linux-work-253p6/include/asm-i386/signal.h
--- linux-2.5.3-pre6/include/asm-i386/signal.h Tue Jan 29 08:37:15 2002
+++ linux-work-253p6/include/asm-i386/signal.h Tue Jan 29 08:49:29 2002
@@ -2,6 +2,7 @@
#define _ASMi386_SIGNAL_H

#include <linux/types.h>
+#include <linux/linkage.h>

/* Avoid too many header ordering problems. */
struct siginfo;
@@ -216,6 +217,8 @@
return word;
}

+extern int FASTCALL(do_signal(struct pt_regs *regs, sigset_t *oldset));
+
#endif /* __KERNEL__ */

#endif
diff -uNr linux-2.5.3-pre6/include/linux/init_task.h linux-work-253p6/include/linux/init_task.h
--- linux-2.5.3-pre6/include/linux/init_task.h Tue Jan 29 08:43:49 2002
+++ linux-work-253p6/include/linux/init_task.h Tue Jan 29 08:55:49 2002
@@ -35,6 +35,14 @@
siglock: SPIN_LOCK_UNLOCKED \
}

+#define INIT_TASK_WORK \
+{ \
+ need_resched: 0, \
+ syscall_trace: 0, \
+ sigpending: 0, \
+ notify_resume: 0, \
+}
+
/*
* INIT_TASK is used to set up the first task table, touch at
* your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -43,7 +51,7 @@
{ \
state: 0, \
flags: 0, \
- sigpending: 0, \
+ work: INIT_TASK_WORK, \
addr_limit: KERNEL_DS, \
exec_domain: &default_exec_domain, \
lock_depth: -1, \
diff -uNr linux-2.5.3-pre6/include/linux/sched.h linux-work-253p6/include/linux/sched.h
--- linux-2.5.3-pre6/include/linux/sched.h Tue Jan 29 08:37:15 2002
+++ linux-work-253p6/include/linux/sched.h Tue Jan 29 08:49:29 2002
@@ -229,19 +229,29 @@

typedef struct prio_array prio_array_t;

+/* this struct must occupy one 32-bit chunk so that is can be read in one go */
+struct task_work {
+ __s8 need_resched;
+ __u8 syscall_trace; /* count of syscall interceptors */
+ __u8 sigpending;
+ __u8 notify_resume; /* request for notification on
+ userspace execution resumption */
+} __attribute__((packed));
+
struct task_struct {
/*
* offsets of these are hardcoded elsewhere - touch with care
*/
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
unsigned long flags; /* per process flags, defined below */
- int sigpending;
+ volatile struct task_work work;
+
mm_segment_t addr_limit; /* thread address space:
0-0xBFFFFFFF for user-thead
0-0xFFFFFFFF for kernel-thread
*/
struct exec_domain *exec_domain;
- volatile long need_resched;
+ long __pad;
unsigned long ptrace;

int lock_depth; /* Lock depth */
@@ -381,7 +391,7 @@
*/

#define PT_PTRACED 0x00000001
-#define PT_TRACESYS 0x00000002
+#define PT_SYSCALLTRACE 0x00000002 /* T if syscall_trace is +1 for ptrace() */
#define PT_DTRACE 0x00000004 /* delayed trace (used on m68k, i386) */
#define PT_TRACESYSGOOD 0x00000008
#define PT_PTRACE_CAP 0x00000010 /* ptracer can follow suid-exec */
@@ -575,12 +585,12 @@

static inline int signal_pending(struct task_struct *p)
{
- return (p->sigpending != 0);
+ return (p->work.sigpending != 0);
}

static inline int need_resched(void)
{
- return unlikely(current->need_resched != 0);
+ return unlikely(current->work.need_resched != 0);
}

static inline void cond_resched(void)
@@ -625,7 +635,7 @@

static inline void recalc_sigpending(struct task_struct *t)
{
- t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked);
+ t->work.sigpending = has_pending_signals(&t->pending.signal, &t->blocked);
}

/* True if we are on the alternate signal stack. */
diff -uNr linux-2.5.3-pre6/kernel/fork.c linux-work-253p6/kernel/fork.c
--- linux-2.5.3-pre6/kernel/fork.c Tue Jan 29 08:19:26 2002
+++ linux-work-253p6/kernel/fork.c Tue Jan 29 08:32:57 2002
@@ -631,7 +631,7 @@
}
spin_lock_init(&p->alloc_lock);

- p->sigpending = 0;
+ p->work.sigpending = 0;
init_sigpending(&p->pending);

p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
@@ -755,7 +755,7 @@
* Let the child process run first, to avoid most of the
* COW overhead when the child exec()s afterwards.
*/
- current->need_resched = 1;
+ current->work.need_resched = 1;

fork_out:
return retval;
diff -uNr linux-2.5.3-pre6/kernel/sched.c linux-work-253p6/kernel/sched.c
--- linux-2.5.3-pre6/kernel/sched.c Tue Jan 29 08:19:26 2002
+++ linux-work-253p6/kernel/sched.c Tue Jan 29 08:35:10 2002
@@ -194,9 +194,9 @@
{
int need_resched;

- need_resched = p->need_resched;
+ need_resched = p->work.need_resched;
wmb();
- p->need_resched = 1;
+ p->work.need_resched = 1;
if (!need_resched && (p->cpu != smp_processor_id()))
smp_send_reschedule(p->cpu);
}
@@ -523,7 +523,7 @@
this_rq->nr_running++;
enqueue_task(next, this_rq->active);
if (next->prio < current->prio)
- current->need_resched = 1;
+ current->work.need_resched = 1;
if (!idle && --imbalance) {
if (array == busiest->expired) {
array = busiest->active;
@@ -572,7 +572,7 @@
#endif
/* Task might have expired already, but not scheduled off yet */
if (p->array != rq->active) {
- p->need_resched = 1;
+ p->work.need_resched = 1;
return;
}
spin_lock(&rq->lock);
@@ -583,7 +583,7 @@
*/
if ((p->policy == SCHED_RR) && !--p->time_slice) {
p->time_slice = NICE_TO_TIMESLICE(p->__nice);
- p->need_resched = 1;
+ p->work.need_resched = 1;

/* put it at the end of the queue: */
dequeue_task(p, rq->active);
@@ -603,7 +603,7 @@
p->sleep_avg--;
if (!--p->time_slice) {
dequeue_task(p, rq->active);
- p->need_resched = 1;
+ p->work.need_resched = 1;
p->prio = effective_prio(p);
p->time_slice = NICE_TO_TIMESLICE(p->__nice);

@@ -684,7 +684,7 @@

switch_tasks:
prefetch(next);
- prev->need_resched = 0;
+ prev->work.need_resched = 0;

if (likely(prev != next)) {
rq->nr_switches++;
@@ -1318,7 +1318,7 @@
idle->state = TASK_RUNNING;
idle->cpu = cpu;
double_rq_unlock(idle_rq, rq);
- idle->need_resched = 1;
+ idle->work.need_resched = 1;
__restore_flags(flags);
}

diff -uNr linux-2.5.3-pre6/kernel/signal.c linux-work-253p6/kernel/signal.c
--- linux-2.5.3-pre6/kernel/signal.c Tue Jan 22 09:06:00 2002
+++ linux-work-253p6/kernel/signal.c Tue Jan 29 08:32:57 2002
@@ -105,7 +105,7 @@
void
flush_signals(struct task_struct *t)
{
- t->sigpending = 0;
+ t->work.sigpending = 0;
flush_sigqueue(&t->pending);
}

@@ -119,7 +119,7 @@
if (atomic_dec_and_test(&sig->count))
kmem_cache_free(sigact_cachep, sig);
}
- tsk->sigpending = 0;
+ tsk->work.sigpending = 0;
flush_sigqueue(&tsk->pending);
spin_unlock_irq(&tsk->sigmask_lock);
}
@@ -246,7 +246,7 @@
if (current->notifier) {
if (sigismember(current->notifier_mask, sig)) {
if (!(current->notifier)(current->notifier_data)) {
- current->sigpending = 0;
+ current->work.sigpending = 0;
return 0;
}
}
@@ -465,7 +465,7 @@
*/
static inline void signal_wake_up(struct task_struct *t)
{
- t->sigpending = 1;
+ t->work.sigpending = 1;

#ifdef CONFIG_SMP
/*
diff -uNr linux-2.5.3-pre6/net/sunrpc/sched.c linux-work-253p6/net/sunrpc/sched.c
--- linux-2.5.3-pre6/net/sunrpc/sched.c Tue Jan 29 08:19:26 2002
+++ linux-work-253p6/net/sunrpc/sched.c Tue Jan 29 08:32:57 2002
@@ -1109,7 +1109,7 @@
unsigned long flags;

while (all_tasks) {
- current->sigpending = 0;
+ current->work.sigpending = 0;
rpc_killall_tasks(NULL);
__rpc_schedule();
if (all_tasks) {
@@ -1183,7 +1183,7 @@
* Usually rpciod will exit very quickly, so we
* wait briefly before checking the process id.
*/
- current->sigpending = 0;
+ current->work.sigpending = 0;
yield();
/*
* Display a message if we're going to wait longer.
diff -uNr linux-2.5.3-pre6/net/sunrpc/svc.c linux-work-253p6/net/sunrpc/svc.c
--- linux-2.5.3-pre6/net/sunrpc/svc.c Tue Jan 22 09:06:09 2002
+++ linux-work-253p6/net/sunrpc/svc.c Tue Jan 29 08:32:57 2002
@@ -185,7 +185,7 @@
progp->pg_name, proto == IPPROTO_UDP? "udp" : "tcp", port);

if (!port)
- current->sigpending = 0;
+ current->work.sigpending = 0;

for (i = 0; i < progp->pg_nvers; i++) {
if (progp->pg_vers[i] == NULL)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/