question regarding arch/i386/kernel/entry.S and stack/register usage

Dave Peterson (dsp@llnl.gov)
Sat, 28 Jun 2003 21:04:06 -0700


I have been experimenting with some code I wrote that checks for kernel
stack overflows and I have been observing some rather odd behavior. I
am working with a 2.4.18 kernel and my code changes are as follows:

1. I added a field to the end of the task_struct
as follows:

#define STACK_RED_ZONE_SIZE 128

struct task_struct {
...
[ contents omitted for brevity ]
...
uint32_t stack_red_zone[STACK_RED_ZONE_SIZE];
}

2. I added the following function to kernel/sys.c:

asmlinkage void check_stack_red_zone(void)
{
int i;
uint32_t *red_zone;
char *stack;

red_zone = current->stack_red_zone;

for (i = 0; i < STACK_RED_ZONE_SIZE; i++) {
if (unlikely(red_zone[i] != STACK_RED_ZONE_VALUE))
goto fail;
}

return;

fail:
printk("kernel stack red zone corrupt\n");
printk("red zone contents:\n");

for (i = 0; i < STACK_RED_ZONE_SIZE; i++)
printk("0x%x ", red_zone[i]);

printk("\n");
stack = (char *) current;
stack += sizeof(struct task_struct);
SCAN_STACK((unsigned long *) stack); /* show stack trace */
panic("kernel stack overflow");
}

3. I added some code that initializes the stack_red_zone field of the
initial task at boot time.

4. I modified arch/i386/kernel/entry.S so that check_stack_red_zone()
is called after each syscall or exception handler (but not interrupt
handlers). My code changes are shown by the diff appended to this
message. The only relevant changes are the ones that look like this:

+#ifdef CONFIG_STACK_RED_ZONE
+ call SYMBOL_NAME(check_stack_red_zone)
+#endif

All other code changes are unrelated.

With my code changes as described above, things work as expected most of the
time. Occasionally when I boot my kernel, I observe the following:

Real Time Clock Driver v1.10e
block: 1024 slots per queue, batch=256
Uniform Multi-Platform E-IDE driver Revision: 6.31
ide: Assuming 33MHz system bus speed for PIO modes; override with idebus=xx
PIIX4: IDE controller on PCI bus 00 dev f9
PCI: No IRQ known for interrupt pin A of device 00:1f.1. Probably buggy MP tab
PIIX4: chipset revision 2
PIIX4: not 100% native mode: will probe irqs later
PIIX4: ATA-66/100 forced bit set (WARNING)!!
ide0: BM-DMA at 0x3c40-0x3c47, BIOS settings: hda:DMA, hdb:pio
ide1: BM-DMA at 0x3c48-0x3c4f, BIOS settings: hdc:pio, hdd:pio
hda: C/H/S=0/0/0 from BIOS ignored
hda: IC35L120AVVA07-0, ATA DISK drive
Allocated IRQ 14 task f75de000
ide0 at 0x1f0-0x1f7,0x3f6 on irq 14
blk: queue c04777e4, I/O limit 4095Mb (mask 0xffffffff)
blk: queue c04777e4, I/O limit 4095Mb (mask 0xffffffff)
hda: lost interrupt
hda: lost interrupt
hda: lost interrupt
hda: 241254720 sectors (123522 MB) w/1863KiB Cache, CHS=239340/16/63, UDMA(100
hda: lost interrupt

[ "lost interrupt" messages continue... ]

I find this surprising because AFAICS my changes to entry.S affect only
syscall and exception handlers, not interrupt handlers. Is there anything that
looks clearly wrong with my code changes? Do my changes to entry.S violate some
assumption I am unaware of regarding the layout of stuff on the kernel stack or
register contents?

-Dave Peterson
dsp@llnl.gov

***** start of diff *********************************************************
--- entry.S.2_4_18 Sat Jun 28 19:55:26 2003
+++ entry.S.modified Sat Jun 28 19:55:58 2003
@@ -63,7 +63,9 @@
OLDSS = 0x38

CF_MASK = 0x00000001
+TF_MASK = 0x00000100
IF_MASK = 0x00000200
+DF_MASK = 0x00000400
NT_MASK = 0x00004000
VM_MASK = 0x00020000

@@ -77,7 +79,7 @@
exec_domain = 16
need_resched = 20
tsk_ptrace = 24
-processor = 52
+cpu = 32

ENOSYS = 38

@@ -140,6 +142,9 @@
movl CS(%esp),%edx # this is eip..
movl EFLAGS(%esp),%ecx # and this is cs..
movl %eax,EFLAGS(%esp) #
+ andl $~(NT_MASK|TF_MASK|DF_MASK), %eax
+ pushl %eax
+ popfl
movl %edx,EIP(%esp) # Now we move them to their "normal" places
movl %ecx,CS(%esp) #
movl %esp,%ebx
@@ -149,6 +154,9 @@
movl 4(%edx),%edx # Get the lcall7 handler for the domain
pushl $0x7
call *%edx
+#ifdef CONFIG_STACK_RED_ZONE
+ call SYMBOL_NAME(check_stack_red_zone)
+#endif
addl $4, %esp
popl %eax
jmp ret_from_sys_call
@@ -161,6 +169,9 @@
movl CS(%esp),%edx # this is eip..
movl EFLAGS(%esp),%ecx # and this is cs..
movl %eax,EFLAGS(%esp) #
+ andl $~(NT_MASK|TF_MASK|DF_MASK), %eax
+ pushl %eax
+ popfl
movl %edx,EIP(%esp) # Now we move them to their "normal" places
movl %ecx,CS(%esp) #
movl %esp,%ebx
@@ -170,15 +181,19 @@
movl 4(%edx),%edx # Get the lcall7 handler for the domain
pushl $0x27
call *%edx
+#ifdef CONFIG_STACK_RED_ZONE
+ call SYMBOL_NAME(check_stack_red_zone)
+#endif
addl $4, %esp
popl %eax
jmp ret_from_sys_call

-
ENTRY(ret_from_fork)
+#if CONFIG_SMP
pushl %ebx
call SYMBOL_NAME(schedule_tail)
addl $4, %esp
+#endif
GET_CURRENT(%ebx)
testb $0x02,tsk_ptrace(%ebx) # PT_TRACESYS
jne tracesys_exit
@@ -201,6 +216,9 @@
jae badsys
call *SYMBOL_NAME(sys_call_table)(,%eax,4)
movl %eax,EAX(%esp) # save the return value
+#ifdef CONFIG_STACK_RED_ZONE
+ call SYMBOL_NAME(check_stack_red_zone)
+#endif
ENTRY(ret_from_sys_call)
cli # need_resched and signals atomic test
cmpl $0,need_resched(%ebx)
@@ -232,13 +250,22 @@
tracesys:
movl $-ENOSYS,EAX(%esp)
call SYMBOL_NAME(syscall_trace)
+#ifdef CONFIG_STACK_RED_ZONE
+ call SYMBOL_NAME(check_stack_red_zone)
+#endif
movl ORIG_EAX(%esp),%eax
cmpl $(NR_syscalls),%eax
jae tracesys_exit
call *SYMBOL_NAME(sys_call_table)(,%eax,4)
movl %eax,EAX(%esp) # save the return value
+#ifdef CONFIG_STACK_RED_ZONE
+ call SYMBOL_NAME(check_stack_red_zone)
+#endif
tracesys_exit:
call SYMBOL_NAME(syscall_trace)
+#ifdef CONFIG_STACK_RED_ZONE
+ call SYMBOL_NAME(check_stack_red_zone)
+#endif
jmp ret_from_sys_call
badsys:
movl $-ENOSYS,EAX(%esp)
@@ -288,6 +315,9 @@
movl %edx,%es
GET_CURRENT(%ebx)
call *%edi
+#ifdef CONFIG_STACK_RED_ZONE
+ call SYMBOL_NAME(check_stack_red_zone)
+#endif
addl $8,%esp
jmp ret_from_exception

@@ -309,10 +339,16 @@
testl $0x4,%eax # EM (math emulation bit)
jne device_not_available_emulate
call SYMBOL_NAME(math_state_restore)
+#ifdef CONFIG_STACK_RED_ZONE
+ call SYMBOL_NAME(check_stack_red_zone)
+#endif
jmp ret_from_exception
device_not_available_emulate:
pushl $0 # temporary storage for ORIG_EIP
call SYMBOL_NAME(math_emulate)
+#ifdef CONFIG_STACK_RED_ZONE
+ call SYMBOL_NAME(check_stack_red_zone)
+#endif
addl $4,%esp
jmp ret_from_exception

@@ -584,8 +620,8 @@
.long SYMBOL_NAME(sys_capset) /* 185 */
.long SYMBOL_NAME(sys_sigaltstack)
.long SYMBOL_NAME(sys_sendfile)
- .long SYMBOL_NAME(sys_ni_syscall) /* streams1 */
- .long SYMBOL_NAME(sys_ni_syscall) /* streams2 */
+ .long SYMBOL_NAME(sys_getpmsg) /* streams1 */
+ .long SYMBOL_NAME(sys_putpmsg) /* streams2 */
.long SYMBOL_NAME(sys_vfork) /* 190 */
.long SYMBOL_NAME(sys_getrlimit)
.long SYMBOL_NAME(sys_mmap2)
@@ -618,7 +654,15 @@
.long SYMBOL_NAME(sys_madvise)
.long SYMBOL_NAME(sys_getdents64) /* 220 */
.long SYMBOL_NAME(sys_fcntl64)
- .long SYMBOL_NAME(sys_ni_syscall) /* reserved for TUX */
+#ifdef CONFIG_TUX
+ .long SYMBOL_NAME(__sys_tux)
+#else
+# ifdef CONFIG_TUX_MODULE
+ .long SYMBOL_NAME(sys_tux)
+# else
+ .long SYMBOL_NAME(sys_ni_syscall)
+# endif
+#endif
.long SYMBOL_NAME(sys_ni_syscall) /* Reserved for Security */
.long SYMBOL_NAME(sys_gettid)
.long SYMBOL_NAME(sys_readahead) /* 225 */
@@ -634,6 +678,13 @@
.long SYMBOL_NAME(sys_ni_syscall) /* 235 reserved for removexattr */
.long SYMBOL_NAME(sys_ni_syscall) /* reserved for lremovexattr */
.long SYMBOL_NAME(sys_ni_syscall) /* reserved for fremovexattr */
+ .long SYMBOL_NAME(sys_tkill)
+ .long SYMBOL_NAME(sys_sendfile64) /* reserved for sendfile64 */
+ .long SYMBOL_NAME(sys_ni_syscall) /* 240 reserved for futex */
+ .long SYMBOL_NAME(sys_sched_setaffinity)
+ .long SYMBOL_NAME(sys_sched_getaffinity)
+ .long SYMBOL_NAME(sys_statfs64)
+ .long SYMBOL_NAME(sys_fstatfs64)

.rept NR_syscalls-(.-sys_call_table)/4
.long SYMBOL_NAME(sys_ni_syscall)

***** end of diff ***********************************************************
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/