From 03a8357e84a9a7c478d787c4705dddf70dc99f7d Mon Sep 17 00:00:00 2001 From: Andreas Kling Date: Wed, 7 Nov 2018 21:19:47 +0100 Subject: [PATCH] Implement sending signals to blocked-in-kernel processes. This is dirty but pretty cool! If we have a pending, unmasked signal for a process that's blocked inside the kernel, we set up alternate stacks for that process and unblock it to execute the signal handler. A slightly different return trampoline is used here: since we need to get back into the kernel, a dedicated syscall is used (sys$sigreturn.) This restores the TSS contents of the process to the state it was in while we were originally blocking in the kernel. NOTE: There's currently only one "kernel resume TSS" so signal nesting definitely won't work. --- Kernel/Process.cpp | 128 +++++++++++++++++++++--- Kernel/Process.h | 18 +++- Kernel/Syscall.cpp | 6 +- Kernel/Syscall.h | 2 + Userland/sh.cpp | 8 +- Userland/sleep.cpp | 47 ++++++++- VirtualFileSystem/Ext2FileSystem.cpp | 2 +- VirtualFileSystem/FileDescriptor.cpp | 2 +- VirtualFileSystem/FullDevice.cpp | 2 +- VirtualFileSystem/VirtualFileSystem.cpp | 2 +- 10 files changed, 190 insertions(+), 27 deletions(-) diff --git a/Kernel/Process.cpp b/Kernel/Process.cpp index 4071203e676..5fd261ab655 100644 --- a/Kernel/Process.cpp +++ b/Kernel/Process.cpp @@ -15,6 +15,7 @@ #include "ProcFileSystem.h" #include #include +#include "Syscall.h" //#define DEBUG_IO //#define TASK_DEBUG @@ -371,9 +372,9 @@ int Process::exec(const String& path, Vector&& arguments, Vector m_tss.gs = 0x23; m_tss.ss = 0x23; m_tss.cr3 = (dword)m_page_directory; - auto* stack_region = allocate_region(LinearAddress(), defaultStackSize, "stack"); - ASSERT(stack_region); - m_stackTop3 = stack_region->linearAddress.offset(defaultStackSize).get() & 0xfffffff8; + m_stack_region = allocate_region(LinearAddress(), defaultStackSize, "stack"); + ASSERT(m_stack_region); + m_stackTop3 = m_stack_region->linearAddress.offset(defaultStackSize).get() & 0xfffffff8; m_tss.esp = m_stackTop3; m_tss.ss0 = 0x10; m_tss.esp0 = old_esp0; @@ -783,20 +784,49 @@ void Process::dispatch_signal(byte signal) return terminate_due_to_signal(signal); } + m_tss_to_resume_kernel = m_tss; +#ifdef SIGNAL_DEBUG + kprintf("resume tss pc: %w:%x\n", m_tss_to_resume_kernel.cs, m_tss_to_resume_kernel.eip); +#endif + + word ret_ss = m_tss.ss; + dword ret_esp = m_tss.esp; word ret_cs = m_tss.cs; dword ret_eip = m_tss.eip; dword ret_eflags = m_tss.eflags; + bool interrupting_in_kernel = (ret_cs & 3) == 0; + if ((ret_cs & 3) == 0) { // FIXME: Handle send_signal to process currently in kernel code. - kprintf("Boo! dispatch_signal in %s(%u) with return to %w:%x\n", name().characters(), pid(), ret_cs, ret_eip); - ASSERT_NOT_REACHED(); + dbgprintf("dispatch_signal to %s(%u) in state=%s with return to %w:%x\n", name().characters(), pid(), toString(state()), ret_cs, ret_eip); + ASSERT(is_blocked()); } ProcessPagingScope pagingScope(*this); + + if (interrupting_in_kernel) { + if (!m_signal_stack_user_region) { + m_signal_stack_user_region = allocate_region(LinearAddress(), defaultStackSize, "signal stack (user)"); + ASSERT(m_signal_stack_user_region); + m_signal_stack_kernel_region = allocate_region(LinearAddress(), defaultStackSize, "signal stack (kernel)"); + ASSERT(m_signal_stack_user_region); + } + m_tss.ss = 0x23; + m_tss.esp = m_signal_stack_user_region->linearAddress.offset(defaultStackSize).get() & 0xfffffff8; + m_tss.ss0 = 0x10; + m_tss.esp0 = m_signal_stack_kernel_region->linearAddress.offset(defaultStackSize).get() & 0xfffffff8; + push_value_on_stack(ret_eflags); + push_value_on_stack(ret_cs); + push_value_on_stack(ret_eip); + } else { + push_value_on_stack(ret_cs); + push_value_on_stack(ret_eip); + push_value_on_stack(ret_eflags); + } + + // PUSHA dword old_esp = m_tss.esp; - push_value_on_stack(ret_eip); - push_value_on_stack(ret_eflags); push_value_on_stack(m_tss.eax); push_value_on_stack(m_tss.ecx); push_value_on_stack(m_tss.edx); @@ -805,31 +835,66 @@ void Process::dispatch_signal(byte signal) push_value_on_stack(m_tss.ebp); push_value_on_stack(m_tss.esi); push_value_on_stack(m_tss.edi); + m_tss.eax = (dword)signal; m_tss.cs = 0x1b; + m_tss.ds = 0x23; + m_tss.es = 0x23; + m_tss.fs = 0x23; + m_tss.gs = 0x23; m_tss.eip = handler_laddr.get(); - if (m_return_from_signal_trampoline.is_null()) { + if (m_return_to_ring3_from_signal_trampoline.is_null()) { // FIXME: This should be a global trampoline shared by all processes, not one created per process! // FIXME: Remap as read-only after setup. auto* region = allocate_region(LinearAddress(), PAGE_SIZE, "signal_trampoline", true, true); - m_return_from_signal_trampoline = region->linearAddress; - byte* code_ptr = m_return_from_signal_trampoline.asPtr(); + m_return_to_ring3_from_signal_trampoline = region->linearAddress; + byte* code_ptr = m_return_to_ring3_from_signal_trampoline.asPtr(); *code_ptr++ = 0x61; // popa *code_ptr++ = 0x9d; // popf *code_ptr++ = 0xc3; // ret *code_ptr++ = 0x0f; // ud2 *code_ptr++ = 0x0b; + + m_return_to_ring0_from_signal_trampoline = LinearAddress((dword)code_ptr); + *code_ptr++ = 0x61; // popa + *code_ptr++ = 0xb8; // mov eax, + *(dword*)code_ptr = Syscall::SC_sigreturn; + code_ptr += sizeof(dword); + *code_ptr++ = 0xcd; // int 0x80 + *code_ptr++ = 0x80; + *code_ptr++ = 0x0f; // ud2 + *code_ptr++ = 0x0b; + // FIXME: For !SA_NODEFER, maybe we could do something like emitting an int 0x80 syscall here that // unmasks the signal so it can be received again? I guess then I would need one trampoline // per signal number if it's hard-coded, but it's just a few bytes per each. } - push_value_on_stack(m_return_from_signal_trampoline.get()); + if (interrupting_in_kernel) + push_value_on_stack(m_return_to_ring0_from_signal_trampoline.get()); + else + push_value_on_stack(m_return_to_ring3_from_signal_trampoline.get()); m_pending_signals &= ~(1 << signal); +#ifdef SIGNAL_DEBUG dbgprintf("signal: Okay, %s(%u) has been primed\n", name().characters(), pid()); +#endif +} + +void Process::sys$sigreturn() +{ + InterruptDisabler disabler; + m_tss = m_tss_to_resume_kernel; +#ifdef SIGNAL_DEBUG + dbgprintf("sys$sigreturn in %s(%u)\n", name().characters(), pid()); + dbgprintf(" -> resuming execution at %w:%x\n", m_tss.cs, m_tss.eip); +#endif + loadTaskRegister(s_kernelProcess->selector()); + sched_yield(); + kprintf("sys$sigreturn failed in %s(%u)\n", name().characters(), pid()); + ASSERT_NOT_REACHED(); } void Process::push_value_on_stack(dword value) @@ -871,7 +936,7 @@ void Process::doHouseKeeping() int sched_yield() { if (!current) { - kprintf( "PANIC: yield() with !current" ); + kprintf("PANIC: sched_yield() with !current"); HANG; } @@ -921,6 +986,18 @@ static void for_each_process_not_in_state(Process::State state, Callback callbac } } +template +static void for_each_blocked_process(Callback callback) +{ + ASSERT_INTERRUPTS_DISABLED(); + for (auto* process = s_processes->head(); process;) { + auto* next_process = process->next(); + if (process->is_blocked()) + callback(*process); + process = next_process; + } +} + bool scheduleNewProcess() { ASSERT_INTERRUPTS_DISABLED(); @@ -955,6 +1032,7 @@ bool scheduleNewProcess() if (process->state() == Process::BlockedRead) { ASSERT(process->m_fdBlockedOnRead != -1); + // FIXME: Block until the amount of data wanted is available. if (process->m_file_descriptors[process->m_fdBlockedOnRead]->hasDataAvailableForRead()) process->unblock(); continue; @@ -980,7 +1058,19 @@ bool scheduleNewProcess() for_each_process_not_in_state(Process::Dead, [] (auto& process) { if (!process.has_unmasked_pending_signals()) return; + // We know how to interrupt blocked processes, but if they are just executing + // at some random point in the kernel, let them continue. They'll be in userspace + // sooner or later and we can deliver the signal then. + // FIXME: Maybe we could check when returning from a syscall if there's a pending + // signal and dispatch it then and there? Would that be doable without the + // syscall effectively being "interrupted" despite having completed? + if (process.in_kernel() && !process.is_blocked()) + return; process.dispatch_one_pending_signal(); + if (process.is_blocked()) { + process.m_was_interrupted_while_blocked = true; + process.unblock(); + } }); #ifdef SCHEDULER_DEBUG @@ -1000,7 +1090,7 @@ bool scheduleNewProcess() if (process->state() == Process::Runnable || process->state() == Process::Running) { #ifdef SCHEDULER_DEBUG - dbgprintf("switch to %s(%u) (%p vs %p)\n", process->name().characters(), process->pid(), process, current); + dbgprintf("switch to %s(%u)\n", process->name().characters(), process->pid()); #endif return contextSwitch(process); } @@ -1177,6 +1267,8 @@ ssize_t Process::sys$read(int fd, void* outbuf, size_t nread) m_fdBlockedOnRead = fd; block(BlockedRead); sched_yield(); + if (m_was_interrupted_while_blocked) + return -EINTR; } } nread = descriptor->read((byte*)outbuf, nread); @@ -1345,6 +1437,11 @@ int Process::sys$sleep(unsigned seconds) if (!seconds) return 0; sleep(seconds * TICKS_PER_SECOND); + if (m_wakeupTime > system.uptime) { + ASSERT(m_was_interrupted_while_blocked); + dword ticks_left_until_original_wakeup_time = m_wakeupTime - system.uptime; + return ticks_left_until_original_wakeup_time / TICKS_PER_SECOND; + } return 0; } @@ -1407,6 +1504,8 @@ pid_t Process::sys$waitpid(pid_t waitee, int* wstatus, int options) m_waitee_status = 0; block(BlockedWait); sched_yield(); + if (m_was_interrupted_while_blocked) + return -EINTR; if (wstatus) *wstatus = m_waitee_status; return m_waitee; @@ -1423,7 +1522,8 @@ void Process::block(Process::State state) { ASSERT(current->state() == Process::Running); system.nblocked++; - current->set_state(state); + m_was_interrupted_while_blocked = false; + set_state(state); } void block(Process::State state) diff --git a/Kernel/Process.h b/Kernel/Process.h index b9c2d0f0108..32c7e803ae2 100644 --- a/Kernel/Process.h +++ b/Kernel/Process.h @@ -51,6 +51,13 @@ public: bool isRing0() const { return m_ring == Ring0; } bool isRing3() const { return m_ring == Ring3; } + bool is_blocked() const + { + return m_state == BlockedSleep || m_state == BlockedWait || m_state == BlockedRead; + } + + bool in_kernel() const { return (m_tss.cs & 0x03) == 0; } + static Process* fromPID(pid_t); static Process* kernelProcess(); @@ -115,6 +122,7 @@ public: int sys$kill(pid_t pid, int sig); int sys$geterror() { return m_error; } void sys$exit(int status); + void sys$sigreturn(); pid_t sys$spawn(const char* path, const char** args, const char** envp); pid_t sys$waitpid(pid_t, int* wstatus, int options); void* sys$mmap(void*, size_t size); @@ -212,6 +220,7 @@ private: State m_state { Invalid }; DWORD m_wakeupTime { 0 }; TSS32 m_tss; + TSS32 m_tss_to_resume_kernel; Vector> m_file_descriptors; RingLevel m_ring { Ring0 }; int m_error { 0 }; @@ -243,16 +252,23 @@ private: // FIXME: Implement some kind of ASLR? LinearAddress m_nextRegion; - LinearAddress m_return_from_signal_trampoline; + LinearAddress m_return_to_ring3_from_signal_trampoline; + LinearAddress m_return_to_ring0_from_signal_trampoline; pid_t m_ppid { 0 }; mode_t m_umask { 022 }; + bool m_was_interrupted_while_blocked { false }; + static void notify_waiters(pid_t waitee, int exit_status, int signal); Vector m_arguments; Vector m_initialEnvironment; HashTable m_gids; + + Region* m_stack_region { nullptr }; + Region* m_signal_stack_user_region { nullptr }; + Region* m_signal_stack_kernel_region { nullptr }; }; class ProcessInspectionScope { diff --git a/Kernel/Syscall.cpp b/Kernel/Syscall.cpp index a6695c377f6..3a418edfa4d 100644 --- a/Kernel/Syscall.cpp +++ b/Kernel/Syscall.cpp @@ -54,7 +54,7 @@ static DWORD handle(RegisterDump& regs, DWORD function, DWORD arg1, DWORD arg2, Console::the().putChar(arg1 & 0xff); break; case Syscall::SC_sleep: - return current->sys$sleep(arg1); + return current->sys$sleep((unsigned)arg1); case Syscall::SC_gettimeofday: return current->sys$gettimeofday((timeval*)arg1); case Syscall::SC_spawn: @@ -156,6 +156,10 @@ static DWORD handle(RegisterDump& regs, DWORD function, DWORD arg1, DWORD arg2, return current->sys$getgroups((int)arg1, (gid_t*)arg2); case Syscall::SC_setgroups: return current->sys$setgroups((size_t)arg1, (const gid_t*)arg2); + case Syscall::SC_sigreturn: + current->sys$sigreturn(); + ASSERT_NOT_REACHED(); + return 0; default: kprintf("<%u> int0x80: Unknown function %x requested {%x, %x, %x}\n", current->pid(), function, arg1, arg2, arg3); break; diff --git a/Kernel/Syscall.h b/Kernel/Syscall.h index d60ea052e3f..a9f8187769a 100644 --- a/Kernel/Syscall.h +++ b/Kernel/Syscall.h @@ -54,6 +54,7 @@ __ENUMERATE_SYSCALL(umask) \ __ENUMERATE_SYSCALL(getgroups) \ __ENUMERATE_SYSCALL(setgroups) \ + __ENUMERATE_SYSCALL(sigreturn) \ #define DO_SYSCALL_A0(function) Syscall::invoke((dword)(function)) @@ -78,6 +79,7 @@ inline constexpr const char* toString(Function function) ENUMERATE_SYSCALLS #undef __ENUMERATE_SYSCALL } + return "Unknown"; } void initialize(); diff --git a/Userland/sh.cpp b/Userland/sh.cpp index 19870439644..039bada919c 100644 --- a/Userland/sh.cpp +++ b/Userland/sh.cpp @@ -331,8 +331,12 @@ int main(int, char**) char keybuf[16]; ssize_t nread = read(0, keybuf, sizeof(keybuf)); if (nread < 0) { - printf("failed to read :(\n"); - return 2; + if (errno == EINTR) { + // Ignore. :^) + } else { + perror("read failed"); + return 2; + } } for (ssize_t i = 0; i < nread; ++i) { putchar(keybuf[i]); diff --git a/Userland/sleep.cpp b/Userland/sleep.cpp index c0e7c44eea8..79cffbfc1b4 100644 --- a/Userland/sleep.cpp +++ b/Userland/sleep.cpp @@ -1,10 +1,47 @@ -#include -#include +#include +#include +#include +#include -int main(int c, char** v) +static unsigned parseUInt(const String& str, bool& ok) { - unsigned secs = 10; - sleep(secs); + unsigned value = 0; + for (size_t i = 0; i < str.length(); ++i) { + if (str[i] < '0' || str[i] > '9') { + ok = false; + return 0; + } + value = value * 10; + value += str[i] - '0'; + } + ok = true; + return value; +} + +void handle_sigint(int) +{ +} + +int main(int argc, char** argv) +{ + if (argc != 2) { + printf("usage: sleep \n"); + return 1; + } + bool ok; + unsigned secs = parseUInt(argv[1], ok); + if (!ok) { + fprintf(stderr, "Not a valid number of seconds: \"%s\"\n", argv[1]); + return 1; + } + struct sigaction sa; + memset(&sa, 0, sizeof(struct sigaction)); + sa.sa_handler = handle_sigint; + sigaction(SIGINT, &sa, nullptr); + unsigned remaining = sleep(secs); + if (remaining) { + printf("Sleep interrupted with %u seconds remaining.\n", remaining); + } return 0; } diff --git a/VirtualFileSystem/Ext2FileSystem.cpp b/VirtualFileSystem/Ext2FileSystem.cpp index 24be4926702..55515d598c1 100644 --- a/VirtualFileSystem/Ext2FileSystem.cpp +++ b/VirtualFileSystem/Ext2FileSystem.cpp @@ -7,7 +7,7 @@ #include #include #include -#include "sys-errno.h" +#include //#define EXT2_DEBUG diff --git a/VirtualFileSystem/FileDescriptor.cpp b/VirtualFileSystem/FileDescriptor.cpp index 93e66f91f3d..fe09f84427a 100644 --- a/VirtualFileSystem/FileDescriptor.cpp +++ b/VirtualFileSystem/FileDescriptor.cpp @@ -1,7 +1,7 @@ #include "FileDescriptor.h" #include "FileSystem.h" #include "CharacterDevice.h" -#include "sys-errno.h" +#include #include "UnixTypes.h" #include diff --git a/VirtualFileSystem/FullDevice.cpp b/VirtualFileSystem/FullDevice.cpp index 0a74e21c02a..f435876d91f 100644 --- a/VirtualFileSystem/FullDevice.cpp +++ b/VirtualFileSystem/FullDevice.cpp @@ -1,6 +1,6 @@ #include "FullDevice.h" #include "Limits.h" -#include "sys-errno.h" +#include #include #include diff --git a/VirtualFileSystem/VirtualFileSystem.cpp b/VirtualFileSystem/VirtualFileSystem.cpp index 8c6e3e330e4..da482bd9fa9 100644 --- a/VirtualFileSystem/VirtualFileSystem.cpp +++ b/VirtualFileSystem/VirtualFileSystem.cpp @@ -6,7 +6,7 @@ #include #include #include "CharacterDevice.h" -#include "sys-errno.h" +#include //#define VFS_DEBUG