ladybird/Kernel/Syscalls/mmap.cpp
Andreas Kling 32aa37d5dc Kernel+LibC: Add msync() system call
This allows userspace to trigger a full (FIXME) flush of a shared file
mapping to disk. We iterate over all the mapped pages in the VMObject
and write them out to the underlying inode, one by one. This is rather
naive, and there's lots of room for improvement.

Note that shared file mappings are currently not possible since mmap()
returns ENOTSUP for PROT_WRITE+MAP_SHARED. That restriction will be
removed in a subsequent commit. :^)
2021-11-17 19:34:15 +01:00

610 lines
22 KiB
C++

/*
* Copyright (c) 2018-2021, Andreas Kling <kling@serenityos.org>
* Copyright (c) 2021, Leon Albrecht <leon2002.la@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <Kernel/Arch/SmapDisabler.h>
#include <Kernel/Arch/x86/MSR.h>
#include <Kernel/FileSystem/OpenFileDescription.h>
#include <Kernel/Memory/AnonymousVMObject.h>
#include <Kernel/Memory/MemoryManager.h>
#include <Kernel/Memory/PageDirectory.h>
#include <Kernel/Memory/PrivateInodeVMObject.h>
#include <Kernel/Memory/Region.h>
#include <Kernel/Memory/SharedInodeVMObject.h>
#include <Kernel/PerformanceEventBuffer.h>
#include <Kernel/PerformanceManager.h>
#include <Kernel/Process.h>
#include <LibC/limits.h>
#include <LibELF/Validation.h>
namespace Kernel {
static bool should_make_executable_exception_for_dynamic_loader(bool make_readable, bool make_writable, bool make_executable, Memory::Region const& region)
{
// Normally we don't allow W -> X transitions, but we have to make an exception
// for the dynamic loader, which needs to do this after performing text relocations.
// FIXME: Investigate whether we could get rid of all text relocations entirely.
// The exception is only made if all the following criteria is fulfilled:
// The region must be RW
if (!(region.is_readable() && region.is_writable() && !region.is_executable()))
return false;
// The region wants to become RX
if (!(make_readable && !make_writable && make_executable))
return false;
// The region is backed by a file
if (!region.vmobject().is_inode())
return false;
// The file mapping is private, not shared (no relocations in a shared mapping!)
if (!region.vmobject().is_private_inode())
return false;
auto& inode_vm = static_cast<Memory::InodeVMObject const&>(region.vmobject());
auto& inode = inode_vm.inode();
ElfW(Ehdr) header;
auto buffer = UserOrKernelBuffer::for_kernel_buffer((u8*)&header);
auto result = inode.read_bytes(0, sizeof(header), buffer, nullptr);
if (result.is_error() || result.value() != sizeof(header))
return false;
// The file is a valid ELF binary
if (!ELF::validate_elf_header(header, inode.size()))
return false;
// The file is an ELF shared object
if (header.e_type != ET_DYN)
return false;
// FIXME: Are there any additional checks/validations we could do here?
return true;
}
static bool validate_mmap_prot(int prot, bool map_stack, bool map_anonymous, Memory::Region const* region = nullptr)
{
bool make_readable = prot & PROT_READ;
bool make_writable = prot & PROT_WRITE;
bool make_executable = prot & PROT_EXEC;
if (map_anonymous && make_executable)
return false;
if (make_writable && make_executable)
return false;
if (map_stack && make_executable)
return false;
if (region) {
if (make_writable && region->has_been_executable())
return false;
if (make_executable && region->has_been_writable()) {
if (should_make_executable_exception_for_dynamic_loader(make_readable, make_writable, make_executable, *region))
return true;
return false;
}
}
return true;
}
static bool validate_inode_mmap_prot(const Process& process, int prot, const Inode& inode, bool map_shared)
{
auto metadata = inode.metadata();
if ((prot & PROT_READ) && !metadata.may_read(process))
return false;
if (map_shared) {
// FIXME: What about readonly filesystem mounts? We cannot make a
// decision here without knowing the mount flags, so we would need to
// keep a Custody or something from mmap time.
if ((prot & PROT_WRITE) && !metadata.may_write(process))
return false;
if (auto shared_vmobject = inode.shared_vmobject()) {
if ((prot & PROT_EXEC) && shared_vmobject->writable_mappings())
return false;
if ((prot & PROT_WRITE) && shared_vmobject->executable_mappings())
return false;
}
}
return true;
}
ErrorOr<FlatPtr> Process::sys$mmap(Userspace<const Syscall::SC_mmap_params*> user_params)
{
VERIFY_PROCESS_BIG_LOCK_ACQUIRED(this)
REQUIRE_PROMISE(stdio);
auto params = TRY(copy_typed_from_user(user_params));
FlatPtr addr = params.addr;
auto size = params.size;
auto alignment = params.alignment;
auto prot = params.prot;
auto flags = params.flags;
auto fd = params.fd;
auto offset = params.offset;
if (prot & PROT_EXEC) {
REQUIRE_PROMISE(prot_exec);
}
if (prot & MAP_FIXED) {
REQUIRE_PROMISE(map_fixed);
}
if (alignment & ~PAGE_MASK)
return EINVAL;
if (Memory::page_round_up_would_wrap(size))
return EINVAL;
if (!Memory::is_user_range(VirtualAddress(addr), Memory::page_round_up(size)))
return EFAULT;
OwnPtr<KString> name;
if (params.name.characters) {
if (params.name.length > PATH_MAX)
return ENAMETOOLONG;
name = TRY(try_copy_kstring_from_user(params.name));
}
if (size == 0)
return EINVAL;
if ((FlatPtr)addr & ~PAGE_MASK)
return EINVAL;
bool map_shared = flags & MAP_SHARED;
bool map_anonymous = flags & MAP_ANONYMOUS;
bool map_private = flags & MAP_PRIVATE;
bool map_stack = flags & MAP_STACK;
bool map_fixed = flags & MAP_FIXED;
bool map_noreserve = flags & MAP_NORESERVE;
bool map_randomized = flags & MAP_RANDOMIZED;
if (map_shared && map_private)
return EINVAL;
if (!map_shared && !map_private)
return EINVAL;
if (map_fixed && map_randomized)
return EINVAL;
if (!validate_mmap_prot(prot, map_stack, map_anonymous))
return EINVAL;
if (map_stack && (!map_private || !map_anonymous))
return EINVAL;
Memory::Region* region = nullptr;
auto range = TRY([&]() -> ErrorOr<Memory::VirtualRange> {
if (map_randomized) {
return address_space().page_directory().range_allocator().try_allocate_randomized(Memory::page_round_up(size), alignment);
}
auto range = address_space().try_allocate_range(VirtualAddress(addr), size, alignment);
if (range.is_error()) {
if (addr && !map_fixed) {
// If there's an address but MAP_FIXED wasn't specified, the address is just a hint.
range = address_space().try_allocate_range({}, size, alignment);
}
}
return range;
}());
if (map_anonymous) {
auto strategy = map_noreserve ? AllocationStrategy::None : AllocationStrategy::Reserve;
RefPtr<Memory::AnonymousVMObject> vmobject;
if (flags & MAP_PURGEABLE) {
vmobject = TRY(Memory::AnonymousVMObject::try_create_purgeable_with_size(Memory::page_round_up(size), strategy));
} else {
vmobject = TRY(Memory::AnonymousVMObject::try_create_with_size(Memory::page_round_up(size), strategy));
}
region = TRY(address_space().allocate_region_with_vmobject(range, vmobject.release_nonnull(), 0, {}, prot, map_shared));
} else {
if (offset < 0)
return EINVAL;
if (static_cast<size_t>(offset) & ~PAGE_MASK)
return EINVAL;
auto description = TRY(fds().open_file_description(fd));
if (description->is_directory())
return ENODEV;
// Require read access even when read protection is not requested.
if (!description->is_readable())
return EACCES;
if (map_shared) {
if ((prot & PROT_WRITE) && !description->is_writable())
return EACCES;
}
if (description->inode()) {
if (!validate_inode_mmap_prot(*this, prot, *description->inode(), map_shared))
return EACCES;
}
region = TRY(description->mmap(*this, range, static_cast<u64>(offset), prot, map_shared));
}
if (!region)
return ENOMEM;
region->set_mmap(true);
if (map_shared)
region->set_shared(true);
if (map_stack)
region->set_stack(true);
region->set_name(move(name));
PerformanceManager::add_mmap_perf_event(*this, *region);
return region->vaddr().get();
}
static ErrorOr<Memory::VirtualRange> expand_range_to_page_boundaries(FlatPtr address, size_t size)
{
if (Memory::page_round_up_would_wrap(size))
return EINVAL;
if ((address + size) < address)
return EINVAL;
if (Memory::page_round_up_would_wrap(address + size))
return EINVAL;
auto base = VirtualAddress { address }.page_base();
auto end = Memory::page_round_up(address + size);
return Memory::VirtualRange { base, end - base.get() };
}
ErrorOr<FlatPtr> Process::sys$mprotect(Userspace<void*> addr, size_t size, int prot)
{
VERIFY_PROCESS_BIG_LOCK_ACQUIRED(this)
REQUIRE_PROMISE(stdio);
if (prot & PROT_EXEC) {
REQUIRE_PROMISE(prot_exec);
}
auto range_to_mprotect = TRY(expand_range_to_page_boundaries(addr.ptr(), size));
if (!range_to_mprotect.size())
return EINVAL;
if (!is_user_range(range_to_mprotect))
return EFAULT;
if (auto* whole_region = address_space().find_region_from_range(range_to_mprotect)) {
if (!whole_region->is_mmap())
return EPERM;
if (!validate_mmap_prot(prot, whole_region->is_stack(), whole_region->vmobject().is_anonymous(), whole_region))
return EINVAL;
if (whole_region->access() == Memory::prot_to_region_access_flags(prot))
return 0;
if (whole_region->vmobject().is_inode()
&& !validate_inode_mmap_prot(*this, prot, static_cast<Memory::InodeVMObject const&>(whole_region->vmobject()).inode(), whole_region->is_shared())) {
return EACCES;
}
whole_region->set_readable(prot & PROT_READ);
whole_region->set_writable(prot & PROT_WRITE);
whole_region->set_executable(prot & PROT_EXEC);
whole_region->remap();
return 0;
}
// Check if we can carve out the desired range from an existing region
if (auto* old_region = address_space().find_region_containing(range_to_mprotect)) {
if (!old_region->is_mmap())
return EPERM;
if (!validate_mmap_prot(prot, old_region->is_stack(), old_region->vmobject().is_anonymous(), old_region))
return EINVAL;
if (old_region->access() == Memory::prot_to_region_access_flags(prot))
return 0;
if (old_region->vmobject().is_inode()
&& !validate_inode_mmap_prot(*this, prot, static_cast<Memory::InodeVMObject const&>(old_region->vmobject()).inode(), old_region->is_shared())) {
return EACCES;
}
// Remove the old region from our regions tree, since were going to add another region
// with the exact same start address, but do not deallocate it yet
auto region = address_space().take_region(*old_region);
// Unmap the old region here, specifying that we *don't* want the VM deallocated.
region->unmap(Memory::Region::ShouldDeallocateVirtualRange::No);
// This vector is the region(s) adjacent to our range.
// We need to allocate a new region for the range we wanted to change permission bits on.
auto adjacent_regions = TRY(address_space().try_split_region_around_range(*region, range_to_mprotect));
size_t new_range_offset_in_vmobject = region->offset_in_vmobject() + (range_to_mprotect.base().get() - region->range().base().get());
auto new_region = TRY(address_space().try_allocate_split_region(*region, range_to_mprotect, new_range_offset_in_vmobject));
new_region->set_readable(prot & PROT_READ);
new_region->set_writable(prot & PROT_WRITE);
new_region->set_executable(prot & PROT_EXEC);
// Map the new regions using our page directory (they were just allocated and don't have one).
for (auto* adjacent_region : adjacent_regions) {
TRY(adjacent_region->map(address_space().page_directory()));
}
TRY(new_region->map(address_space().page_directory()));
return 0;
}
if (const auto& regions = address_space().find_regions_intersecting(range_to_mprotect); regions.size()) {
size_t full_size_found = 0;
// first check before doing anything
for (const auto* region : regions) {
if (!region->is_mmap())
return EPERM;
if (!validate_mmap_prot(prot, region->is_stack(), region->vmobject().is_anonymous(), region))
return EINVAL;
if (region->access() == Memory::prot_to_region_access_flags(prot))
return 0;
if (region->vmobject().is_inode()
&& !validate_inode_mmap_prot(*this, prot, static_cast<Memory::InodeVMObject const&>(region->vmobject()).inode(), region->is_shared())) {
return EACCES;
}
full_size_found += region->range().intersect(range_to_mprotect).size();
}
if (full_size_found != range_to_mprotect.size())
return ENOMEM;
// then do all the other stuff
for (auto* old_region : regions) {
const auto intersection_to_mprotect = range_to_mprotect.intersect(old_region->range());
// full sub region
if (intersection_to_mprotect == old_region->range()) {
old_region->set_readable(prot & PROT_READ);
old_region->set_writable(prot & PROT_WRITE);
old_region->set_executable(prot & PROT_EXEC);
old_region->remap();
continue;
}
// Remove the old region from our regions tree, since were going to add another region
// with the exact same start address, but dont deallocate it yet
auto region = address_space().take_region(*old_region);
// Unmap the old region here, specifying that we *don't* want the VM deallocated.
region->unmap(Memory::Region::ShouldDeallocateVirtualRange::No);
// This vector is the region(s) adjacent to our range.
// We need to allocate a new region for the range we wanted to change permission bits on.
auto adjacent_regions = TRY(address_space().try_split_region_around_range(*old_region, intersection_to_mprotect));
// there should only be one
VERIFY(adjacent_regions.size() == 1);
size_t new_range_offset_in_vmobject = old_region->offset_in_vmobject() + (intersection_to_mprotect.base().get() - old_region->range().base().get());
auto* new_region = TRY(address_space().try_allocate_split_region(*region, intersection_to_mprotect, new_range_offset_in_vmobject));
new_region->set_readable(prot & PROT_READ);
new_region->set_writable(prot & PROT_WRITE);
new_region->set_executable(prot & PROT_EXEC);
// Map the new region using our page directory (they were just allocated and don't have one) if any.
if (adjacent_regions.size())
TRY(adjacent_regions[0]->map(address_space().page_directory()));
TRY(new_region->map(address_space().page_directory()));
}
return 0;
}
return EINVAL;
}
ErrorOr<FlatPtr> Process::sys$madvise(Userspace<void*> address, size_t size, int advice)
{
VERIFY_PROCESS_BIG_LOCK_ACQUIRED(this)
REQUIRE_PROMISE(stdio);
auto range_to_madvise = TRY(expand_range_to_page_boundaries(address.ptr(), size));
if (!range_to_madvise.size())
return EINVAL;
if (!is_user_range(range_to_madvise))
return EFAULT;
auto* region = address_space().find_region_from_range(range_to_madvise);
if (!region)
return EINVAL;
if (!region->is_mmap())
return EPERM;
bool set_volatile = advice & MADV_SET_VOLATILE;
bool set_nonvolatile = advice & MADV_SET_NONVOLATILE;
if (set_volatile && set_nonvolatile)
return EINVAL;
if (set_volatile || set_nonvolatile) {
if (!region->vmobject().is_anonymous())
return EINVAL;
auto& vmobject = static_cast<Memory::AnonymousVMObject&>(region->vmobject());
if (!vmobject.is_purgeable())
return EINVAL;
bool was_purged = false;
TRY(vmobject.set_volatile(set_volatile, was_purged));
return was_purged ? 1 : 0;
}
return EINVAL;
}
ErrorOr<FlatPtr> Process::sys$set_mmap_name(Userspace<const Syscall::SC_set_mmap_name_params*> user_params)
{
VERIFY_PROCESS_BIG_LOCK_ACQUIRED(this)
REQUIRE_PROMISE(stdio);
auto params = TRY(copy_typed_from_user(user_params));
if (params.name.length > PATH_MAX)
return ENAMETOOLONG;
auto name = TRY(try_copy_kstring_from_user(params.name));
auto range = TRY(expand_range_to_page_boundaries((FlatPtr)params.addr, params.size));
auto* region = address_space().find_region_from_range(range);
if (!region)
return EINVAL;
if (!region->is_mmap())
return EPERM;
region->set_name(move(name));
PerformanceManager::add_mmap_perf_event(*this, *region);
return 0;
}
ErrorOr<FlatPtr> Process::sys$munmap(Userspace<void*> addr, size_t size)
{
VERIFY_PROCESS_BIG_LOCK_ACQUIRED(this)
REQUIRE_PROMISE(stdio);
TRY(address_space().unmap_mmap_range(addr.vaddr(), size));
return 0;
}
ErrorOr<FlatPtr> Process::sys$mremap(Userspace<const Syscall::SC_mremap_params*> user_params)
{
VERIFY_PROCESS_BIG_LOCK_ACQUIRED(this)
REQUIRE_PROMISE(stdio);
auto params = TRY(copy_typed_from_user(user_params));
auto old_range = TRY(expand_range_to_page_boundaries((FlatPtr)params.old_address, params.old_size));
auto* old_region = address_space().find_region_from_range(old_range);
if (!old_region)
return EINVAL;
if (!old_region->is_mmap())
return EPERM;
if (old_region->vmobject().is_shared_inode() && params.flags & MAP_PRIVATE && !(params.flags & (MAP_ANONYMOUS | MAP_NORESERVE))) {
auto range = old_region->range();
auto old_prot = region_access_flags_to_prot(old_region->access());
auto old_offset = old_region->offset_in_vmobject();
NonnullRefPtr inode = static_cast<Memory::SharedInodeVMObject&>(old_region->vmobject()).inode();
auto new_vmobject = TRY(Memory::PrivateInodeVMObject::try_create_with_inode(inode));
auto old_name = old_region->take_name();
// Unmap without deallocating the VM range since we're going to reuse it.
old_region->unmap(Memory::Region::ShouldDeallocateVirtualRange::No);
address_space().deallocate_region(*old_region);
auto new_region = TRY(address_space().allocate_region_with_vmobject(range, move(new_vmobject), old_offset, old_name->view(), old_prot, false));
new_region->set_mmap(true);
return new_region->vaddr().get();
}
dbgln("sys$mremap: Unimplemented remap request (flags={})", params.flags);
return ENOTIMPL;
}
ErrorOr<FlatPtr> Process::sys$allocate_tls(Userspace<const char*> initial_data, size_t size)
{
VERIFY_PROCESS_BIG_LOCK_ACQUIRED(this)
REQUIRE_PROMISE(stdio);
if (!size || size % PAGE_SIZE != 0)
return EINVAL;
if (!m_master_tls_region.is_null())
return EEXIST;
if (thread_count() != 1)
return EFAULT;
Thread* main_thread = nullptr;
bool multiple_threads = false;
for_each_thread([&main_thread, &multiple_threads](auto& thread) {
if (main_thread)
multiple_threads = true;
main_thread = &thread;
return IterationDecision::Break;
});
VERIFY(main_thread);
if (multiple_threads)
return EINVAL;
auto range = TRY(address_space().try_allocate_range({}, size));
auto region = TRY(address_space().allocate_region(range, String("Master TLS"), PROT_READ | PROT_WRITE));
m_master_tls_region = region->make_weak_ptr();
m_master_tls_size = size;
m_master_tls_alignment = PAGE_SIZE;
{
Kernel::SmapDisabler disabler;
void* fault_at;
if (!Kernel::safe_memcpy((char*)m_master_tls_region.unsafe_ptr()->vaddr().as_ptr(), (char*)initial_data.ptr(), size, fault_at))
return EFAULT;
}
TRY(main_thread->make_thread_specific_region({}));
#if ARCH(I386)
auto& tls_descriptor = Processor::current().get_gdt_entry(GDT_SELECTOR_TLS);
tls_descriptor.set_base(main_thread->thread_specific_data());
tls_descriptor.set_limit(main_thread->thread_specific_region_size());
#else
MSR fs_base_msr(MSR_FS_BASE);
fs_base_msr.set(main_thread->thread_specific_data().get());
#endif
return m_master_tls_region.unsafe_ptr()->vaddr().get();
}
ErrorOr<FlatPtr> Process::sys$msyscall(Userspace<void*> address)
{
VERIFY_PROCESS_BIG_LOCK_ACQUIRED(this)
if (address_space().enforces_syscall_regions())
return EPERM;
if (!address) {
address_space().set_enforces_syscall_regions(true);
return 0;
}
if (!Memory::is_user_address(address.vaddr()))
return EFAULT;
auto* region = address_space().find_region_containing(Memory::VirtualRange { address.vaddr(), 1 });
if (!region)
return EINVAL;
if (!region->is_mmap())
return EINVAL;
region->set_syscall_region(true);
return 0;
}
ErrorOr<FlatPtr> Process::sys$msync(Userspace<void*> address, size_t size, [[maybe_unused]] int flags)
{
// FIXME: We probably want to sync all mappings in the address+size range.
auto* region = address_space().find_region_from_range(Memory::VirtualRange { address.vaddr(), size });
if (!region)
return EINVAL;
auto& vmobject = region->vmobject();
if (!vmobject.is_shared_inode())
return 0;
auto& inode_vmobject = static_cast<Memory::SharedInodeVMObject&>(vmobject);
TRY(inode_vmobject.sync());
return 0;
}
}