From 2b40ea75a072ca2a91b1c52906bdd6dba6378eed Mon Sep 17 00:00:00 2001 From: Dmitrii Kuvaiskii Date: Thu, 14 Mar 2024 07:45:10 -0700 Subject: [PATCH] [LibOS,PAL] Emulate file-backed mmap via PAL read/write APIs Previously, the `chroot` FS (plain host-backed files) used the `PalStreamMap()` PAL API for file-backed mmap. This had three problems: 1) need to implement a non-trivial `map` callback in PALs; 2) discrepancy between `map` implementations in different PALs; 3) hard to debug file-mmap bugs because they only reproduced on SGX (`gramine-sgx`) PAL and not on Linux (`gramine-direct`) PAL. Note that other FSes already used emulated file-backed mmap: `tmpfs` and `encrypted` FSes emulate such mmaps via `PalStreamRead()` and `PalStreamWrite()`. This commit switches `chroot` FS to use emulated file-backed mmap. This way, `chroot` becomes similar in implementation to `tmpfs` and `encrypted` FSes. Only `shm` FS still uses `PalStreamMap()` because devices with shared memory have non-standard semantics of mmaps. Corresponding `file_map()` functions in PAL are removed. In this commit, we also introduce the model of "logical" split within a single VMA: some prefix of the VMA is accessible (has valid pages), while the rest is unmapped (returns SIGBUS). Only file-backed VMAs are split in this way (anonymous-memory VMAs can't be in "unmapped" state). This logical split is achieved via a new `vma->valid_length` field. The switch to emulated mmap uncovered several bugs: - Underlying file may be shorter than the requested mmap size. In this case access beyond the last file-backed page must cause SIGBUS. Previously this semantics worked only on `gramine-direct` and wasn't implemented on `gramine-sgx` (even with EDMM). - As a consequence of the semantics above, file-growing `write()` and `ftruncate()` on already-mmapped file must make newly extended file contents accessible. Previously it didn't work on `gramine-sgx` (with EDMM), now it is resolved via `prot_refresh_mmaped_from_file_handle()` call. - `msync()` must update file contents with the mmapped-in-process contents, but only those parts that do not exceed the file size. Previously there was a bug that msync'ed even the exceeding parts. - Applications expect `msync(MS_ASYNC)` to update file contents before the next app access to the file. Gramine instead ignored such requests, leading to accessing stale contents. We fix this bug by treating `MS_ASYNC` the same way as `MS_SYNC`. This bug was detected on LTP test `msync01`. A few more FS tests are enabled on SGX now. Generally, `gramine-sgx` now supports shared file-backed mappings, i.e. `mmap(MAP_SHARED)`. New LibOS test `mmap_file_sigbus` is added; old bad `mmap_file` test is removed. Signed-off-by: Dmitrii Kuvaiskii --- Documentation/pal/host-abi.rst | 6 +- libos/include/libos_fs.h | 27 +- libos/include/libos_handle.h | 2 - libos/include/libos_vma.h | 7 + libos/src/bookkeep/libos_handle.c | 21 -- libos/src/bookkeep/libos_signal.c | 8 +- libos/src/bookkeep/libos_vma.c | 331 +++++++++++++++++------ libos/src/fs/chroot/encrypted.c | 31 ++- libos/src/fs/chroot/fs.c | 33 +-- libos/src/fs/libos_fs_util.c | 54 +++- libos/src/fs/shm/fs.c | 5 +- libos/src/fs/tmpfs/fs.c | 33 ++- libos/src/libos_rtld.c | 16 +- libos/src/sys/libos_mmap.c | 19 +- libos/test/fs/test_enc.py | 39 --- libos/test/fs/test_fs.py | 12 - libos/test/fs/test_tmpfs.py | 14 +- libos/test/ltp/manifest.template | 5 +- libos/test/regression/manifest.template | 3 +- libos/test/regression/meson.build | 2 +- libos/test/regression/mmap_file.c | 101 ------- libos/test/regression/mmap_file_sigbus.c | 215 +++++++++++++++ libos/test/regression/test_libos.py | 48 ++-- libos/test/regression/tests.toml | 2 +- libos/test/regression/tests_musl.toml | 2 +- pal/include/pal/pal.h | 11 +- pal/include/pal_internal.h | 8 +- pal/regression/File.c | 31 --- pal/regression/Symbols.c | 2 +- pal/regression/test_pal.py | 10 +- pal/src/host/linux-sgx/pal_files.c | 141 ---------- pal/src/host/linux/pal_files.c | 13 - pal/src/host/skeleton/pal_files.c | 6 - pal/src/pal_streams.c | 6 +- pal/src/pal_symbols | 2 +- 35 files changed, 682 insertions(+), 584 deletions(-) delete mode 100644 libos/test/regression/mmap_file.c create mode 100644 libos/test/regression/mmap_file_sigbus.c diff --git a/Documentation/pal/host-abi.rst b/Documentation/pal/host-abi.rst index d5fb6e7edb..7f3330b66d 100644 --- a/Documentation/pal/host-abi.rst +++ b/Documentation/pal/host-abi.rst @@ -175,9 +175,6 @@ applications. .. doxygenfunction:: PalStreamDelete :project: pal -.. doxygenfunction:: PalStreamMap - :project: pal - .. doxygenfunction:: PalStreamSetLength :project: pal @@ -366,3 +363,6 @@ random bits, to obtain an attestation report and quote, etc. .. doxygenfunction:: PalGetSpecialKey :project: pal + +.. doxygenfunction:: PalDeviceMap + :project: pal diff --git a/libos/include/libos_fs.h b/libos/include/libos_fs.h index 4682a22bdc..917f8052be 100644 --- a/libos/include/libos_fs.h +++ b/libos/include/libos_fs.h @@ -108,21 +108,28 @@ struct libos_fs_ops { /* * \brief Map file at an address. * - * \param hdl File handle. - * \param addr Address of the memory region. Cannot be NULL. - * \param size Size of the memory region. - * \param prot Permissions for the memory region (`PROT_*`). - * \param flags `mmap` flags (`MAP_*`). - * \param offset Offset in file. - * - * Maps the file at given address. This might involve mapping directly (`PalStreamMap`), or + * \param hdl File handle. + * \param addr Address of the memory region. Cannot be NULL. + * \param size Size of the memory region. + * \param prot Permissions for the memory region (`PROT_*`). + * \param flags `mmap` flags (`MAP_*`). + * \param offset Offset in file. + * \param[out] out_valid_size Valid size (i.e. backed by file). + * + * Maps the file at given address. This might involve mapping directly (`PalDeviceMap`), or * mapping anonymous memory (`PalVirtualMemoryAlloc`) and writing data. * + * The contents of the mapping are initialized using `size` bytes starting at `offset` offset in + * the file. For a file size that is not a multiple of the page size, the remaining bytes on the + * last page are zeroed. Pages that are not backed by file contents are inaccessible + * (effectively they have PROT_NONE permissions). This function returns the valid size (i.e. the + * pages backed by file contents) in `out_valid_size`. + * * `addr`, `offset` and `size` must be alloc-aligned (see `IS_ALLOC_ALIGNED*` macros in * `libos_internal.h`). */ int (*mmap)(struct libos_handle* hdl, void* addr, size_t size, int prot, int flags, - uint64_t offset); + uint64_t offset, size_t* out_valid_size); /* * \brief Write back mapped memory to file. @@ -968,7 +975,7 @@ file_off_t generic_inode_seek(struct libos_handle* hdl, file_off_t offset, int o int generic_inode_poll(struct libos_handle* hdl, int in_events, int* out_events); int generic_emulated_mmap(struct libos_handle* hdl, void* addr, size_t size, int prot, int flags, - uint64_t offset); + uint64_t offset, size_t* valid_size); int generic_emulated_msync(struct libos_handle* hdl, void* addr, size_t size, int prot, int flags, uint64_t offset); int generic_truncate(struct libos_handle* hdl, file_off_t size); diff --git a/libos/include/libos_handle.h b/libos/include/libos_handle.h index 4ce281f4e0..d0920cff06 100644 --- a/libos/include/libos_handle.h +++ b/libos/include/libos_handle.h @@ -303,8 +303,6 @@ int init_exec_handle(const char* const* argv, char*** out_new_argv); int open_executable(struct libos_handle* hdl, const char* path); -int get_file_size(struct libos_handle* file, uint64_t* size); - ssize_t do_handle_read(struct libos_handle* hdl, void* buf, size_t count); ssize_t do_handle_write(struct libos_handle* hdl, const void* buf, size_t count); diff --git a/libos/include/libos_vma.h b/libos/include/libos_vma.h index 37130174a1..a12463bdd1 100644 --- a/libos/include/libos_vma.h +++ b/libos/include/libos_vma.h @@ -26,6 +26,7 @@ struct libos_vma_info { void* addr; size_t length; + size_t valid_length; // memory accesses beyond valid_length result in SIGBUS/EFAULT int prot; // memory protection flags: PROT_* int flags; // MAP_* and VMA_* struct libos_handle* file; @@ -99,6 +100,9 @@ int bkeep_mmap_any(size_t length, int prot, int flags, struct libos_handle* file int bkeep_mmap_any_aslr(size_t length, int prot, int flags, struct libos_handle* file, uint64_t offset, const char* comment, void** ret_val_ptr); +/* Looks up VMA that starts at `begin_addr` and if found, updates `vma->valid_length`. */ +int bkeep_vma_update_valid_length(void* begin_addr, size_t valid_length); + /* Looking up VMA that contains `addr`. If one is found, returns its description in `vma_info`. * This function increases ref-count of `vma_info->file` by one (if it is not NULL). */ int lookup_vma(void* addr, struct libos_vma_info* vma_info); @@ -133,6 +137,9 @@ int msync_handle(struct libos_handle* hdl); /* Reload file mappings of `hdl` */ int reload_mmaped_from_file_handle(struct libos_handle* hdl); +/* Refresh page protections of file mappings of `hdl` when the file size has changed */ +int prot_refresh_mmaped_from_file_handle(struct libos_handle* hdl, size_t file_size); + void debug_print_all_vmas(void); /* Returns the peak amount of memory usage */ diff --git a/libos/src/bookkeep/libos_handle.c b/libos/src/bookkeep/libos_handle.c index daa7ee8b2d..246eedb076 100644 --- a/libos/src/bookkeep/libos_handle.c +++ b/libos/src/bookkeep/libos_handle.c @@ -548,27 +548,6 @@ void put_handle(struct libos_handle* hdl) { } } -int get_file_size(struct libos_handle* hdl, uint64_t* size) { - if (!hdl->fs || !hdl->fs->fs_ops) - return -EINVAL; - - if (hdl->fs->fs_ops->hstat) { - struct stat stat; - int ret = hdl->fs->fs_ops->hstat(hdl, &stat); - if (ret < 0) { - return ret; - } - if (stat.st_size < 0) { - return -EINVAL; - } - *size = (uint64_t)stat.st_size; - return 0; - } - - *size = 0; - return 0; -} - static struct libos_handle_map* get_new_handle_map(uint32_t size) { struct libos_handle_map* handle_map = calloc(1, sizeof(struct libos_handle_map)); diff --git a/libos/src/bookkeep/libos_signal.c b/libos/src/bookkeep/libos_signal.c index cfe4bf5741..52569fd164 100644 --- a/libos/src/bookkeep/libos_signal.c +++ b/libos/src/bookkeep/libos_signal.c @@ -361,12 +361,8 @@ static void memfault_upcall(bool is_in_pal, uintptr_t addr, PAL_CONTEXT* context struct libos_handle* file = vma_info.file; if (file && file->type == TYPE_CHROOT) { /* If the mapping exceeds end of a file then return a SIGBUS. */ - lock(&file->inode->lock); - file_off_t size = file->inode->size; - unlock(&file->inode->lock); - - uintptr_t eof_in_vma = (uintptr_t)vma_info.addr + (size - vma_info.file_offset); - if (addr > eof_in_vma) { + uintptr_t eof_in_vma = (uintptr_t)vma_info.addr + vma_info.valid_length; + if (addr >= eof_in_vma) { info.si_signo = SIGBUS; info.si_code = BUS_ADRERR; } else { diff --git a/libos/src/bookkeep/libos_vma.c b/libos/src/bookkeep/libos_vma.c index c07a627c5f..4e3fb4129c 100644 --- a/libos/src/bookkeep/libos_vma.c +++ b/libos/src/bookkeep/libos_vma.c @@ -43,6 +43,7 @@ static int filter_saved_flags(int flags) { struct libos_vma { uintptr_t begin; uintptr_t end; + uintptr_t valid_end; // memory accesses beyond valid_end result in SIGBUS/EFAULT int prot; int flags; struct libos_handle* file; @@ -64,11 +65,13 @@ static void copy_comment(struct libos_vma* vma, const char* comment) { } static void copy_vma(struct libos_vma* old_vma, struct libos_vma* new_vma) { - new_vma->begin = old_vma->begin; - new_vma->end = old_vma->end; - new_vma->prot = old_vma->prot; - new_vma->flags = old_vma->flags; - new_vma->file = old_vma->file; + new_vma->begin = old_vma->begin; + new_vma->end = old_vma->end; + new_vma->valid_end = old_vma->valid_end; + new_vma->prot = old_vma->prot; + new_vma->flags = old_vma->flags; + + new_vma->file = old_vma->file; if (new_vma->file) { if (new_vma->file->inode) (void)__atomic_add_fetch(&new_vma->file->inode->num_mmapped, 1, __ATOMIC_RELAXED); @@ -170,12 +173,24 @@ typedef bool (*traverse_visitor)(struct libos_vma* vma, void* visitor_arg); * `visitor` returns whether to continue iteration. It must be as simple as possible, because * it's called with the VMA lock held. * - * Returns whether the traversed range was continuously covered by VMAs. This is useful for - * emulating errors in memory management syscalls. + * Returns whether the traversed range was continuously covered by VMAs (takes into account + * `vma->valid_end` if asked by the caller). This is useful: + * + * - For emulating errors in memory management syscalls. To avoid memory faults during deep copy + * of user-supplied buffers in syscalls (e.g., in case of SGX OCALLs), callers must set + * `use_only_valid_part = true`. This deviates slightly from Linux behavior: e.g., on + * `write(partially-valid-vma)` Linux does not return -EFAULT but instead uses the buffer until + * the first invalid address. This behavior is too cumbersome to implement in Gramine + SGX, + * thus on `write(partially-valid-vma)` Gramine immediately returns -EFAULT. + * + * - For deciding whether to return ENOMEM in madvise(MADV_DONTNEED). E.g., on + * `madvise(partially-valid-vma, MADV_DONTNEED)` Linux returns success (even though there is a + * part that is invalid). Callers must set `use_only_valid_part = false` to comply with this + * Linux behavior. */ // TODO: Probably other VMA functions could make use of this helper. -static bool _traverse_vmas_in_range(uintptr_t begin, uintptr_t end, traverse_visitor visitor, - void* visitor_arg) { +static bool _traverse_vmas_in_range(uintptr_t begin, uintptr_t end, bool use_only_valid_part, + traverse_visitor visitor, void* visitor_arg) { assert(spinlock_is_locked(&vma_tree_lock)); assert(begin <= end); @@ -196,11 +211,13 @@ static bool _traverse_vmas_in_range(uintptr_t begin, uintptr_t end, traverse_vis prev = vma; vma = _get_next_vma(vma); if (!vma || end <= vma->begin) { - is_continuous &= end <= prev->end; + uintptr_t prev_end = use_only_valid_part ? prev->valid_end : prev->end; + is_continuous &= end <= prev_end; break; } - is_continuous &= prev->end == vma->begin; + uintptr_t prev_end = use_only_valid_part ? prev->valid_end : prev->end; + is_continuous &= prev_end == vma->begin; } return is_continuous; @@ -213,9 +230,18 @@ static void split_vma(struct libos_vma* old_vma, struct libos_vma* new_vma, uint new_vma->begin = addr; if (new_vma->file) { new_vma->offset += new_vma->begin - old_vma->begin; + if (new_vma->valid_end < new_vma->begin) { + new_vma->valid_end = new_vma->begin; + } } old_vma->end = addr; + if (old_vma->valid_end > old_vma->end) { + old_vma->valid_end = old_vma->end; + } + + assert(old_vma->begin <= old_vma->valid_end && old_vma->valid_end <= old_vma->end); + assert(new_vma->begin <= new_vma->valid_end && new_vma->valid_end <= new_vma->end); } /* @@ -265,6 +291,9 @@ static int _vma_bkeep_remove(uintptr_t begin, uintptr_t end, bool is_internal, split_vma(vma, new_vma, end); vma->end = begin; + if (vma->valid_end > vma->end) { + vma->valid_end = vma->end; + } avl_tree_insert(&vma_tree, &new_vma->tree_node); total_memory_size_sub(end - begin); @@ -274,6 +303,9 @@ static int _vma_bkeep_remove(uintptr_t begin, uintptr_t end, bool is_internal, total_memory_size_sub(vma->end - begin); vma->end = begin; + if (vma->valid_end > vma->end) { + vma->valid_end = vma->end; + } vma = _get_next_vma(vma); if (!vma) { @@ -304,6 +336,9 @@ static int _vma_bkeep_remove(uintptr_t begin, uintptr_t end, bool is_internal, } total_memory_size_sub(end - vma->begin); vma->begin = end; + if (vma->valid_end < vma->begin) { + vma->valid_end = vma->begin; + } } return 0; @@ -582,8 +617,10 @@ int init_vma(void) { continue; } - init_vmas[1 + idx].begin = g_pal_public_state->initial_mem_ranges[i].start; - init_vmas[1 + idx].end = g_pal_public_state->initial_mem_ranges[i].end; + init_vmas[1 + idx].begin = g_pal_public_state->initial_mem_ranges[i].start; + init_vmas[1 + idx].end = g_pal_public_state->initial_mem_ranges[i].end; + init_vmas[1 + idx].valid_end = g_pal_public_state->initial_mem_ranges[i].end; + init_vmas[1 + idx].prot = PAL_PROT_TO_LINUX(g_pal_public_state->initial_mem_ranges[i].prot); init_vmas[1 + idx].flags = MAP_PRIVATE | MAP_ANONYMOUS | VMA_INTERNAL; init_vmas[1 + idx].file = NULL; @@ -704,8 +741,10 @@ int init_vma(void) { static void _add_unmapped_vma(uintptr_t begin, uintptr_t end, struct libos_vma* vma) { assert(spinlock_is_locked(&vma_tree_lock)); - vma->begin = begin; - vma->end = end; + vma->begin = begin; + vma->end = end; + vma->valid_end = end; + vma->prot = PROT_NONE; vma->flags = VMA_INTERNAL | VMA_UNMAPPED; vma->file = NULL; @@ -803,6 +842,11 @@ int bkeep_mmap_fixed(void* addr, size_t length, int prot, int flags, struct libo new_vma->begin = (uintptr_t)addr; new_vma->end = new_vma->begin + length; + + /* valid_end is potentially incorrect now (if there is a file-backed mapping with a part that + * exceeds the file); it should be updated in the mmap syscall (for file-backed mappings) */ + new_vma->valid_end = new_vma->begin + length; + new_vma->prot = prot; new_vma->flags = filter_saved_flags(flags) | ((file && (prot & PROT_WRITE)) ? VMA_TAINTED : 0); new_vma->file = file; @@ -1097,6 +1141,10 @@ int bkeep_mmap_any_in_range(void* _bottom_addr, void* _top_addr, size_t length, new_vma->end = max_addr; new_vma->begin = new_vma->end - length; + /* valid_end is potentially incorrect now (if there is a file-backed mapping with a part that + * exceeds the file); it should be updated in the mmap syscall (for file-backed mappings) */ + new_vma->valid_end = max_addr; + avl_tree_insert(&vma_tree, &new_vma->tree_node); total_memory_size_add(new_vma->end - new_vma->begin); @@ -1133,6 +1181,28 @@ int bkeep_mmap_any_aslr(size_t length, int prot, int flags, struct libos_handle* return bkeep_mmap_any(length, prot, flags, file, offset, comment, ret_val_ptr); } +int bkeep_vma_update_valid_length(void* begin_addr, size_t valid_length) { + int ret; + + spinlock_lock(&vma_tree_lock); + struct libos_vma* vma = _lookup_vma((uintptr_t)begin_addr); + if (!vma || !is_addr_in_vma((uintptr_t)begin_addr, vma)) { + ret = -ENOENT; + goto out; + } + + if (vma->begin != (uintptr_t)begin_addr || valid_length > vma->end - vma->begin) { + ret = -EINVAL; + goto out; + } + + vma->valid_end = vma->begin + valid_length; + ret = 0; +out: + spinlock_unlock(&vma_tree_lock); + return ret; +} + static int pal_mem_bkeep_alloc(size_t size, uintptr_t* out_addr) { void* addr; int ret = bkeep_mmap_any(size, PROT_READ | PROT_WRITE, @@ -1157,12 +1227,13 @@ static int pal_mem_bkeep_free(uintptr_t addr, size_t size) { } static void dump_vma(struct libos_vma_info* vma_info, struct libos_vma* vma) { - vma_info->addr = (void*)vma->begin; - vma_info->length = vma->end - vma->begin; - vma_info->prot = vma->prot; - vma_info->flags = vma->flags; - vma_info->file_offset = vma->offset; - vma_info->file = vma->file; + vma_info->addr = (void*)vma->begin; + vma_info->length = vma->end - vma->begin; + vma_info->valid_length = vma->valid_end - vma->begin; + vma_info->prot = vma->prot; + vma_info->flags = vma->flags; + vma_info->file_offset = vma->offset; + vma_info->file = vma->file; if (vma_info->file) { get_handle(vma_info->file); } @@ -1212,7 +1283,8 @@ bool is_in_adjacent_user_vmas(const void* addr, size_t length, int prot) { }; spinlock_lock(&vma_tree_lock); - bool is_continuous = _traverse_vmas_in_range(begin, end, adj_visitor, &ctx); + bool is_continuous = _traverse_vmas_in_range(begin, end, /*use_only_valid_part=*/true, + adj_visitor, &ctx); spinlock_unlock(&vma_tree_lock); return is_continuous && ctx.is_ok; @@ -1327,7 +1399,7 @@ static bool madvise_dontneed_visitor(struct libos_vma* vma, void* visitor_arg) { } uintptr_t zero_start = MAX(ctx->begin, vma->begin); - uintptr_t zero_end = MIN(ctx->end, vma->end); + uintptr_t zero_end = MIN(ctx->end, vma->valid_end); pal_prot_flags_t pal_prot = LINUX_PROT_TO_PAL(vma->prot, vma->flags); pal_prot_flags_t pal_prot_writable = pal_prot | PAL_PROT_WRITE; @@ -1363,7 +1435,8 @@ int madvise_dontneed_range(uintptr_t begin, uintptr_t end) { }; spinlock_lock(&vma_tree_lock); - bool is_continuous = _traverse_vmas_in_range(begin, end, madvise_dontneed_visitor, &ctx); + bool is_continuous = _traverse_vmas_in_range(begin, end, /*use_only_valid_part=*/false, + madvise_dontneed_visitor, &ctx); spinlock_unlock(&vma_tree_lock); if (!is_continuous) @@ -1371,27 +1444,9 @@ int madvise_dontneed_range(uintptr_t begin, uintptr_t end) { return ctx.error; } -static bool vma_filter_needs_msync(struct libos_vma* vma, void* arg) { - struct libos_handle* hdl = arg; - - if (vma->flags & (VMA_UNMAPPED | VMA_INTERNAL | MAP_ANONYMOUS | MAP_PRIVATE)) - return false; - - assert(vma->file); - - if (hdl && vma->file != hdl) - return false; - - if (!vma->file->fs || !vma->file->fs->fs_ops || !vma->file->fs->fs_ops->msync) - return false; - - if (!(vma->file->acc_mode & MAY_WRITE)) - return false; - - return true; -} - static bool vma_filter_needs_reload(struct libos_vma* vma, void* arg) { + assert(spinlock_is_locked(&vma_tree_lock)); + struct libos_handle* hdl = arg; assert(hdl && hdl->inode); /* guaranteed to have inode because invoked from `write` callback */ @@ -1420,7 +1475,7 @@ static int reload_vma(struct libos_vma_info* vma_info) { /* NOTE: Unfortunately there's a data race here: the memory can be unmapped, or remapped, by * another thread by the time we get to `read`. */ uintptr_t read_begin = (uintptr_t)vma_info->addr; - uintptr_t read_end = (uintptr_t)vma_info->addr + vma_info->length; + uintptr_t read_end = (uintptr_t)vma_info->addr + vma_info->valid_length; assert(IS_ALLOC_ALIGNED(read_begin)); assert(IS_ALLOC_ALIGNED(read_end)); @@ -1495,6 +1550,118 @@ int reload_mmaped_from_file_handle(struct libos_handle* hdl) { return ret; } +struct vma_update_valid_end_args { + struct libos_handle* hdl; + size_t file_size; +}; + +/* returns whether prot_refresh_vma() must be applied on a VMA */ +static bool vma_update_valid_end(struct libos_vma* vma, void* _args) { + assert(spinlock_is_locked(&vma_tree_lock)); + + struct vma_update_valid_end_args* args = _args; + + /* guaranteed to have inode because invoked from `write` or `truncate` callback */ + assert(args->hdl && args->hdl->inode); + + if (vma->flags & (VMA_UNMAPPED | VMA_INTERNAL | MAP_ANONYMOUS)) + return false; + + assert(vma->file); /* check above filtered out non-file-backed mappings */ + + if (!vma->file->inode || vma->file->inode != args->hdl->inode) + return false; + + size_t valid_length; + if (args->file_size >= vma->offset) { + size_t vma_length = vma->end - vma->begin; + if (args->file_size - vma->offset > vma_length) { + /* file size exceeds the mmapped part in VMA, the whole VMA is accessible */ + valid_length = vma_length; + } else { + /* file size is smaller than the mmapped part in VMA, only part of VMA is accessible */ + valid_length = args->file_size - vma->offset; + } + } else { + /* file got smaller than the offset from which VMA is mapped, all VMA is inaccessible */ + valid_length = 0; + } + valid_length = ALLOC_ALIGN_UP(valid_length); + + vma->valid_end = vma->begin + valid_length; + assert(vma->valid_end <= vma->end); + + return true; +} + +static int prot_refresh_vma(struct libos_vma_info* vma_info) { + int ret; + + /* NOTE: Unfortunately there's a data race here: the memory can be unmapped, or remapped, by + * another thread by the time we get to `PalVirtualMemoryProtect`. */ + if (vma_info->valid_length) { + ret = PalVirtualMemoryProtect(vma_info->addr, vma_info->valid_length, + LINUX_PROT_TO_PAL(vma_info->prot, vma_info->flags)); + if (ret < 0) + BUG(); + } + if (vma_info->length - vma_info->valid_length) { + ret = PalVirtualMemoryProtect(vma_info->addr + vma_info->valid_length, + vma_info->length - vma_info->valid_length, /*prot=*/0); + if (ret < 0) + BUG(); + } + + return 0; +} + +/* This helper function is to refresh access protections on the VMA pages of a given file handle on + * file-extend operations (`write` and `ftruncate`). */ +int prot_refresh_mmaped_from_file_handle(struct libos_handle* hdl, size_t file_size) { + struct libos_vma_info* vma_infos; + size_t count; + + struct vma_update_valid_end_args args = { .hdl = hdl, .file_size = file_size }; + + int ret = dump_vmas(&vma_infos, &count, /*begin=*/0, /*end=*/UINTPTR_MAX, + vma_update_valid_end, &args); + if (ret < 0) + return ret; + + for (size_t i = 0; i < count; i++) { + ret = prot_refresh_vma(&vma_infos[i]); + if (ret < 0) + goto out; + } + + ret = 0; +out: + free_vma_info_array(vma_infos, count); + return ret; +} + +static bool vma_filter_needs_msync(struct libos_vma* vma, void* arg) { + assert(spinlock_is_locked(&vma_tree_lock)); + + struct libos_handle* hdl = arg; + + if (vma->flags & (VMA_UNMAPPED | VMA_INTERNAL | MAP_ANONYMOUS | MAP_PRIVATE)) + return false; + + assert(vma->file); + + if (hdl && vma->file != hdl) + return false; + + if (!vma->file->fs || !vma->file->fs->fs_ops || !vma->file->fs->fs_ops->msync) + return false; + + if (!(vma->file->acc_mode & MAY_WRITE)) + return false; + + return true; +} + static int msync_all(uintptr_t begin, uintptr_t end, struct libos_handle* hdl) { assert(IS_ALLOC_ALIGNED(begin)); assert(end == UINTPTR_MAX || IS_ALLOC_ALIGNED(end)); @@ -1514,8 +1681,11 @@ static int msync_all(uintptr_t begin, uintptr_t end, struct libos_handle* hdl) { /* NOTE: Unfortunately there's a data race here: the memory can be unmapped, or remapped, by * another thread by the time we get to `msync`. */ + if (!vma_info->valid_length) + continue; + uintptr_t msync_begin = MAX(begin, (uintptr_t)vma_info->addr); - uintptr_t msync_end = MIN(end, (uintptr_t)vma_info->addr + vma_info->length); + uintptr_t msync_end = MIN(end, (uintptr_t)vma_info->addr + vma_info->valid_length); assert(IS_ALLOC_ALIGNED(msync_begin)); assert(IS_ALLOC_ALIGNED(msync_end)); @@ -1576,31 +1746,26 @@ BEGIN_CP_FUNC(vma) { if (!vma->file) { /* Send anonymous memory region. */ struct libos_mem_entry* mem; - DO_CP_SIZE(memory, vma->addr, vma->length, &mem); + assert(vma->valid_length == vma->length); + DO_CP_SIZE(memory, vma->addr, vma->valid_length, &mem); mem->prot = LINUX_PROT_TO_PAL(vma->prot, /*map_flags=*/0); } else { - /* Send file-backed memory region. */ - uint64_t file_size = 0; - int ret = get_file_size(vma->file, &file_size); - if (ret < 0) - return ret; - - /* Access beyond the last file-backed page will cause SIGBUS. For reducing fork - * latency, we send only those memory contents of VMA that are backed by the file, - * round up to pages. Rest of VMA memory region will be inaccessible in the child - * process. */ - size_t send_size = vma->length; - if (vma->file_offset + vma->length > file_size) { - send_size = file_size > vma->file_offset ? file_size - vma->file_offset : 0; - send_size = ALLOC_ALIGN_UP(send_size); - } - - /* It may happen that the whole file-backed memory is beyond the file size (e.g., + /* + * Send file-backed memory region. + * + * Access beyond the last file-backed page (reflected via vma->valid_length) should + * cause SIGBUS. So we send only those memory contents of VMA that are backed by the + * file, round up to pages. Rest of VMA memory region will be inaccessible in the + * child process. + * + * It may happen that the whole file-backed memory is beyond the file size (e.g., * the file was truncated after the memory was allocated). In this case we consider - * the whole memory region to be inaccessible. */ - if (send_size > 0) { + * the whole memory region to be inaccessible in the child process. + */ + assert(vma->valid_length <= vma->length); + if (vma->valid_length > 0) { struct libos_mem_entry* mem; - DO_CP_SIZE(memory, vma->addr, send_size, &mem); + DO_CP_SIZE(memory, vma->addr, vma->valid_length, &mem); mem->prot = LINUX_PROT_TO_PAL(vma->prot, /*map_flags=*/0); } } @@ -1641,6 +1806,8 @@ BEGIN_RS_FUNC(vma) { if (ret < 0) return ret; + size_t valid_length = vma->valid_length; + if (!(vma->flags & VMA_UNMAPPED) && vma->file) { struct libos_fs* fs = vma->file->fs; get_handle(vma->file); @@ -1650,12 +1817,17 @@ BEGIN_RS_FUNC(vma) { if (!fs || !fs->fs_ops || !fs->fs_ops->mmap) return -EINVAL; - int ret = fs->fs_ops->mmap(vma->file, vma->addr, vma->length, vma->prot, - vma->flags | MAP_FIXED, vma->file_offset); + ret = fs->fs_ops->mmap(vma->file, vma->addr, vma->length, vma->prot, + vma->flags | MAP_FIXED, vma->file_offset, &valid_length); if (ret < 0) return ret; } } + + assert(valid_length <= vma->length); + ret = bkeep_vma_update_valid_length(vma->addr, vma->valid_length); + if (ret < 0) + return ret; } END_RS_FUNC(vma) @@ -1688,16 +1860,17 @@ END_CP_FUNC_NO_RS(all_vmas) static void debug_print_vma(struct libos_vma* vma) { - log_always("[0x%lx-0x%lx] prot=0x%x flags=0x%x%s%s file=%p (offset=%ld)%s%s", - vma->begin, vma->end, - vma->prot, - vma->flags & ~(VMA_INTERNAL | VMA_UNMAPPED), - vma->flags & VMA_INTERNAL ? "(INTERNAL " : "(", - vma->flags & VMA_UNMAPPED ? "UNMAPPED)" : ")", - vma->file, - vma->offset, - vma->comment[0] ? " comment=" : "", - vma->comment[0] ? vma->comment : ""); + log_always( + "[all=0x%lx-0x%lx; valid=0x%lx-0x%lx] prot=0x%x flags=0x%x%s%s file=%p (offset=%ld)%s%s", + vma->begin, vma->end, vma->begin, vma->valid_end, + vma->prot, + vma->flags & ~(VMA_INTERNAL | VMA_UNMAPPED), + vma->flags & VMA_INTERNAL ? "(INTERNAL " : "(", + vma->flags & VMA_UNMAPPED ? "UNMAPPED)" : ")", + vma->file, + vma->offset, + vma->comment[0] ? " comment=" : "", + vma->comment[0] ? vma->comment : ""); } void debug_print_all_vmas(void) { diff --git a/libos/src/fs/chroot/encrypted.c b/libos/src/fs/chroot/encrypted.c index 5f6e37113b..7c52991129 100644 --- a/libos/src/fs/chroot/encrypted.c +++ b/libos/src/fs/chroot/encrypted.c @@ -488,10 +488,19 @@ static ssize_t chroot_encrypted_write(struct libos_handle* hdl, const void* buf, if (hdl->inode->size < *pos) hdl->inode->size = *pos; + size_t new_size = hdl->inode->size; unlock(&hdl->inode->lock); - /* If there are any MAP_SHARED mappings for the file, this will read data from `enc`. */ if (__atomic_load_n(&hdl->inode->num_mmapped, __ATOMIC_ACQUIRE) != 0) { + /* There are mappings for the file, refresh their access protections. */ + ret = prot_refresh_mmaped_from_file_handle(hdl, new_size); + if (ret < 0) { + log_error("refreshing page protections of mmapped regions of file failed: %s", + unix_strerror(ret)); + BUG(); + } + + /* There are mappings for the file, read data from `enc` (only for MAP_SHARED mappings). */ ret = reload_mmaped_from_file_handle(hdl); if (ret < 0) { log_error("reload mmapped regions of file failed: %s", unix_strerror(ret)); @@ -515,11 +524,25 @@ static int chroot_encrypted_truncate(struct libos_handle* hdl, file_off_t size) lock(&hdl->inode->lock); ret = encrypted_file_set_size(enc, size); - if (ret == 0) - hdl->inode->size = size; + if (ret < 0) { + unlock(&hdl->inode->lock); + return ret; + } + + hdl->inode->size = size; unlock(&hdl->inode->lock); - return ret; + if (__atomic_load_n(&hdl->inode->num_mmapped, __ATOMIC_ACQUIRE) != 0) { + /* There are mappings for the file, refresh their access protections. */ + ret = prot_refresh_mmaped_from_file_handle(hdl, size); + if (ret < 0) { + log_error("refreshing page protections of mmapped regions of file failed: %s", + unix_strerror(ret)); + BUG(); + } + } + + return 0; } static int chroot_encrypted_stat(struct libos_dentry* dent, struct stat* buf) { diff --git a/libos/src/fs/chroot/fs.c b/libos/src/fs/chroot/fs.c index 3662b270ed..384dc7c877 100644 --- a/libos/src/fs/chroot/fs.c +++ b/libos/src/fs/chroot/fs.c @@ -234,17 +234,28 @@ static ssize_t chroot_write(struct libos_handle* hdl, const void* buf, size_t co return pal_to_unix_errno(ret); } assert(actual_count <= count); + + size_t new_size = 0; if (hdl->inode->type == S_IFREG) { *pos += actual_count; /* Update file size if we just wrote past the end of file */ lock(&hdl->inode->lock); if (hdl->inode->size < *pos) hdl->inode->size = *pos; + new_size = hdl->inode->size; unlock(&hdl->inode->lock); } - /* If there are any MAP_SHARED mappings for the file, this will read data from `hdl`. */ if (__atomic_load_n(&hdl->inode->num_mmapped, __ATOMIC_ACQUIRE) != 0) { + /* There are mappings for the file, refresh their access protections. */ + ret = prot_refresh_mmaped_from_file_handle(hdl, new_size); + if (ret < 0) { + log_error("refreshing page protections of mmapped regions of file failed: %s", + unix_strerror(ret)); + BUG(); + } + + /* There are mappings for the file, read data from `hdl` (only for MAP_SHARED mappings). */ ret = reload_mmaped_from_file_handle(hdl); if (ret < 0) { log_error("reload mmapped regions of file failed: %s", unix_strerror(ret)); @@ -255,23 +266,6 @@ static ssize_t chroot_write(struct libos_handle* hdl, const void* buf, size_t co return (ssize_t)actual_count; } -static int chroot_mmap(struct libos_handle* hdl, void* addr, size_t size, int prot, int flags, - uint64_t offset) { - assert(hdl->type == TYPE_CHROOT); - assert(addr); - - pal_prot_flags_t pal_prot = LINUX_PROT_TO_PAL(prot, flags); - - if (flags & MAP_ANONYMOUS) - return -EINVAL; - - int ret = PalStreamMap(hdl->pal_handle, addr, pal_prot, offset, size); - if (ret < 0) - return pal_to_unix_errno(ret); - - return 0; -} - int chroot_readdir(struct libos_dentry* dent, readdir_callback_t callback, void* arg) { int ret; PAL_HANDLE palhdl; @@ -419,7 +413,8 @@ struct libos_fs_ops chroot_fs_ops = { .flush = &chroot_flush, .read = &chroot_read, .write = &chroot_write, - .mmap = &chroot_mmap, + .mmap = &generic_emulated_mmap, + .msync = &generic_emulated_msync, /* TODO: this function emulates lseek() completely inside the LibOS, but some device files may * report size == 0 during fstat() and may provide device-specific lseek() logic; this emulation * breaks for such device-specific cases */ diff --git a/libos/src/fs/libos_fs_util.c b/libos/src/fs/libos_fs_util.c index b5c9a22926..e584940239 100644 --- a/libos/src/fs/libos_fs_util.c +++ b/libos/src/fs/libos_fs_util.c @@ -7,6 +7,7 @@ #include "libos_flags_conv.h" #include "libos_fs.h" #include "libos_lock.h" +#include "libos_vma.h" #include "stat.h" int generic_seek(file_off_t pos, file_off_t size, file_off_t offset, int origin, @@ -136,8 +137,9 @@ int generic_inode_poll(struct libos_handle* hdl, int in_events, int* out_events) } int generic_emulated_mmap(struct libos_handle* hdl, void* addr, size_t size, int prot, int flags, - uint64_t offset) { - assert(addr); + uint64_t offset, size_t* out_valid_size) { + assert(addr && IS_ALLOC_ALIGNED_PTR(addr)); + assert(IS_ALLOC_ALIGNED(size)); int ret; @@ -148,11 +150,11 @@ int generic_emulated_mmap(struct libos_handle* hdl, void* addr, size_t size, int if (ret < 0) return pal_to_unix_errno(ret); - size_t read_size = size; + size_t size_to_read = size; char* read_addr = addr; file_off_t pos = offset; - while (read_size > 0) { - ssize_t count = hdl->fs->fs_ops->read(hdl, read_addr, read_size, &pos); + while (size_to_read > 0) { + ssize_t count = hdl->fs->fs_ops->read(hdl, read_addr, size_to_read, &pos); if (count < 0) { if (count == -EINTR) continue; @@ -163,8 +165,8 @@ int generic_emulated_mmap(struct libos_handle* hdl, void* addr, size_t size, int if (count == 0) break; - assert((size_t)count <= read_size); - read_size -= count; + assert((size_t)count <= size_to_read); + size_to_read -= count; read_addr += count; } @@ -176,6 +178,22 @@ int generic_emulated_mmap(struct libos_handle* hdl, void* addr, size_t size, int } } + /* + * Underlying file may be shorter than the requested mmap size. In this case access beyond the + * last file-backed page must cause SIGBUS. Since we allocated all memory above, let's make the + * chunk of memory that is beyond the last file-backed page unavailable. Also see checkpointing + * logic in libos_vma.c for similar emulation in the child process. + */ + assert(size_to_read <= size); + size_t valid_size = ALLOC_ALIGN_UP(size - size_to_read); + if (valid_size < size) { + int valid_ret = PalVirtualMemoryProtect(addr + valid_size, size - valid_size, + /*prot=*/0); + if (valid_ret < 0) + BUG(); + } + + *out_valid_size = valid_size; return 0; err:; @@ -244,11 +262,23 @@ int generic_emulated_msync(struct libos_handle* hdl, void* addr, size_t size, in int generic_truncate(struct libos_handle* hdl, file_off_t size) { lock(&hdl->inode->lock); int ret = PalStreamSetLength(hdl->pal_handle, size); - if (ret == 0) { - hdl->inode->size = size; - } else { - ret = pal_to_unix_errno(ret); + if (ret < 0) { + unlock(&hdl->inode->lock); + return pal_to_unix_errno(ret); } + + hdl->inode->size = size; unlock(&hdl->inode->lock); - return ret; + + if (__atomic_load_n(&hdl->inode->num_mmapped, __ATOMIC_ACQUIRE) != 0) { + /* There are mappings for the file, refresh their access protections. */ + ret = prot_refresh_mmaped_from_file_handle(hdl, size); + if (ret < 0) { + log_error("refreshing page protections of mmapped regions of file failed: %s", + unix_strerror(ret)); + BUG(); + } + } + + return 0; } diff --git a/libos/src/fs/shm/fs.c b/libos/src/fs/shm/fs.c index b696f16b52..b226c2ae5a 100644 --- a/libos/src/fs/shm/fs.c +++ b/libos/src/fs/shm/fs.c @@ -35,7 +35,7 @@ static int shm_mount(struct libos_mount_params* params, void** mount_data) { } static int shm_mmap(struct libos_handle* hdl, void* addr, size_t size, int prot, int flags, - uint64_t offset) { + uint64_t offset, size_t* out_valid_size) { assert(hdl->type == TYPE_SHM); assert(addr); @@ -44,10 +44,11 @@ static int shm_mmap(struct libos_handle* hdl, void* addr, size_t size, int prot, if (flags & MAP_ANONYMOUS) return -EINVAL; - int ret = PalStreamMap(hdl->pal_handle, addr, pal_prot, offset, size); + int ret = PalDeviceMap(hdl->pal_handle, addr, pal_prot, offset, size); if (ret < 0) return pal_to_unix_errno(ret); + *out_valid_size = size; return 0; } diff --git a/libos/src/fs/tmpfs/fs.c b/libos/src/fs/tmpfs/fs.c index e9ae4a6761..ef109d407d 100644 --- a/libos/src/fs/tmpfs/fs.c +++ b/libos/src/fs/tmpfs/fs.c @@ -271,10 +271,19 @@ static ssize_t tmpfs_write(struct libos_handle* hdl, const void* buf, size_t siz inode->mtime = time_us / USEC_IN_SEC; /* keep `ret` */ + size_t new_size = inode->size; unlock(&inode->lock); - /* If there are any MAP_SHARED mappings for the file, this will read data from `hdl`. */ if (__atomic_load_n(&hdl->inode->num_mmapped, __ATOMIC_ACQUIRE) != 0) { + /* There are mappings for the file, refresh their access protections. */ + int refresh_ret = prot_refresh_mmaped_from_file_handle(hdl, new_size); + if (refresh_ret < 0) { + log_error("refreshing page protections of mmapped regions of file failed: %s", + unix_strerror(refresh_ret)); + BUG(); + } + + /* There are mappings for the file, read data from `hdl` (only for MAP_SHARED mappings). */ int reload_ret = reload_mmaped_from_file_handle(hdl); if (reload_ret < 0) { log_error("reload mmapped regions of file failed: %s", unix_strerror(reload_ret)); @@ -296,16 +305,26 @@ static int tmpfs_truncate(struct libos_handle* hdl, file_off_t size) { struct libos_mem_file* mem = hdl->inode->data; ret = mem_file_truncate(mem, size); - if (ret < 0) - goto out; + if (ret < 0) { + unlock(&hdl->inode->lock); + return ret; + } hdl->inode->mtime = time_us / USEC_IN_SEC; hdl->inode->size = size; - ret = 0; - -out: unlock(&hdl->inode->lock); - return ret; + + if (__atomic_load_n(&hdl->inode->num_mmapped, __ATOMIC_ACQUIRE) != 0) { + /* There are mappings for the file, refresh their access protections. */ + ret = prot_refresh_mmaped_from_file_handle(hdl, size); + if (ret < 0) { + log_error("refreshing page protections of mmapped regions of file failed: %s", + unix_strerror(ret)); + BUG(); + } + } + + return 0; } struct libos_fs_ops tmp_fs_ops = { diff --git a/libos/src/libos_rtld.c b/libos/src/libos_rtld.c index 5115d44ce5..baa08f9e30 100644 --- a/libos/src/libos_rtld.c +++ b/libos/src/libos_rtld.c @@ -257,17 +257,25 @@ static int execute_loadcmd(const struct loadcmd* c, elf_addr_t base_diff, void* map_start = (void*)(c->start + base_diff); size_t map_size = c->map_end - c->start; - if ((ret = bkeep_mmap_fixed(map_start, map_size, c->prot, map_flags, file, c->map_off, - /*comment=*/NULL)) < 0) { + ret = bkeep_mmap_fixed(map_start, map_size, c->prot, map_flags, file, c->map_off, + /*comment=*/NULL); + if (ret < 0) { log_debug("failed to bookkeep address of segment"); return ret; } - if ((ret = file->fs->fs_ops->mmap(file, map_start, map_size, c->prot, map_flags, - c->map_off)) < 0) { + size_t valid_size; + ret = file->fs->fs_ops->mmap(file, map_start, map_size, c->prot, map_flags, c->map_off, + &valid_size); + if (ret < 0) { log_debug("failed to map segment: %s", unix_strerror(ret)); return ret; } + if (valid_size != map_size) { + log_debug("failed to map segment: expected to map %lu bytes but mapped only %lu bytes", + map_size, valid_size); + return -EACCES; + } } /* Zero out the extra data at the end of mapped area. If necessary, temporarily remap the last diff --git a/libos/src/sys/libos_mmap.c b/libos/src/sys/libos_mmap.c index fa04766a63..93b9aa58db 100644 --- a/libos/src/sys/libos_mmap.c +++ b/libos/src/sys/libos_mmap.c @@ -255,7 +255,16 @@ void* libos_syscall_mmap(void* addr, size_t length, int prot, int flags, int fd, } } } else { - ret = hdl->fs->fs_ops->mmap(hdl, addr, length, prot, flags, offset); + size_t valid_length; + ret = hdl->fs->fs_ops->mmap(hdl, addr, length, prot, flags, offset, &valid_length); + if (ret == 0) { + int update_valid_length_ret = bkeep_vma_update_valid_length(addr, valid_length); + if (update_valid_length_ret < 0) { + log_error("[mmap] Failed to update valid length to %lu of bookkeeped memory %p-%p!", + valid_length, addr, (char*)addr + length); + BUG(); + } + } } if (ret < 0) { @@ -539,10 +548,6 @@ long libos_syscall_msync(unsigned long start, size_t len_orig, int flags) { return -ENOSYS; } - if (flags & MS_SYNC) { - return msync_range(start, start + len); - } else { - /* `MS_ASYNC` is a no-op on Linux. */ - return 0; - } + /* `MS_ASYNC` is emulated as `MS_SYNC`; this sacrifices performance for correctness. */ + return msync_range(start, start + len); } diff --git a/libos/test/fs/test_enc.py b/libos/test/fs/test_enc.py index 6eccc02e19..3b83a33879 100644 --- a/libos/test/fs/test_enc.py +++ b/libos/test/fs/test_enc.py @@ -91,33 +91,6 @@ def test_100_open_close(self): self.verify_open_close(stdout, stderr, output_path, 'output') self.assertTrue(os.path.isfile(output_path)) - # overrides TC_00_FileSystem to not skip Gramine-SGX - def test_111_read_write_mmap(self): - file_path = os.path.join(self.OUTPUT_DIR, 'test_111') # new file to be created - stdout, stderr = self.run_binary(['read_write_mmap', file_path]) - size = '1048576' - self.assertNotIn('ERROR: ', stderr) - self.assertTrue(os.path.isfile(file_path)) - - self.assertIn('open(' + file_path + ') RW (mmap) OK', stdout) - self.assertIn('mmap_fd(' + size + ') OK', stdout) - self.assertIn('read(' + file_path + ') 1 RW (mmap) OK', stdout) - self.assertIn('seek(' + file_path + ') 1 RW (mmap) OK', stdout) - self.assertIn('write(' + file_path + ') RW (mmap) OK', stdout) - self.assertIn('seek(' + file_path + ') 2 RW (mmap) OK', stdout) - self.assertIn('read(' + file_path + ') 2 RW (mmap) OK', stdout) - self.assertIn('compare(' + file_path + ') RW (mmap) OK', stdout) - self.assertIn('munmap_fd(' + size + ') OK', stdout) - self.assertIn('close(' + file_path + ') RW (mmap) OK', stdout) - - self.assertIn('open(' + file_path + ') RW fd1 (mmap) OK', stdout) - self.assertIn('open(' + file_path + ') RW fd2 OK', stdout) - self.assertIn('mmap_fd(' + size + ') fd1 OK', stdout) - self.assertIn('write(' + file_path + ') RW fd2 OK', stdout) - self.assertIn('munmap_fd(' + size + ') fd1 OK', stdout) - self.assertIn('close(' + file_path + ') RW fd1 (mmap) OK', stdout) - self.assertIn('close(' + file_path + ') RW fd2 OK', stdout) - # overrides TC_00_FileSystem to change input dir (from plaintext to encrypted) def test_115_seek_tell(self): # the test binary expects a path to read-only (existing) file and two paths to files that @@ -186,18 +159,6 @@ def do_copy_test(self, executable, timeout): timeout=timeout) self.verify_copy(stdout, stderr, self.ENCRYPTED_DIR, executable) - # overrides TC_00_FileSystem to not skip this on SGX - def test_204_copy_dir_mmap_whole(self): - self.do_copy_test('copy_mmap_whole', 30) - - # overrides TC_00_FileSystem to not skip this on SGX - def test_205_copy_dir_mmap_seq(self): - self.do_copy_test('copy_mmap_seq', 60) - - # overrides TC_00_FileSystem to not skip this on SGX - def test_206_copy_dir_mmap_rev(self): - self.do_copy_test('copy_mmap_rev', 60) - # overrides TC_00_FileSystem to change dirs (from plaintext to encrypted) def test_210_copy_dir_mounted(self): executable = 'copy_whole' diff --git a/libos/test/fs/test_fs.py b/libos/test/fs/test_fs.py index 9aa8e77655..2fb768d252 100644 --- a/libos/test/fs/test_fs.py +++ b/libos/test/fs/test_fs.py @@ -95,11 +95,6 @@ def test_110_read_write(self): self.assertIn('compare(' + file_path + ') RW OK', stdout) self.assertIn('close(' + file_path + ') RW OK', stdout) - # Gramine's implementation of file_map doesn't currently support shared memory-mapped regular - # chroot files with write permission in PAL/Linux-SGX (like mmap(PROT_WRITE, MAP_SHARED, fd)). - # Below test requires it, so skip it. We decided not to implement it as we don't know any - # workload using it. - @unittest.skipIf(HAS_SGX, 'mmap(PROT_WRITE, MAP_SHARED, fd) not implemented in Linux-SGX PAL') def test_111_read_write_mmap(self): file_path = os.path.join(self.OUTPUT_DIR, 'test_111') # new file to be created stdout, stderr = self.run_binary(['read_write_mmap', file_path]) @@ -317,19 +312,12 @@ def test_202_copy_dir_rev(self): def test_203_copy_dir_sendfile(self): self.do_copy_test('copy_sendfile', 60) - # Gramine's implementation of file_map doesn't currently support shared memory-mapped - # files with write permission in PAL/Linux-SGX (like mmap(PROT_WRITE, MAP_SHARED, fd)). - # These tests require it, so skip them. We decided not to implement it as we don't - # know any workload using it. - @unittest.skipIf(HAS_SGX, 'mmap(PROT_WRITE, MAP_SHARED, fd) not implemented in Linux-SGX PAL') def test_204_copy_dir_mmap_whole(self): self.do_copy_test('copy_mmap_whole', 30) - @unittest.skipIf(HAS_SGX, 'mmap(PROT_WRITE, MAP_SHARED, fd) not implemented in Linux-SGX PAL') def test_205_copy_dir_mmap_seq(self): self.do_copy_test('copy_mmap_seq', 60) - @unittest.skipIf(HAS_SGX, 'mmap(PROT_WRITE, MAP_SHARED, fd) not implemented in Linux-SGX PAL') def test_206_copy_dir_mmap_rev(self): self.do_copy_test('copy_mmap_rev', 60) diff --git a/libos/test/fs/test_tmpfs.py b/libos/test/fs/test_tmpfs.py index a57d080c40..67edeb1646 100644 --- a/libos/test/fs/test_tmpfs.py +++ b/libos/test/fs/test_tmpfs.py @@ -46,7 +46,7 @@ def test_110_read_write(self): self.assertIn('compare(' + file_path + ') RW OK', stdout) self.assertIn('close(' + file_path + ') RW OK', stdout) - # overrides TC_00_FileSystem to not skip Gramine-SGX and to skip verification of file existence + # overrides TC_00_FileSystem to skip verification of file existence def test_111_read_write_mmap(self): file_path = os.path.join(self.OUTPUT_DIR, 'test_111') # new file to be created stdout, stderr = self.run_binary(['read_write_mmap', file_path]) @@ -92,18 +92,6 @@ def test_140_file_truncate(self): def verify_copy_content(self, input_path, output_path): pass - # This overrides parent class to remove @expectedFailureIf(HAS_SGX) - def test_204_copy_dir_mmap_whole(self): - self.do_copy_test('copy_mmap_whole', 30) - - # This overrides parent class to remove @expectedFailureIf(HAS_SGX) - def test_205_copy_dir_mmap_seq(self): - self.do_copy_test('copy_mmap_seq', 60) - - # This overrides parent class to remove @expectedFailureIf(HAS_SGX) - def test_206_copy_dir_mmap_rev(self): - self.do_copy_test('copy_mmap_rev', 60) - @unittest.skip("not applicable for tmpfs") def test_210_copy_dir_mounted(self): test_fs.TC_00_FileSystem.test_210_copy_dir_mounted(self) diff --git a/libos/test/ltp/manifest.template b/libos/test/ltp/manifest.template index c939b0c99f..ea76c87326 100644 --- a/libos/test/ltp/manifest.template +++ b/libos/test/ltp/manifest.template @@ -14,7 +14,8 @@ fs.mounts = [ { path = "{{ arch_libdir }}", uri = "file:{{ arch_libdir }}" }, { path = "/usr", uri = "file:/usr" }, { path = "/tmp", uri = "file:/tmp" }, - # for tests that require POSIX shared memory support + + # many LTP multi-process tests rely on shared-memory IPC via `mmap(MAP_SHARED, )` { type = "untrusted_shm", path = "/dev/shm", uri = "dev:/dev/shm" }, ] @@ -31,7 +32,7 @@ sgx.use_exinfo = true sgx.allowed_files = [ "file:/tmp", - "dev:/dev/shm/", # for tests that require POSIX shared memory support + "dev:/dev/shm/", # for tests that rely on shared-memory IPC, see note above ] sgx.trusted_files = [ diff --git a/libos/test/regression/manifest.template b/libos/test/regression/manifest.template index 25ba7a5602..ab236b1cf2 100644 --- a/libos/test/regression/manifest.template +++ b/libos/test/regression/manifest.template @@ -23,11 +23,12 @@ fs.mounts = [ sgx.max_threads = {{ '1' if env.get('EDMM', '0') == '1' else '16' }} sgx.debug = true sgx.edmm_enable = {{ 'true' if env.get('EDMM', '0') == '1' else 'false' }} +sgx.use_exinfo = {{ 'true' if env.get('EDMM', '0') == '1' else 'false' }} sgx.allowed_files = [ "file:tmp/", "file:root", # for getdents test - "file:testfile", # for mmap_file test + "file:testfile", # for large_mmap test "file:scripts/", # for exec_script test ] diff --git a/libos/test/regression/meson.build b/libos/test/regression/meson.build index d5bfa56b5e..cbd141d51f 100644 --- a/libos/test/regression/meson.build +++ b/libos/test/regression/meson.build @@ -70,9 +70,9 @@ tests = { 'large_mmap': {}, 'madvise': {}, 'mkfifo': {}, - 'mmap_file': {}, 'mmap_file_backed': {}, 'mmap_file_emulated': {}, + 'mmap_file_sigbus': {}, 'mock_syscalls': {}, 'mprotect_file_fork': {}, 'mprotect_prot_growsdown': {}, diff --git a/libos/test/regression/mmap_file.c b/libos/test/regression/mmap_file.c deleted file mode 100644 index dfe17eb07c..0000000000 --- a/libos/test/regression/mmap_file.c +++ /dev/null @@ -1,101 +0,0 @@ -/* TODO: Hans, get ze flammenwerfer... */ -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include - -static const char* message; - -static void SIGBUS_handler(int sig) { - puts(message); - exit(0); -} - -int main(int argc, const char** argv) { - setbuf(stdout, NULL); - setbuf(stderr, NULL); - int rv; - - FILE* fp = fopen("testfile", "w+"); - if (!fp) { - perror("fopen"); - return 1; - } - - long page_size = sysconf(_SC_PAGESIZE); - if (page_size < 0) { - perror("sysconf"); - return 1; - } - long quarter_page = page_size / 4; - - rv = ftruncate(fileno(fp), quarter_page); - if (rv) { - perror("ftruncate"); - return 1; - } - - volatile unsigned char* a = - mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, MAP_PRIVATE, fileno(fp), 0); - if (a == MAP_FAILED) { - perror("mmap"); - return 1; - } - - a[quarter_page - 1] = 0xff; - a[page_size - 1] = 0xff; - - __asm__ volatile("nop" ::: "memory"); - - int pid = fork(); - if (pid == -1) { - perror("fork"); - return 1; - } - if (pid != 0) { - rv = waitpid(pid, NULL, 0); - if (rv == -1) { - perror("waitpid"); - return 1; - } - } - - __asm__ volatile("nop" ::: "memory"); - - a[0] = 0xff; - printf(pid == 0 ? "mmap test 1 passed\n" : "mmap test 6 passed\n"); - a[quarter_page] = 0xff; - printf(pid == 0 ? "mmap test 2 passed\n" : "mmap test 7 passed\n"); - - __asm__ volatile("nop" ::: "memory"); - - if (pid == 0) { - if (a[quarter_page - 1] == 0xff) - printf("mmap test 3 passed\n"); - if (a[page_size - 1] == 0xff) - printf("mmap test 4 passed\n"); - } - - __asm__ volatile("nop" ::: "memory"); - - if (signal(SIGBUS, SIGBUS_handler) == SIG_ERR) { - perror("signal"); - return 1; - } - - message = pid == 0 ? "mmap test 5 passed\n" : "mmap test 8 passed\n"; - /* need a barrier to assign message before SIGBUS due to a[page_size] */ - __asm__ volatile("nop" ::: "memory"); - a[page_size] = 0xff; - - if (signal(SIGBUS, SIG_DFL) == SIG_ERR) { - perror("signal"); - return 1; - } - - return 0; -} diff --git a/libos/test/regression/mmap_file_sigbus.c b/libos/test/regression/mmap_file_sigbus.c new file mode 100644 index 0000000000..f3cd3a3f25 --- /dev/null +++ b/libos/test/regression/mmap_file_sigbus.c @@ -0,0 +1,215 @@ +/* SPDX-License-Identifier: LGPL-3.0-or-later */ +/* Copyright (C) 2024 Intel Corporation */ + +/* + * Perform a 2-page file-backed mmap on a file with 1-page size. Verify the following: + * + * - Behavior when accessing the mmapped regions + * - accessing the first 1-page region succeeds + * - accessing the second 1-page region results in SIGBUS + * + * - Behavior when using the mmapped regions as a buffer to a syscall + * - specifying the whole 2-page region succeeds (writes 1 page) (yes, that's how Linux works) + * - specifying the first 1-page region succeeds (writes 1 page) + * - specifying the second 1-page region results in -EFAULT + * + * - Behavior when using the mmapped regions with madvise(MADV_DONTNEED) + * - madvise(MADV_DONTNEED) on the first 1-page region succeeds + * - madvise(MADV_DONTNEED) on the second 1-page region succeeds (yes, that's how Linux works) + * - accessing the first 1-page region after madvise(MADV_DONTNEED) succeeds + * - accessing the second 1-page region after madvise(MADV_DONTNEED) results in SIGBUS + * + * This test can be run as single-process (last argument == "nofork") or as multi-process (last + * argument == "fork"). In the latter case, mmap happens in the parent process and all tests happen + * in the child process, i.e. the test verifies that mmaped region was correctly sent to child. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +/* this test can be augmented to run on any arch, but we currently only care about x86-64 */ +#ifndef __x86_64__ +#error Unsupported architecture +#endif + +uint64_t mem_read(void* addr) __attribute__((visibility("internal"))); +void ret(void) __attribute__((visibility("internal"))); +__asm__ ( +".pushsection .text\n" +".type mem_read, @function\n" +".type ret, @function\n" +"mem_read:\n" + "movq (%rdi), %rax\n" +"ret:\n" + "ret\n" +".popsection\n" +); + +static int g_sigbus_triggered = 0; + +static void sigbus_handler(int signum, siginfo_t* si, void* uc) { + if (signum != SIGBUS) { + /* we registered a SIGBUS handler but got another signal?! */ + _Exit(1); + } + + uint64_t rip = ((ucontext_t*)uc)->uc_mcontext.gregs[REG_RIP]; + if (rip != (uint64_t)(mem_read)) + _Exit(1); + + g_sigbus_triggered++; + + ((ucontext_t*)uc)->uc_mcontext.gregs[REG_RAX] = 0xdeadbeef; + ((ucontext_t*)uc)->uc_mcontext.gregs[REG_RIP] = (uint64_t)ret; +} + +static void run_tests(char* m, const char* write_path) { + size_t page_size = getpagesize(); + + void* addr_page1 = &m[0]; + void* addr_page2 = &m[page_size]; + + /* test 1: access memory regions (first page succeeds, second page raises SIGBUS) */ + g_sigbus_triggered = 0; + + uint64_t x; + x = mem_read(addr_page1); + if (x == 0xdeadbeef) + errx(1, "read returned value reserved for invalid accesses: %lx", x); + if (g_sigbus_triggered != 0) + errx(1, "expected no SIGBUS, got %d", g_sigbus_triggered); + x = mem_read(addr_page2); + if (x != 0xdeadbeef) + errx(1, "read did not return value reserved for invalid accesses but instead: %lx", x); + if (g_sigbus_triggered != 1) + errx(1, "expected 1 SIGBUS, got %d", g_sigbus_triggered); + + /* test 2: specify memory regions as buffer to a syscall */ + int write_fd = CHECK(open(write_path, O_WRONLY | O_CREAT | O_TRUNC, 0660)); + + ssize_t ret; +#if 0 + /* + * FIXME: Linux writes until the first memory fault, i.e. until the second page. Gramine + * on SGX (with EDMM) doesn't currently comply with this behavior: this would require + * intercepting memory faults, realizing that we're inside a system call and that a + * user-supplied buffer raised this fault, and instructing the syscall to return a + * partial success. Instead, Gramine returns -EFAULT when a buffer with an invalid memory + * region is detected. + * + * Note that Linux returns -EFAULT if the memory fault is raised before any data was + * written, see write(write_fd, addr_page2, page_size) below. This is similar to Gramine. + * + * Also see https://yarchive.net/comp/linux/partial_reads_writes.html for Linux history. + */ + ret = write(write_fd, addr_page1, page_size * 2); + if (ret != (ssize_t)page_size) + errx(1, "write(2 pages): expected 1-page write, got ret=%ld, errno=%d", ret, errno); +#endif + ret = write(write_fd, addr_page1, page_size); + if (ret != (ssize_t)page_size) + errx(1, "write(valid page): expected 1-page write, got ret=%ld, errno=%d", ret, errno); + ret = write(write_fd, addr_page2, page_size); + if (ret != -1 || errno != EFAULT) + errx(1, "write(invalid page): expected EFAULT, got ret=%ld, errno=%d", ret, errno); + + CHECK(close(write_fd)); + CHECK(unlink(write_path)); + + /* test 3: specify memory regions in madvise(MADV_DONTNEED) and access them */ + ret = madvise(addr_page1, page_size, MADV_DONTNEED); + if (ret != 0) + errx(1, "madvise(valid page): expected success, got ret=%ld, errno=%d", ret, errno); + ret = madvise(addr_page2, page_size, MADV_DONTNEED); + if (ret != 0) + errx(1, "madvise(invalid page): expected success, got ret=%ld, errno=%d", ret, errno); + + g_sigbus_triggered = 0; + x = mem_read(addr_page1); + if (x == 0xdeadbeef) + errx(1, "(after madvise) read returned value reserved for invalid accesses: %lx", x); + if (g_sigbus_triggered != 0) + errx(1, "expected no SIGBUS, got %d", g_sigbus_triggered); + x = mem_read(addr_page2); + if (x != 0xdeadbeef) + errx(1, "(after madvise) read did not return value reserved for invalid accesses but " + "instead: %lx", x); + if (g_sigbus_triggered != 1) + errx(1, "expected 1 SIGBUS, got %d", g_sigbus_triggered); +} + +int main(int argc, char** argv) { + size_t page_size = getpagesize(); + + if (argc != 4) { + errx(1, "Usage: %s ", + argv[0]); + } + + const char* path1 = argv[1]; + const char* path2 = argv[2]; + + bool do_fork; + if (strcmp(argv[3], "fork") == 0) { + do_fork = true; + } else if (strcmp(argv[3], "nofork") == 0) { + do_fork = false; + } else { + errx(1, "Did not recognize 3rd argument (can be only fork/nofork, but got %s)", argv[3]); + } + + struct sigaction sa = { + .sa_sigaction = sigbus_handler, + .sa_flags = SA_RESTART | SA_SIGINFO, + }; + CHECK(sigaction(SIGBUS, &sa, NULL)); + + /* we assume that Pytest creates the 1-page file before running this test; note that we can't + * create the file and ftruncate it as it would require the file to be writable -- this won't + * allow to test madvise(MADV_DONTNEED) as Gramine doesn't support it on writable files */ + int fd = CHECK(open(path1, O_RDONLY)); + + struct stat st; + CHECK(stat(path1, &st)); + if (st.st_size != (ssize_t)page_size) + errx(1, "stat: got 0x%lx, expected 0x%lx", st.st_size, page_size); + + char* m = (char*)mmap(NULL, page_size * 2, PROT_READ, MAP_PRIVATE, fd, 0); + if (m == MAP_FAILED) + err(1, "mmap()"); + + if (!do_fork) { + /* single-process test: run all tests in this main (and only) process */ + run_tests(m, path2); + } else { + /* multi-process test: run all tests in the child process */ + int pid = CHECK(fork()); + if (pid == 0) { + run_tests(m, path2); + puts("CHILD OK"); + } else { + int status = 0; + CHECK(wait(&status)); + if (!WIFEXITED(status) || WEXITSTATUS(status)) + errx(1, "child wait status: %#x", status); + puts("PARENT OK"); + } + } + + CHECK(close(fd)); + puts("TEST OK"); + return 0; +} diff --git a/libos/test/regression/test_libos.py b/libos/test/regression/test_libos.py index 95aaaa140a..55ed2c56be 100644 --- a/libos/test/regression/test_libos.py +++ b/libos/test/regression/test_libos.py @@ -11,6 +11,7 @@ from graminelibos.regression import ( HAS_AVX, + HAS_EDMM, HAS_SGX, IS_VM, ON_X86, @@ -817,30 +818,31 @@ def test_043_futex_wake_op(self): self.assertIn('Test successful!', stdout) - def test_050_mmap(self): - stdout, _ = self.run_binary(['mmap_file'], timeout=60) - - # Private mmap beyond file range - self.assertIn('mmap test 6 passed', stdout) - self.assertIn('mmap test 7 passed', stdout) - - # Private mmap beyond file range (after fork) - self.assertIn('mmap test 1 passed', stdout) - self.assertIn('mmap test 2 passed', stdout) - self.assertIn('mmap test 3 passed', stdout) - self.assertIn('mmap test 4 passed', stdout) - - # "test 5" and "test 8" are checked below, in test_051_mmap_sgx - - @unittest.skipIf(HAS_SGX, - 'On SGX, SIGBUS isn\'t always implemented correctly, for lack ' - 'of memory protection. For now, some of these cases won\'t work.') - def test_051_mmap_sgx(self): - stdout, _ = self.run_binary(['mmap_file'], timeout=60) + def _prepare_mmap_file_sigbus_files(self): + read_path = 'tmp/__mmaptestreadfile__' + if not os.path.exists(read_path): + with open(read_path, "wb") as f: + f.truncate(os.sysconf("SC_PAGE_SIZE")) + write_path = 'tmp/__mmaptestfilewrite__' + if os.path.exists(write_path): + os.unlink(write_path) + return read_path, write_path + + @unittest.skipIf(HAS_SGX and not HAS_EDMM, + 'On SGX without EDMM, SIGBUS cannot be triggered for lack of dynamic memory protection.') + def test_050_mmap_file_sigbus(self): + read_path, write_path = self._prepare_mmap_file_sigbus_files() + stdout, _ = self.run_binary(['mmap_file_sigbus', read_path, write_path, 'nofork']) + self.assertIn('TEST OK', stdout) - # SIGBUS test - self.assertIn('mmap test 5 passed', stdout) - self.assertIn('mmap test 8 passed', stdout) + @unittest.skipIf(HAS_SGX and not HAS_EDMM, + 'On SGX without EDMM, SIGBUS cannot be triggered for lack of dynamic memory protection.') + def test_051_mmap_file_sigbus_child(self): + read_path, write_path = self._prepare_mmap_file_sigbus_files() + stdout, _ = self.run_binary(['mmap_file_sigbus', read_path, write_path, 'fork'], timeout=60) + self.assertIn('PARENT OK', stdout) + self.assertIn('CHILD OK', stdout) + self.assertIn('TEST OK', stdout) @unittest.skipUnless(HAS_SGX, 'Trusted files are only available with SGX') diff --git a/libos/test/regression/tests.toml b/libos/test/regression/tests.toml index 00f1f740bc..16033ebc79 100644 --- a/libos/test/regression/tests.toml +++ b/libos/test/regression/tests.toml @@ -73,9 +73,9 @@ manifests = [ "large_mmap", "madvise", "mkfifo", - "mmap_file", "mmap_file_backed", "mmap_file_emulated", + "mmap_file_sigbus", "mock_syscalls", "mprotect_file_fork", "mprotect_prot_growsdown", diff --git a/libos/test/regression/tests_musl.toml b/libos/test/regression/tests_musl.toml index e1577fd490..2c5de8d5ee 100644 --- a/libos/test/regression/tests_musl.toml +++ b/libos/test/regression/tests_musl.toml @@ -75,9 +75,9 @@ manifests = [ "large_mmap", "madvise", "mkfifo", - "mmap_file", "mmap_file_backed", "mmap_file_emulated", + "mmap_file_sigbus", "mock_syscalls", "mprotect_file_fork", "mprotect_prot_growsdown", diff --git a/pal/include/pal/pal.h b/pal/include/pal/pal.h index 259082fb76..637de0ac15 100644 --- a/pal/include/pal/pal.h +++ b/pal/include/pal/pal.h @@ -410,19 +410,20 @@ enum pal_delete_mode { int PalStreamDelete(PAL_HANDLE handle, enum pal_delete_mode delete_mode); /*! - * \brief Map a file to a virtual memory address in the current process. + * \brief Map a device to a virtual memory address in the current process. * - * \param handle Handle to the stream to be mapped. + * \param handle Handle to the device to be mapped. * \param addr See #PalVirtualMemoryAlloc. * \param prot See #PalVirtualMemoryAlloc. - * \param offset Offset in the stream to be mapped. Must be properly aligned. + * \param offset Offset in the device handle to be mapped. Must be properly aligned. * \param size Size of the requested mapping. Must be non-zero and properly aligned. * * \returns 0 on success, negative error code on failure. * - * Use `PalVirtualMemoryFree` to unmap the file. + * Currently used only by devices (to establish shared memory on device); files emulate this via + * `PalStreamRead` and `PalStreamWrite`. Use `PalVirtualMemoryFree` to unmap this mapping. */ -int PalStreamMap(PAL_HANDLE handle, void* addr, pal_prot_flags_t prot, uint64_t offset, +int PalDeviceMap(PAL_HANDLE handle, void* addr, pal_prot_flags_t prot, uint64_t offset, size_t size); /*! diff --git a/pal/include/pal_internal.h b/pal/include/pal_internal.h index 8d650629f6..97448703a2 100644 --- a/pal/include/pal_internal.h +++ b/pal/include/pal_internal.h @@ -58,8 +58,7 @@ struct handle_ops { void (*destroy)(PAL_HANDLE handle); /* - * 'map' and 'unmap' will map or unmap the handle into memory space, it's not necessary mapped - * by mmap, so unmap also needs 'handle' to deal with special cases. + * 'map' will map the handle (currently only device handles) into memory space. * * Common PAL code will ensure that address, offset, and size are page-aligned. 'address' * should not be NULL. @@ -175,8 +174,6 @@ int64_t _PalStreamRead(PAL_HANDLE handle, uint64_t offset, uint64_t count, void* int64_t _PalStreamWrite(PAL_HANDLE handle, uint64_t offset, uint64_t count, const void* buf); int _PalStreamAttributesQuery(const char* uri, PAL_STREAM_ATTR* attr); int _PalStreamAttributesQueryByHandle(PAL_HANDLE hdl, PAL_STREAM_ATTR* attr); -int _PalStreamMap(PAL_HANDLE handle, void* addr, pal_prot_flags_t prot, uint64_t offset, - uint64_t size); int _PalStreamSetLength(PAL_HANDLE handle, uint64_t length); int _PalStreamFlush(PAL_HANDLE handle); int _PalSendHandle(PAL_HANDLE target_process, PAL_HANDLE cargo); @@ -196,6 +193,9 @@ int _PalSocketSend(PAL_HANDLE handle, struct iovec* iov, size_t iov_len, size_t* int _PalSocketRecv(PAL_HANDLE handle, struct iovec* iov, size_t iov_len, size_t* out_total_size, struct pal_socket_addr* addr, bool force_nonblocking); +int _PalDeviceMap(PAL_HANDLE handle, void* addr, pal_prot_flags_t prot, uint64_t offset, + uint64_t size); + /* PalProcess and PalThread calls */ int _PalThreadCreate(PAL_HANDLE* handle, int (*callback)(void*), void* param); noreturn void _PalThreadExit(int* clear_child_tid); diff --git a/pal/regression/File.c b/pal/regression/File.c index ea3005c2c9..7cfb429c3a 100644 --- a/pal/regression/File.c +++ b/pal/regression/File.c @@ -58,37 +58,6 @@ int main(int argc, char** argv, char** envp) { attr1.pending_size); } - /* test file map */ - - uintptr_t mem1_addr; - ret = mem_bkeep_alloc(PAGE_SIZE, &mem1_addr); - if (ret < 0) { - pal_printf("mem_bkeep_alloc failed: %d\n", ret); - return 1; - } - void* mem1 = (void*)mem1_addr; - ret = PalStreamMap(file1, mem1, PAL_PROT_READ | PAL_PROT_WRITECOPY, 0, PAGE_SIZE); - if (ret >= 0 && mem1) { - memcpy(buffer1, mem1, 40); - print_hex("Map Test 1 (0th - 40th): %s\n", buffer1, 40); - - memcpy(buffer2, mem1 + 200, 40); - print_hex("Map Test 2 (200th - 240th): %s\n", buffer2, 40); - - ret = PalVirtualMemoryFree(mem1, PAGE_SIZE); - if (ret < 0) { - pal_printf("PalVirtualMemoryFree failed\n"); - return 1; - } - ret = mem_bkeep_free((uintptr_t)mem1, PAGE_SIZE); - if (ret < 0) { - pal_printf("mem_bkeep_free failed: %d\n", ret); - return 1; - } - } else { - pal_printf("Map Test 1 & 2: Failed to map buffer\n"); - } - PalObjectDestroy(file1); } diff --git a/pal/regression/Symbols.c b/pal/regression/Symbols.c index 5c1e1c4e99..0840af16d1 100644 --- a/pal/regression/Symbols.c +++ b/pal/regression/Symbols.c @@ -24,9 +24,9 @@ int main(int argc, char** argv, char** envp) { PRINT_SYMBOL(PalStreamRead); PRINT_SYMBOL(PalStreamWrite); PRINT_SYMBOL(PalStreamDelete); - PRINT_SYMBOL(PalStreamMap); PRINT_SYMBOL(PalStreamSetLength); PRINT_SYMBOL(PalStreamFlush); + PRINT_SYMBOL(PalDeviceMap); PRINT_SYMBOL(PalSendHandle); PRINT_SYMBOL(PalReceiveHandle); PRINT_SYMBOL(PalStreamAttributesQuery); diff --git a/pal/regression/test_pal.py b/pal/regression/test_pal.py index 0991b52d98..14b74fe347 100644 --- a/pal/regression/test_pal.py +++ b/pal/regression/test_pal.py @@ -149,9 +149,9 @@ class TC_02_Symbols(RegressionTestCase): 'PalStreamRead', 'PalStreamWrite', 'PalStreamDelete', - 'PalStreamMap', 'PalStreamSetLength', 'PalStreamFlush', + 'PalDeviceMap', 'PalSendHandle', 'PalReceiveHandle', 'PalStreamAttributesQuery', @@ -279,14 +279,6 @@ def test_100_file(self): self.assertIn('Query by Handle: type = ', stderr) self.assertIn(', size = {}'.format(len(file_exist)), stderr) - # File Mapping - self.assertIn( - 'Map Test 1 (0th - 40th): {}'.format(file_exist[0:40].hex()), - stderr) - self.assertIn( - 'Map Test 2 (200th - 240th): {}'.format(file_exist[200:240].hex()), - stderr) - # Set File Length self.assertEqual( pathlib.Path('file_nonexist.tmp').stat().st_size, diff --git a/pal/src/host/linux-sgx/pal_files.c b/pal/src/host/linux-sgx/pal_files.c index 7f6a01b95b..a12bbbfe30 100644 --- a/pal/src/host/linux-sgx/pal_files.c +++ b/pal/src/host/linux-sgx/pal_files.c @@ -250,146 +250,6 @@ static int file_delete(PAL_HANDLE handle, enum pal_delete_mode delete_mode) { return ret < 0 ? unix_to_pal_error(ret) : 0; } -static int file_map(PAL_HANDLE handle, void* addr, pal_prot_flags_t prot, uint64_t offset, - uint64_t size) { - assert(IS_ALLOC_ALIGNED(offset) && IS_ALLOC_ALIGNED(size)); - int ret; - - uint64_t dummy; - if (__builtin_add_overflow(offset, size, &dummy)) { - return -PAL_ERROR_INVAL; - } - - if (size > SIZE_MAX) { - /* for compatibility with 32-bit systems */ - return -PAL_ERROR_INVAL; - } - - if (!(prot & PAL_PROT_WRITECOPY) && (prot & PAL_PROT_WRITE)) { - log_warning( - "file_map does not currently support writable pass-through mappings on SGX. You " - "may add the PAL_PROT_WRITECOPY (MAP_PRIVATE) flag to your file mapping to keep " - "the writes inside the enclave but they won't be reflected outside of the " - "enclave."); - return -PAL_ERROR_DENIED; - } - - /* Sanity checks. */ - if (!addr || !sgx_is_completely_within_enclave(addr, size)) { - return -PAL_ERROR_INVAL; - } - - if (g_pal_linuxsgx_state.edmm_enabled) { - /* Enclave pages will be written to below, so we must add W permission. */ - ret = sgx_edmm_add_pages((uint64_t)addr, size / PAGE_SIZE, - PAL_TO_SGX_PROT(prot | PAL_PROT_WRITE)); - if (ret < 0) { - return ret; - } - } else { -#ifdef ASAN - asan_unpoison_region((uintptr_t)addr, size); -#endif - } - - if (handle->file.trusted) { - /* case of trusted file: already mmaped in umem, copy from there into enclave memory and - * verify hashes along the way */ - assert(handle->file.chunk_hashes); - - off_t end = MIN(offset + size, handle->file.size); - size_t bytes_filled; - if ((off_t)offset >= end) { - /* file is mmapped at offset beyond file size, there are no trusted-file contents to - * back mmapped enclave pages; this is a legit case, so simply zero out these enclave - * pages and return success */ - bytes_filled = 0; - } else { - off_t aligned_offset = ALIGN_DOWN(offset, TRUSTED_CHUNK_SIZE); - off_t aligned_end = ALIGN_UP(end, TRUSTED_CHUNK_SIZE); - off_t total_size = aligned_end - aligned_offset; - - if ((uint64_t)total_size > SIZE_MAX) { - /* for compatibility with 32-bit systems */ - ret = -PAL_ERROR_INVAL; - goto out; - } - - assert(handle->file.size && handle->file.umem); - ret = copy_and_verify_trusted_file(handle->file.realpath, addr, handle->file.umem, - aligned_offset, aligned_end, offset, end, - handle->file.chunk_hashes, handle->file.size); - if (ret < 0) { - log_error("file_map - copy & verify on trusted file: %s", pal_strerror(ret)); - goto out; - } - - bytes_filled = end - offset; - } - - if (size > bytes_filled) { - /* file ended before all mmapped memory was filled -- remaining memory must be zeroed */ - memset((char*)addr + bytes_filled, 0, size - bytes_filled); - } - } else { - /* case of allowed file: simply read from underlying file descriptor into enclave memory */ - assert(!handle->file.chunk_hashes); - - size_t bytes_read = 0; - while (bytes_read < size) { - size_t read_size = MIN(size - bytes_read, MAX_READ_SIZE); - ssize_t bytes = ocall_pread(handle->file.fd, (char*)addr + bytes_read, read_size, - offset + bytes_read); - if (bytes > 0) { - bytes_read += bytes; - } else if (bytes == 0) { - break; /* EOF */ - } else if (bytes == -EINTR || bytes == -EAGAIN) { - continue; - } else { - log_warning("file_map - ocall_pread on allowed file returned %ld", bytes); - ret = unix_to_pal_error(bytes); - goto out; - } - } - - if (size > bytes_read) { - /* file ended before all mmapped memory was filled -- remaining memory must be zeroed */ - memset((char*)addr + bytes_read, 0, size - bytes_read); - } - } - - if (g_pal_linuxsgx_state.edmm_enabled && !(prot & PAL_PROT_WRITE)) { - /* Clear W permission, in case we added it artificially. */ - ret = sgx_edmm_set_page_permissions((uint64_t)addr, size / PAGE_SIZE, - PAL_TO_SGX_PROT(prot)); - if (ret < 0) { - log_error("failed to remove W bit from pages permissions at %p-%p", - (char*)addr, (char*)addr + size); - goto out; - } - } - - ret = 0; - -out: - if (ret < 0) { - if (g_pal_linuxsgx_state.edmm_enabled) { - int tmp_ret = sgx_edmm_remove_pages((uint64_t)addr, size / PAGE_SIZE); - if (tmp_ret < 0) { - log_error("removing previously allocated pages failed: %s (%d)", - pal_strerror(tmp_ret), ret); - die_or_inf_loop(); - } - } else { -#ifdef ASAN - asan_poison_region((uintptr_t)addr, size, ASAN_POISON_USER); -#endif - } - } - return ret; -} - static int file_setlength(PAL_HANDLE handle, uint64_t length) { int ret = ocall_ftruncate(handle->file.fd, length); if (ret < 0) @@ -639,7 +499,6 @@ struct handle_ops g_file_ops = { .write = &file_write, .destroy = &file_destroy, .delete = &file_delete, - .map = &file_map, .setlength = &file_setlength, .flush = &file_flush, .attrquery = &file_attrquery, diff --git a/pal/src/host/linux/pal_files.c b/pal/src/host/linux/pal_files.c index acca5897c9..23860984f9 100644 --- a/pal/src/host/linux/pal_files.c +++ b/pal/src/host/linux/pal_files.c @@ -124,18 +124,6 @@ static int file_delete(PAL_HANDLE handle, enum pal_delete_mode delete_mode) { return 0; } -static int file_map(PAL_HANDLE handle, void* addr, pal_prot_flags_t prot, uint64_t offset, - uint64_t size) { - int flags = PAL_MEM_FLAGS_TO_LINUX(prot) | (addr ? MAP_FIXED_NOREPLACE : 0); - int linux_prot = PAL_PROT_TO_LINUX(prot); - - addr = (void*)DO_SYSCALL(mmap, addr, size, linux_prot, flags, handle->file.fd, offset); - if (IS_PTR_ERR(addr)) - return unix_to_pal_error(PTR_TO_ERR(addr)); - - return 0; -} - static int file_setlength(PAL_HANDLE handle, uint64_t length) { int ret = DO_SYSCALL(ftruncate, handle->file.fd, length); return ret < 0 ? unix_to_pal_error(ret) : 0; @@ -370,7 +358,6 @@ struct handle_ops g_file_ops = { .write = &file_write, .destroy = &file_destroy, .delete = &file_delete, - .map = &file_map, .setlength = &file_setlength, .flush = &file_flush, .attrquery = &file_attrquery, diff --git a/pal/src/host/skeleton/pal_files.c b/pal/src/host/skeleton/pal_files.c index edec49bb29..a9666d53cc 100644 --- a/pal/src/host/skeleton/pal_files.c +++ b/pal/src/host/skeleton/pal_files.c @@ -32,11 +32,6 @@ static int file_delete(PAL_HANDLE handle, enum pal_delete_mode delete_mode) { return -PAL_ERROR_NOTIMPLEMENTED; } -static int file_map(PAL_HANDLE handle, void* addr, pal_prot_flags_t prot, uint64_t offset, - uint64_t size) { - return -PAL_ERROR_NOTIMPLEMENTED; -} - static int file_setlength(PAL_HANDLE handle, uint64_t length) { return -PAL_ERROR_NOTIMPLEMENTED; } @@ -89,7 +84,6 @@ struct handle_ops g_file_ops = { .write = &file_write, .destroy = &file_destroy, .delete = &file_delete, - .map = &file_map, .setlength = &file_setlength, .flush = &file_flush, .attrquery = &file_attrquery, diff --git a/pal/src/pal_streams.c b/pal/src/pal_streams.c index 10d342db1f..722a4ff238 100644 --- a/pal/src/pal_streams.c +++ b/pal/src/pal_streams.c @@ -268,7 +268,7 @@ int PalStreamAttributesSetByHandle(PAL_HANDLE handle, PAL_STREAM_ATTR* attr) { return ops->attrsetbyhdl(handle, attr); } -int _PalStreamMap(PAL_HANDLE handle, void* addr, pal_prot_flags_t prot, uint64_t offset, +int _PalDeviceMap(PAL_HANDLE handle, void* addr, pal_prot_flags_t prot, uint64_t offset, uint64_t size) { assert(IS_ALLOC_ALIGNED(offset)); int ret; @@ -289,7 +289,7 @@ int _PalStreamMap(PAL_HANDLE handle, void* addr, pal_prot_flags_t prot, uint64_t return 0; } -int PalStreamMap(PAL_HANDLE handle, void* addr, pal_prot_flags_t prot, uint64_t offset, +int PalDeviceMap(PAL_HANDLE handle, void* addr, pal_prot_flags_t prot, uint64_t offset, size_t size) { if (!handle) { return -PAL_ERROR_INVAL; @@ -307,7 +307,7 @@ int PalStreamMap(PAL_HANDLE handle, void* addr, pal_prot_flags_t prot, uint64_t return -PAL_ERROR_INVAL; } - return _PalStreamMap(handle, addr, prot, offset, size); + return _PalDeviceMap(handle, addr, prot, offset, size); } int _PalStreamSetLength(PAL_HANDLE handle, uint64_t length) { diff --git a/pal/src/pal_symbols b/pal/src/pal_symbols index 92a82cb2cc..eb2d26550d 100644 --- a/pal/src/pal_symbols +++ b/pal/src/pal_symbols @@ -16,10 +16,10 @@ PalStreamsWaitEvents PalStreamOpen PalStreamRead PalStreamWrite -PalStreamMap PalStreamSetLength PalStreamFlush PalStreamDelete +PalDeviceMap PalSocketCreate PalSocketBind PalSocketListen