summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-12-06 14:01:20 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2025-12-06 14:01:20 -0800
commit509d3f45847627f4c5cdce004c3ec79262b5239c (patch)
tree3f5d650b393eeb0e560f78958bb20d6645ca55e0 /mm
parent09670b8c38b37bc2d6fc5d01fa7e02c38f7adf36 (diff)
parentaa514a297a0c175239f24a2e582ebd37f0727494 (diff)
Merge tag 'mm-nonmm-stable-2025-12-06-11-14' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull non-MM updates from Andrew Morton: - "panic: sys_info: Refactor and fix a potential issue" (Andy Shevchenko) fixes a build issue and does some cleanup in ib/sys_info.c - "Implement mul_u64_u64_div_u64_roundup()" (David Laight) enhances the 64-bit math code on behalf of a PWM driver and beefs up the test module for these library functions - "scripts/gdb/symbols: make BPF debug info available to GDB" (Ilya Leoshkevich) makes BPF symbol names, sizes, and line numbers available to the GDB debugger - "Enable hung_task and lockup cases to dump system info on demand" (Feng Tang) adds a sysctl which can be used to cause additional info dumping when the hung-task and lockup detectors fire - "lib/base64: add generic encoder/decoder, migrate users" (Kuan-Wei Chiu) adds a general base64 encoder/decoder to lib/ and migrates several users away from their private implementations - "rbree: inline rb_first() and rb_last()" (Eric Dumazet) makes TCP a little faster - "liveupdate: Rework KHO for in-kernel users" (Pasha Tatashin) reworks the KEXEC Handover interfaces in preparation for Live Update Orchestrator (LUO), and possibly for other future clients - "kho: simplify state machine and enable dynamic updates" (Pasha Tatashin) increases the flexibility of KEXEC Handover. Also preparation for LUO - "Live Update Orchestrator" (Pasha Tatashin) is a major new feature targeted at cloud environments. Quoting the cover letter: This series introduces the Live Update Orchestrator, a kernel subsystem designed to facilitate live kernel updates using a kexec-based reboot. This capability is critical for cloud environments, allowing hypervisors to be updated with minimal downtime for running virtual machines. LUO achieves this by preserving the state of selected resources, such as memory, devices and their dependencies, across the kernel transition. As a key feature, this series includes support for preserving memfd file descriptors, which allows critical in-memory data, such as guest RAM or any other large memory region, to be maintained in RAM across the kexec reboot. Mike Rappaport merits a mention here, for his extensive review and testing work. - "kexec: reorganize kexec and kdump sysfs" (Sourabh Jain) moves the kexec and kdump sysfs entries from /sys/kernel/ to /sys/kernel/kexec/ and adds back-compatibility symlinks which can hopefully be removed one day - "kho: fixes for vmalloc restoration" (Mike Rapoport) fixes a BUG which was being hit during KHO restoration of vmalloc() regions * tag 'mm-nonmm-stable-2025-12-06-11-14' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (139 commits) calibrate: update header inclusion Reinstate "resource: avoid unnecessary lookups in find_next_iomem_res()" vmcoreinfo: track and log recoverable hardware errors kho: fix restoring of contiguous ranges of order-0 pages kho: kho_restore_vmalloc: fix initialization of pages array MAINTAINERS: TPM DEVICE DRIVER: update the W-tag init: replace simple_strtoul with kstrtoul to improve lpj_setup KHO: fix boot failure due to kmemleak access to non-PRESENT pages Documentation/ABI: new kexec and kdump sysfs interface Documentation/ABI: mark old kexec sysfs deprecated kexec: move sysfs entries to /sys/kernel/kexec test_kho: always print restore status kho: free chunks using free_page() instead of kfree() selftests/liveupdate: add kexec test for multiple and empty sessions selftests/liveupdate: add simple kexec-based selftest for LUO selftests/liveupdate: add userspace API selftests docs: add documentation for memfd preservation via LUO mm: memfd_luo: allow preserving memfd liveupdate: luo_file: add private argument to store runtime state mm: shmem: export some functions to internal.h ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile1
-rw-r--r--mm/internal.h6
-rw-r--r--mm/memblock.c93
-rw-r--r--mm/memfd_luo.c516
-rw-r--r--mm/shmem.c49
5 files changed, 604 insertions, 61 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 00ceb2418b64..2d0570a16e5b 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_NUMA) += memory-tiers.o
obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
+obj-$(CONFIG_LIVEUPDATE) += memfd_luo.o
obj-$(CONFIG_MEMCG_V1) += memcontrol-v1.o
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
ifdef CONFIG_SWAP
diff --git a/mm/internal.h b/mm/internal.h
index 89790def1bae..e430da900430 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1582,6 +1582,12 @@ void __meminit __init_page_from_nid(unsigned long pfn, int nid);
unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
int priority);
+int shmem_add_to_page_cache(struct folio *folio,
+ struct address_space *mapping,
+ pgoff_t index, void *expected, gfp_t gfp);
+int shmem_inode_acct_blocks(struct inode *inode, long pages);
+bool shmem_recalc_inode(struct inode *inode, long alloced, long swapped);
+
#ifdef CONFIG_SHRINKER_DEBUG
static inline __printf(2, 0) int shrinker_debugfs_name_alloc(
struct shrinker *shrinker, const char *fmt, va_list ap)
diff --git a/mm/memblock.c b/mm/memblock.c
index f0f2dc66e9a2..905d06b16348 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -2445,60 +2445,59 @@ int reserve_mem_release_by_name(const char *name)
#define MEMBLOCK_KHO_FDT "memblock"
#define MEMBLOCK_KHO_NODE_COMPATIBLE "memblock-v1"
#define RESERVE_MEM_KHO_NODE_COMPATIBLE "reserve-mem-v1"
-static struct page *kho_fdt;
-static int reserve_mem_kho_finalize(struct kho_serialization *ser)
+static int __init reserved_mem_preserve(void)
{
- int err = 0, i;
+ unsigned int nr_preserved = 0;
+ int err;
- for (i = 0; i < reserved_mem_count; i++) {
+ for (unsigned int i = 0; i < reserved_mem_count; i++, nr_preserved++) {
struct reserve_mem_table *map = &reserved_mem_table[i];
struct page *page = phys_to_page(map->start);
unsigned int nr_pages = map->size >> PAGE_SHIFT;
- err |= kho_preserve_pages(page, nr_pages);
+ err = kho_preserve_pages(page, nr_pages);
+ if (err)
+ goto err_unpreserve;
}
- err |= kho_preserve_folio(page_folio(kho_fdt));
- err |= kho_add_subtree(ser, MEMBLOCK_KHO_FDT, page_to_virt(kho_fdt));
+ return 0;
- return notifier_from_errno(err);
-}
+err_unpreserve:
+ for (unsigned int i = 0; i < nr_preserved; i++) {
+ struct reserve_mem_table *map = &reserved_mem_table[i];
+ struct page *page = phys_to_page(map->start);
+ unsigned int nr_pages = map->size >> PAGE_SHIFT;
-static int reserve_mem_kho_notifier(struct notifier_block *self,
- unsigned long cmd, void *v)
-{
- switch (cmd) {
- case KEXEC_KHO_FINALIZE:
- return reserve_mem_kho_finalize((struct kho_serialization *)v);
- case KEXEC_KHO_ABORT:
- return NOTIFY_DONE;
- default:
- return NOTIFY_BAD;
+ kho_unpreserve_pages(page, nr_pages);
}
-}
-static struct notifier_block reserve_mem_kho_nb = {
- .notifier_call = reserve_mem_kho_notifier,
-};
+ return err;
+}
static int __init prepare_kho_fdt(void)
{
- int err = 0, i;
+ struct page *fdt_page;
void *fdt;
+ int err;
- kho_fdt = alloc_page(GFP_KERNEL);
- if (!kho_fdt)
- return -ENOMEM;
+ fdt_page = alloc_page(GFP_KERNEL);
+ if (!fdt_page) {
+ err = -ENOMEM;
+ goto err_report;
+ }
- fdt = page_to_virt(kho_fdt);
+ fdt = page_to_virt(fdt_page);
+ err = kho_preserve_pages(fdt_page, 1);
+ if (err)
+ goto err_free_fdt;
err |= fdt_create(fdt, PAGE_SIZE);
err |= fdt_finish_reservemap(fdt);
-
err |= fdt_begin_node(fdt, "");
err |= fdt_property_string(fdt, "compatible", MEMBLOCK_KHO_NODE_COMPATIBLE);
- for (i = 0; i < reserved_mem_count; i++) {
+
+ for (unsigned int i = 0; !err && i < reserved_mem_count; i++) {
struct reserve_mem_table *map = &reserved_mem_table[i];
err |= fdt_begin_node(fdt, map->name);
@@ -2508,14 +2507,29 @@ static int __init prepare_kho_fdt(void)
err |= fdt_end_node(fdt);
}
err |= fdt_end_node(fdt);
-
err |= fdt_finish(fdt);
- if (err) {
- pr_err("failed to prepare memblock FDT for KHO: %d\n", err);
- put_page(kho_fdt);
- kho_fdt = NULL;
- }
+ if (err)
+ goto err_unpreserve_fdt;
+
+ err = kho_add_subtree(MEMBLOCK_KHO_FDT, fdt);
+ if (err)
+ goto err_unpreserve_fdt;
+
+ err = reserved_mem_preserve();
+ if (err)
+ goto err_remove_subtree;
+
+ return 0;
+
+err_remove_subtree:
+ kho_remove_subtree(fdt);
+err_unpreserve_fdt:
+ kho_unpreserve_pages(fdt_page, 1);
+err_free_fdt:
+ put_page(fdt_page);
+err_report:
+ pr_err("failed to prepare memblock FDT for KHO: %d\n", err);
return err;
}
@@ -2530,13 +2544,6 @@ static int __init reserve_mem_init(void)
err = prepare_kho_fdt();
if (err)
return err;
-
- err = register_kho_notifier(&reserve_mem_kho_nb);
- if (err) {
- put_page(kho_fdt);
- kho_fdt = NULL;
- }
-
return err;
}
late_initcall(reserve_mem_init);
diff --git a/mm/memfd_luo.c b/mm/memfd_luo.c
new file mode 100644
index 000000000000..4f6ba63b4310
--- /dev/null
+++ b/mm/memfd_luo.c
@@ -0,0 +1,516 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ *
+ * Copyright (C) 2025 Amazon.com Inc. or its affiliates.
+ * Pratyush Yadav <ptyadav@amazon.de>
+ */
+
+/**
+ * DOC: Memfd Preservation via LUO
+ *
+ * Overview
+ * ========
+ *
+ * Memory file descriptors (memfd) can be preserved over a kexec using the Live
+ * Update Orchestrator (LUO) file preservation. This allows userspace to
+ * transfer its memory contents to the next kernel after a kexec.
+ *
+ * The preservation is not intended to be transparent. Only select properties of
+ * the file are preserved. All others are reset to default. The preserved
+ * properties are described below.
+ *
+ * .. note::
+ * The LUO API is not stabilized yet, so the preserved properties of a memfd
+ * are also not stable and are subject to backwards incompatible changes.
+ *
+ * .. note::
+ * Currently a memfd backed by Hugetlb is not supported. Memfds created
+ * with ``MFD_HUGETLB`` will be rejected.
+ *
+ * Preserved Properties
+ * ====================
+ *
+ * The following properties of the memfd are preserved across kexec:
+ *
+ * File Contents
+ * All data stored in the file is preserved.
+ *
+ * File Size
+ * The size of the file is preserved. Holes in the file are filled by
+ * allocating pages for them during preservation.
+ *
+ * File Position
+ * The current file position is preserved, allowing applications to continue
+ * reading/writing from their last position.
+ *
+ * File Status Flags
+ * memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property
+ * is maintained.
+ *
+ * Non-Preserved Properties
+ * ========================
+ *
+ * All properties which are not preserved must be assumed to be reset to
+ * default. This section describes some of those properties which may be more of
+ * note.
+ *
+ * ``FD_CLOEXEC`` flag
+ * A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the
+ * ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set
+ * again after restore via ``fcntl()``.
+ *
+ * Seals
+ * File seals are not preserved. The file is unsealed on restore and if
+ * needed, must be sealed again via ``fcntl()``.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/bits.h>
+#include <linux/err.h>
+#include <linux/file.h>
+#include <linux/io.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho/abi/memfd.h>
+#include <linux/liveupdate.h>
+#include <linux/shmem_fs.h>
+#include <linux/vmalloc.h>
+#include "internal.h"
+
+static int memfd_luo_preserve_folios(struct file *file,
+ struct kho_vmalloc *kho_vmalloc,
+ struct memfd_luo_folio_ser **out_folios_ser,
+ u64 *nr_foliosp)
+{
+ struct inode *inode = file_inode(file);
+ struct memfd_luo_folio_ser *folios_ser;
+ unsigned int max_folios;
+ long i, size, nr_pinned;
+ struct folio **folios;
+ int err = -EINVAL;
+ pgoff_t offset;
+ u64 nr_folios;
+
+ size = i_size_read(inode);
+ /*
+ * If the file has zero size, then the folios and nr_folios properties
+ * are not set.
+ */
+ if (!size) {
+ *nr_foliosp = 0;
+ *out_folios_ser = NULL;
+ memset(kho_vmalloc, 0, sizeof(*kho_vmalloc));
+ return 0;
+ }
+
+ /*
+ * Guess the number of folios based on inode size. Real number might end
+ * up being smaller if there are higher order folios.
+ */
+ max_folios = PAGE_ALIGN(size) / PAGE_SIZE;
+ folios = kvmalloc_array(max_folios, sizeof(*folios), GFP_KERNEL);
+ if (!folios)
+ return -ENOMEM;
+
+ /*
+ * Pin the folios so they don't move around behind our back. This also
+ * ensures none of the folios are in CMA -- which ensures they don't
+ * fall in KHO scratch memory. It also moves swapped out folios back to
+ * memory.
+ *
+ * A side effect of doing this is that it allocates a folio for all
+ * indices in the file. This might waste memory on sparse memfds. If
+ * that is really a problem in the future, we can have a
+ * memfd_pin_folios() variant that does not allocate a page on empty
+ * slots.
+ */
+ nr_pinned = memfd_pin_folios(file, 0, size - 1, folios, max_folios,
+ &offset);
+ if (nr_pinned < 0) {
+ err = nr_pinned;
+ pr_err("failed to pin folios: %d\n", err);
+ goto err_free_folios;
+ }
+ nr_folios = nr_pinned;
+
+ folios_ser = vcalloc(nr_folios, sizeof(*folios_ser));
+ if (!folios_ser) {
+ err = -ENOMEM;
+ goto err_unpin;
+ }
+
+ for (i = 0; i < nr_folios; i++) {
+ struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
+ struct folio *folio = folios[i];
+ unsigned int flags = 0;
+
+ err = kho_preserve_folio(folio);
+ if (err)
+ goto err_unpreserve;
+
+ if (folio_test_dirty(folio))
+ flags |= MEMFD_LUO_FOLIO_DIRTY;
+ if (folio_test_uptodate(folio))
+ flags |= MEMFD_LUO_FOLIO_UPTODATE;
+
+ pfolio->pfn = folio_pfn(folio);
+ pfolio->flags = flags;
+ pfolio->index = folio->index;
+ }
+
+ err = kho_preserve_vmalloc(folios_ser, kho_vmalloc);
+ if (err)
+ goto err_unpreserve;
+
+ kvfree(folios);
+ *nr_foliosp = nr_folios;
+ *out_folios_ser = folios_ser;
+
+ /*
+ * Note: folios_ser is purposely not freed here. It is preserved
+ * memory (via KHO). In the 'unpreserve' path, we use the vmap pointer
+ * that is passed via private_data.
+ */
+ return 0;
+
+err_unpreserve:
+ for (i = i - 1; i >= 0; i--)
+ kho_unpreserve_folio(folios[i]);
+ vfree(folios_ser);
+err_unpin:
+ unpin_folios(folios, nr_folios);
+err_free_folios:
+ kvfree(folios);
+
+ return err;
+}
+
+static void memfd_luo_unpreserve_folios(struct kho_vmalloc *kho_vmalloc,
+ struct memfd_luo_folio_ser *folios_ser,
+ u64 nr_folios)
+{
+ long i;
+
+ if (!nr_folios)
+ return;
+
+ kho_unpreserve_vmalloc(kho_vmalloc);
+
+ for (i = 0; i < nr_folios; i++) {
+ const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
+ struct folio *folio;
+
+ if (!pfolio->pfn)
+ continue;
+
+ folio = pfn_folio(pfolio->pfn);
+
+ kho_unpreserve_folio(folio);
+ unpin_folio(folio);
+ }
+
+ vfree(folios_ser);
+}
+
+static int memfd_luo_preserve(struct liveupdate_file_op_args *args)
+{
+ struct inode *inode = file_inode(args->file);
+ struct memfd_luo_folio_ser *folios_ser;
+ struct memfd_luo_ser *ser;
+ u64 nr_folios;
+ int err = 0;
+
+ inode_lock(inode);
+ shmem_freeze(inode, true);
+
+ /* Allocate the main serialization structure in preserved memory */
+ ser = kho_alloc_preserve(sizeof(*ser));
+ if (IS_ERR(ser)) {
+ err = PTR_ERR(ser);
+ goto err_unlock;
+ }
+
+ ser->pos = args->file->f_pos;
+ ser->size = i_size_read(inode);
+
+ err = memfd_luo_preserve_folios(args->file, &ser->folios,
+ &folios_ser, &nr_folios);
+ if (err)
+ goto err_free_ser;
+
+ ser->nr_folios = nr_folios;
+ inode_unlock(inode);
+
+ args->private_data = folios_ser;
+ args->serialized_data = virt_to_phys(ser);
+
+ return 0;
+
+err_free_ser:
+ kho_unpreserve_free(ser);
+err_unlock:
+ shmem_freeze(inode, false);
+ inode_unlock(inode);
+ return err;
+}
+
+static int memfd_luo_freeze(struct liveupdate_file_op_args *args)
+{
+ struct memfd_luo_ser *ser;
+
+ if (WARN_ON_ONCE(!args->serialized_data))
+ return -EINVAL;
+
+ ser = phys_to_virt(args->serialized_data);
+
+ /*
+ * The pos might have changed since prepare. Everything else stays the
+ * same.
+ */
+ ser->pos = args->file->f_pos;
+
+ return 0;
+}
+
+static void memfd_luo_unpreserve(struct liveupdate_file_op_args *args)
+{
+ struct inode *inode = file_inode(args->file);
+ struct memfd_luo_ser *ser;
+
+ if (WARN_ON_ONCE(!args->serialized_data))
+ return;
+
+ inode_lock(inode);
+ shmem_freeze(inode, false);
+
+ ser = phys_to_virt(args->serialized_data);
+
+ memfd_luo_unpreserve_folios(&ser->folios, args->private_data,
+ ser->nr_folios);
+
+ kho_unpreserve_free(ser);
+ inode_unlock(inode);
+}
+
+static void memfd_luo_discard_folios(const struct memfd_luo_folio_ser *folios_ser,
+ u64 nr_folios)
+{
+ u64 i;
+
+ for (i = 0; i < nr_folios; i++) {
+ const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
+ struct folio *folio;
+ phys_addr_t phys;
+
+ if (!pfolio->pfn)
+ continue;
+
+ phys = PFN_PHYS(pfolio->pfn);
+ folio = kho_restore_folio(phys);
+ if (!folio) {
+ pr_warn_ratelimited("Unable to restore folio at physical address: %llx\n",
+ phys);
+ continue;
+ }
+
+ folio_put(folio);
+ }
+}
+
+static void memfd_luo_finish(struct liveupdate_file_op_args *args)
+{
+ struct memfd_luo_folio_ser *folios_ser;
+ struct memfd_luo_ser *ser;
+
+ if (args->retrieved)
+ return;
+
+ ser = phys_to_virt(args->serialized_data);
+ if (!ser)
+ return;
+
+ if (ser->nr_folios) {
+ folios_ser = kho_restore_vmalloc(&ser->folios);
+ if (!folios_ser)
+ goto out;
+
+ memfd_luo_discard_folios(folios_ser, ser->nr_folios);
+ vfree(folios_ser);
+ }
+
+out:
+ kho_restore_free(ser);
+}
+
+static int memfd_luo_retrieve_folios(struct file *file,
+ struct memfd_luo_folio_ser *folios_ser,
+ u64 nr_folios)
+{
+ struct inode *inode = file_inode(file);
+ struct address_space *mapping = inode->i_mapping;
+ struct folio *folio;
+ int err = -EIO;
+ long i;
+
+ for (i = 0; i < nr_folios; i++) {
+ const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
+ phys_addr_t phys;
+ u64 index;
+ int flags;
+
+ if (!pfolio->pfn)
+ continue;
+
+ phys = PFN_PHYS(pfolio->pfn);
+ folio = kho_restore_folio(phys);
+ if (!folio) {
+ pr_err("Unable to restore folio at physical address: %llx\n",
+ phys);
+ goto put_folios;
+ }
+ index = pfolio->index;
+ flags = pfolio->flags;
+
+ /* Set up the folio for insertion. */
+ __folio_set_locked(folio);
+ __folio_set_swapbacked(folio);
+
+ err = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping));
+ if (err) {
+ pr_err("shmem: failed to charge folio index %ld: %d\n",
+ i, err);
+ goto unlock_folio;
+ }
+
+ err = shmem_add_to_page_cache(folio, mapping, index, NULL,
+ mapping_gfp_mask(mapping));
+ if (err) {
+ pr_err("shmem: failed to add to page cache folio index %ld: %d\n",
+ i, err);
+ goto unlock_folio;
+ }
+
+ if (flags & MEMFD_LUO_FOLIO_UPTODATE)
+ folio_mark_uptodate(folio);
+ if (flags & MEMFD_LUO_FOLIO_DIRTY)
+ folio_mark_dirty(folio);
+
+ err = shmem_inode_acct_blocks(inode, 1);
+ if (err) {
+ pr_err("shmem: failed to account folio index %ld: %d\n",
+ i, err);
+ goto unlock_folio;
+ }
+
+ shmem_recalc_inode(inode, 1, 0);
+ folio_add_lru(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+
+ return 0;
+
+unlock_folio:
+ folio_unlock(folio);
+ folio_put(folio);
+put_folios:
+ /*
+ * Note: don't free the folios already added to the file. They will be
+ * freed when the file is freed. Free the ones not added yet here.
+ */
+ for (long j = i + 1; j < nr_folios; j++) {
+ const struct memfd_luo_folio_ser *pfolio = &folios_ser[j];
+
+ folio = kho_restore_folio(pfolio->pfn);
+ if (folio)
+ folio_put(folio);
+ }
+
+ return err;
+}
+
+static int memfd_luo_retrieve(struct liveupdate_file_op_args *args)
+{
+ struct memfd_luo_folio_ser *folios_ser;
+ struct memfd_luo_ser *ser;
+ struct file *file;
+ int err;
+
+ ser = phys_to_virt(args->serialized_data);
+ if (!ser)
+ return -EINVAL;
+
+ file = shmem_file_setup("", 0, VM_NORESERVE);
+
+ if (IS_ERR(file)) {
+ pr_err("failed to setup file: %pe\n", file);
+ return PTR_ERR(file);
+ }
+
+ vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE);
+ file->f_inode->i_size = ser->size;
+
+ if (ser->nr_folios) {
+ folios_ser = kho_restore_vmalloc(&ser->folios);
+ if (!folios_ser) {
+ err = -EINVAL;
+ goto put_file;
+ }
+
+ err = memfd_luo_retrieve_folios(file, folios_ser, ser->nr_folios);
+ vfree(folios_ser);
+ if (err)
+ goto put_file;
+ }
+
+ args->file = file;
+ kho_restore_free(ser);
+
+ return 0;
+
+put_file:
+ fput(file);
+
+ return err;
+}
+
+static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler,
+ struct file *file)
+{
+ struct inode *inode = file_inode(file);
+
+ return shmem_file(file) && !inode->i_nlink;
+}
+
+static const struct liveupdate_file_ops memfd_luo_file_ops = {
+ .freeze = memfd_luo_freeze,
+ .finish = memfd_luo_finish,
+ .retrieve = memfd_luo_retrieve,
+ .preserve = memfd_luo_preserve,
+ .unpreserve = memfd_luo_unpreserve,
+ .can_preserve = memfd_luo_can_preserve,
+ .owner = THIS_MODULE,
+};
+
+static struct liveupdate_file_handler memfd_luo_handler = {
+ .ops = &memfd_luo_file_ops,
+ .compatible = MEMFD_LUO_FH_COMPATIBLE,
+};
+
+static int __init memfd_luo_init(void)
+{
+ int err = liveupdate_register_file_handler(&memfd_luo_handler);
+
+ if (err && err != -EOPNOTSUPP) {
+ pr_err("Could not register luo filesystem handler: %pe\n",
+ ERR_PTR(err));
+
+ return err;
+ }
+
+ return 0;
+}
+late_initcall(memfd_luo_init);
diff --git a/mm/shmem.c b/mm/shmem.c
index d578d8e765d7..3f194c9842a8 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -174,20 +174,20 @@ static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
*/
static inline int shmem_acct_size(unsigned long flags, loff_t size)
{
- return (flags & VM_NORESERVE) ?
+ return (flags & SHMEM_F_NORESERVE) ?
0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
}
static inline void shmem_unacct_size(unsigned long flags, loff_t size)
{
- if (!(flags & VM_NORESERVE))
+ if (!(flags & SHMEM_F_NORESERVE))
vm_unacct_memory(VM_ACCT(size));
}
static inline int shmem_reacct_size(unsigned long flags,
loff_t oldsize, loff_t newsize)
{
- if (!(flags & VM_NORESERVE)) {
+ if (!(flags & SHMEM_F_NORESERVE)) {
if (VM_ACCT(newsize) > VM_ACCT(oldsize))
return security_vm_enough_memory_mm(current->mm,
VM_ACCT(newsize) - VM_ACCT(oldsize));
@@ -205,7 +205,7 @@ static inline int shmem_reacct_size(unsigned long flags,
*/
static inline int shmem_acct_blocks(unsigned long flags, long pages)
{
- if (!(flags & VM_NORESERVE))
+ if (!(flags & SHMEM_F_NORESERVE))
return 0;
return security_vm_enough_memory_mm(current->mm,
@@ -214,11 +214,11 @@ static inline int shmem_acct_blocks(unsigned long flags, long pages)
static inline void shmem_unacct_blocks(unsigned long flags, long pages)
{
- if (flags & VM_NORESERVE)
+ if (flags & SHMEM_F_NORESERVE)
vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
}
-static int shmem_inode_acct_blocks(struct inode *inode, long pages)
+int shmem_inode_acct_blocks(struct inode *inode, long pages)
{
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
@@ -434,7 +434,7 @@ static void shmem_free_inode(struct super_block *sb, size_t freed_ispace)
*
* Return: true if swapped was incremented from 0, for shmem_writeout().
*/
-static bool shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
+bool shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
{
struct shmem_inode_info *info = SHMEM_I(inode);
bool first_swapped = false;
@@ -878,9 +878,9 @@ static void shmem_update_stats(struct folio *folio, int nr_pages)
/*
* Somewhat like filemap_add_folio, but error if expected item has gone.
*/
-static int shmem_add_to_page_cache(struct folio *folio,
- struct address_space *mapping,
- pgoff_t index, void *expected, gfp_t gfp)
+int shmem_add_to_page_cache(struct folio *folio,
+ struct address_space *mapping,
+ pgoff_t index, void *expected, gfp_t gfp)
{
XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
unsigned long nr = folio_nr_pages(folio);
@@ -1314,6 +1314,8 @@ static int shmem_setattr(struct mnt_idmap *idmap,
return -EPERM;
if (newsize != oldsize) {
+ if (info->flags & SHMEM_F_MAPPING_FROZEN)
+ return -EPERM;
error = shmem_reacct_size(SHMEM_I(inode)->flags,
oldsize, newsize);
if (error)
@@ -1568,7 +1570,7 @@ int shmem_writeout(struct folio *folio, struct swap_iocb **plug,
int nr_pages;
bool split = false;
- if ((info->flags & VM_LOCKED) || sbinfo->noswap)
+ if ((info->flags & SHMEM_F_LOCKED) || sbinfo->noswap)
goto redirty;
if (!total_swap_pages)
@@ -2926,15 +2928,15 @@ int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
* ipc_lock_object() when called from shmctl_do_lock(),
* no serialization needed when called from shm_destroy().
*/
- if (lock && !(info->flags & VM_LOCKED)) {
+ if (lock && !(info->flags & SHMEM_F_LOCKED)) {
if (!user_shm_lock(inode->i_size, ucounts))
goto out_nomem;
- info->flags |= VM_LOCKED;
+ info->flags |= SHMEM_F_LOCKED;
mapping_set_unevictable(file->f_mapping);
}
- if (!lock && (info->flags & VM_LOCKED) && ucounts) {
+ if (!lock && (info->flags & SHMEM_F_LOCKED) && ucounts) {
user_shm_unlock(inode->i_size, ucounts);
- info->flags &= ~VM_LOCKED;
+ info->flags &= ~SHMEM_F_LOCKED;
mapping_clear_unevictable(file->f_mapping);
}
retval = 0;
@@ -3079,7 +3081,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
spin_lock_init(&info->lock);
atomic_set(&info->stop_eviction, 0);
info->seals = F_SEAL_SEAL;
- info->flags = flags & VM_NORESERVE;
+ info->flags = (flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : 0;
info->i_crtime = inode_get_mtime(inode);
info->fsflags = (dir == NULL) ? 0 :
SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
@@ -3306,6 +3308,10 @@ shmem_write_begin(const struct kiocb *iocb, struct address_space *mapping,
return -EPERM;
}
+ if (unlikely((info->flags & SHMEM_F_MAPPING_FROZEN) &&
+ pos + len > inode->i_size))
+ return -EPERM;
+
ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE);
if (ret)
return ret;
@@ -3679,6 +3685,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
inode_lock(inode);
+ if (info->flags & SHMEM_F_MAPPING_FROZEN) {
+ error = -EPERM;
+ goto out;
+ }
+
if (mode & FALLOC_FL_PUNCH_HOLE) {
struct address_space *mapping = file->f_mapping;
loff_t unmap_start = round_up(offset, PAGE_SIZE);
@@ -5799,8 +5810,10 @@ static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
/* common code */
static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
- loff_t size, unsigned long flags, unsigned int i_flags)
+ loff_t size, unsigned long vm_flags,
+ unsigned int i_flags)
{
+ unsigned long flags = (vm_flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : 0;
struct inode *inode;
struct file *res;
@@ -5817,7 +5830,7 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
return ERR_PTR(-ENOMEM);
inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
- S_IFREG | S_IRWXUGO, 0, flags);
+ S_IFREG | S_IRWXUGO, 0, vm_flags);
if (IS_ERR(inode)) {
shmem_unacct_size(flags, size);
return ERR_CAST(inode);