diff options
41 files changed, 707 insertions, 252 deletions
diff --git a/.clang-format b/.clang-format index 2ceca764122f..c7060124a47a 100644 --- a/.clang-format +++ b/.clang-format @@ -748,6 +748,7 @@ ForEachMacros: - 'ynl_attr_for_each_nested' - 'ynl_attr_for_each_payload' - 'zorro_for_each_dev' + - 'zpci_bus_for_each' IncludeBlocks: Preserve IncludeCategories: diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 74dd29816f36..b6eb7a465ad2 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1004,7 +1004,7 @@ static void __maybe_unused build_bhb_mitigation(struct jit_ctx *ctx) arm64_get_spectre_v2_state() == SPECTRE_VULNERABLE) return; - if (capable(CAP_SYS_ADMIN)) + if (ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN)) return; if (supports_clearbhb(SCOPE_SYSTEM)) { diff --git a/arch/s390/include/uapi/asm/ipl.h b/arch/s390/include/uapi/asm/ipl.h index 2cd28af50dd4..3d64a2251699 100644 --- a/arch/s390/include/uapi/asm/ipl.h +++ b/arch/s390/include/uapi/asm/ipl.h @@ -15,6 +15,7 @@ struct ipl_pl_hdr { #define IPL_PL_FLAG_IPLPS 0x80 #define IPL_PL_FLAG_SIPL 0x40 #define IPL_PL_FLAG_IPLSR 0x20 +#define IPL_PL_FLAG_SBP 0x10 /* IPL Parameter Block header */ struct ipl_pb_hdr { diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 961a3d60a4dd..dcdc7e274848 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -262,6 +262,24 @@ static struct kobj_attribute sys_##_prefix##_##_name##_attr = \ sys_##_prefix##_##_name##_show, \ sys_##_prefix##_##_name##_store) +#define DEFINE_IPL_ATTR_BOOTPROG_RW(_prefix, _name, _fmt_out, _fmt_in, _hdr, _value) \ + IPL_ATTR_SHOW_FN(_prefix, _name, _fmt_out, (unsigned long long) _value) \ +static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + unsigned long long value; \ + if (sscanf(buf, _fmt_in, &value) != 1) \ + return -EINVAL; \ + (_value) = value; \ + (_hdr).flags &= ~IPL_PL_FLAG_SBP; \ + return len; \ +} \ +static struct kobj_attribute sys_##_prefix##_##_name##_attr = \ + __ATTR(_name, 0644, \ + sys_##_prefix##_##_name##_show, \ + sys_##_prefix##_##_name##_store) + #define DEFINE_IPL_ATTR_STR_RW(_prefix, _name, _fmt_out, _fmt_in, _value)\ IPL_ATTR_SHOW_FN(_prefix, _name, _fmt_out, _value) \ static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj, \ @@ -818,12 +836,13 @@ DEFINE_IPL_ATTR_RW(reipl_fcp, wwpn, "0x%016llx\n", "%llx\n", reipl_block_fcp->fcp.wwpn); DEFINE_IPL_ATTR_RW(reipl_fcp, lun, "0x%016llx\n", "%llx\n", reipl_block_fcp->fcp.lun); -DEFINE_IPL_ATTR_RW(reipl_fcp, bootprog, "%lld\n", "%lld\n", - reipl_block_fcp->fcp.bootprog); DEFINE_IPL_ATTR_RW(reipl_fcp, br_lba, "%lld\n", "%lld\n", reipl_block_fcp->fcp.br_lba); DEFINE_IPL_ATTR_RW(reipl_fcp, device, "0.0.%04llx\n", "0.0.%llx\n", reipl_block_fcp->fcp.devno); +DEFINE_IPL_ATTR_BOOTPROG_RW(reipl_fcp, bootprog, "%lld\n", "%lld\n", + reipl_block_fcp->hdr, + reipl_block_fcp->fcp.bootprog); static void reipl_get_ascii_loadparm(char *loadparm, struct ipl_parameter_block *ibp) @@ -942,10 +961,11 @@ DEFINE_IPL_ATTR_RW(reipl_nvme, fid, "0x%08llx\n", "%llx\n", reipl_block_nvme->nvme.fid); DEFINE_IPL_ATTR_RW(reipl_nvme, nsid, "0x%08llx\n", "%llx\n", reipl_block_nvme->nvme.nsid); -DEFINE_IPL_ATTR_RW(reipl_nvme, bootprog, "%lld\n", "%lld\n", - reipl_block_nvme->nvme.bootprog); DEFINE_IPL_ATTR_RW(reipl_nvme, br_lba, "%lld\n", "%lld\n", reipl_block_nvme->nvme.br_lba); +DEFINE_IPL_ATTR_BOOTPROG_RW(reipl_nvme, bootprog, "%lld\n", "%lld\n", + reipl_block_nvme->hdr, + reipl_block_nvme->nvme.bootprog); static struct attribute *reipl_nvme_attrs[] = { &sys_reipl_nvme_fid_attr.attr, @@ -1038,8 +1058,9 @@ static const struct bin_attribute *const reipl_eckd_bin_attrs[] = { }; DEFINE_IPL_CCW_ATTR_RW(reipl_eckd, device, reipl_block_eckd->eckd); -DEFINE_IPL_ATTR_RW(reipl_eckd, bootprog, "%lld\n", "%lld\n", - reipl_block_eckd->eckd.bootprog); +DEFINE_IPL_ATTR_BOOTPROG_RW(reipl_eckd, bootprog, "%lld\n", "%lld\n", + reipl_block_eckd->hdr, + reipl_block_eckd->eckd.bootprog); static struct attribute *reipl_eckd_attrs[] = { &sys_reipl_eckd_device_attr.attr, @@ -1567,12 +1588,13 @@ DEFINE_IPL_ATTR_RW(dump_fcp, wwpn, "0x%016llx\n", "%llx\n", dump_block_fcp->fcp.wwpn); DEFINE_IPL_ATTR_RW(dump_fcp, lun, "0x%016llx\n", "%llx\n", dump_block_fcp->fcp.lun); -DEFINE_IPL_ATTR_RW(dump_fcp, bootprog, "%lld\n", "%lld\n", - dump_block_fcp->fcp.bootprog); DEFINE_IPL_ATTR_RW(dump_fcp, br_lba, "%lld\n", "%lld\n", dump_block_fcp->fcp.br_lba); DEFINE_IPL_ATTR_RW(dump_fcp, device, "0.0.%04llx\n", "0.0.%llx\n", dump_block_fcp->fcp.devno); +DEFINE_IPL_ATTR_BOOTPROG_RW(dump_fcp, bootprog, "%lld\n", "%lld\n", + dump_block_fcp->hdr, + dump_block_fcp->fcp.bootprog); DEFINE_IPL_ATTR_SCP_DATA_RW(dump_fcp, dump_block_fcp->hdr, dump_block_fcp->fcp, @@ -1604,10 +1626,11 @@ DEFINE_IPL_ATTR_RW(dump_nvme, fid, "0x%08llx\n", "%llx\n", dump_block_nvme->nvme.fid); DEFINE_IPL_ATTR_RW(dump_nvme, nsid, "0x%08llx\n", "%llx\n", dump_block_nvme->nvme.nsid); -DEFINE_IPL_ATTR_RW(dump_nvme, bootprog, "%lld\n", "%llx\n", - dump_block_nvme->nvme.bootprog); DEFINE_IPL_ATTR_RW(dump_nvme, br_lba, "%lld\n", "%llx\n", dump_block_nvme->nvme.br_lba); +DEFINE_IPL_ATTR_BOOTPROG_RW(dump_nvme, bootprog, "%lld\n", "%llx\n", + dump_block_nvme->hdr, + dump_block_nvme->nvme.bootprog); DEFINE_IPL_ATTR_SCP_DATA_RW(dump_nvme, dump_block_nvme->hdr, dump_block_nvme->nvme, @@ -1635,8 +1658,9 @@ static const struct attribute_group dump_nvme_attr_group = { /* ECKD dump device attributes */ DEFINE_IPL_CCW_ATTR_RW(dump_eckd, device, dump_block_eckd->eckd); -DEFINE_IPL_ATTR_RW(dump_eckd, bootprog, "%lld\n", "%llx\n", - dump_block_eckd->eckd.bootprog); +DEFINE_IPL_ATTR_BOOTPROG_RW(dump_eckd, bootprog, "%lld\n", "%llx\n", + dump_block_eckd->hdr, + dump_block_eckd->eckd.bootprog); IPL_ATTR_BR_CHR_SHOW_FN(dump, dump_block_eckd->eckd); IPL_ATTR_BR_CHR_STORE_FN(dump, dump_block_eckd->eckd); diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c index 3aae7f70e6ab..18520d333058 100644 --- a/arch/s390/kernel/stacktrace.c +++ b/arch/s390/kernel/stacktrace.c @@ -104,7 +104,6 @@ void arch_stack_walk_user_common(stack_trace_consume_fn consume_entry, void *coo struct stack_frame_vdso_wrapper __user *sf_vdso; struct stack_frame_user __user *sf; unsigned long ip, sp; - bool first = true; if (!current->mm) return; @@ -133,24 +132,11 @@ void arch_stack_walk_user_common(stack_trace_consume_fn consume_entry, void *coo if (__get_user(ip, &sf->gprs[8])) break; } - /* Sanity check: ABI requires SP to be 8 byte aligned. */ - if (sp & 0x7) + /* Validate SP and RA (ABI requires SP to be 8 byte aligned). */ + if (sp & 0x7 || ip_invalid(ip)) break; - if (ip_invalid(ip)) { - /* - * If the instruction address is invalid, and this - * is the first stack frame, assume r14 has not - * been written to the stack yet. Otherwise exit. - */ - if (!first) - break; - ip = regs->gprs[14]; - if (ip_invalid(ip)) - break; - } if (!store_ip(consume_entry, cookie, entry, perf, ip)) break; - first = false; } pagefault_enable(); } diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 5a6ace9d875a..57f3980b98a9 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -961,6 +961,7 @@ void zpci_device_reserved(struct zpci_dev *zdev) } void zpci_release_device(struct kref *kref) + __releases(&zpci_list_lock) { struct zpci_dev *zdev = container_of(kref, struct zpci_dev, kref); @@ -1148,6 +1149,7 @@ static void zpci_add_devices(struct list_head *scan_list) int zpci_scan_devices(void) { + struct zpci_bus *zbus; LIST_HEAD(scan_list); int rc; @@ -1156,7 +1158,10 @@ int zpci_scan_devices(void) return rc; zpci_add_devices(&scan_list); - zpci_bus_scan_busses(); + zpci_bus_for_each(zbus) { + zpci_bus_scan_bus(zbus); + cond_resched(); + } return 0; } diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c index 66c4bd888b29..42a13e451f64 100644 --- a/arch/s390/pci/pci_bus.c +++ b/arch/s390/pci/pci_bus.c @@ -153,23 +153,6 @@ int zpci_bus_scan_bus(struct zpci_bus *zbus) return ret; } -/* zpci_bus_scan_busses - Scan all registered busses - * - * Scan all available zbusses - * - */ -void zpci_bus_scan_busses(void) -{ - struct zpci_bus *zbus = NULL; - - mutex_lock(&zbus_list_lock); - list_for_each_entry(zbus, &zbus_list, bus_next) { - zpci_bus_scan_bus(zbus); - cond_resched(); - } - mutex_unlock(&zbus_list_lock); -} - static bool zpci_bus_is_multifunction_root(struct zpci_dev *zdev) { return !s390_pci_no_rid && zdev->rid_available && @@ -222,10 +205,29 @@ out_free_domain: return -ENOMEM; } -static void zpci_bus_release(struct kref *kref) +/** + * zpci_bus_release - Un-initialize resources associated with the zbus and + * free memory + * @kref: refcount * that is part of struct zpci_bus + * + * MUST be called with `zbus_list_lock` held, but the lock is released during + * run of the function. + */ +static inline void zpci_bus_release(struct kref *kref) + __releases(&zbus_list_lock) { struct zpci_bus *zbus = container_of(kref, struct zpci_bus, kref); + lockdep_assert_held(&zbus_list_lock); + + list_del(&zbus->bus_next); + mutex_unlock(&zbus_list_lock); + + /* + * At this point no-one should see this object, or be able to get a new + * reference to it. + */ + if (zbus->bus) { pci_lock_rescan_remove(); pci_stop_root_bus(zbus->bus); @@ -237,16 +239,19 @@ static void zpci_bus_release(struct kref *kref) pci_unlock_rescan_remove(); } - mutex_lock(&zbus_list_lock); - list_del(&zbus->bus_next); - mutex_unlock(&zbus_list_lock); zpci_remove_parent_msi_domain(zbus); kfree(zbus); } -static void zpci_bus_put(struct zpci_bus *zbus) +static inline void __zpci_bus_get(struct zpci_bus *zbus) +{ + lockdep_assert_held(&zbus_list_lock); + kref_get(&zbus->kref); +} + +static inline void zpci_bus_put(struct zpci_bus *zbus) { - kref_put(&zbus->kref, zpci_bus_release); + kref_put_mutex(&zbus->kref, zpci_bus_release, &zbus_list_lock); } static struct zpci_bus *zpci_bus_get(int topo, bool topo_is_tid) @@ -258,7 +263,7 @@ static struct zpci_bus *zpci_bus_get(int topo, bool topo_is_tid) if (!zbus->multifunction) continue; if (topo_is_tid == zbus->topo_is_tid && topo == zbus->topo) { - kref_get(&zbus->kref); + __zpci_bus_get(zbus); goto out_unlock; } } @@ -268,6 +273,44 @@ out_unlock: return zbus; } +/** + * zpci_bus_get_next - get the next zbus object from given position in the list + * @pos: current position/cursor in the global zbus list + * + * Acquires and releases references as the cursor iterates (might also free/ + * release the cursor). Is tolerant of concurrent operations on the list. + * + * To begin the iteration, set *@pos to %NULL before calling the function. + * + * *@pos is set to %NULL in cases where either the list is empty, or *@pos is + * the last element in the list. + * + * Context: Process context. May sleep. + */ +void zpci_bus_get_next(struct zpci_bus **pos) +{ + struct zpci_bus *curp = *pos, *next = NULL; + + mutex_lock(&zbus_list_lock); + if (curp) + next = list_next_entry(curp, bus_next); + else + next = list_first_entry(&zbus_list, typeof(*curp), bus_next); + + if (list_entry_is_head(next, &zbus_list, bus_next)) + next = NULL; + + if (next) + __zpci_bus_get(next); + + *pos = next; + mutex_unlock(&zbus_list_lock); + + /* zpci_bus_put() might drop refcount to 0 and locks zbus_list_lock */ + if (curp) + zpci_bus_put(curp); +} + static struct zpci_bus *zpci_bus_alloc(int topo, bool topo_is_tid) { struct zpci_bus *zbus; @@ -279,9 +322,6 @@ static struct zpci_bus *zpci_bus_alloc(int topo, bool topo_is_tid) zbus->topo = topo; zbus->topo_is_tid = topo_is_tid; INIT_LIST_HEAD(&zbus->bus_next); - mutex_lock(&zbus_list_lock); - list_add_tail(&zbus->bus_next, &zbus_list); - mutex_unlock(&zbus_list_lock); kref_init(&zbus->kref); INIT_LIST_HEAD(&zbus->resources); @@ -291,6 +331,10 @@ static struct zpci_bus *zpci_bus_alloc(int topo, bool topo_is_tid) zbus->bus_resource.flags = IORESOURCE_BUS; pci_add_resource(&zbus->resources, &zbus->bus_resource); + mutex_lock(&zbus_list_lock); + list_add_tail(&zbus->bus_next, &zbus_list); + mutex_unlock(&zbus_list_lock); + return zbus; } diff --git a/arch/s390/pci/pci_bus.h b/arch/s390/pci/pci_bus.h index ae3d7a9159bd..e440742e3145 100644 --- a/arch/s390/pci/pci_bus.h +++ b/arch/s390/pci/pci_bus.h @@ -15,7 +15,20 @@ int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops); void zpci_bus_device_unregister(struct zpci_dev *zdev); int zpci_bus_scan_bus(struct zpci_bus *zbus); -void zpci_bus_scan_busses(void); +void zpci_bus_get_next(struct zpci_bus **pos); + +/** + * zpci_bus_for_each - iterate over all the registered zbus objects + * @pos: a struct zpci_bus * as cursor + * + * Acquires and releases references as the cursor iterates over the registered + * objects. Is tolerant against concurrent removals of objects. + * + * Context: Process context. May sleep. + */ +#define zpci_bus_for_each(pos) \ + for ((pos) = NULL, zpci_bus_get_next(&(pos)); (pos) != NULL; \ + zpci_bus_get_next(&(pos))) int zpci_bus_scan_device(struct zpci_dev *zdev); void zpci_bus_remove_device(struct zpci_dev *zdev, bool set_error); diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 977ee75e047c..f610fde2d5c4 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -2,6 +2,7 @@ #include <linux/objtool.h> #include <linux/module.h> #include <linux/sort.h> +#include <linux/bpf.h> #include <asm/ptrace.h> #include <asm/stacktrace.h> #include <asm/unwind.h> @@ -172,6 +173,25 @@ static struct orc_entry *orc_ftrace_find(unsigned long ip) } #endif +/* Fake frame pointer entry -- used as a fallback for generated code */ +static struct orc_entry orc_fp_entry = { + .type = ORC_TYPE_CALL, + .sp_reg = ORC_REG_BP, + .sp_offset = 16, + .bp_reg = ORC_REG_PREV_SP, + .bp_offset = -16, +}; + +static struct orc_entry *orc_bpf_find(unsigned long ip) +{ +#ifdef CONFIG_BPF_JIT + if (bpf_has_frame_pointer(ip)) + return &orc_fp_entry; +#endif + + return NULL; +} + /* * If we crash with IP==0, the last successfully executed instruction * was probably an indirect function call with a NULL function pointer, @@ -186,15 +206,6 @@ static struct orc_entry null_orc_entry = { .type = ORC_TYPE_CALL }; -/* Fake frame pointer entry -- used as a fallback for generated code */ -static struct orc_entry orc_fp_entry = { - .type = ORC_TYPE_CALL, - .sp_reg = ORC_REG_BP, - .sp_offset = 16, - .bp_reg = ORC_REG_PREV_SP, - .bp_offset = -16, -}; - static struct orc_entry *orc_find(unsigned long ip) { static struct orc_entry *orc; @@ -238,6 +249,11 @@ static struct orc_entry *orc_find(unsigned long ip) if (orc) return orc; + /* BPF lookup: */ + orc = orc_bpf_find(ip); + if (orc) + return orc; + return orc_ftrace_find(ip); } @@ -495,9 +511,8 @@ bool unwind_next_frame(struct unwind_state *state) if (!orc) { /* * As a fallback, try to assume this code uses a frame pointer. - * This is useful for generated code, like BPF, which ORC - * doesn't know about. This is just a guess, so the rest of - * the unwind is no longer considered reliable. + * This is just a guess, so the rest of the unwind is no longer + * considered reliable. */ orc = &orc_fp_entry; state->error = true; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index b69dc7194e2c..b0bac2a66eff 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1678,6 +1678,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image emit_prologue(&prog, image, stack_depth, bpf_prog_was_classic(bpf_prog), tail_call_reachable, bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb); + + bpf_prog->aux->ksym.fp_start = prog - temp; + /* Exception callback will clobber callee regs for its own use, and * restore the original callee regs from main prog's stack frame. */ @@ -2736,6 +2739,8 @@ emit_jmp: pop_r12(&prog); } EMIT1(0xC9); /* leave */ + bpf_prog->aux->ksym.fp_end = prog - temp; + emit_return(&prog, image + addrs[i - 1] + (prog - temp)); break; @@ -3325,6 +3330,9 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im } EMIT1(0x55); /* push rbp */ EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ + if (im) + im->ksym.fp_start = prog - (u8 *)rw_image; + if (!is_imm8(stack_size)) { /* sub rsp, stack_size */ EMIT3_off32(0x48, 0x81, 0xEC, stack_size); @@ -3462,7 +3470,11 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, -rbx_off); + EMIT1(0xC9); /* leave */ + if (im) + im->ksym.fp_end = prog - (u8 *)rw_image; + if (flags & BPF_TRAMP_F_SKIP_FRAME) { /* skip our return address and return to parent */ EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */ diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7a501e73d880..1abc7ed2990e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2019,13 +2019,14 @@ out: else btrfs_delalloc_release_space(inode, data_reserved, page_start, reserved_space, true); - extent_changeset_free(data_reserved); out_noreserve: if (only_release_metadata) btrfs_check_nocow_unlock(inode); sb_end_pagefault(inode->vfs_inode.i_sb); + extent_changeset_free(data_reserved); + if (ret < 0) return vmf_error(ret); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c4bee47829ed..317db7d10a21 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -256,6 +256,7 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off if (ret < 0) { btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d", logical, ret); + btrfs_release_path(&path); return; } eb = path.nodes[0]; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 9e2b53e90dcb..d9d8d9968a58 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1243,14 +1243,7 @@ out: btrfs_end_transaction(trans); else if (trans) ret = btrfs_end_transaction(trans); - - /* - * At this point we either failed at allocating prealloc, or we - * succeeded and passed the ownership to it to add_qgroup_rb(). In any - * case, this needs to be NULL or there is something wrong. - */ - ASSERT(prealloc == NULL); - + kfree(prealloc); return ret; } @@ -1682,12 +1675,7 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); - /* - * At this point we either failed at allocating prealloc, or we - * succeeded and passed the ownership to it to add_qgroup_rb(). In any - * case, this needs to be NULL or there is something wrong. - */ - ASSERT(prealloc == NULL); + kfree(prealloc); return ret; } @@ -3279,7 +3267,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, struct btrfs_root *quota_root; struct btrfs_qgroup *srcgroup; struct btrfs_qgroup *dstgroup; - struct btrfs_qgroup *prealloc = NULL; + struct btrfs_qgroup *prealloc; struct btrfs_qgroup_list **qlist_prealloc = NULL; bool free_inherit = false; bool need_rescan = false; @@ -3520,14 +3508,7 @@ out: } if (free_inherit) kfree(inherit); - - /* - * At this point we either failed at allocating prealloc, or we - * succeeded and passed the ownership to it to add_qgroup_rb(). In any - * case, this needs to be NULL or there is something wrong. - */ - ASSERT(prealloc == NULL); - + kfree(prealloc); return ret; } diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 05cfda8af422..e9124605974b 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -187,7 +187,6 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, ret = btrfs_search_slot(&trans, root, &key, path, -1, 1); if (ret) { test_err("couldn't find backref %d", ret); - btrfs_free_path(path); return ret; } btrfs_del_item(&trans, root, path); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index fff37c8d96a4..31edc93a383e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5865,14 +5865,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, struct btrfs_inode *curr_inode = start_inode; int ret = 0; - /* - * If we are logging a new name, as part of a link or rename operation, - * don't bother logging new dentries, as we just want to log the names - * of an inode and that any new parents exist. - */ - if (ctx->logging_new_name) - return 0; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -6051,6 +6043,33 @@ static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino, return ret; } +static bool can_log_conflicting_inode(const struct btrfs_trans_handle *trans, + const struct btrfs_inode *inode) +{ + if (!S_ISDIR(inode->vfs_inode.i_mode)) + return true; + + if (inode->last_unlink_trans < trans->transid) + return true; + + /* + * If this is a directory and its unlink_trans is not from a past + * transaction then we must fallback to a transaction commit in order + * to avoid getting a directory with 2 hard links after log replay. + * + * This happens if a directory A is renamed, moved from one parent + * directory to another one, a new file is created in the old parent + * directory with the old name of our directory A, the new file is + * fsynced, then we moved the new file to some other parent directory + * and fsync again the new file. This results in a log tree where we + * logged that directory A existed, with the INODE_REF item for the + * new location but without having logged its old parent inode, so + * that on log replay we add a new link for the new location but the + * old link remains, resulting in a link count of 2. + */ + return false; +} + static int add_conflicting_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -6154,6 +6173,11 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, return 0; } + if (!can_log_conflicting_inode(trans, inode)) { + btrfs_add_delayed_iput(inode); + return BTRFS_LOG_FORCE_COMMIT; + } + btrfs_add_delayed_iput(inode); ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); @@ -6218,6 +6242,12 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, break; } + if (!can_log_conflicting_inode(trans, inode)) { + btrfs_add_delayed_iput(inode); + ret = BTRFS_LOG_FORCE_COMMIT; + break; + } + /* * Always log the directory, we cannot make this * conditional on need_log_inode() because the directory diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ae1742a35e76..13c514684cfb 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7128,6 +7128,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, fs_devices->seeding = true; fs_devices->opened = 1; + list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list); return fs_devices; } diff --git a/fs/file_attr.c b/fs/file_attr.c index 4c4916632f11..13cdb31a3e94 100644 --- a/fs/file_attr.c +++ b/fs/file_attr.c @@ -2,6 +2,7 @@ #include <linux/fs.h> #include <linux/security.h> #include <linux/fscrypt.h> +#include <linux/fsnotify.h> #include <linux/fileattr.h> #include <linux/export.h> #include <linux/syscalls.h> @@ -298,6 +299,7 @@ int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, err = inode->i_op->fileattr_set(idmap, dentry, fa); if (err) goto out; + fsnotify_xattr(dentry); } out: diff --git a/fs/libfs.c b/fs/libfs.c index 9264523be85c..591eb649ebba 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -346,22 +346,22 @@ void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry) * User space expects the directory offset value of the replaced * (new) directory entry to be unchanged after a rename. * - * Returns zero on success, a negative errno value on failure. + * Caller must have grabbed a slot for new_dentry in the maple_tree + * associated with new_dir, even if dentry is negative. */ -int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) +void simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) { struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir); struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir); long new_offset = dentry2offset(new_dentry); - simple_offset_remove(old_ctx, old_dentry); + if (WARN_ON(!new_offset)) + return; - if (new_offset) { - offset_set(new_dentry, 0); - return simple_offset_replace(new_ctx, old_dentry, new_offset); - } - return simple_offset_add(new_ctx, old_dentry); + simple_offset_remove(old_ctx, old_dentry); + offset_set(new_dentry, 0); + WARN_ON(simple_offset_replace(new_ctx, old_dentry, new_offset)); } /** @@ -388,31 +388,23 @@ int simple_offset_rename_exchange(struct inode *old_dir, long new_index = dentry2offset(new_dentry); int ret; - simple_offset_remove(old_ctx, old_dentry); - simple_offset_remove(new_ctx, new_dentry); + if (WARN_ON(!old_index || !new_index)) + return -EINVAL; - ret = simple_offset_replace(new_ctx, old_dentry, new_index); - if (ret) - goto out_restore; + ret = mtree_store(&new_ctx->mt, new_index, old_dentry, GFP_KERNEL); + if (WARN_ON(ret)) + return ret; - ret = simple_offset_replace(old_ctx, new_dentry, old_index); - if (ret) { - simple_offset_remove(new_ctx, old_dentry); - goto out_restore; + ret = mtree_store(&old_ctx->mt, old_index, new_dentry, GFP_KERNEL); + if (WARN_ON(ret)) { + mtree_store(&new_ctx->mt, new_index, new_dentry, GFP_KERNEL); + return ret; } - ret = simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); - if (ret) { - simple_offset_remove(new_ctx, old_dentry); - simple_offset_remove(old_ctx, new_dentry); - goto out_restore; - } + offset_set(old_dentry, new_index); + offset_set(new_dentry, old_index); + simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); return 0; - -out_restore: - (void)simple_offset_replace(old_ctx, old_dentry, old_index); - (void)simple_offset_replace(new_ctx, new_dentry, new_index); - return ret; } /** diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index d27ff5e5f165..71bd44e5ab6d 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -270,8 +270,15 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, /* * Include parent/name in notification either if some notification * groups require parent info or the parent is interested in this event. + * The parent interest in ACCESS/MODIFY events does not apply to special + * files, where read/write are not on the filesystem of the parent and + * events can provide an undesirable side-channel for information + * exfiltration. */ - parent_interested = mask & p_mask & ALL_FSNOTIFY_EVENTS; + parent_interested = mask & p_mask & ALL_FSNOTIFY_EVENTS && + !(data_type == FSNOTIFY_EVENT_PATH && + d_is_special(dentry) && + (mask & (FS_ACCESS | FS_MODIFY))); if (parent_needed || parent_interested) { /* When notifying parent, child should be passed as data */ WARN_ON_ONCE(inode != fsnotify_data_inode(data, data_type)); diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h index 384b19177e1c..ee4c2726771a 100644 --- a/fs/smb/common/smbdirect/smbdirect_socket.h +++ b/fs/smb/common/smbdirect/smbdirect_socket.h @@ -133,6 +133,14 @@ struct smbdirect_socket { struct smbdirect_socket_parameters parameters; /* + * The state for connect/negotiation + */ + struct { + spinlock_t lock; + struct work_struct work; + } connect; + + /* * The state for keepalive and timeout handling */ struct { @@ -353,6 +361,10 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc) INIT_WORK(&sc->disconnect_work, __smbdirect_socket_disabled_work); disable_work_sync(&sc->disconnect_work); + spin_lock_init(&sc->connect.lock); + INIT_WORK(&sc->connect.work, __smbdirect_socket_disabled_work); + disable_work_sync(&sc->connect.work); + INIT_WORK(&sc->idle.immediate_work, __smbdirect_socket_disabled_work); disable_work_sync(&sc->idle.immediate_work); INIT_DELAYED_WORK(&sc->idle.timer_work, __smbdirect_socket_disabled_work); diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c index 1c181ef99929..7d880ff34402 100644 --- a/fs/smb/server/mgmt/user_session.c +++ b/fs/smb/server/mgmt/user_session.c @@ -325,8 +325,10 @@ struct ksmbd_session *ksmbd_session_lookup_all(struct ksmbd_conn *conn, sess = ksmbd_session_lookup(conn, id); if (!sess && conn->binding) sess = ksmbd_session_lookup_slowpath(id); - if (sess && sess->state != SMB2_SESSION_VALID) + if (sess && sess->state != SMB2_SESSION_VALID) { + ksmbd_user_session_put(sess); sess = NULL; + } return sess; } diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 27f87a13f20a..8aa483800014 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -2363,7 +2363,7 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, int rc = 0; unsigned int next = 0; - if (buf_len < sizeof(struct smb2_ea_info) + eabuf->EaNameLength + + if (buf_len < sizeof(struct smb2_ea_info) + eabuf->EaNameLength + 1 + le16_to_cpu(eabuf->EaValueLength)) return -EINVAL; @@ -2440,7 +2440,7 @@ next: break; } - if (buf_len < sizeof(struct smb2_ea_info) + eabuf->EaNameLength + + if (buf_len < sizeof(struct smb2_ea_info) + eabuf->EaNameLength + 1 + le16_to_cpu(eabuf->EaValueLength)) { rc = -EINVAL; break; diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c index 5aa7a66334d9..05598d994a68 100644 --- a/fs/smb/server/smbacl.c +++ b/fs/smb/server/smbacl.c @@ -1307,9 +1307,6 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, granted |= le32_to_cpu(ace->access_req); ace = (struct smb_ace *)((char *)ace + le16_to_cpu(ace->size)); } - - if (!pdacl->num_aces) - granted = GENERIC_ALL_FLAGS; } if (!uid) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 4e7ab8d9314f..f585359684d4 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -242,6 +242,7 @@ static void smb_direct_disconnect_rdma_work(struct work_struct *work) * disable[_delayed]_work_sync() */ disable_work(&sc->disconnect_work); + disable_work(&sc->connect.work); disable_work(&sc->recv_io.posted.refill_work); disable_delayed_work(&sc->idle.timer_work); disable_work(&sc->idle.immediate_work); @@ -297,6 +298,7 @@ smb_direct_disconnect_rdma_connection(struct smbdirect_socket *sc) * not queued again but here we don't block and avoid * disable[_delayed]_work_sync() */ + disable_work(&sc->connect.work); disable_work(&sc->recv_io.posted.refill_work); disable_work(&sc->idle.immediate_work); disable_delayed_work(&sc->idle.timer_work); @@ -467,6 +469,7 @@ static void free_transport(struct smb_direct_transport *t) */ smb_direct_disconnect_wake_up_all(sc); + disable_work_sync(&sc->connect.work); disable_work_sync(&sc->recv_io.posted.refill_work); disable_delayed_work_sync(&sc->idle.timer_work); disable_work_sync(&sc->idle.immediate_work); @@ -635,28 +638,8 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) switch (sc->recv_io.expected) { case SMBDIRECT_EXPECT_NEGOTIATE_REQ: - if (wc->byte_len < sizeof(struct smbdirect_negotiate_req)) { - put_recvmsg(sc, recvmsg); - smb_direct_disconnect_rdma_connection(sc); - return; - } - sc->recv_io.reassembly.full_packet_received = true; - /* - * Some drivers (at least mlx5_ib) might post a - * recv completion before RDMA_CM_EVENT_ESTABLISHED, - * we need to adjust our expectation in that case. - */ - if (!sc->first_error && sc->status == SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING) - sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED; - if (SMBDIRECT_CHECK_STATUS_WARN(sc, SMBDIRECT_SOCKET_NEGOTIATE_NEEDED)) { - put_recvmsg(sc, recvmsg); - smb_direct_disconnect_rdma_connection(sc); - return; - } - sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING; - enqueue_reassembly(sc, recvmsg, 0); - wake_up(&sc->status_wait); - return; + /* see smb_direct_negotiate_recv_done */ + break; case SMBDIRECT_EXPECT_DATA_TRANSFER: { struct smbdirect_data_transfer *data_transfer = (struct smbdirect_data_transfer *)recvmsg->packet; @@ -742,6 +725,126 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) smb_direct_disconnect_rdma_connection(sc); } +static void smb_direct_negotiate_recv_work(struct work_struct *work); + +static void smb_direct_negotiate_recv_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct smbdirect_recv_io *recv_io = + container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe); + struct smbdirect_socket *sc = recv_io->socket; + unsigned long flags; + + /* + * reset the common recv_done for later reuse. + */ + recv_io->cqe.done = recv_done; + + if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) { + put_recvmsg(sc, recv_io); + if (wc->status != IB_WC_WR_FLUSH_ERR) { + pr_err("Negotiate Recv error. status='%s (%d)' opcode=%d\n", + ib_wc_status_msg(wc->status), wc->status, + wc->opcode); + smb_direct_disconnect_rdma_connection(sc); + } + return; + } + + ksmbd_debug(RDMA, "Negotiate Recv completed. status='%s (%d)', opcode=%d\n", + ib_wc_status_msg(wc->status), wc->status, + wc->opcode); + + ib_dma_sync_single_for_cpu(sc->ib.dev, + recv_io->sge.addr, + recv_io->sge.length, + DMA_FROM_DEVICE); + + /* + * This is an internal error! + */ + if (WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_NEGOTIATE_REQ)) { + put_recvmsg(sc, recv_io); + smb_direct_disconnect_rdma_connection(sc); + return; + } + + /* + * Don't reset timer to the keepalive interval in + * this will be done in smb_direct_negotiate_recv_work. + */ + + /* + * Only remember the recv_io if it has enough bytes, + * this gives smb_direct_negotiate_recv_work enough + * information in order to disconnect if it was not + * valid. + */ + sc->recv_io.reassembly.full_packet_received = true; + if (wc->byte_len >= sizeof(struct smbdirect_negotiate_req)) + enqueue_reassembly(sc, recv_io, 0); + else + put_recvmsg(sc, recv_io); + + /* + * Some drivers (at least mlx5_ib and irdma in roce mode) + * might post a recv completion before RDMA_CM_EVENT_ESTABLISHED, + * we need to adjust our expectation in that case. + * + * So we defer further processing of the negotiation + * to smb_direct_negotiate_recv_work(). + * + * If we are already in SMBDIRECT_SOCKET_NEGOTIATE_NEEDED + * we queue the work directly otherwise + * smb_direct_cm_handler() will do it, when + * RDMA_CM_EVENT_ESTABLISHED arrived. + */ + spin_lock_irqsave(&sc->connect.lock, flags); + if (!sc->first_error) { + INIT_WORK(&sc->connect.work, smb_direct_negotiate_recv_work); + if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_NEEDED) + queue_work(sc->workqueue, &sc->connect.work); + } + spin_unlock_irqrestore(&sc->connect.lock, flags); +} + +static void smb_direct_negotiate_recv_work(struct work_struct *work) +{ + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, connect.work); + const struct smbdirect_socket_parameters *sp = &sc->parameters; + struct smbdirect_recv_io *recv_io; + + if (sc->first_error) + return; + + ksmbd_debug(RDMA, "Negotiate Recv Work running\n"); + + /* + * Reset timer to the keepalive interval in + * order to trigger our next keepalive message. + */ + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE; + mod_delayed_work(sc->workqueue, &sc->idle.timer_work, + msecs_to_jiffies(sp->keepalive_interval_msec)); + + /* + * If smb_direct_negotiate_recv_done() detected an + * invalid request we want to disconnect. + */ + recv_io = get_first_reassembly(sc); + if (!recv_io) { + smb_direct_disconnect_rdma_connection(sc); + return; + } + + if (SMBDIRECT_CHECK_STATUS_WARN(sc, SMBDIRECT_SOCKET_NEGOTIATE_NEEDED)) { + smb_direct_disconnect_rdma_connection(sc); + return; + } + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING; + wake_up(&sc->status_wait); +} + static int smb_direct_post_recv(struct smbdirect_socket *sc, struct smbdirect_recv_io *recvmsg) { @@ -758,7 +861,6 @@ static int smb_direct_post_recv(struct smbdirect_socket *sc, return ret; recvmsg->sge.length = sp->max_recv_size; recvmsg->sge.lkey = sc->ib.pd->local_dma_lkey; - recvmsg->cqe.done = recv_done; wr.wr_cqe = &recvmsg->cqe; wr.next = NULL; @@ -1732,6 +1834,7 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { struct smbdirect_socket *sc = cm_id->context; + unsigned long flags; ksmbd_debug(RDMA, "RDMA CM event. cm_id=%p event=%s (%d)\n", cm_id, rdma_event_msg(event->event), event->event); @@ -1739,18 +1842,27 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, switch (event->event) { case RDMA_CM_EVENT_ESTABLISHED: { /* - * Some drivers (at least mlx5_ib) might post a - * recv completion before RDMA_CM_EVENT_ESTABLISHED, + * Some drivers (at least mlx5_ib and irdma in roce mode) + * might post a recv completion before RDMA_CM_EVENT_ESTABLISHED, * we need to adjust our expectation in that case. * - * As we already started the negotiation, we just - * ignore RDMA_CM_EVENT_ESTABLISHED here. + * If smb_direct_negotiate_recv_done was called first + * it initialized sc->connect.work only for us to + * start, so that we turned into + * SMBDIRECT_SOCKET_NEGOTIATE_NEEDED, before + * smb_direct_negotiate_recv_work() runs. + * + * If smb_direct_negotiate_recv_done didn't happen + * yet. sc->connect.work is still be disabled and + * queue_work() is a no-op. */ - if (!sc->first_error && sc->status > SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING) - break; if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING)) break; sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED; + spin_lock_irqsave(&sc->connect.lock, flags); + if (!sc->first_error) + queue_work(sc->workqueue, &sc->connect.work); + spin_unlock_irqrestore(&sc->connect.lock, flags); wake_up(&sc->status_wait); break; } @@ -1921,6 +2033,7 @@ static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc) recvmsg = get_free_recvmsg(sc); if (!recvmsg) return -ENOMEM; + recvmsg->cqe.done = smb_direct_negotiate_recv_done; ret = smb_direct_post_recv(sc, recvmsg); if (ret) { @@ -2339,6 +2452,7 @@ respond: static int smb_direct_connect(struct smbdirect_socket *sc) { + struct smbdirect_recv_io *recv_io; int ret; ret = smb_direct_init_params(sc); @@ -2353,6 +2467,9 @@ static int smb_direct_connect(struct smbdirect_socket *sc) return ret; } + list_for_each_entry(recv_io, &sc->recv_io.free.list, list) + recv_io->cqe.done = recv_done; + ret = smb_direct_create_qpair(sc); if (ret) { pr_err("Can't accept RDMA client: %d\n", ret); diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 98b0eb966d91..f891344bd76b 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -702,7 +702,7 @@ retry: rd.old_parent = NULL; rd.new_parent = new_path.dentry; rd.flags = flags; - rd.delegated_inode = NULL, + rd.delegated_inode = NULL; err = start_renaming_dentry(&rd, lookup_flags, old_child, &new_last); if (err) goto out_drop_write; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6498be4c44f8..e5be698256d1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1283,6 +1283,8 @@ struct bpf_ksym { struct list_head lnode; struct latch_tree_node tnode; bool prog; + u32 fp_start; + u32 fp_end; }; enum bpf_tramp_prog_type { @@ -1511,6 +1513,7 @@ void bpf_image_ksym_add(struct bpf_ksym *ksym); void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); void bpf_ksym_del(struct bpf_ksym *ksym); +bool bpf_has_frame_pointer(unsigned long ip); int bpf_jit_charge_modmem(u32 size); void bpf_jit_uncharge_modmem(u32 size); bool bpf_prog_has_trampoline(const struct bpf_prog *prog); diff --git a/include/linux/fs.h b/include/linux/fs.h index 04ceeca12a0d..f5c9cf28c4dc 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3247,7 +3247,7 @@ struct offset_ctx { void simple_offset_init(struct offset_ctx *octx); int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry); void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry); -int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry, +void simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); int simple_offset_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c8ae6ab31651..1b9b18e5b03c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -760,6 +760,22 @@ struct bpf_prog *bpf_prog_ksym_find(unsigned long addr) NULL; } +bool bpf_has_frame_pointer(unsigned long ip) +{ + struct bpf_ksym *ksym; + unsigned long offset; + + guard(rcu)(); + + ksym = bpf_ksym_find(ip); + if (!ksym || !ksym->fp_start || !ksym->fp_end) + return false; + + offset = ip - ksym->start; + + return offset >= ksym->fp_start && offset < ksym->fp_end; +} + const struct exception_table_entry *search_bpf_extables(unsigned long addr) { const struct exception_table_entry *e = NULL; diff --git a/kernel/bpf/dmabuf_iter.c b/kernel/bpf/dmabuf_iter.c index 4dd7ef7c145c..cd500248abd9 100644 --- a/kernel/bpf/dmabuf_iter.c +++ b/kernel/bpf/dmabuf_iter.c @@ -6,10 +6,33 @@ #include <linux/kernel.h> #include <linux/seq_file.h> +struct dmabuf_iter_priv { + /* + * If this pointer is non-NULL, the buffer's refcount is elevated to + * prevent destruction between stop/start. If reading is not resumed and + * start is never called again, then dmabuf_iter_seq_fini drops the + * reference when the iterator is released. + */ + struct dma_buf *dmabuf; +}; + static void *dmabuf_iter_seq_start(struct seq_file *seq, loff_t *pos) { - if (*pos) - return NULL; + struct dmabuf_iter_priv *p = seq->private; + + if (*pos) { + struct dma_buf *dmabuf = p->dmabuf; + + if (!dmabuf) + return NULL; + + /* + * Always resume from where we stopped, regardless of the value + * of pos. + */ + p->dmabuf = NULL; + return dmabuf; + } return dma_buf_iter_begin(); } @@ -54,8 +77,11 @@ static void dmabuf_iter_seq_stop(struct seq_file *seq, void *v) { struct dma_buf *dmabuf = v; - if (dmabuf) - dma_buf_put(dmabuf); + if (dmabuf) { + struct dmabuf_iter_priv *p = seq->private; + + p->dmabuf = dmabuf; + } } static const struct seq_operations dmabuf_iter_seq_ops = { @@ -71,11 +97,27 @@ static void bpf_iter_dmabuf_show_fdinfo(const struct bpf_iter_aux_info *aux, seq_puts(seq, "dmabuf iter\n"); } +static int dmabuf_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux) +{ + struct dmabuf_iter_priv *p = (struct dmabuf_iter_priv *)priv; + + p->dmabuf = NULL; + return 0; +} + +static void dmabuf_iter_seq_fini(void *priv) +{ + struct dmabuf_iter_priv *p = (struct dmabuf_iter_priv *)priv; + + if (p->dmabuf) + dma_buf_put(p->dmabuf); +} + static const struct bpf_iter_seq_info dmabuf_iter_seq_info = { .seq_ops = &dmabuf_iter_seq_ops, - .init_seq_private = NULL, - .fini_seq_private = NULL, - .seq_priv_size = 0, + .init_seq_private = dmabuf_iter_seq_init, + .fini_seq_private = dmabuf_iter_seq_fini, + .seq_priv_size = sizeof(struct dmabuf_iter_priv), }; static struct bpf_iter_reg bpf_dmabuf_reg_info = { diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index a198e40c799b..150e5871e66f 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -71,7 +71,6 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) { struct llist_head *lhead; struct css_rstat_cpu *rstatc; - struct css_rstat_cpu __percpu *rstatc_pcpu; struct llist_node *self; /* @@ -104,18 +103,22 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) /* * This function can be renentered by irqs and nmis for the same cgroup * and may try to insert the same per-cpu lnode into the llist. Note - * that llist_add() does not protect against such scenarios. + * that llist_add() does not protect against such scenarios. In addition + * this same per-cpu lnode can be modified through init_llist_node() + * from css_rstat_flush() running on a different CPU. * * To protect against such stacked contexts of irqs/nmis, we use the * fact that lnode points to itself when not on a list and then use - * this_cpu_cmpxchg() to atomically set to NULL to select the winner + * try_cmpxchg() to atomically set to NULL to select the winner * which will call llist_add(). The losers can assume the insertion is * successful and the winner will eventually add the per-cpu lnode to * the llist. + * + * Please note that we can not use this_cpu_cmpxchg() here as on some + * archs it is not safe against modifications from multiple CPUs. */ self = &rstatc->lnode; - rstatc_pcpu = css->rstat_cpu; - if (this_cpu_cmpxchg(rstatc_pcpu->lnode.next, self, NULL) != self) + if (!try_cmpxchg(&rstatc->lnode.next, &self, NULL)) return; lhead = ss_lhead_cpu(css->ss, cpu); diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 05f5a49e9649..94164f2dec6d 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -41,6 +41,13 @@ static bool scx_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); +/* + * Tracks whether scx_enable() called scx_bypass(true). Used to balance bypass + * depth on enable failure. Will be removed when bypass depth is moved into the + * sched instance. + */ +static bool scx_bypassed_for_enable; + static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); @@ -975,6 +982,30 @@ static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1); } +static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p, + u64 enq_flags) +{ + struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); + bool preempt = false; + + /* + * If @rq is in balance, the CPU is already vacant and looking for the + * next task to run. No need to preempt or trigger resched after moving + * @p into its local DSQ. + */ + if (rq->scx.flags & SCX_RQ_IN_BALANCE) + return; + + if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && + rq->curr->sched_class == &ext_sched_class) { + rq->curr->scx.slice = 0; + preempt = true; + } + + if (preempt || sched_class_above(&ext_sched_class, rq->curr->sched_class)) + resched_curr(rq); +} + static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, struct task_struct *p, u64 enq_flags) { @@ -1086,22 +1117,10 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, if (enq_flags & SCX_ENQ_CLEAR_OPSS) atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); - if (is_local) { - struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); - bool preempt = false; - - if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && - rq->curr->sched_class == &ext_sched_class) { - rq->curr->scx.slice = 0; - preempt = true; - } - - if (preempt || sched_class_above(&ext_sched_class, - rq->curr->sched_class)) - resched_curr(rq); - } else { + if (is_local) + local_dsq_post_enq(dsq, p, enq_flags); + else raw_spin_unlock(&dsq->lock); - } } static void task_unlink_from_dsq(struct task_struct *p, @@ -1625,6 +1644,8 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags, dsq_mod_nr(dst_dsq, 1); p->scx.dsq = dst_dsq; + + local_dsq_post_enq(dst_dsq, p, enq_flags); } /** @@ -2402,7 +2423,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, * ops.enqueue() that @p is the only one available for this cpu, * which should trigger an explicit follow-up scheduling event. */ - if (sched_class_above(&ext_sched_class, next->sched_class)) { + if (next && sched_class_above(&ext_sched_class, next->sched_class)) { WARN_ON_ONCE(!(sch->ops.flags & SCX_OPS_ENQ_LAST)); do_enqueue_task(rq, p, SCX_ENQ_LAST, -1); } else { @@ -2425,7 +2446,7 @@ static struct task_struct * do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) { struct task_struct *prev = rq->curr; - bool keep_prev, kick_idle = false; + bool keep_prev; struct task_struct *p; /* see kick_cpus_irq_workfn() */ @@ -2467,12 +2488,8 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) refill_task_slice_dfl(rcu_dereference_sched(scx_root), p); } else { p = first_local_task(rq); - if (!p) { - if (kick_idle) - scx_kick_cpu(rcu_dereference_sched(scx_root), - cpu_of(rq), SCX_KICK_IDLE); + if (!p) return NULL; - } if (unlikely(!p->scx.slice)) { struct scx_sched *sch = rcu_dereference_sched(scx_root); @@ -3575,7 +3592,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work) int node; irq_work_sync(&sch->error_irq_work); - kthread_stop(sch->helper->task); + kthread_destroy_worker(sch->helper); free_percpu(sch->pcpu); @@ -4318,6 +4335,11 @@ static void scx_disable_workfn(struct kthread_work *work) scx_dsp_max_batch = 0; free_kick_syncs(); + if (scx_bypassed_for_enable) { + scx_bypassed_for_enable = false; + scx_bypass(false); + } + mutex_unlock(&scx_enable_mutex); WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); @@ -4786,7 +4808,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) return sch; err_stop_helper: - kthread_stop(sch->helper->task); + kthread_destroy_worker(sch->helper); err_free_pcpu: free_percpu(sch->pcpu); err_free_gdsqs: @@ -4970,6 +4992,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) * Init in bypass mode to guarantee forward progress. */ scx_bypass(true); + scx_bypassed_for_enable = true; for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) if (((void (**)(void))ops)[i]) @@ -5067,6 +5090,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); + scx_bypassed_for_enable = false; scx_bypass(false); if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d57727abaade..fe28d86f7c35 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -965,7 +965,7 @@ static const struct bpf_func_proto bpf_d_path_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_d_path_btf_ids[0], - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .allowed = bpf_d_path_allowed, }; diff --git a/mm/shmem.c b/mm/shmem.c index b329b5302c48..ec6c01378e9d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -4019,22 +4019,10 @@ static int shmem_whiteout(struct mnt_idmap *idmap, whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name); if (!whiteout) return -ENOMEM; - error = shmem_mknod(idmap, old_dir, whiteout, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); dput(whiteout); - if (error) - return error; - - /* - * Cheat and hash the whiteout while the old dentry is still in - * place, instead of playing games with FS_RENAME_DOES_D_MOVE. - * - * d_lookup() will consistently find one of them at this point, - * not sure which one, but that isn't even important. - */ - d_rehash(whiteout); - return 0; + return error; } /* @@ -4050,6 +4038,7 @@ static int shmem_rename2(struct mnt_idmap *idmap, { struct inode *inode = d_inode(old_dentry); int they_are_dirs = S_ISDIR(inode->i_mode); + bool had_offset = false; int error; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) @@ -4062,16 +4051,23 @@ static int shmem_rename2(struct mnt_idmap *idmap, if (!simple_empty(new_dentry)) return -ENOTEMPTY; + error = simple_offset_add(shmem_get_offset_ctx(new_dir), new_dentry); + if (error == -EBUSY) + had_offset = true; + else if (unlikely(error)) + return error; + if (flags & RENAME_WHITEOUT) { error = shmem_whiteout(idmap, old_dir, old_dentry); - if (error) + if (error) { + if (!had_offset) + simple_offset_remove(shmem_get_offset_ctx(new_dir), + new_dentry); return error; + } } - error = simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry); - if (error) - return error; - + simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry); if (d_really_is_positive(new_dentry)) { (void) shmem_unlink(new_dir, new_dentry); if (they_are_dirs) { diff --git a/net/smc/Kconfig b/net/smc/Kconfig index 325addf83cc6..277ef504bc26 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -22,10 +22,10 @@ config SMC_DIAG config SMC_HS_CTRL_BPF bool "Generic eBPF hook for SMC handshake flow" - depends on SMC && BPF_SYSCALL + depends on SMC && BPF_JIT && BPF_SYSCALL default y help SMC_HS_CTRL_BPF enables support to register generic eBPF hook for SMC handshake flow, which offer much greater flexibility in modifying the behavior of the SMC protocol stack compared to a complete kernel-based approach. Select - this option if you want filtring the handshake process via eBPF programs.
\ No newline at end of file + this option if you want filtring the handshake process via eBPF programs. diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 586d1b2595d1..5442073a2e42 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -224,6 +224,8 @@ endif $(OUTPUT)%.bpf.o: skeleton/%.bpf.c $(OUTPUT)vmlinux.h $(LIBBPF_BOOTSTRAP) $(QUIET_CLANG)$(CLANG) \ + -Wno-microsoft-anon-tag \ + -fms-extensions \ -I$(or $(OUTPUT),.) \ -I$(srctree)/tools/include/uapi/ \ -I$(LIBBPF_BOOTSTRAP_INCLUDE) \ diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 3dc8a8078815..f4dfd23148a5 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -8484,7 +8484,7 @@ static int kallsyms_cb(unsigned long long sym_addr, char sym_type, struct bpf_object *obj = ctx; const struct btf_type *t; struct extern_desc *ext; - char *res; + const char *res; res = strstr(sym_name, ".llvm."); if (sym_type == 'd' && res) @@ -11818,7 +11818,8 @@ static int avail_kallsyms_cb(unsigned long long sym_addr, char sym_type, * * [0] fb6a421fb615 ("kallsyms: Match symbols exactly with CONFIG_LTO_CLANG") */ - char sym_trim[256], *psym_trim = sym_trim, *sym_sfx; + char sym_trim[256], *psym_trim = sym_trim; + const char *sym_sfx; if (!(sym_sfx = strstr(sym_name, ".llvm."))) return 0; @@ -12401,7 +12402,7 @@ static int resolve_full_path(const char *file, char *result, size_t result_sz) if (!search_paths[i]) continue; for (s = search_paths[i]; s != NULL; s = strchr(s, ':')) { - char *next_path; + const char *next_path; int seg_len; if (s[0] == ':') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index b7030a6e2e76..4aa60e83ff19 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -437,6 +437,8 @@ BPF_CFLAGS = -g -Wall -Werror -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN) \ -I$(abspath $(OUTPUT)/../usr/include) \ -std=gnu11 \ -fno-strict-aliasing \ + -Wno-microsoft-anon-tag \ + -fms-extensions \ -Wno-compare-distinct-pointer-types \ -Wno-initializer-overrides \ # diff --git a/tools/testing/selftests/bpf/prog_tests/d_path.c b/tools/testing/selftests/bpf/prog_tests/d_path.c index ccc768592e66..1a2a2f1abf03 100644 --- a/tools/testing/selftests/bpf/prog_tests/d_path.c +++ b/tools/testing/selftests/bpf/prog_tests/d_path.c @@ -38,6 +38,14 @@ static int set_pathname(int fd, pid_t pid) return readlink(buf, src.paths[src.cnt++], MAX_PATH_LEN); } +static inline long syscall_close(int fd) +{ + return syscall(__NR_close_range, + (unsigned int)fd, + (unsigned int)fd, + 0u); +} + static int trigger_fstat_events(pid_t pid) { int sockfd = -1, procfd = -1, devfd = -1; @@ -104,18 +112,34 @@ out_close: /* sys_close no longer triggers filp_close, but we can * call sys_close_range instead which still does */ -#define close(fd) syscall(__NR_close_range, fd, fd, 0) + syscall_close(pipefd[0]); + syscall_close(pipefd[1]); + syscall_close(sockfd); + syscall_close(procfd); + syscall_close(devfd); + syscall_close(localfd); + syscall_close(indicatorfd); + return ret; +} - close(pipefd[0]); - close(pipefd[1]); - close(sockfd); - close(procfd); - close(devfd); - close(localfd); - close(indicatorfd); +static void attach_and_load(struct test_d_path **skel) +{ + int err; -#undef close - return ret; + *skel = test_d_path__open_and_load(); + if (CHECK(!*skel, "setup", "d_path skeleton failed\n")) + goto cleanup; + + err = test_d_path__attach(*skel); + if (CHECK(err, "setup", "attach failed: %d\n", err)) + goto cleanup; + + (*skel)->bss->my_pid = getpid(); + return; + +cleanup: + test_d_path__destroy(*skel); + *skel = NULL; } static void test_d_path_basic(void) @@ -124,16 +148,11 @@ static void test_d_path_basic(void) struct test_d_path *skel; int err; - skel = test_d_path__open_and_load(); - if (CHECK(!skel, "setup", "d_path skeleton failed\n")) - goto cleanup; - - err = test_d_path__attach(skel); - if (CHECK(err, "setup", "attach failed: %d\n", err)) + attach_and_load(&skel); + if (!skel) goto cleanup; bss = skel->bss; - bss->my_pid = getpid(); err = trigger_fstat_events(bss->my_pid); if (err < 0) @@ -195,6 +214,39 @@ static void test_d_path_check_types(void) test_d_path_check_types__destroy(skel); } +/* Check if the verifier correctly generates code for + * accessing the memory modified by d_path helper. + */ +static void test_d_path_mem_access(void) +{ + int localfd = -1; + char path_template[] = "/dev/shm/d_path_loadgen.XXXXXX"; + struct test_d_path__bss *bss; + struct test_d_path *skel; + + attach_and_load(&skel); + if (!skel) + goto cleanup; + + bss = skel->bss; + + localfd = mkstemp(path_template); + if (CHECK(localfd < 0, "trigger", "mkstemp failed\n")) + goto cleanup; + + if (CHECK(fallocate(localfd, 0, 0, 1024) < 0, "trigger", "fallocate failed\n")) + goto cleanup; + remove(path_template); + + if (CHECK(!bss->path_match_fallocate, "check", + "failed to read fallocate path")) + goto cleanup; + +cleanup: + syscall_close(localfd); + test_d_path__destroy(skel); +} + void test_d_path(void) { if (test__start_subtest("basic")) @@ -205,4 +257,7 @@ void test_d_path(void) if (test__start_subtest("check_alloc_mem")) test_d_path_check_types(); + + if (test__start_subtest("check_mem_access")) + test_d_path_mem_access(); } diff --git a/tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c b/tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c index 6c2b0c3dbcd8..e442be9dde7e 100644 --- a/tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c @@ -73,12 +73,10 @@ close_memfd: return -1; } -static int create_sys_heap_dmabuf(void) +static int create_sys_heap_dmabuf(size_t bytes) { - sysheap_test_buffer_size = 20 * getpagesize(); - struct dma_heap_allocation_data data = { - .len = sysheap_test_buffer_size, + .len = bytes, .fd = 0, .fd_flags = O_RDWR | O_CLOEXEC, .heap_flags = 0, @@ -110,7 +108,9 @@ close_sysheap_dmabuf: static int create_test_buffers(void) { udmabuf = create_udmabuf(); - sysheap_dmabuf = create_sys_heap_dmabuf(); + + sysheap_test_buffer_size = 20 * getpagesize(); + sysheap_dmabuf = create_sys_heap_dmabuf(sysheap_test_buffer_size); if (udmabuf < 0 || sysheap_dmabuf < 0) return -1; @@ -219,6 +219,26 @@ close_iter_fd: close(iter_fd); } +static void subtest_dmabuf_iter_check_lots_of_buffers(struct dmabuf_iter *skel) +{ + int iter_fd; + char buf[1024]; + size_t total_bytes_read = 0; + ssize_t bytes_read; + + iter_fd = bpf_iter_create(bpf_link__fd(skel->links.dmabuf_collector)); + if (!ASSERT_OK_FD(iter_fd, "iter_create")) + return; + + while ((bytes_read = read(iter_fd, buf, sizeof(buf))) > 0) + total_bytes_read += bytes_read; + + ASSERT_GT(total_bytes_read, getpagesize(), "total_bytes_read"); + + close(iter_fd); +} + + static void subtest_dmabuf_iter_check_open_coded(struct dmabuf_iter *skel, int map_fd) { LIBBPF_OPTS(bpf_test_run_opts, topts); @@ -275,6 +295,23 @@ void test_dmabuf_iter(void) subtest_dmabuf_iter_check_no_infinite_reads(skel); if (test__start_subtest("default_iter")) subtest_dmabuf_iter_check_default_iter(skel); + if (test__start_subtest("lots_of_buffers")) { + size_t NUM_BUFS = 100; + int buffers[NUM_BUFS]; + int i; + + for (i = 0; i < NUM_BUFS; ++i) { + buffers[i] = create_sys_heap_dmabuf(getpagesize()); + if (!ASSERT_OK_FD(buffers[i], "dmabuf_fd")) + goto cleanup_bufs; + } + + subtest_dmabuf_iter_check_lots_of_buffers(skel); + +cleanup_bufs: + for (--i; i >= 0; --i) + close(buffers[i]); + } if (test__start_subtest("open_coded")) subtest_dmabuf_iter_check_open_coded(skel, map_fd); diff --git a/tools/testing/selftests/bpf/progs/test_d_path.c b/tools/testing/selftests/bpf/progs/test_d_path.c index 84e1f883f97b..561b2f861808 100644 --- a/tools/testing/selftests/bpf/progs/test_d_path.c +++ b/tools/testing/selftests/bpf/progs/test_d_path.c @@ -17,6 +17,7 @@ int rets_close[MAX_FILES] = {}; int called_stat = 0; int called_close = 0; +int path_match_fallocate = 0; SEC("fentry/security_inode_getattr") int BPF_PROG(prog_stat, struct path *path, struct kstat *stat, @@ -62,4 +63,26 @@ int BPF_PROG(prog_close, struct file *file, void *id) return 0; } +SEC("fentry/vfs_fallocate") +int BPF_PROG(prog_fallocate, struct file *file, int mode, loff_t offset, loff_t len) +{ + pid_t pid = bpf_get_current_pid_tgid() >> 32; + int ret = 0; + char path_fallocate[MAX_PATH_LEN] = {}; + + if (pid != my_pid) + return 0; + + ret = bpf_d_path(&file->f_path, + path_fallocate, MAX_PATH_LEN); + if (ret < 0) + return 0; + + if (!path_fallocate[0]) + return 0; + + path_match_fallocate = 1; + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/sched_ext/runner.c b/tools/testing/selftests/sched_ext/runner.c index aa2d7d32dda9..5748d2c69903 100644 --- a/tools/testing/selftests/sched_ext/runner.c +++ b/tools/testing/selftests/sched_ext/runner.c @@ -46,6 +46,14 @@ static void print_test_preamble(const struct scx_test *test, bool quiet) if (!quiet) printf("DESCRIPTION: %s\n", test->description); printf("OUTPUT:\n"); + + /* + * The tests may fork with the preamble buffered + * in the children's stdout. Flush before the test + * to avoid printing the message multiple times. + */ + fflush(stdout); + fflush(stderr); } static const char *status_to_result(enum scx_test_status status) |
