summaryrefslogtreecommitdiff
path: root/net/sctp/socket.c
AgeCommit message (Collapse)Author
4 dayssctp: Fetch inet6_sk() after setting ->pinet6 in sctp_clone_sock().Kuniyuki Iwashima
syzbot reported the lockdep splat below. [0] sctp_clone_sock() sets the child socket's ipv6_mc_list to NULL, but somehow sock_release() in an error path finally acquires lock_sock() in ipv6_sock_mc_close(). The root cause is that sctp_clone_sock() fetches inet6_sk(newsk) before setting newinet->pinet6, meaning that the parent's ipv6_mc_list was actually cleared. Also, sctp_v6_copy_ip_options() uses inet6_sk() but is called before newinet->pinet6 is set. Let's use inet6_sk() only after setting newinet->pinet6. [0]: WARNING: possible recursive locking detected syzkaller #0 Not tainted syz.0.17/5996 is trying to acquire lock: ffff888031af4c60 (sk_lock-AF_INET6){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1700 [inline] ffff888031af4c60 (sk_lock-AF_INET6){+.+.}-{0:0}, at: ipv6_sock_mc_close+0xd3/0x140 net/ipv6/mcast.c:348 but task is already holding lock: ffff888031af4320 (sk_lock-AF_INET6){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1700 [inline] ffff888031af4320 (sk_lock-AF_INET6){+.+.}-{0:0}, at: sctp_getsockopt+0x135/0xb60 net/sctp/socket.c:8131 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(sk_lock-AF_INET6); lock(sk_lock-AF_INET6); *** DEADLOCK *** May be due to missing lock nesting notation 1 lock held by syz.0.17/5996: #0: ffff888031af4320 (sk_lock-AF_INET6){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1700 [inline] #0: ffff888031af4320 (sk_lock-AF_INET6){+.+.}-{0:0}, at: sctp_getsockopt+0x135/0xb60 net/sctp/socket.c:8131 stack backtrace: CPU: 0 UID: 0 PID: 5996 Comm: syz.0.17 Not tainted syzkaller #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/25/2025 Call Trace: <TASK> dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120 print_deadlock_bug+0x279/0x290 kernel/locking/lockdep.c:3041 check_deadlock kernel/locking/lockdep.c:3093 [inline] validate_chain kernel/locking/lockdep.c:3895 [inline] __lock_acquire+0x2540/0x2cf0 kernel/locking/lockdep.c:5237 lock_acquire+0x117/0x340 kernel/locking/lockdep.c:5868 lock_sock_nested+0x48/0x100 net/core/sock.c:3780 lock_sock include/net/sock.h:1700 [inline] ipv6_sock_mc_close+0xd3/0x140 net/ipv6/mcast.c:348 inet6_release+0x47/0x70 net/ipv6/af_inet6.c:482 __sock_release net/socket.c:653 [inline] sock_release+0x85/0x150 net/socket.c:681 sctp_getsockopt_peeloff_common+0x56b/0x770 net/sctp/socket.c:5732 sctp_getsockopt_peeloff_flags+0x13b/0x230 net/sctp/socket.c:5801 sctp_getsockopt+0x3ab/0xb60 net/sctp/socket.c:8151 do_sock_getsockopt+0x2b4/0x3d0 net/socket.c:2399 __sys_getsockopt net/socket.c:2428 [inline] __do_sys_getsockopt net/socket.c:2435 [inline] __se_sys_getsockopt net/socket.c:2432 [inline] __x64_sys_getsockopt+0x1a5/0x250 net/socket.c:2432 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f8f8c38f749 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffcfdade018 EFLAGS: 00000246 ORIG_RAX: 0000000000000037 RAX: ffffffffffffffda RBX: 00007f8f8c5e5fa0 RCX: 00007f8f8c38f749 RDX: 000000000000007a RSI: 0000000000000084 RDI: 0000000000000003 RBP: 00007f8f8c413f91 R08: 0000200000000040 R09: 0000000000000000 R10: 0000200000000340 R11: 0000000000000246 R12: 0000000000000000 R13: 00007f8f8c5e5fa0 R14: 00007f8f8c5e5fa0 R15: 0000000000000005 </TASK> Fixes: 16942cf4d3e31 ("sctp: Use sk_clone() in sctp_accept().") Reported-by: syzbot+c59e6bb54e7620495725@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6936d112.a70a0220.38f243.00a7.GAE@google.com/ Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com> Link: https://patch.msgid.link/20251210081206.1141086-2-kuniyu@google.com Acked-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-10sctp: Don't inherit do_auto_asconf in sctp_clone_sock().Kuniyuki Iwashima
syzbot reported list_del(&sp->auto_asconf_list) corruption in sctp_destroy_sock(). The repro calls setsockopt(SCTP_AUTO_ASCONF, 1) to a SCTP listener, calls accept(), and close()s the child socket. setsockopt(SCTP_AUTO_ASCONF, 1) sets sp->do_auto_asconf to 1 and links sp->auto_asconf_list to a per-netns list. Both fields are placed after sp->pd_lobby in struct sctp_sock, and sctp_copy_descendant() did not copy the fields before the cited commit. Also, sctp_clone_sock() did not set them explicitly. In addition, sctp_auto_asconf_init() is called from sctp_sock_migrate(), but it initialises the fields only conditionally. The two fields relied on __GFP_ZERO added in sk_alloc(), but sk_clone() does not use it. Let's clear newsp->do_auto_asconf in sctp_clone_sock(). [0]: list_del corruption. prev->next should be ffff8880799e9148, but was ffff8880799e8808. (prev=ffff88803347d9f8) kernel BUG at lib/list_debug.c:64! Oops: invalid opcode: 0000 [#1] SMP KASAN PTI CPU: 0 UID: 0 PID: 6008 Comm: syz.0.17 Not tainted syzkaller #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/02/2025 RIP: 0010:__list_del_entry_valid_or_report+0x15a/0x190 lib/list_debug.c:62 Code: e8 7b 26 71 fd 43 80 3c 2c 00 74 08 4c 89 ff e8 7c ee 92 fd 49 8b 17 48 c7 c7 80 0a bf 8b 48 89 de 4c 89 f9 e8 07 c6 94 fc 90 <0f> 0b 4c 89 f7 e8 4c 26 71 fd 43 80 3c 2c 00 74 08 4c 89 ff e8 4d RSP: 0018:ffffc90003067ad8 EFLAGS: 00010246 RAX: 000000000000006d RBX: ffff8880799e9148 RCX: b056988859ee6e00 RDX: 0000000000000000 RSI: 0000000000000202 RDI: 0000000000000000 RBP: dffffc0000000000 R08: ffffc90003067807 R09: 1ffff9200060cf00 R10: dffffc0000000000 R11: fffff5200060cf01 R12: 1ffff1100668fb3f R13: dffffc0000000000 R14: ffff88803347d9f8 R15: ffff88803347d9f8 FS: 00005555823e5500(0000) GS:ffff88812613e000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000200000000480 CR3: 00000000741ce000 CR4: 00000000003526f0 Call Trace: <TASK> __list_del_entry_valid include/linux/list.h:132 [inline] __list_del_entry include/linux/list.h:223 [inline] list_del include/linux/list.h:237 [inline] sctp_destroy_sock+0xb4/0x370 net/sctp/socket.c:5163 sk_common_release+0x75/0x310 net/core/sock.c:3961 sctp_close+0x77e/0x900 net/sctp/socket.c:1550 inet_release+0x144/0x190 net/ipv4/af_inet.c:437 __sock_release net/socket.c:662 [inline] sock_close+0xc3/0x240 net/socket.c:1455 __fput+0x44c/0xa70 fs/file_table.c:468 task_work_run+0x1d4/0x260 kernel/task_work.c:227 resume_user_mode_work include/linux/resume_user_mode.h:50 [inline] exit_to_user_mode_loop+0xe9/0x130 kernel/entry/common.c:43 exit_to_user_mode_prepare include/linux/irq-entry-common.h:225 [inline] syscall_exit_to_user_mode_work include/linux/entry-common.h:175 [inline] syscall_exit_to_user_mode include/linux/entry-common.h:210 [inline] do_syscall_64+0x2bd/0xfa0 arch/x86/entry/syscall_64.c:100 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: 16942cf4d3e3 ("sctp: Use sk_clone() in sctp_accept().") Reported-by: syzbot+ba535cb417f106327741@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/690d2185.a70a0220.22f260.000e.GAE@google.com/ Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com> Acked-by: Xin Long <lucien.xin@gmail.com> Link: https://patch.msgid.link/20251106223418.1455510-1-kuniyu@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04net: Convert proto callbacks from sockaddr to sockaddr_unsizedKees Cook
Convert struct proto pre_connect(), connect(), bind(), and bind_add() callback function prototypes from struct sockaddr to struct sockaddr_unsized. This does not change per-implementation use of sockaddr for passing around an arbitrarily sized sockaddr struct. Those will be addressed in future patches. Additionally removes the no longer referenced struct sockaddr from include/net/inet_common.h. No binary changes expected. Signed-off-by: Kees Cook <kees@kernel.org> Link: https://patch.msgid.link/20251104002617.2752303-5-kees@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04net: Convert proto_ops connect() callbacks to use sockaddr_unsizedKees Cook
Update all struct proto_ops connect() callback function prototypes from "struct sockaddr *" to "struct sockaddr_unsized *" to avoid lying to the compiler about object sizes. Calls into struct proto handlers gain casts that will be removed in the struct proto conversion patch. No binary changes expected. Signed-off-by: Kees Cook <kees@kernel.org> Link: https://patch.msgid.link/20251104002617.2752303-3-kees@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27sctp: Remove sctp_copy_sock() and sctp_copy_descendant().Kuniyuki Iwashima
Now, sctp_accept() and sctp_do_peeloff() use sk_clone(), and we no longer need sctp_copy_sock() and sctp_copy_descendant(). Let's remove them. Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com> Acked-by: Xin Long <lucien.xin@gmail.com> Link: https://patch.msgid.link/20251023231751.4168390-9-kuniyu@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27sctp: Use sctp_clone_sock() in sctp_do_peeloff().Kuniyuki Iwashima
sctp_do_peeloff() calls sock_create() to allocate and initialise struct sock, inet_sock, and sctp_sock, but later sctp_copy_sock() and sctp_sock_migrate() overwrite most fields. What sctp_do_peeloff() does is more like accept(). Let's use sock_create_lite() and sctp_clone_sock(). Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com> Acked-by: Xin Long <lucien.xin@gmail.com> Link: https://patch.msgid.link/20251023231751.4168390-8-kuniyu@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27sctp: Use sk_clone() in sctp_accept().Kuniyuki Iwashima
sctp_accept() calls sctp_v[46]_create_accept_sk() to allocate a new socket and calls sctp_sock_migrate() to copy fields from the parent socket to the new socket. sctp_v4_create_accept_sk() allocates sk by sk_alloc(), initialises it by sock_init_data(), and copy a bunch of fields from the parent socekt by sctp_copy_sock(). sctp_sock_migrate() calls sctp_copy_descendant() to copy most fields in sctp_sock from the parent socket by memcpy(). These can be simply replaced by sk_clone(). Let's consolidate sctp_v[46]_create_accept_sk() to sctp_clone_sock() with sk_clone(). We will reuse sctp_clone_sock() for sctp_do_peeloff() and then remove sctp_copy_descendant(). Note that sock_reset_flag(newsk, SOCK_ZAPPED) is not copied to sctp_clone_sock() as sctp does not use SOCK_ZAPPED at all. Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com> Acked-by: Xin Long <lucien.xin@gmail.com> Link: https://patch.msgid.link/20251023231751.4168390-6-kuniyu@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27sctp: Don't call sk->sk_prot->init() in sctp_v[46]_create_accept_sk().Kuniyuki Iwashima
sctp_accept() calls sctp_v[46]_create_accept_sk() to allocate a new socket and calls sctp_sock_migrate() to copy fields from the parent socket to the new socket. sctp_v[46]_create_accept_sk() calls sctp_init_sock() to initialise sctp_sock, but most fields are overwritten by sctp_copy_descendant() called from sctp_sock_migrate(). Things done in sctp_init_sock() but not in sctp_sock_migrate() are the following: 1. Copy sk->sk_gso 2. Copy sk->sk_destruct (sctp_v6_init_sock()) 3. Allocate sctp_sock.ep 4. Initialise sctp_sock.pd_lobby 5. Count sk_sockets_allocated_inc(), sock_prot_inuse_add(), and SCTP_DBG_OBJCNT_INC() Let's do these in sctp_copy_sock() and sctp_sock_migrate() and avoid calling sk->sk_prot->init() in sctp_v[46]_create_accept_sk(). Note that sk->sk_destruct is already copied in sctp_copy_sock(). Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com> Acked-by: Xin Long <lucien.xin@gmail.com> Link: https://patch.msgid.link/20251023231751.4168390-4-kuniyu@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27sctp: Don't copy sk_sndbuf and sk_rcvbuf in sctp_sock_migrate().Kuniyuki Iwashima
sctp_sock_migrate() is called from 2 places. 1) sctp_accept() calls sp->pf->create_accept_sk() before sctp_sock_migrate(), and sp->pf->create_accept_sk() calls sctp_copy_sock(). 2) sctp_do_peeloff() also calls sctp_copy_sock() before sctp_sock_migrate(). sctp_copy_sock() copies sk_sndbuf and sk_rcvbuf from the parent socket. Let's not copy the two fields in sctp_sock_migrate(). Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com> Acked-by: Xin Long <lucien.xin@gmail.com> Link: https://patch.msgid.link/20251023231751.4168390-3-kuniyu@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27sctp: Defer SCTP_DBG_OBJCNT_DEC() to sctp_destroy_sock().Kuniyuki Iwashima
SCTP_DBG_OBJCNT_INC() is called only when sctp_init_sock() returns 0 after successfully allocating sctp_sk(sk)->ep. OTOH, SCTP_DBG_OBJCNT_DEC() is called in sctp_close(). The code seems to expect that the socket is always exposed to userspace once SCTP_DBG_OBJCNT_INC() is incremented, but there is a path where the assumption is not true. In sctp_accept(), sctp_sock_migrate() could fail after sctp_init_sock(). Then, sk_common_release() does not call inet_release() nor sctp_close(). Instead, it calls sk->sk_prot->destroy(). Let's move SCTP_DBG_OBJCNT_DEC() from sctp_close() to sctp_destroy_sock(). Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com> Acked-by: Xin Long <lucien.xin@gmail.com> Link: https://patch.msgid.link/20251023231751.4168390-2-kuniyu@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-08-19sctp: Convert cookie authentication to use HMAC-SHA256Eric Biggers
Convert SCTP cookies to use HMAC-SHA256, instead of the previous choice of the legacy algorithms HMAC-MD5 and HMAC-SHA1. Simplify and optimize the code by using the HMAC-SHA256 library instead of crypto_shash, and by preparing the HMAC key when it is generated instead of per-operation. This doesn't break compatibility, since the cookie format is an implementation detail, not part of the SCTP protocol itself. Note that the cookie size doesn't change either. The HMAC field was already 32 bytes, even though previously at most 20 bytes were actually compared. 32 bytes exactly fits an untruncated HMAC-SHA256 value. So, although we could safely truncate the MAC to something slightly shorter, for now just keep the cookie size the same. I also considered SipHash, but that would generate only 8-byte MACs. An 8-byte MAC *might* suffice here. However, there's quite a lot of information in the SCTP cookies: more than in TCP SYN cookies. So absent an analysis that occasional forgeries of all that information is okay in SCTP, I errored on the side of caution. Remove HMAC-MD5 and HMAC-SHA1 as options, since the new HMAC-SHA256 option is just better. It's faster as well as more secure. For example, benchmarking on x86_64, cookie authentication is now nearly 3x as fast as the previous default choice and implementation of HMAC-MD5. Also just make the kernel always support cookie authentication if SCTP is supported at all, rather than making it optional in the build. (It was sort of optional before, but it didn't really work properly. E.g., a kernel with CONFIG_SCTP_COOKIE_HMAC_MD5=n still supported HMAC-MD5 cookie authentication if CONFIG_CRYPTO_HMAC and CONFIG_CRYPTO_MD5 happened to be enabled in the kconfig for other reasons.) Acked-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Eric Biggers <ebiggers@kernel.org> Link: https://patch.msgid.link/20250818205426.30222-5-ebiggers@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-08-19sctp: Use HMAC-SHA1 and HMAC-SHA256 library for chunk authenticationEric Biggers
For SCTP chunk authentication, use the HMAC-SHA1 and HMAC-SHA256 library functions instead of crypto_shash. This is simpler and faster. There's no longer any need to pre-allocate 'crypto_shash' objects; the SCTP code now simply calls into the HMAC code directly. As part of this, make SCTP always support both HMAC-SHA1 and HMAC-SHA256. Previously, it only guaranteed support for HMAC-SHA1. However, HMAC-SHA256 tended to be supported too anyway, as it was supported if CONFIG_CRYPTO_SHA256 was enabled elsewhere in the kconfig. Acked-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Eric Biggers <ebiggers@kernel.org> Link: https://patch.msgid.link/20250818205426.30222-4-ebiggers@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-06-23net: make sk->sk_rcvtimeo locklessEric Dumazet
Followup of commit 285975dd6742 ("net: annotate data-races around sk->sk_{rcv|snd}timeo"). Remove lock_sock()/release_sock() from ksmbd_tcp_rcv_timeout() and add READ_ONCE()/WRITE_ONCE() where it is needed. Also SO_RCVTIMEO_OLD and SO_RCVTIMEO_NEW can call sock_set_timeout() without holding the socket lock. Signed-off-by: Eric Dumazet <edumazet@google.com> Link: https://patch.msgid.link/20250620155536.335520-3-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-06-23net: make sk->sk_sndtimeo locklessEric Dumazet
Followup of commit 285975dd6742 ("net: annotate data-races around sk->sk_{rcv|snd}timeo"). Remove lock_sock()/release_sock() from sock_set_sndtimeo(), and add READ_ONCE()/WRITE_ONCE() where it is needed. Also SO_SNDTIMEO_OLD and SO_SNDTIMEO_NEW can call sock_set_timeout() without holding the socket lock. Signed-off-by: Eric Dumazet <edumazet@google.com> Link: https://patch.msgid.link/20250620155536.335520-2-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-06-23net: remove sock_i_uid()Eric Dumazet
Difference between sock_i_uid() and sk_uid() is that after sock_orphan(), sock_i_uid() returns GLOBAL_ROOT_UID while sk_uid() returns the last cached sk->sk_uid value. None of sock_i_uid() callers care about this. Use sk_uid() which is much faster and inlined. Note that diag/dump users are calling sock_i_ino() and can not see the full benefit yet. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Lorenzo Colitti <lorenzo@google.com> Reviewed-by: Maciej Żenczykowski <maze@google.com> Link: https://patch.msgid.link/20250620133001.4090592-3-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-05-27sctp: mark sctp_do_peeloff staticChristoph Hellwig
sctp_do_peeloff is only used inside of net/sctp/socket.c, so mark it static. Signed-off-by: Christoph Hellwig <hch@lst.de> Acked-by: Xin Long <lucien.xin@gmail.com> Link: https://patch.msgid.link/20250526054745.2329201-1-hch@lst.de Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-05-20sctp: Do not wake readers in __sctp_write_space()Petr Malat
Function __sctp_write_space() doesn't set poll key, which leads to ep_poll_callback() waking up all waiters, not only these waiting for the socket being writable. Set the key properly using wake_up_interruptible_poll(), which is preferred over the sync variant, as writers are not woken up before at least half of the queue is available. Also, TCP does the same. Signed-off-by: Petr Malat <oss@malat.biz> Acked-by: Xin Long <lucien.xin@gmail.com> Link: https://patch.msgid.link/20250516081727.1361451-1-oss@malat.biz Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-05-16net: rfs: add sock_rps_delete_flow() helperEric Dumazet
RFS can exhibit lower performance for workloads using short-lived flows and a small set of 4-tuple. This is often the case for load-testers, using a pair of hosts, if the server has a single listener port. Typical use case : Server : tcp_crr -T128 -F1000 -6 -U -l30 -R 14250 Client : tcp_crr -T128 -F1000 -6 -U -l30 -c -H server | grep local_throughput This is because RFS global hash table contains stale information, when the same RSS key is recycled for another socket and another cpu. Make sure to undo the changes and go back to initial state when a flow is disconnected. Performance of the above test is increased by 22 %, going from 372604 transactions per second to 457773. Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: Octavian Purdila <tavip@google.com> Reviewed-by: Neal Cardwell <ncardwell@google.com> Link: https://patch.msgid.link/20250515100354.3339920-1-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-04-08sctp: detect and prevent references to a freed transport in sendmsgRicardo Cañuelo Navarro
sctp_sendmsg() re-uses associations and transports when possible by doing a lookup based on the socket endpoint and the message destination address, and then sctp_sendmsg_to_asoc() sets the selected transport in all the message chunks to be sent. There's a possible race condition if another thread triggers the removal of that selected transport, for instance, by explicitly unbinding an address with setsockopt(SCTP_SOCKOPT_BINDX_REM), after the chunks have been set up and before the message is sent. This can happen if the send buffer is full, during the period when the sender thread temporarily releases the socket lock in sctp_wait_for_sndbuf(). This causes the access to the transport data in sctp_outq_select_transport(), when the association outqueue is flushed, to result in a use-after-free read. This change avoids this scenario by having sctp_transport_free() signal the freeing of the transport, tagging it as "dead". In order to do this, the patch restores the "dead" bit in struct sctp_transport, which was removed in commit 47faa1e4c50e ("sctp: remove the dead field of sctp_transport"). Then, in the scenario where the sender thread has released the socket lock in sctp_wait_for_sndbuf(), the bit is checked again after re-acquiring the socket lock to detect the deletion. This is done while holding a reference to the transport to prevent it from being freed in the process. If the transport was deleted while the socket lock was relinquished, sctp_sendmsg_to_asoc() will return -EAGAIN to let userspace retry the send. The bug was found by a private syzbot instance (see the error report [1] and the C reproducer that triggers it [2]). Link: https://people.igalia.com/rcn/kernel_logs/20250402__KASAN_slab-use-after-free_Read_in_sctp_outq_select_transport.txt [1] Link: https://people.igalia.com/rcn/kernel_logs/20250402__KASAN_slab-use-after-free_Read_in_sctp_outq_select_transport__repro.c [2] Cc: stable@vger.kernel.org Fixes: df132eff4638 ("sctp: clear the transport of some out_chunk_list chunks in sctp_assoc_rm_peer") Suggested-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Ricardo Cañuelo Navarro <rcn@igalia.com> Acked-by: Xin Long <lucien.xin@gmail.com> Link: https://patch.msgid.link/20250404-kasan_slab-use-after-free_read_in_sctp_outq_select_transport__20250404-v1-1-5ce4a0b78ef2@igalia.com Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2024-10-09sctp: ensure sk_state is set to CLOSED if hashing fails in sctp_listen_startXin Long
If hashing fails in sctp_listen_start(), the socket remains in the LISTENING state, even though it was not added to the hash table. This can lead to a scenario where a socket appears to be listening without actually being accessible. This patch ensures that if the hashing operation fails, the sk_state is set back to CLOSED before returning an error. Note that there is no need to undo the autobind operation if hashing fails, as the bind port can still be used for next listen() call on the same socket. Fixes: 76c6d988aeb3 ("sctp: add sock_reuseport for the sock in __sctp_hash_endpoint") Reported-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Signed-off-by: Xin Long <lucien.xin@gmail.com> Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2024-10-03sctp: set sk_state back to CLOSED if autobind fails in sctp_listen_startXin Long
In sctp_listen_start() invoked by sctp_inet_listen(), it should set the sk_state back to CLOSED if sctp_autobind() fails due to whatever reason. Otherwise, next time when calling sctp_inet_listen(), if sctp_sk(sk)->reuse is already set via setsockopt(SCTP_REUSE_PORT), sctp_sk(sk)->bind_hash will be dereferenced as sk_state is LISTENING, which causes a crash as bind_hash is NULL. KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] RIP: 0010:sctp_inet_listen+0x7f0/0xa20 net/sctp/socket.c:8617 Call Trace: <TASK> __sys_listen_socket net/socket.c:1883 [inline] __sys_listen+0x1b7/0x230 net/socket.c:1894 __do_sys_listen net/socket.c:1902 [inline] Fixes: 5e8f3f703ae4 ("sctp: simplify sctp listening code") Reported-by: syzbot+f4e0f821e3a3b7cee51d@syzkaller.appspotmail.com Signed-off-by: Xin Long <lucien.xin@gmail.com> Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Link: https://patch.msgid.link/a93e655b3c153dc8945d7a812e6d8ab0d52b7aa0.1727729391.git.lucien.xin@gmail.com Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2024-07-03sctp: cancel a blocking accept when shutdown a listen socketXin Long
As David Laight noticed, "In a multithreaded program it is reasonable to have a thread blocked in accept(). With TCP a subsequent shutdown(listen_fd, SHUT_RDWR) causes the accept to fail. But nothing happens for SCTP." sctp_disconnect() is eventually called when shutdown a listen socket, but nothing is done in this function. This patch sets RCV_SHUTDOWN flag in sk->sk_shutdown there, and adds the check (sk->sk_shutdown & RCV_SHUTDOWN) to break and return in sctp_accept(). Note that shutdown() is only supported on TCP-style SCTP socket. Reported-by: David Laight <David.Laight@aculab.com> Signed-off-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2024-05-13net: change proto and proto_ops accept typeJens Axboe
Rather than pass in flags, error pointer, and whether this is a kernel invocation or not, add a struct proto_accept_arg struct as the argument. This then holds all of these arguments, and prepares accept for being able to pass back more information. No functional changes in this patch. Acked-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-04-30sctp: prefer struct_size over open coded arithmeticErick Archer
This is an effort to get rid of all multiplications from allocation functions in order to prevent integer overflows [1][2]. As the "ids" variable is a pointer to "struct sctp_assoc_ids" and this structure ends in a flexible array: struct sctp_assoc_ids { [...] sctp_assoc_t gaids_assoc_id[]; }; the preferred way in the kernel is to use the struct_size() helper to do the arithmetic instead of the calculation "size + size * count" in the kmalloc() function. Also, refactor the code adding the "ids_size" variable to avoid sizing twice. This way, the code is more readable and safer. This code was detected with the help of Coccinelle, and audited and modified manually. Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#open-coded-arithmetic-in-allocator-arguments [1] Link: https://github.com/KSPP/linux/issues/160 [2] Signed-off-by: Erick Archer <erick.archer@outlook.com> Acked-by: Xin Long <lucien.xin@gmail.com> Reviewed-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/r/PAXPR02MB724871DB78375AB06B5171C88B152@PAXPR02MB7248.eurprd02.prod.outlook.com Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2024-03-29net: add sk_wake_async_rcu() helperEric Dumazet
While looking at UDP receive performance, I saw sk_wake_async() was no longer inlined. This matters at least on AMD Zen1-4 platforms (see SRSO) This might be because rcu_read_lock() and rcu_read_unlock() are no longer nops in recent kernels ? Add sk_wake_async_rcu() variant, which must be called from contexts already holding rcu lock. As SOCK_FASYNC is deprecated in modern days, use unlikely() to give a hint to the compiler. sk_wake_async_rcu() is properly inlined from __udp_enqueue_schedule_skb() and sock_def_readable(). Signed-off-by: Eric Dumazet <edumazet@google.com> Link: https://lore.kernel.org/r/20240328144032.1864988-5-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-03-07net: introduce include/net/rps.hEric Dumazet
Move RPS related structures and helpers from include/linux/netdevice.h and include/net/sock.h to a new include file. Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Soheil Hassas Yeganeh <soheil@google.com> Reviewed-by: David Ahern <dsahern@kernel.org> Link: https://lore.kernel.org/r/20240306160031.874438-18-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-01-04sctp: fix busy pollingEric Dumazet
Busy polling while holding the socket lock makes litle sense, because incoming packets wont reach our receive queue. Fixes: 8465a5fcd1ce ("sctp: add support for busy polling to sctp protocol") Reported-by: Jacob Moroni <jmoroni@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Cc: Xin Long <lucien.xin@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2023-12-13sctp: support MSG_ERRQUEUE flag in recvmsg()Eric Dumazet
For some reason sctp_poll() generates EPOLLERR if sk->sk_error_queue is not empty but recvmsg() can not drain the error queue yet. This is needed to better support timestamping. I had to export inet_recv_error(), since sctp can be compiled as a module. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Cc: Willem de Bruijn <willemb@google.com> Acked-by: Xin Long <lucien.xin@gmail.com> Link: https://lore.kernel.org/r/20231212145550.3872051-1-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2023-10-04sctp: update hb timer immediately after users change hb_intervalXin Long
Currently, when hb_interval is changed by users, it won't take effect until the next expiry of hb timer. As the default value is 30s, users have to wait up to 30s to wait its hb_interval update to work. This becomes pretty bad in containers where a much smaller value is usually set on hb_interval. This patch improves it by resetting the hb timer immediately once the value of hb_interval is updated by users. Note that we don't address the already existing 'problem' when sending a heartbeat 'on demand' if one hb has just been sent(from the timer) mentioned in: https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg590224.html Signed-off-by: Xin Long <lucien.xin@gmail.com> Reviewed-by: Simon Horman <horms@kernel.org> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Link: https://lore.kernel.org/r/75465785f8ee5df2fb3acdca9b8fafdc18984098.1696172660.git.lucien.xin@gmail.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2023-08-31sctp: annotate data-races around sk->sk_wmem_queuedEric Dumazet
sk->sk_wmem_queued can be read locklessly from sctp_poll() Use sk_wmem_queued_add() when the field is changed, and add READ_ONCE() annotations in sctp_writeable() and sctp_assocs_seq_show() syzbot reported: BUG: KCSAN: data-race in sctp_poll / sctp_wfree read-write to 0xffff888149d77810 of 4 bytes by interrupt on cpu 0: sctp_wfree+0x170/0x4a0 net/sctp/socket.c:9147 skb_release_head_state+0xb7/0x1a0 net/core/skbuff.c:988 skb_release_all net/core/skbuff.c:1000 [inline] __kfree_skb+0x16/0x140 net/core/skbuff.c:1016 consume_skb+0x57/0x180 net/core/skbuff.c:1232 sctp_chunk_destroy net/sctp/sm_make_chunk.c:1503 [inline] sctp_chunk_put+0xcd/0x130 net/sctp/sm_make_chunk.c:1530 sctp_datamsg_put+0x29a/0x300 net/sctp/chunk.c:128 sctp_chunk_free+0x34/0x50 net/sctp/sm_make_chunk.c:1515 sctp_outq_sack+0xafa/0xd70 net/sctp/outqueue.c:1381 sctp_cmd_process_sack net/sctp/sm_sideeffect.c:834 [inline] sctp_cmd_interpreter net/sctp/sm_sideeffect.c:1366 [inline] sctp_side_effects net/sctp/sm_sideeffect.c:1198 [inline] sctp_do_sm+0x12c7/0x31b0 net/sctp/sm_sideeffect.c:1169 sctp_assoc_bh_rcv+0x2b2/0x430 net/sctp/associola.c:1051 sctp_inq_push+0x108/0x120 net/sctp/inqueue.c:80 sctp_rcv+0x116e/0x1340 net/sctp/input.c:243 sctp6_rcv+0x25/0x40 net/sctp/ipv6.c:1120 ip6_protocol_deliver_rcu+0x92f/0xf30 net/ipv6/ip6_input.c:437 ip6_input_finish net/ipv6/ip6_input.c:482 [inline] NF_HOOK include/linux/netfilter.h:303 [inline] ip6_input+0xbd/0x1b0 net/ipv6/ip6_input.c:491 dst_input include/net/dst.h:468 [inline] ip6_rcv_finish+0x1e2/0x2e0 net/ipv6/ip6_input.c:79 NF_HOOK include/linux/netfilter.h:303 [inline] ipv6_rcv+0x74/0x150 net/ipv6/ip6_input.c:309 __netif_receive_skb_one_core net/core/dev.c:5452 [inline] __netif_receive_skb+0x90/0x1b0 net/core/dev.c:5566 process_backlog+0x21f/0x380 net/core/dev.c:5894 __napi_poll+0x60/0x3b0 net/core/dev.c:6460 napi_poll net/core/dev.c:6527 [inline] net_rx_action+0x32b/0x750 net/core/dev.c:6660 __do_softirq+0xc1/0x265 kernel/softirq.c:553 run_ksoftirqd+0x17/0x20 kernel/softirq.c:921 smpboot_thread_fn+0x30a/0x4a0 kernel/smpboot.c:164 kthread+0x1d7/0x210 kernel/kthread.c:389 ret_from_fork+0x2e/0x40 arch/x86/kernel/process.c:145 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304 read to 0xffff888149d77810 of 4 bytes by task 17828 on cpu 1: sctp_writeable net/sctp/socket.c:9304 [inline] sctp_poll+0x265/0x410 net/sctp/socket.c:8671 sock_poll+0x253/0x270 net/socket.c:1374 vfs_poll include/linux/poll.h:88 [inline] do_pollfd fs/select.c:873 [inline] do_poll fs/select.c:921 [inline] do_sys_poll+0x636/0xc00 fs/select.c:1015 __do_sys_ppoll fs/select.c:1121 [inline] __se_sys_ppoll+0x1af/0x1f0 fs/select.c:1101 __x64_sys_ppoll+0x67/0x80 fs/select.c:1101 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0x00019e80 -> 0x0000cc80 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 17828 Comm: syz-executor.1 Not tainted 6.5.0-rc7-syzkaller-00185-g28f20a19294d #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/26/2023 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Acked-by: Xin Long <lucien.xin@gmail.com> Link: https://lore.kernel.org/r/20230830094519.950007-1-edumazet@google.com Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2023-08-24Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/netJakub Kicinski
Cross-merge networking fixes after downstream PR. Conflicts: include/net/inet_sock.h f866fbc842de ("ipv4: fix data-races around inet->inet_id") c274af224269 ("inet: introduce inet->inet_flags") https://lore.kernel.org/all/679ddff6-db6e-4ff6-b177-574e90d0103d@tessares.net/ Adjacent changes: drivers/net/bonding/bond_alb.c e74216b8def3 ("bonding: fix macvlan over alb bond support") f11e5bd159b0 ("bonding: support balance-alb with openvswitch") drivers/net/ethernet/broadcom/bgmac.c d6499f0b7c7c ("net: bgmac: Return PTR_ERR() for fixed_phy_register()") 23a14488ea58 ("net: bgmac: Fix return value check for fixed_phy_register()") drivers/net/ethernet/broadcom/genet/bcmmii.c 32bbe64a1386 ("net: bcmgenet: Fix return value check for fixed_phy_register()") acf50d1adbf4 ("net: bcmgenet: Return PTR_ERR() for fixed_phy_register()") net/sctp/socket.c f866fbc842de ("ipv4: fix data-races around inet->inet_id") b09bde5c3554 ("inet: move inet->mc_loop to inet->inet_frags") Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2023-08-20ipv4: fix data-races around inet->inet_idEric Dumazet
UDP sendmsg() is lockless, so ip_select_ident_segs() can very well be run from multiple cpus [1] Convert inet->inet_id to an atomic_t, but implement a dedicated path for TCP, avoiding cost of a locked instruction (atomic_add_return()) Note that this patch will cause a trivial merge conflict because we added inet->flags in net-next tree. v2: added missing change in drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c (David Ahern) [1] BUG: KCSAN: data-race in __ip_make_skb / __ip_make_skb read-write to 0xffff888145af952a of 2 bytes by task 7803 on cpu 1: ip_select_ident_segs include/net/ip.h:542 [inline] ip_select_ident include/net/ip.h:556 [inline] __ip_make_skb+0x844/0xc70 net/ipv4/ip_output.c:1446 ip_make_skb+0x233/0x2c0 net/ipv4/ip_output.c:1560 udp_sendmsg+0x1199/0x1250 net/ipv4/udp.c:1260 inet_sendmsg+0x63/0x80 net/ipv4/af_inet.c:830 sock_sendmsg_nosec net/socket.c:725 [inline] sock_sendmsg net/socket.c:748 [inline] ____sys_sendmsg+0x37c/0x4d0 net/socket.c:2494 ___sys_sendmsg net/socket.c:2548 [inline] __sys_sendmmsg+0x269/0x500 net/socket.c:2634 __do_sys_sendmmsg net/socket.c:2663 [inline] __se_sys_sendmmsg net/socket.c:2660 [inline] __x64_sys_sendmmsg+0x57/0x60 net/socket.c:2660 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd read to 0xffff888145af952a of 2 bytes by task 7804 on cpu 0: ip_select_ident_segs include/net/ip.h:541 [inline] ip_select_ident include/net/ip.h:556 [inline] __ip_make_skb+0x817/0xc70 net/ipv4/ip_output.c:1446 ip_make_skb+0x233/0x2c0 net/ipv4/ip_output.c:1560 udp_sendmsg+0x1199/0x1250 net/ipv4/udp.c:1260 inet_sendmsg+0x63/0x80 net/ipv4/af_inet.c:830 sock_sendmsg_nosec net/socket.c:725 [inline] sock_sendmsg net/socket.c:748 [inline] ____sys_sendmsg+0x37c/0x4d0 net/socket.c:2494 ___sys_sendmsg net/socket.c:2548 [inline] __sys_sendmmsg+0x269/0x500 net/socket.c:2634 __do_sys_sendmmsg net/socket.c:2663 [inline] __se_sys_sendmmsg net/socket.c:2660 [inline] __x64_sys_sendmmsg+0x57/0x60 net/socket.c:2660 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0x184d -> 0x184e Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 7804 Comm: syz-executor.1 Not tainted 6.5.0-rc6-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/26/2023 ================================================================== Fixes: 23f57406b82d ("ipv4: avoid using shared IP generator for connected sockets") Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Reviewed-by: David Ahern <dsahern@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2023-08-18sock: annotate data-races around prot->memory_pressureEric Dumazet
*prot->memory_pressure is read/writen locklessly, we need to add proper annotations. A recent commit added a new race, it is time to audit all accesses. Fixes: 2d0c88e84e48 ("sock: Fix misuse of sk_under_memory_pressure()") Fixes: 4d93df0abd50 ("[SCTP]: Rewrite of sctp buffer management code") Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Abel Wu <wuyun.abel@bytedance.com> Reviewed-by: Shakeel Butt <shakeelb@google.com> Link: https://lore.kernel.org/r/20230818015132.2699348-1-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2023-08-16inet: move inet->mc_loop to inet->inet_fragsEric Dumazet
IP_MULTICAST_LOOP socket option can now be set/read without locking the socket. v3: fix build bot error reported in ipvs set_mcast_loop() Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Soheil Hassas Yeganeh <soheil@google.com> Reviewed-by: Simon Horman <horms@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2023-07-24ipv6: remove hard coded limitation on ipv6_pinfoEric Dumazet
IPv6 inet sockets are supposed to have a "struct ipv6_pinfo" field at the end of their definition, so that inet6_sk_generic() can derive from socket size the offset of the "struct ipv6_pinfo". This is very fragile, and prevents adding bigger alignment in sockets, because inet6_sk_generic() does not work if the compiler adds padding after the ipv6_pinfo component. We are currently working on a patch series to reorganize TCP structures for better data locality and found issues similar to the one fixed in commit f5d547676ca0 ("tcp: fix tcp_inet6_sk() for 32bit kernels") Alternative would be to force an alignment on "struct ipv6_pinfo", greater or equal to __alignof__(any ipv6 sock) to ensure there is no padding. This does not look great. v2: fix typo in mptcp_proto_v6_init() (Paolo) Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Chao Wu <wwchao@google.com> Cc: Wei Wang <weiwan@google.com> Cc: Coco Li <lixiaoyan@google.com> Cc: YiFei Zhu <zhuyifei@google.com> Reviewed-by: Simon Horman <simon.horman@corigine.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2023-06-29sctp: fix potential deadlock on &net->sctp.addr_wq_lockChengfeng Ye
As &net->sctp.addr_wq_lock is also acquired by the timer sctp_addr_wq_timeout_handler() in protocal.c, the same lock acquisition at sctp_auto_asconf_init() seems should disable irq since it is called from sctp_accept() under process context. Possible deadlock scenario: sctp_accept() -> sctp_sock_migrate() -> sctp_auto_asconf_init() -> spin_lock(&net->sctp.addr_wq_lock) <timer interrupt> -> sctp_addr_wq_timeout_handler() -> spin_lock_bh(&net->sctp.addr_wq_lock); (deadlock here) This flaw was found using an experimental static analysis tool we are developing for irq-related deadlock. The tentative patch fix the potential deadlock by spin_lock_bh(). Signed-off-by: Chengfeng Ye <dg573847474@gmail.com> Fixes: 34e5b0118685 ("sctp: delay auto_asconf init until binding the first addr") Acked-by: Xin Long <lucien.xin@gmail.com> Link: https://lore.kernel.org/r/20230627120340.19432-1-dg573847474@gmail.com Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2023-06-15net: ioctl: Use kernel memory on protocol ioctl callbacksBreno Leitao
Most of the ioctls to net protocols operates directly on userspace argument (arg). Usually doing get_user()/put_user() directly in the ioctl callback. This is not flexible, because it is hard to reuse these functions without passing userspace buffers. Change the "struct proto" ioctls to avoid touching userspace memory and operate on kernel buffers, i.e., all protocol's ioctl callbacks is adapted to operate on a kernel memory other than on userspace (so, no more {put,get}_user() and friends being called in the ioctl callback). This changes the "struct proto" ioctl format in the following way: int (*ioctl)(struct sock *sk, int cmd, - unsigned long arg); + int *karg); (Important to say that this patch does not touch the "struct proto_ops" protocols) So, the "karg" argument, which is passed to the ioctl callback, is a pointer allocated to kernel space memory (inside a function wrapper). This buffer (karg) may contain input argument (copied from userspace in a prep function) and it might return a value/buffer, which is copied back to userspace if necessary. There is not one-size-fits-all format (that is I am using 'may' above), but basically, there are three type of ioctls: 1) Do not read from userspace, returns a result to userspace 2) Read an input parameter from userspace, and does not return anything to userspace 3) Read an input from userspace, and return a buffer to userspace. The default case (1) (where no input parameter is given, and an "int" is returned to userspace) encompasses more than 90% of the cases, but there are two other exceptions. Here is a list of exceptions: * Protocol RAW: * cmd = SIOCGETVIFCNT: * input and output = struct sioc_vif_req * cmd = SIOCGETSGCNT * input and output = struct sioc_sg_req * Explanation: for the SIOCGETVIFCNT case, userspace passes the input argument, which is struct sioc_vif_req. Then the callback populates the struct, which is copied back to userspace. * Protocol RAW6: * cmd = SIOCGETMIFCNT_IN6 * input and output = struct sioc_mif_req6 * cmd = SIOCGETSGCNT_IN6 * input and output = struct sioc_sg_req6 * Protocol PHONET: * cmd == SIOCPNADDRESOURCE | SIOCPNDELRESOURCE * input int (4 bytes) * Nothing is copied back to userspace. For the exception cases, functions sock_sk_ioctl_inout() will copy the userspace input, and copy it back to kernel space. The wrapper that prepare the buffer and put the buffer back to user is sk_ioctl(), so, instead of calling sk->sk_prot->ioctl(), the callee now calls sk_ioctl(), which will handle all cases. Signed-off-by: Breno Leitao <leitao@debian.org> Reviewed-by: Willem de Bruijn <willemb@google.com> Reviewed-by: David Ahern <dsahern@kernel.org> Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com> Link: https://lore.kernel.org/r/20230609152800.830401-1-leitao@debian.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2023-05-12sctp: add bpf_bypass_getsockopt proto callbackAlexander Mikhalitsyn
Implement ->bpf_bypass_getsockopt proto callback and filter out SCTP_SOCKOPT_PEELOFF, SCTP_SOCKOPT_PEELOFF_FLAGS and SCTP_SOCKOPT_CONNECTX3 socket options from running eBPF hook on them. SCTP_SOCKOPT_PEELOFF and SCTP_SOCKOPT_PEELOFF_FLAGS options do fd_install(), and if BPF_CGROUP_RUN_PROG_GETSOCKOPT hook returns an error after success of the original handler sctp_getsockopt(...), userspace will receive an error from getsockopt syscall and will be not aware that fd was successfully installed into a fdtable. As pointed by Marcelo Ricardo Leitner it seems reasonable to skip bpf getsockopt hook for SCTP_SOCKOPT_CONNECTX3 sockopt too. Because internaly, it triggers connect() and if error is masked then userspace will be confused. This patch was born as a result of discussion around a new SCM_PIDFD interface: https://lore.kernel.org/all/20230413133355.350571-3-aleksandr.mikhalitsyn@canonical.com/ Fixes: 0d01da6afc54 ("bpf: implement getsockopt and setsockopt hooks") Cc: Daniel Borkmann <daniel@iogearbox.net> Cc: Christian Brauner <brauner@kernel.org> Cc: Stanislav Fomichev <sdf@google.com> Cc: Neil Horman <nhorman@tuxdriver.com> Cc: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Cc: Xin Long <lucien.xin@gmail.com> Cc: linux-sctp@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: netdev@vger.kernel.org Suggested-by: Stanislav Fomichev <sdf@google.com> Acked-by: Stanislav Fomichev <sdf@google.com> Signed-off-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com> Acked-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2023-04-17sctp: add intl_capable and reconf_capable in ss peer_capableXin Long
There are two new peer capables have been added since sctp_diag was introduced into SCTP. When dumping the peer capables, these two new peer capables should also be included. To not break the old capables, reconf_capable takes the old hostname_address bit, and intl_capable uses the higher available bit in sctpi_peer_capable. Signed-off-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2023-04-17sctp: delete the obsolete code for the host name address paramXin Long
In the latest RFC9260, the Host Name Address param has been deprecated. For INIT chunk: Note 3: An INIT chunk MUST NOT contain the Host Name Address parameter. The receiver of an INIT chunk containing a Host Name Address parameter MUST send an ABORT chunk and MAY include an "Unresolvable Address" error cause. For Supported Address Types: The value indicating the Host Name Address parameter MUST NOT be used when sending this parameter and MUST be ignored when receiving this parameter. Currently Linux SCTP doesn't really support Host Name Address param, but only saves some flag and print debug info, which actually won't even be triggered due to the verification in sctp_verify_param(). This patch is to delete those dead code. Signed-off-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2023-04-02sctp: check send stream number after wait_for_sndbufXin Long
This patch fixes a corner case where the asoc out stream count may change after wait_for_sndbuf. When the main thread in the client starts a connection, if its out stream count is set to N while the in stream count in the server is set to N - 2, another thread in the client keeps sending the msgs with stream number N - 1, and waits for sndbuf before processing INIT_ACK. However, after processing INIT_ACK, the out stream count in the client is shrunk to N - 2, the same to the in stream count in the server. The crash occurs when the thread waiting for sndbuf is awake and sends the msg in a non-existing stream(N - 1), the call trace is as below: KASAN: null-ptr-deref in range [0x0000000000000038-0x000000000000003f] Call Trace: <TASK> sctp_cmd_send_msg net/sctp/sm_sideeffect.c:1114 [inline] sctp_cmd_interpreter net/sctp/sm_sideeffect.c:1777 [inline] sctp_side_effects net/sctp/sm_sideeffect.c:1199 [inline] sctp_do_sm+0x197d/0x5310 net/sctp/sm_sideeffect.c:1170 sctp_primitive_SEND+0x9f/0xc0 net/sctp/primitive.c:163 sctp_sendmsg_to_asoc+0x10eb/0x1a30 net/sctp/socket.c:1868 sctp_sendmsg+0x8d4/0x1d90 net/sctp/socket.c:2026 inet_sendmsg+0x9d/0xe0 net/ipv4/af_inet.c:825 sock_sendmsg_nosec net/socket.c:722 [inline] sock_sendmsg+0xde/0x190 net/socket.c:745 The fix is to add an unlikely check for the send stream number after the thread wakes up from the wait_for_sndbuf. Fixes: 5bbbbe32a431 ("sctp: introduce stream scheduler foundations") Reported-by: syzbot+47c24ca20a2fa01f082e@syzkaller.appspotmail.com Signed-off-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2023-01-25inet: Add IP_LOCAL_PORT_RANGE socket optionJakub Sitnicki
Users who want to share a single public IP address for outgoing connections between several hosts traditionally reach for SNAT. However, SNAT requires state keeping on the node(s) performing the NAT. A stateless alternative exists, where a single IP address used for egress can be shared between several hosts by partitioning the available ephemeral port range. In such a setup: 1. Each host gets assigned a disjoint range of ephemeral ports. 2. Applications open connections from the host-assigned port range. 3. Return traffic gets routed to the host based on both, the destination IP and the destination port. An application which wants to open an outgoing connection (connect) from a given port range today can choose between two solutions: 1. Manually pick the source port by bind()'ing to it before connect()'ing the socket. This approach has a couple of downsides: a) Search for a free port has to be implemented in the user-space. If the chosen 4-tuple happens to be busy, the application needs to retry from a different local port number. Detecting if 4-tuple is busy can be either easy (TCP) or hard (UDP). In TCP case, the application simply has to check if connect() returned an error (EADDRNOTAVAIL). That is assuming that the local port sharing was enabled (REUSEADDR) by all the sockets. # Assume desired local port range is 60_000-60_511 s = socket(AF_INET, SOCK_STREAM) s.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1) s.bind(("192.0.2.1", 60_000)) s.connect(("1.1.1.1", 53)) # Fails only if 192.0.2.1:60000 -> 1.1.1.1:53 is busy # Application must retry with another local port In case of UDP, the network stack allows binding more than one socket to the same 4-tuple, when local port sharing is enabled (REUSEADDR). Hence detecting the conflict is much harder and involves querying sock_diag and toggling the REUSEADDR flag [1]. b) For TCP, bind()-ing to a port within the ephemeral port range means that no connecting sockets, that is those which leave it to the network stack to find a free local port at connect() time, can use the this port. IOW, the bind hash bucket tb->fastreuse will be 0 or 1, and the port will be skipped during the free port search at connect() time. 2. Isolate the app in a dedicated netns and use the use the per-netns ip_local_port_range sysctl to adjust the ephemeral port range bounds. The per-netns setting affects all sockets, so this approach can be used only if: - there is just one egress IP address, or - the desired egress port range is the same for all egress IP addresses used by the application. For TCP, this approach avoids the downsides of (1). Free port search and 4-tuple conflict detection is done by the network stack: system("sysctl -w net.ipv4.ip_local_port_range='60000 60511'") s = socket(AF_INET, SOCK_STREAM) s.setsockopt(SOL_IP, IP_BIND_ADDRESS_NO_PORT, 1) s.bind(("192.0.2.1", 0)) s.connect(("1.1.1.1", 53)) # Fails if all 4-tuples 192.0.2.1:60000-60511 -> 1.1.1.1:53 are busy For UDP this approach has limited applicability. Setting the IP_BIND_ADDRESS_NO_PORT socket option does not result in local source port being shared with other connected UDP sockets. Hence relying on the network stack to find a free source port, limits the number of outgoing UDP flows from a single IP address down to the number of available ephemeral ports. To put it another way, partitioning the ephemeral port range between hosts using the existing Linux networking API is cumbersome. To address this use case, add a new socket option at the SOL_IP level, named IP_LOCAL_PORT_RANGE. The new option can be used to clamp down the ephemeral port range for each socket individually. The option can be used only to narrow down the per-netns local port range. If the per-socket range lies outside of the per-netns range, the latter takes precedence. UAPI-wise, the low and high range bounds are passed to the kernel as a pair of u16 values in host byte order packed into a u32. This avoids pointer passing. PORT_LO = 40_000 PORT_HI = 40_511 s = socket(AF_INET, SOCK_STREAM) v = struct.pack("I", PORT_HI << 16 | PORT_LO) s.setsockopt(SOL_IP, IP_LOCAL_PORT_RANGE, v) s.bind(("127.0.0.1", 0)) s.getsockname() # Local address between ("127.0.0.1", 40_000) and ("127.0.0.1", 40_511), # if there is a free port. EADDRINUSE otherwise. [1] https://github.com/cloudflare/cloudflare-blog/blob/232b432c1d57/2022-02-connectx/connectx.py#L116 Reviewed-by: Marek Majkowski <marek@cloudflare.com> Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com> Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com> Reviewed-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2023-01-23net/sock: Introduce trace_sk_data_ready()Peilin Ye
As suggested by Cong, introduce a tracepoint for all ->sk_data_ready() callback implementations. For example: <...> iperf-609 [002] ..... 70.660425: sk_data_ready: family=2 protocol=6 func=sock_def_readable iperf-609 [002] ..... 70.660436: sk_data_ready: family=2 protocol=6 func=sock_def_readable <...> Suggested-by: Cong Wang <cong.wang@bytedance.com> Signed-off-by: Peilin Ye <peilin.ye@bytedance.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-12-13Merge tag 'net-next-6.2' of ↵Linus Torvalds
git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next Pull networking updates from Paolo Abeni: "Core: - Allow live renaming when an interface is up - Add retpoline wrappers for tc, improving considerably the performances of complex queue discipline configurations - Add inet drop monitor support - A few GRO performance improvements - Add infrastructure for atomic dev stats, addressing long standing data races - De-duplicate common code between OVS and conntrack offloading infrastructure - A bunch of UBSAN_BOUNDS/FORTIFY_SOURCE improvements - Netfilter: introduce packet parser for tunneled packets - Replace IPVS timer-based estimators with kthreads to scale up the workload with the number of available CPUs - Add the helper support for connection-tracking OVS offload BPF: - Support for user defined BPF objects: the use case is to allocate own objects, build own object hierarchies and use the building blocks to build own data structures flexibly, for example, linked lists in BPF - Make cgroup local storage available to non-cgroup attached BPF programs - Avoid unnecessary deadlock detection and failures wrt BPF task storage helpers - A relevant bunch of BPF verifier fixes and improvements - Veristat tool improvements to support custom filtering, sorting, and replay of results - Add LLVM disassembler as default library for dumping JITed code - Lots of new BPF documentation for various BPF maps - Add bpf_rcu_read_{,un}lock() support for sleepable programs - Add RCU grace period chaining to BPF to wait for the completion of access from both sleepable and non-sleepable BPF programs - Add support storing struct task_struct objects as kptrs in maps - Improve helper UAPI by explicitly defining BPF_FUNC_xxx integer values - Add libbpf *_opts API-variants for bpf_*_get_fd_by_id() functions Protocols: - TCP: implement Protective Load Balancing across switch links - TCP: allow dynamically disabling TCP-MD5 static key, reverting back to fast[er]-path - UDP: Introduce optional per-netns hash lookup table - IPv6: simplify and cleanup sockets disposal - Netlink: support different type policies for each generic netlink operation - MPTCP: add MSG_FASTOPEN and FastOpen listener side support - MPTCP: add netlink notification support for listener sockets events - SCTP: add VRF support, allowing sctp sockets binding to VRF devices - Add bridging MAC Authentication Bypass (MAB) support - Extensions for Ethernet VPN bridging implementation to better support multicast scenarios - More work for Wi-Fi 7 support, comprising conversion of all the existing drivers to internal TX queue usage - IPSec: introduce a new offload type (packet offload) allowing complete header processing and crypto offloading - IPSec: extended ack support for more descriptive XFRM error reporting - RXRPC: increase SACK table size and move processing into a per-local endpoint kernel thread, reducing considerably the required locking - IEEE 802154: synchronous send frame and extended filtering support, initial support for scanning available 15.4 networks - Tun: bump the link speed from 10Mbps to 10Gbps - Tun/VirtioNet: implement UDP segmentation offload support Driver API: - PHY/SFP: improve power level switching between standard level 1 and the higher power levels - New API for netdev <-> devlink_port linkage - PTP: convert existing drivers to new frequency adjustment implementation - DSA: add support for rx offloading - Autoload DSA tagging driver when dynamically changing protocol - Add new PCP and APPTRUST attributes to Data Center Bridging - Add configuration support for 800Gbps link speed - Add devlink port function attribute to enable/disable RoCE and migratable - Extend devlink-rate to support strict prioriry and weighted fair queuing - Add devlink support to directly reading from region memory - New device tree helper to fetch MAC address from nvmem - New big TCP helper to simplify temporary header stripping New hardware / drivers: - Ethernet: - Marvel Octeon CNF95N and CN10KB Ethernet Switches - Marvel Prestera AC5X Ethernet Switch - WangXun 10 Gigabit NIC - Motorcomm yt8521 Gigabit Ethernet - Microchip ksz9563 Gigabit Ethernet Switch - Microsoft Azure Network Adapter - Linux Automation 10Base-T1L adapter - PHY: - Aquantia AQR112 and AQR412 - Motorcomm YT8531S - PTP: - Orolia ART-CARD - WiFi: - MediaTek Wi-Fi 7 (802.11be) devices - RealTek rtw8821cu, rtw8822bu, rtw8822cu and rtw8723du USB devices - Bluetooth: - Broadcom BCM4377/4378/4387 Bluetooth chipsets - Realtek RTL8852BE and RTL8723DS - Cypress.CYW4373A0 WiFi + Bluetooth combo device Drivers: - CAN: - gs_usb: bus error reporting support - kvaser_usb: listen only and bus error reporting support - Ethernet NICs: - Intel (100G): - extend action skbedit to RX queue mapping - implement devlink-rate support - support direct read from memory - nVidia/Mellanox (mlx5): - SW steering improvements, increasing rules update rate - Support for enhanced events compression - extend H/W offload packet manipulation capabilities - implement IPSec packet offload mode - nVidia/Mellanox (mlx4): - better big TCP support - Netronome Ethernet NICs (nfp): - IPsec offload support - add support for multicast filter - Broadcom: - RSS and PTP support improvements - AMD/SolarFlare: - netlink extened ack improvements - add basic flower matches to offload, and related stats - Virtual NICs: - ibmvnic: introduce affinity hint support - small / embedded: - FreeScale fec: add initial XDP support - Marvel mv643xx_eth: support MII/GMII/RGMII modes for Kirkwood - TI am65-cpsw: add suspend/resume support - Mediatek MT7986: add RX wireless wthernet dispatch support - Realtek 8169: enable GRO software interrupt coalescing per default - Ethernet high-speed switches: - Microchip (sparx5): - add support for Sparx5 TC/flower H/W offload via VCAP - Mellanox mlxsw: - add 802.1X and MAC Authentication Bypass offload support - add ip6gre support - Embedded Ethernet switches: - Mediatek (mtk_eth_soc): - improve PCS implementation, add DSA untag support - enable flow offload support - Renesas: - add rswitch R-Car Gen4 gPTP support - Microchip (lan966x): - add full XDP support - add TC H/W offload via VCAP - enable PTP on bridge interfaces - Microchip (ksz8): - add MTU support for KSZ8 series - Qualcomm 802.11ax WiFi (ath11k): - support configuring channel dwell time during scan - MediaTek WiFi (mt76): - enable Wireless Ethernet Dispatch (WED) offload support - add ack signal support - enable coredump support - remain_on_channel support - Intel WiFi (iwlwifi): - enable Wi-Fi 7 Extremely High Throughput (EHT) PHY capabilities - 320 MHz channels support - RealTek WiFi (rtw89): - new dynamic header firmware format support - wake-over-WLAN support" * tag 'net-next-6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (2002 commits) ipvs: fix type warning in do_div() on 32 bit net: lan966x: Remove a useless test in lan966x_ptp_add_trap() net: ipa: add IPA v4.7 support dt-bindings: net: qcom,ipa: Add SM6350 compatible bnxt: Use generic HBH removal helper in tx path IPv6/GRO: generic helper to remove temporary HBH/jumbo header in driver selftests: forwarding: Add bridge MDB test selftests: forwarding: Rename bridge_mdb test bridge: mcast: Support replacement of MDB port group entries bridge: mcast: Allow user space to specify MDB entry routing protocol bridge: mcast: Allow user space to add (*, G) with a source list and filter mode bridge: mcast: Add support for (*, G) with a source list and filter mode bridge: mcast: Avoid arming group timer when (S, G) corresponds to a source bridge: mcast: Add a flag for user installed source entries bridge: mcast: Expose __br_multicast_del_group_src() bridge: mcast: Expose br_multicast_new_group_src() bridge: mcast: Add a centralized error path bridge: mcast: Place netlink policy before validation functions bridge: mcast: Split (*, G) and (S, G) addition into different functions bridge: mcast: Do not derive entry type from its filter mode ...
2022-11-18sctp: add dif and sdif check in asoc and ep lookupXin Long
This patch at first adds a pernet global l3mdev_accept to decide if it accepts the packets from a l3mdev when a SCTP socket doesn't bind to any interface. It's set to 1 to avoid any possible incompatible issue, and in next patch, a sysctl will be introduced to allow to change it. Then similar to inet/udp_sk_bound_dev_eq(), sctp_sk_bound_dev_eq() is added to check either dif or sdif is equal to sk_bound_dev_if, and to check sid is 0 or l3mdev_accept is 1 if sk_bound_dev_if is not set. This function is used to match a association or a endpoint, namely called by sctp_addrs_lookup_transport() and sctp_endpoint_is_match(). All functions that needs updating are: sctp_rcv(): asoc: __sctp_rcv_lookup() __sctp_lookup_association() -> sctp_addrs_lookup_transport() __sctp_rcv_lookup_harder() __sctp_rcv_init_lookup() __sctp_lookup_association() -> sctp_addrs_lookup_transport() __sctp_rcv_walk_lookup() __sctp_rcv_asconf_lookup() __sctp_lookup_association() -> sctp_addrs_lookup_transport() ep: __sctp_rcv_lookup_endpoint() -> sctp_endpoint_is_match() sctp_connect(): sctp_endpoint_is_peeled_off() __sctp_lookup_association() sctp_has_association() sctp_lookup_association() __sctp_lookup_association() -> sctp_addrs_lookup_transport() sctp_diag_dump_one(): sctp_transport_lookup_process() -> sctp_addrs_lookup_transport() Signed-off-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-11-18sctp: check sk_bound_dev_if when matching ep in get_portXin Long
In sctp_get_port_local(), when binding to IP and PORT, it should also check sk_bound_dev_if to match listening sk if it's set by SO_BINDTOIFINDEX, so that multiple sockets with the same IP and PORT, but different sk_bound_dev_if can be listened at the same time. Signed-off-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-11-18treewide: use get_random_u32_below() instead of deprecated functionJason A. Donenfeld
This is a simple mechanical transformation done by: @@ expression E; @@ - prandom_u32_max + get_random_u32_below (E) Reviewed-by: Kees Cook <keescook@chromium.org> Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Acked-by: Darrick J. Wong <djwong@kernel.org> # for xfs Reviewed-by: SeongJae Park <sj@kernel.org> # for damon Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> # for infiniband Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk> # for arm Acked-by: Ulf Hansson <ulf.hansson@linaro.org> # for mmc Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
2022-10-24sctp: Call inet6_destroy_sock() via sk->sk_destruct().Kuniyuki Iwashima
After commit d38afeec26ed ("tcp/udp: Call inet6_destroy_sock() in IPv6 sk->sk_destruct()."), we call inet6_destroy_sock() in sk->sk_destruct() by setting inet6_sock_destruct() to it to make sure we do not leak inet6-specific resources. SCTP sets its own sk->sk_destruct() in the sctp_init_sock(), and SCTPv6 socket reuses it as the init function. To call inet6_sock_destruct() from SCTPv6 sk->sk_destruct(), we set sctp_v6_destruct_sock() in a new init function. Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-10-11treewide: use get_random_{u8,u16}() when possible, part 1Jason A. Donenfeld
Rather than truncate a 32-bit value to a 16-bit value or an 8-bit value, simply use the get_random_{u8,u16}() functions, which are faster than wasting the additional bytes from a 32-bit value. This was done mechanically with this coccinelle script: @@ expression E; identifier get_random_u32 =~ "get_random_int|prandom_u32|get_random_u32"; typedef u16; typedef __be16; typedef __le16; typedef u8; @@ ( - (get_random_u32() & 0xffff) + get_random_u16() | - (get_random_u32() & 0xff) + get_random_u8() | - (get_random_u32() % 65536) + get_random_u16() | - (get_random_u32() % 256) + get_random_u8() | - (get_random_u32() >> 16) + get_random_u16() | - (get_random_u32() >> 24) + get_random_u8() | - (u16)get_random_u32() + get_random_u16() | - (u8)get_random_u32() + get_random_u8() | - (__be16)get_random_u32() + (__be16)get_random_u16() | - (__le16)get_random_u32() + (__le16)get_random_u16() | - prandom_u32_max(65536) + get_random_u16() | - prandom_u32_max(256) + get_random_u8() | - E->inet_id = get_random_u32() + E->inet_id = get_random_u16() ) @@ identifier get_random_u32 =~ "get_random_int|prandom_u32|get_random_u32"; typedef u16; identifier v; @@ - u16 v = get_random_u32(); + u16 v = get_random_u16(); @@ identifier get_random_u32 =~ "get_random_int|prandom_u32|get_random_u32"; typedef u8; identifier v; @@ - u8 v = get_random_u32(); + u8 v = get_random_u8(); @@ identifier get_random_u32 =~ "get_random_int|prandom_u32|get_random_u32"; typedef u16; u16 v; @@ - v = get_random_u32(); + v = get_random_u16(); @@ identifier get_random_u32 =~ "get_random_int|prandom_u32|get_random_u32"; typedef u8; u8 v; @@ - v = get_random_u32(); + v = get_random_u8(); // Find a potential literal @literal_mask@ expression LITERAL; type T; identifier get_random_u32 =~ "get_random_int|prandom_u32|get_random_u32"; position p; @@ ((T)get_random_u32()@p & (LITERAL)) // Examine limits @script:python add_one@ literal << literal_mask.LITERAL; RESULT; @@ value = None if literal.startswith('0x'): value = int(literal, 16) elif literal[0] in '123456789': value = int(literal, 10) if value is None: print("I don't know how to handle %s" % (literal)) cocci.include_match(False) elif value < 256: coccinelle.RESULT = cocci.make_ident("get_random_u8") elif value < 65536: coccinelle.RESULT = cocci.make_ident("get_random_u16") else: print("Skipping large mask of %s" % (literal)) cocci.include_match(False) // Replace the literal mask with the calculated result. @plus_one@ expression literal_mask.LITERAL; position literal_mask.p; identifier add_one.RESULT; identifier FUNC; @@ - (FUNC()@p & (LITERAL)) + (RESULT() & LITERAL) Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Reviewed-by: Kees Cook <keescook@chromium.org> Reviewed-by: Yury Norov <yury.norov@gmail.com> Acked-by: Jakub Kicinski <kuba@kernel.org> Acked-by: Toke Høiland-Jørgensen <toke@toke.dk> # for sch_cake Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
2022-10-11treewide: use prandom_u32_max() when possible, part 1Jason A. Donenfeld
Rather than incurring a division or requesting too many random bytes for the given range, use the prandom_u32_max() function, which only takes the minimum required bytes from the RNG and avoids divisions. This was done mechanically with this coccinelle script: @basic@ expression E; type T; identifier get_random_u32 =~ "get_random_int|prandom_u32|get_random_u32"; typedef u64; @@ ( - ((T)get_random_u32() % (E)) + prandom_u32_max(E) | - ((T)get_random_u32() & ((E) - 1)) + prandom_u32_max(E * XXX_MAKE_SURE_E_IS_POW2) | - ((u64)(E) * get_random_u32() >> 32) + prandom_u32_max(E) | - ((T)get_random_u32() & ~PAGE_MASK) + prandom_u32_max(PAGE_SIZE) ) @multi_line@ identifier get_random_u32 =~ "get_random_int|prandom_u32|get_random_u32"; identifier RAND; expression E; @@ - RAND = get_random_u32(); ... when != RAND - RAND %= (E); + RAND = prandom_u32_max(E); // Find a potential literal @literal_mask@ expression LITERAL; type T; identifier get_random_u32 =~ "get_random_int|prandom_u32|get_random_u32"; position p; @@ ((T)get_random_u32()@p & (LITERAL)) // Add one to the literal. @script:python add_one@ literal << literal_mask.LITERAL; RESULT; @@ value = None if literal.startswith('0x'): value = int(literal, 16) elif literal[0] in '123456789': value = int(literal, 10) if value is None: print("I don't know how to handle %s" % (literal)) cocci.include_match(False) elif value == 2**32 - 1 or value == 2**31 - 1 or value == 2**24 - 1 or value == 2**16 - 1 or value == 2**8 - 1: print("Skipping 0x%x for cleanup elsewhere" % (value)) cocci.include_match(False) elif value & (value + 1) != 0: print("Skipping 0x%x because it's not a power of two minus one" % (value)) cocci.include_match(False) elif literal.startswith('0x'): coccinelle.RESULT = cocci.make_expr("0x%x" % (value + 1)) else: coccinelle.RESULT = cocci.make_expr("%d" % (value + 1)) // Replace the literal mask with the calculated result. @plus_one@ expression literal_mask.LITERAL; position literal_mask.p; expression add_one.RESULT; identifier FUNC; @@ - (FUNC()@p & (LITERAL)) + prandom_u32_max(RESULT) @collapse_ret@ type T; identifier VAR; expression E; @@ { - T VAR; - VAR = (E); - return VAR; + return E; } @drop_var@ type T; identifier VAR; @@ { - T VAR; ... when != VAR } Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Reviewed-by: Kees Cook <keescook@chromium.org> Reviewed-by: Yury Norov <yury.norov@gmail.com> Reviewed-by: KP Singh <kpsingh@kernel.org> Reviewed-by: Jan Kara <jack@suse.cz> # for ext4 and sbitmap Reviewed-by: Christoph Böhmwalder <christoph.boehmwalder@linbit.com> # for drbd Acked-by: Jakub Kicinski <kuba@kernel.org> Acked-by: Heiko Carstens <hca@linux.ibm.com> # for s390 Acked-by: Ulf Hansson <ulf.hansson@linaro.org> # for mmc Acked-by: Darrick J. Wong <djwong@kernel.org> # for xfs Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>