tcp_ipv4.c source code [linux/net/ipv4/tcp_ipv4.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* INET An implementation of the TCP/IP protocol suite for the LINUX
4	* operating system. INET is implemented using the BSD Socket
5	* interface as the means of communication with the user level.
6	*
7	* Implementation of the Transmission Control Protocol(TCP).
8	*
9	* IPv4 specific functions
10	*
11	* code split from:
12	* linux/ipv4/tcp.c
13	* linux/ipv4/tcp_input.c
14	* linux/ipv4/tcp_output.c
15	*
16	* See tcp.c for author information
17	*/
18
19	/*
20	* Changes:
21	* David S. Miller : New socket lookup architecture.
22	* This code is dedicated to John Dyson.
23	* David S. Miller : Change semantics of established hash,
24	* half is devoted to TIME_WAIT sockets
25	* and the rest go in the other half.
26	* Andi Kleen : Add support for syncookies and fixed
27	* some bugs: ip options weren't passed to
28	* the TCP layer, missed a check for an
29	* ACK bit.
30	* Andi Kleen : Implemented fast path mtu discovery.
31	* Fixed many serious bugs in the
32	* request_sock handling and moved
33	* most of it into the af independent code.
34	* Added tail drop and some other bugfixes.
35	* Added new listen semantics.
36	* Mike McLagan : Routing by source
37	* Juan Jose Ciarlante: ip_dynaddr bits
38	* Andi Kleen: various fixes.
39	* Vitaly E. Lavrov : Transparent proxy revived after year
40	* coma.
41	* Andi Kleen : Fix new listen.
42	* Andi Kleen : Fix accept error reporting.
43	* YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44	* Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45	* a single port at the same time.
46	*/
47
48	#define pr_fmt(fmt) "TCP: " fmt
49
50	#include <linux/bottom_half.h>
51	#include <linux/types.h>
52	#include <linux/fcntl.h>
53	#include <linux/module.h>
54	#include <linux/random.h>
55	#include <linux/cache.h>
56	#include <linux/jhash.h>
57	#include <linux/init.h>
58	#include <linux/times.h>
59	#include <linux/slab.h>
60	#include <linux/sched.h>
61
62	#include <net/net_namespace.h>
63	#include <net/icmp.h>
64	#include <net/inet_hashtables.h>
65	#include <net/tcp.h>
66	#include <net/transp_v6.h>
67	#include <net/ipv6.h>
68	#include <net/inet_common.h>
69	#include <net/timewait_sock.h>
70	#include <net/xfrm.h>
71	#include <net/secure_seq.h>
72	#include <net/busy_poll.h>
73
74	#include <linux/inet.h>
75	#include <linux/ipv6.h>
76	#include <linux/stddef.h>
77	#include <linux/proc_fs.h>
78	#include <linux/seq_file.h>
79	#include <linux/inetdevice.h>
80	#include <linux/btf_ids.h>
81
82	#include <crypto/hash.h>
83	#include <linux/scatterlist.h>
84
85	#include <trace/events/tcp.h>
86
87	#ifdef CONFIG_TCP_MD5SIG
88	static int tcp_v4_md5_hash_hdr(char md5_hash, const* struct tcp_md5sig_key *key,
89	__be32 daddr, __be32 saddr, const struct tcphdr *th);
90	#endif
91
92	struct inet_hashinfo tcp_hashinfo;
93	EXPORT_SYMBOL(tcp_hashinfo);
94
95	static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
96
97	static u32 tcp_v4_init_seq(const struct sk_buff *skb)
98	{
99	return secure_tcp_seq(saddr: ip_hdr(skb)->daddr,
100	daddr: ip_hdr(skb)->saddr,
101	sport: tcp_hdr(skb)->dest,
102	dport: tcp_hdr(skb)->source);
103	}
104
105	static u32 tcp_v4_init_ts_off(const struct net net, const* struct sk_buff *skb)
106	{
107	return secure_tcp_ts_off(net, saddr: ip_hdr(skb)->daddr, daddr: ip_hdr(skb)->saddr);
108	}
109
110	int tcp_twsk_unique(struct sock sk, struct* sock sktw, void* *twp)
111	{
112	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
113	const struct inet_timewait_sock *tw = inet_twsk(sk: sktw);
114	const struct tcp_timewait_sock *tcptw = tcp_twsk(sk: sktw);
115	struct tcp_sock *tp = tcp_sk(sk);
116
117	if (reuse == `2`) {
118	/ Still does not detect everything that goes through*
119	* lo, since we require a loopback src or dst address
120	* or direct binding to 'lo' interface.
121	*/
122	bool loopback = false;
123	if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
124	loopback = true;
125	#if IS_ENABLED(CONFIG_IPV6)
126	if (tw->tw_family == AF_INET6) {
127	if (ipv6_addr_loopback(a: &tw->tw_v6_daddr) \|\|
128	ipv6_addr_v4mapped_loopback(a: &tw->tw_v6_daddr) \|\|
129	ipv6_addr_loopback(a: &tw->tw_v6_rcv_saddr) \|\|
130	ipv6_addr_v4mapped_loopback(a: &tw->tw_v6_rcv_saddr))
131	loopback = true;
132	} else
133	#endif
134	{
135	if (ipv4_is_loopback(addr: tw->tw_daddr) \|\|
136	ipv4_is_loopback(addr: tw->tw_rcv_saddr))
137	loopback = true;
138	}
139	if (!loopback)
140	reuse = `0`;
141	}
142
143	/ With PAWS, it is safe from the viewpoint*
144	of data integrity. Even without PAWS it is safe provided sequence
145	spaces do not overlap i.e. at data rates <= 80Mbit/sec.
146
147	Actually, the idea is close to VJ's one, only timestamp cache is
148	held not per host, but per port pair and TW bucket is used as state
149	holder.
150
151	If TW bucket has been already destroyed we fall back to VJ's scheme
152	and use initial timestamp retrieved from peer table.
153	*/
154	if (tcptw->tw_ts_recent_stamp &&
155	(!twp \|\| (reuse && time_after32(ktime_get_seconds(),
156	tcptw->tw_ts_recent_stamp)))) {
157	/ In case of repair and re-using TIME-WAIT sockets we still*
158	* want to be sure that it is safe as above but honor the
159	* sequence numbers and time stamps set as part of the repair
160	* process.
161	*
162	* Without this check re-using a TIME-WAIT socket with TCP
163	* repair would accumulate a -1 on the repair assigned
164	* sequence number. The first time it is reused the sequence
165	* is -1, the second time -2, etc. This fixes that issue
166	* without appearing to create any others.
167	*/
168	if (likely(!tp->repair)) {
169	u32 seq = tcptw->tw_snd_nxt + `65535` + `2`;
170
171	if (!seq)
172	seq = `1`;
173	WRITE_ONCE(tp->write_seq, seq);
174	tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
175	tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
176	}
177	sock_hold(sk: sktw);
178	return `1`;
179	}
180
181	return `0`;
182	}
183	EXPORT_SYMBOL_GPL(tcp_twsk_unique);
184
185	static int tcp_v4_pre_connect(struct sock sk, struct* sockaddr *uaddr,
186	int addr_len)
187	{
188	/ This check is replicated from tcp_v4_connect() and intended to*
189	* prevent BPF program called below from accessing bytes that are out
190	* of the bound specified by user in addr_len.
191	*/
192	if (addr_len < sizeof(struct sockaddr_in))
193	return -EINVAL;
194
195	sock_owned_by_me(sk);
196
197	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
198	}
199
200	/ This will initiate an outgoing connection. /
201	int tcp_v4_connect(struct sock sk, struct* sockaddr uaddr, int* addr_len)
202	{
203	struct sockaddr_in usin = (struct* sockaddr_in *)uaddr;
204	struct inet_timewait_death_row *tcp_death_row;
205	struct inet_sock *inet = inet_sk(sk);
206	struct tcp_sock *tp = tcp_sk(sk);
207	struct ip_options_rcu *inet_opt;
208	struct net *net = sock_net(sk);
209	__be16 orig_sport, orig_dport;
210	__be32 daddr, nexthop;
211	struct flowi4 *fl4;
212	struct rtable *rt;
213	int err;
214
215	if (addr_len < sizeof(struct sockaddr_in))
216	return -EINVAL;
217
218	if (usin->sin_family != AF_INET)
219	return -EAFNOSUPPORT;
220
221	nexthop = daddr = usin->sin_addr.s_addr;
222	inet_opt = rcu_dereference_protected(inet->inet_opt,
223	lockdep_sock_is_held(sk));
224	if (inet_opt && inet_opt->opt.srr) {
225	if (!daddr)
226	return -EINVAL;
227	nexthop = inet_opt->opt.faddr;
228	}
229
230	orig_sport = inet->inet_sport;
231	orig_dport = usin->sin_port;
232	fl4 = &inet->cork.fl.u.ip4;
233	rt = ip_route_connect(fl4, dst: nexthop, src: inet->inet_saddr,
234	oif: sk->sk_bound_dev_if, IPPROTO_TCP, sport: orig_sport,
235	dport: orig_dport, sk);
236	if (IS_ERR(ptr: rt)) {
237	err = PTR_ERR(ptr: rt);
238	if (err == -ENETUNREACH)
239	IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
240	return err;
241	}
242
243	if (rt->rt_flags & (RTCF_MULTICAST \| RTCF_BROADCAST)) {
244	ip_rt_put(rt);
245	return -ENETUNREACH;
246	}
247
248	if (!inet_opt \|\| !inet_opt->opt.srr)
249	daddr = fl4->daddr;
250
251	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
252
253	if (!inet->inet_saddr) {
254	err = inet_bhash2_update_saddr(sk, saddr: &fl4->saddr, AF_INET);
255	if (err) {
256	ip_rt_put(rt);
257	return err;
258	}
259	} else {
260	sk_rcv_saddr_set(sk, addr: inet->inet_saddr);
261	}
262
263	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
264	/ Reset inherited state /
265	tp->rx_opt.ts_recent = `0`;
266	tp->rx_opt.ts_recent_stamp = `0`;
267	if (likely(!tp->repair))
268	WRITE_ONCE(tp->write_seq, `0`);
269	}
270
271	inet->inet_dport = usin->sin_port;
272	sk_daddr_set(sk, addr: daddr);
273
274	inet_csk(sk)->icsk_ext_hdr_len = `0`;
275	if (inet_opt)
276	inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
277
278	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
279
280	/ Socket identity is still unknown (sport may be zero).*
281	* However we set state to SYN-SENT and not releasing socket
282	* lock select source port, enter ourselves into the hash tables and
283	* complete initialization after this.
284	*/
285	tcp_set_state(sk, state: TCP_SYN_SENT);
286	err = inet_hash_connect(death_row: tcp_death_row, sk);
287	if (err)
288	goto failure;
289
290	sk_set_txhash(sk);
291
292	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
293	sport: inet->inet_sport, dport: inet->inet_dport, sk);
294	if (IS_ERR(ptr: rt)) {
295	err = PTR_ERR(ptr: rt);
296	rt = NULL;
297	goto failure;
298	}
299	tp->tcp_usec_ts = dst_tcp_usec_ts(dst: &rt->dst);
300	/ OK, now commit destination to socket. /
301	sk->sk_gso_type = SKB_GSO_TCPV4;
302	sk_setup_caps(sk, dst: &rt->dst);
303	rt = NULL;
304
305	if (likely(!tp->repair)) {
306	if (!tp->write_seq)
307	WRITE_ONCE(tp->write_seq,
308	secure_tcp_seq(inet->inet_saddr,
309	inet->inet_daddr,
310	inet->inet_sport,
311	usin->sin_port));
312	WRITE_ONCE(tp->tsoffset,
313	secure_tcp_ts_off(net, inet->inet_saddr,
314	inet->inet_daddr));
315	}
316
317	atomic_set(v: &inet->inet_id, i: get_random_u16());
318
319	if (tcp_fastopen_defer_connect(sk, err: &err))
320	return err;
321	if (err)
322	goto failure;
323
324	err = tcp_connect(sk);
325
326	if (err)
327	goto failure;
328
329	return `0`;
330
331	failure:
332	/*
333	* This unhashes the socket and releases the local port,
334	* if necessary.
335	*/
336	tcp_set_state(sk, state: TCP_CLOSE);
337	inet_bhash2_reset_saddr(sk);
338	ip_rt_put(rt);
339	sk->sk_route_caps = `0`;
340	inet->inet_dport = `0`;
341	return err;
342	}
343	EXPORT_SYMBOL(tcp_v4_connect);
344
345	/*
346	* This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
347	* It can be called through tcp_release_cb() if socket was owned by user
348	* at the time tcp_v4_err() was called to handle ICMP message.
349	*/
350	void tcp_v4_mtu_reduced(struct sock *sk)
351	{
352	struct inet_sock *inet = inet_sk(sk);
353	struct dst_entry *dst;
354	u32 mtu;
355
356	if ((`1` << sk->sk_state) & (TCPF_LISTEN \| TCPF_CLOSE))
357	return;
358	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
359	dst = inet_csk_update_pmtu(sk, mtu);
360	if (!dst)
361	return;
362
363	/ Something is about to be wrong... Remember soft error*
364	* for the case, if this connection will not able to recover.
365	*/
366	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
367	WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
368
369	mtu = dst_mtu(dst);
370
371	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
372	ip_sk_accept_pmtu(sk) &&
373	inet_csk(sk)->icsk_pmtu_cookie > mtu) {
374	tcp_sync_mss(sk, pmtu: mtu);
375
376	/ Resend the TCP packet because it's*
377	* clear that the old packet has been
378	* dropped. This is the new "fast" path mtu
379	* discovery.
380	*/
381	tcp_simple_retransmit(sk);
382	} / else let the usual retransmit timer handle it /
383	}
384	EXPORT_SYMBOL(tcp_v4_mtu_reduced);
385
386	static void do_redirect(struct sk_buff skb, struct* sock *sk)
387	{
388	struct dst_entry *dst = __sk_dst_check(sk, cookie: `0`);
389
390	if (dst)
391	dst->ops->redirect(dst, sk, skb);
392	}
393
394
395	/ handle ICMP messages on TCP_NEW_SYN_RECV request sockets /
396	void tcp_req_err(struct sock *sk, u32 seq, bool abort)
397	{
398	struct request_sock *req = inet_reqsk(sk);
399	struct net *net = sock_net(sk);
400
401	/ ICMPs are not backlogged, hence we cannot get*
402	* an established socket here.
403	*/
404	if (seq != tcp_rsk(req)->snt_isn) {
405	__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
406	} else if (abort) {
407	/*
408	* Still in SYN_RECV, just remove it silently.
409	* There is no good way to pass the error to the newly
410	* created socket, and POSIX does not want network
411	* errors returned from accept().
412	*/
413	inet_csk_reqsk_queue_drop(sk: req->rsk_listener, req);
414	tcp_listendrop(sk: req->rsk_listener);
415	}
416	reqsk_put(req);
417	}
418	EXPORT_SYMBOL(tcp_req_err);
419
420	/ TCP-LD (RFC 6069) logic /
421	void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
422	{
423	struct inet_connection_sock *icsk = inet_csk(sk);
424	struct tcp_sock *tp = tcp_sk(sk);
425	struct sk_buff *skb;
426	s32 remaining;
427	u32 delta_us;
428
429	if (sock_owned_by_user(sk))
430	return;
431
432	if (seq != tp->snd_una \|\| !icsk->icsk_retransmits \|\|
433	!icsk->icsk_backoff)
434	return;
435
436	skb = tcp_rtx_queue_head(sk);
437	if (WARN_ON_ONCE(!skb))
438	return;
439
440	icsk->icsk_backoff--;
441	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
442	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
443
444	tcp_mstamp_refresh(tp);
445	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
446	remaining = icsk->icsk_rto - usecs_to_jiffies(u: delta_us);
447
448	if (remaining > `0`) {
449	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
450	when: remaining, TCP_RTO_MAX);
451	} else {
452	/ RTO revert clocked out retransmission.*
453	* Will retransmit now.
454	*/
455	tcp_retransmit_timer(sk);
456	}
457	}
458	EXPORT_SYMBOL(tcp_ld_RTO_revert);
459
460	/*
461	* This routine is called by the ICMP module when it gets some
462	* sort of error condition. If err < 0 then the socket should
463	* be closed and the error returned to the user. If err > 0
464	* it's just the icmp type << 8 \| icmp code. After adjustment
465	* header points to the first 8 bytes of the tcp header. We need
466	* to find the appropriate port.
467	*
468	* The locking strategy used here is very "optimistic". When
469	* someone else accesses the socket the ICMP is just dropped
470	* and for some paths there is no check at all.
471	* A more general error queue to queue errors for later handling
472	* is probably better.
473	*
474	*/
475
476	int tcp_v4_err(struct sk_buff *skb, u32 info)
477	{
478	const struct iphdr iph = (const* struct iphdr *)skb->data;
479	struct tcphdr th = (struct* tcphdr *)(skb->data + (iph->ihl << `2`));
480	struct tcp_sock *tp;
481	const int type = icmp_hdr(skb)->type;
482	const int code = icmp_hdr(skb)->code;
483	struct sock *sk;
484	struct request_sock *fastopen;
485	u32 seq, snd_una;
486	int err;
487	struct net *net = dev_net(dev: skb->dev);
488
489	sk = __inet_lookup_established(net, hashinfo: net->ipv4.tcp_death_row.hashinfo,
490	saddr: iph->daddr, sport: th->dest, daddr: iph->saddr,
491	ntohs(th->source), dif: inet_iif(skb), sdif: `0`);
492	if (!sk) {
493	__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
494	return -ENOENT;
495	}
496	if (sk->sk_state == TCP_TIME_WAIT) {
497	/ To increase the counter of ignored icmps for TCP-AO /
498	tcp_ao_ignore_icmp(sk, AF_INET, type, code);
499	inet_twsk_put(tw: inet_twsk(sk));
500	return `0`;
501	}
502	seq = ntohl(th->seq);
503	if (sk->sk_state == TCP_NEW_SYN_RECV) {
504	tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB \|\|
505	type == ICMP_TIME_EXCEEDED \|\|
506	(type == ICMP_DEST_UNREACH &&
507	(code == ICMP_NET_UNREACH \|\|
508	code == ICMP_HOST_UNREACH)));
509	return `0`;
510	}
511
512	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
513	sock_put(sk);
514	return `0`;
515	}
516
517	bh_lock_sock(sk);
518	/ If too many ICMPs get dropped on busy*
519	* servers this needs to be solved differently.
520	* We do take care of PMTU discovery (RFC1191) special case :
521	* we can receive locally generated ICMP messages while socket is held.
522	*/
523	if (sock_owned_by_user(sk)) {
524	if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
525	__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
526	}
527	if (sk->sk_state == TCP_CLOSE)
528	goto out;
529
530	if (static_branch_unlikely(&ip4_min_ttl)) {
531	/ min_ttl can be changed concurrently from do_ip_setsockopt() /
532	if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
533	__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
534	goto out;
535	}
536	}
537
538	tp = tcp_sk(sk);
539	/ XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() /
540	fastopen = rcu_dereference(tp->fastopen_rsk);
541	snd_una = fastopen ? tcp_rsk(req: fastopen)->snt_isn : tp->snd_una;
542	if (sk->sk_state != TCP_LISTEN &&
543	!between(seq1: seq, seq2: snd_una, seq3: tp->snd_nxt)) {
544	__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
545	goto out;
546	}
547
548	switch (type) {
549	case ICMP_REDIRECT:
550	if (!sock_owned_by_user(sk))
551	do_redirect(skb, sk);
552	goto out;
553	case ICMP_SOURCE_QUENCH:
554	/ Just silently ignore these. /
555	goto out;
556	case ICMP_PARAMETERPROB:
557	err = EPROTO;
558	break;
559	case ICMP_DEST_UNREACH:
560	if (code > NR_ICMP_UNREACH)
561	goto out;
562
563	if (code == ICMP_FRAG_NEEDED) { / PMTU discovery (RFC1191) /
564	/ We are not interested in TCP_LISTEN and open_requests*
565	* (SYN-ACKs send out by Linux are always <576bytes so
566	* they should go through unfragmented).
567	*/
568	if (sk->sk_state == TCP_LISTEN)
569	goto out;
570
571	WRITE_ONCE(tp->mtu_info, info);
572	if (!sock_owned_by_user(sk)) {
573	tcp_v4_mtu_reduced(sk);
574	} else {
575	if (!test_and_set_bit(nr: TCP_MTU_REDUCED_DEFERRED, addr: &sk->sk_tsq_flags))
576	sock_hold(sk);
577	}
578	goto out;
579	}
580
581	err = icmp_err_convert[code].errno;
582	/ check if this ICMP message allows revert of backoff.*
583	* (see RFC 6069)
584	*/
585	if (!fastopen &&
586	(code == ICMP_NET_UNREACH \|\| code == ICMP_HOST_UNREACH))
587	tcp_ld_RTO_revert(sk, seq);
588	break;
589	case ICMP_TIME_EXCEEDED:
590	err = EHOSTUNREACH;
591	break;
592	default:
593	goto out;
594	}
595
596	switch (sk->sk_state) {
597	case TCP_SYN_SENT:
598	case TCP_SYN_RECV:
599	/ Only in fast or simultaneous open. If a fast open socket is*
600	* already accepted it is treated as a connected one below.
601	*/
602	if (fastopen && !fastopen->sk)
603	break;
604
605	ip_icmp_error(sk, skb, err, port: th->dest, info, payload: (u8 *)th);
606
607	if (!sock_owned_by_user(sk)) {
608	WRITE_ONCE(sk->sk_err, err);
609
610	sk_error_report(sk);
611
612	tcp_done(sk);
613	} else {
614	WRITE_ONCE(sk->sk_err_soft, err);
615	}
616	goto out;
617	}
618
619	/ If we've already connected we will keep trying*
620	* until we time out, or the user gives up.
621	*
622	* rfc1122 4.2.3.9 allows to consider as hard errors
623	* only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
624	* but it is obsoleted by pmtu discovery).
625	*
626	* Note, that in modern internet, where routing is unreliable
627	* and in each dark corner broken firewalls sit, sending random
628	* errors ordered by their masters even this two messages finally lose
629	* their original sense (even Linux sends invalid PORT_UNREACHs)
630	*
631	* Now we are in compliance with RFCs.
632	* --ANK (980905)
633	*/
634
635	if (!sock_owned_by_user(sk) &&
636	inet_test_bit(RECVERR, sk)) {
637	WRITE_ONCE(sk->sk_err, err);
638	sk_error_report(sk);
639	} else { / Only an error on timeout /
640	WRITE_ONCE(sk->sk_err_soft, err);
641	}
642
643	out:
644	bh_unlock_sock(sk);
645	sock_put(sk);
646	return `0`;
647	}
648
649	void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
650	{
651	struct tcphdr *th = tcp_hdr(skb);
652
653	th->check = ~tcp_v4_check(len: skb->len, saddr, daddr, base: `0`);
654	skb->csum_start = skb_transport_header(skb) - skb->head;
655	skb->csum_offset = offsetof(struct tcphdr, check);
656	}
657
658	/ This routine computes an IPv4 TCP checksum. /
659	void tcp_v4_send_check(struct sock sk, struct* sk_buff *skb)
660	{
661	const struct inet_sock *inet = inet_sk(sk);
662
663	__tcp_v4_send_check(skb, saddr: inet->inet_saddr, daddr: inet->inet_daddr);
664	}
665	EXPORT_SYMBOL(tcp_v4_send_check);
666
667	#define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32))
668
669	static bool tcp_v4_ao_sign_reset(const struct sock sk, struct* sk_buff *skb,
670	const struct tcp_ao_hdr *aoh,
671	struct ip_reply_arg arg, struct* tcphdr *reply,
672	__be32 reply_options[REPLY_OPTIONS_LEN])
673	{
674	#ifdef CONFIG_TCP_AO
675	int sdif = tcp_v4_sdif(skb);
676	int dif = inet_iif(skb);
677	int l3index = sdif ? dif : `0`;
678	bool allocated_traffic_key;
679	struct tcp_ao_key *key;
680	char *traffic_key;
681	bool drop = true;
682	u32 ao_sne = `0`;
683	u8 keyid;
684
685	rcu_read_lock();
686	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
687	key: &key, traffic_key: &traffic_key, allocated_traffic_key: &allocated_traffic_key,
688	keyid: &keyid, sne: &ao_sne))
689	goto out;
690
691	reply_options[`0`] = htonl((TCPOPT_AO << `24`) \| (tcp_ao_len(key) << `16`) \|
692	(aoh->rnext_keyid << `8`) \| keyid);
693	arg->iov[`0`].iov_len += tcp_ao_len_aligned(key);
694	reply->doff = arg->iov[`0`].iov_len / `4`;
695
696	if (tcp_ao_hash_hdr(AF_INET, ao_hash: (char *)&reply_options[`1`],
697	key, tkey: traffic_key,
698	daddr: (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
699	saddr: (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
700	th: reply, sne: ao_sne))
701	goto out;
702	drop = false;
703	out:
704	rcu_read_unlock();
705	if (allocated_traffic_key)
706	kfree(objp: traffic_key);
707	return drop;
708	#else
709	return true;
710	#endif
711	}
712
713	/*
714	* This routine will send an RST to the other tcp.
715	*
716	* Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
717	* for reset.
718	* Answer: if a packet caused RST, it is not for a socket
719	* existing in our system, if it is matched to a socket,
720	* it is just duplicate segment or bug in other side's TCP.
721	* So that we build reply only basing on parameters
722	* arrived with segment.
723	* Exception: precedence violation. We do not implement it in any case.
724	*/
725
726	static void tcp_v4_send_reset(const struct sock sk, struct* sk_buff *skb)
727	{
728	const struct tcphdr *th = tcp_hdr(skb);
729	struct {
730	struct tcphdr th;
731	__be32 opt[REPLY_OPTIONS_LEN];
732	} rep;
733	const __u8 *md5_hash_location = NULL;
734	const struct tcp_ao_hdr *aoh;
735	struct ip_reply_arg arg;
736	#ifdef CONFIG_TCP_MD5SIG
737	struct tcp_md5sig_key *key = NULL;
738	unsigned char newhash[`16`];
739	struct sock *sk1 = NULL;
740	int genhash;
741	#endif
742	u64 transmit_time = `0`;
743	struct sock *ctl_sk;
744	struct net *net;
745	u32 txhash = `0`;
746
747	/ Never send a reset in response to a reset. /
748	if (th->rst)
749	return;
750
751	/ If sk not NULL, it means we did a successful lookup and incoming*
752	* route had to be correct. prequeue might have dropped our dst.
753	*/
754	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
755	return;
756
757	/ Swap the send and the receive. /
758	memset(&rep, `0`, sizeof(rep));
759	rep.th.dest = th->source;
760	rep.th.source = th->dest;
761	rep.th.doff = sizeof(struct tcphdr) / `4`;
762	rep.th.rst = `1`;
763
764	if (th->ack) {
765	rep.th.seq = th->ack_seq;
766	} else {
767	rep.th.ack = `1`;
768	rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
769	skb->len - (th->doff << `2`));
770	}
771
772	memset(&arg, `0`, sizeof(arg));
773	arg.iov[`0`].iov_base = (unsigned char *)&rep;
774	arg.iov[`0`].iov_len = sizeof(rep.th);
775
776	net = sk ? sock_net(sk) : dev_net(dev: skb_dst(skb)->dev);
777
778	/ Invalid TCP option size or twice included auth /
779	if (tcp_parse_auth_options(th: tcp_hdr(skb), md5_hash: &md5_hash_location, aoh: &aoh))
780	return;
781
782	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, arg: &arg, reply: &rep.th, reply_options: rep.opt))
783	return;
784
785	#ifdef CONFIG_TCP_MD5SIG
786	rcu_read_lock();
787	if (sk && sk_fullsock(sk)) {
788	const union tcp_md5_addr *addr;
789	int l3index;
790
791	/ sdif set, means packet ingressed via a device*
792	* in an L3 domain and inet_iif is set to it.
793	*/
794	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : `0`;
795	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
796	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
797	} else if (md5_hash_location) {
798	const union tcp_md5_addr *addr;
799	int sdif = tcp_v4_sdif(skb);
800	int dif = inet_iif(skb);
801	int l3index;
802
803	/*
804	* active side is lost. Try to find listening socket through
805	* source port, and then find md5 key through listening socket.
806	* we are not loose security here:
807	* Incoming packet is checked with md5 hash with finding key,
808	* no RST generated if md5 hash doesn't match.
809	*/
810	sk1 = __inet_lookup_listener(net, hashinfo: net->ipv4.tcp_death_row.hashinfo,
811	NULL, doff: `0`, saddr: ip_hdr(skb)->saddr,
812	sport: th->source, daddr: ip_hdr(skb)->daddr,
813	ntohs(th->source), dif, sdif);
814	/ don't send rst if it can't find key /
815	if (!sk1)
816	goto out;
817
818	/ sdif set, means packet ingressed via a device*
819	* in an L3 domain and dif is set to it.
820	*/
821	l3index = sdif ? dif : `0`;
822	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
823	key = tcp_md5_do_lookup(sk: sk1, l3index, addr, AF_INET);
824	if (!key)
825	goto out;
826
827
828	genhash = tcp_v4_md5_hash_skb(md5_hash: newhash, key, NULL, skb);
829	if (genhash \|\| memcmp(p: md5_hash_location, q: newhash, size: `16`) != `0`)
830	goto out;
831
832	}
833
834	if (key) {
835	rep.opt[`0`] = htonl((TCPOPT_NOP << `24`) \|
836	(TCPOPT_NOP << `16`) \|
837	(TCPOPT_MD5SIG << `8`) \|
838	TCPOLEN_MD5SIG);
839	/ Update length and the length the header thinks exists /
840	arg.iov[`0`].iov_len += TCPOLEN_MD5SIG_ALIGNED;
841	rep.th.doff = arg.iov[`0`].iov_len / `4`;
842
843	tcp_v4_md5_hash_hdr(md5_hash: (__u8 *) &rep.opt[`1`],
844	key, daddr: ip_hdr(skb)->saddr,
845	saddr: ip_hdr(skb)->daddr, th: &rep.th);
846	}
847	#endif
848	/ Can't co-exist with TCPMD5, hence check rep.opt[0] /
849	if (rep.opt[`0`] == `0`) {
850	__be32 mrst = mptcp_reset_option(skb);
851
852	if (mrst) {
853	rep.opt[`0`] = mrst;
854	arg.iov[`0`].iov_len += sizeof(mrst);
855	rep.th.doff = arg.iov[`0`].iov_len / `4`;
856	}
857	}
858
859	arg.csum = csum_tcpudp_nofold(saddr: ip_hdr(skb)->daddr,
860	daddr: ip_hdr(skb)->saddr, / XXX /
861	len: arg.iov[`0`].iov_len, IPPROTO_TCP, sum: `0`);
862	arg.csumoffset = offsetof(struct tcphdr, check) / `2`;
863	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : `0`;
864
865	/ When socket is gone, all binding information is lost.*
866	* routing might fail in this case. No choice here, if we choose to force
867	* input interface, we will misroute in case of asymmetric route.
868	*/
869	if (sk) {
870	arg.bound_dev_if = sk->sk_bound_dev_if;
871	if (sk_fullsock(sk))
872	trace_tcp_send_reset(sk, skb);
873	}
874
875	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
876	offsetof(struct inet_timewait_sock, tw_bound_dev_if));
877
878	arg.tos = ip_hdr(skb)->tos;
879	arg.uid = sock_net_uid(net, sk: sk && sk_fullsock(sk) ? sk : NULL);
880	local_bh_disable();
881	ctl_sk = this_cpu_read(ipv4_tcp_sk);
882	sock_net_set(sk: ctl_sk, net);
883	if (sk) {
884	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
885	inet_twsk(sk)->tw_mark : sk->sk_mark;
886	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
887	inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
888	transmit_time = tcp_transmit_time(sk);
889	xfrm_sk_clone_policy(sk: ctl_sk, osk: sk);
890	txhash = (sk->sk_state == TCP_TIME_WAIT) ?
891	inet_twsk(sk)->tw_txhash : sk->sk_txhash;
892	} else {
893	ctl_sk->sk_mark = `0`;
894	ctl_sk->sk_priority = `0`;
895	}
896	ip_send_unicast_reply(sk: ctl_sk,
897	skb, sopt: &TCP_SKB_CB(skb)->header.h4.opt,
898	daddr: ip_hdr(skb)->saddr, saddr: ip_hdr(skb)->daddr,
899	arg: &arg, len: arg.iov[`0`].iov_len,
900	transmit_time, txhash);
901
902	xfrm_sk_free_policy(sk: ctl_sk);
903	sock_net_set(sk: ctl_sk, net: &init_net);
904	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
905	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
906	local_bh_enable();
907
908	#ifdef CONFIG_TCP_MD5SIG
909	out:
910	rcu_read_unlock();
911	#endif
912	}
913
914	/ The code following below sending ACKs in SYN-RECV and TIME-WAIT states*
915	outside socket context is ugly, certainly. What can I do?
916	*/
917
918	static void tcp_v4_send_ack(const struct sock *sk,
919	struct sk_buff *skb, u32 seq, u32 ack,
920	u32 win, u32 tsval, u32 tsecr, int oif,
921	struct tcp_key *key,
922	int reply_flags, u8 tos, u32 txhash)
923	{
924	const struct tcphdr *th = tcp_hdr(skb);
925	struct {
926	struct tcphdr th;
927	__be32 opt[(MAX_TCP_OPTION_SPACE >> `2`)];
928	} rep;
929	struct net *net = sock_net(sk);
930	struct ip_reply_arg arg;
931	struct sock *ctl_sk;
932	u64 transmit_time;
933
934	memset(&rep.th, `0`, sizeof(struct tcphdr));
935	memset(&arg, `0`, sizeof(arg));
936
937	arg.iov[`0`].iov_base = (unsigned char *)&rep;
938	arg.iov[`0`].iov_len = sizeof(rep.th);
939	if (tsecr) {
940	rep.opt[`0`] = htonl((TCPOPT_NOP << `24`) \| (TCPOPT_NOP << `16`) \|
941	(TCPOPT_TIMESTAMP << `8`) \|
942	TCPOLEN_TIMESTAMP);
943	rep.opt[`1`] = htonl(tsval);
944	rep.opt[`2`] = htonl(tsecr);
945	arg.iov[`0`].iov_len += TCPOLEN_TSTAMP_ALIGNED;
946	}
947
948	/ Swap the send and the receive. /
949	rep.th.dest = th->source;
950	rep.th.source = th->dest;
951	rep.th.doff = arg.iov[`0`].iov_len / `4`;
952	rep.th.seq = htonl(seq);
953	rep.th.ack_seq = htonl(ack);
954	rep.th.ack = `1`;
955	rep.th.window = htons(win);
956
957	#ifdef CONFIG_TCP_MD5SIG
958	if (tcp_key_is_md5(key)) {
959	int offset = (tsecr) ? `3` : `0`;
960
961	rep.opt[offset++] = htonl((TCPOPT_NOP << `24`) \|
962	(TCPOPT_NOP << `16`) \|
963	(TCPOPT_MD5SIG << `8`) \|
964	TCPOLEN_MD5SIG);
965	arg.iov[`0`].iov_len += TCPOLEN_MD5SIG_ALIGNED;
966	rep.th.doff = arg.iov[`0`].iov_len/`4`;
967
968	tcp_v4_md5_hash_hdr(md5_hash: (__u8 *) &rep.opt[offset],
969	key: key->md5_key, daddr: ip_hdr(skb)->saddr,
970	saddr: ip_hdr(skb)->daddr, th: &rep.th);
971	}
972	#endif
973	#ifdef CONFIG_TCP_AO
974	if (tcp_key_is_ao(key)) {
975	int offset = (tsecr) ? `3` : `0`;
976
977	rep.opt[offset++] = htonl((TCPOPT_AO << `24`) \|
978	(tcp_ao_len(key->ao_key) << `16`) \|
979	(key->ao_key->sndid << `8`) \|
980	key->rcv_next);
981	arg.iov[`0`].iov_len += tcp_ao_len_aligned(key: key->ao_key);
982	rep.th.doff = arg.iov[`0`].iov_len / `4`;
983
984	tcp_ao_hash_hdr(AF_INET, ao_hash: (char *)&rep.opt[offset],
985	key: key->ao_key, tkey: key->traffic_key,
986	daddr: (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
987	saddr: (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
988	th: &rep.th, sne: key->sne);
989	}
990	#endif
991	arg.flags = reply_flags;
992	arg.csum = csum_tcpudp_nofold(saddr: ip_hdr(skb)->daddr,
993	daddr: ip_hdr(skb)->saddr, / XXX /
994	len: arg.iov[`0`].iov_len, IPPROTO_TCP, sum: `0`);
995	arg.csumoffset = offsetof(struct tcphdr, check) / `2`;
996	if (oif)
997	arg.bound_dev_if = oif;
998	arg.tos = tos;
999	arg.uid = sock_net_uid(net, sk: sk_fullsock(sk) ? sk : NULL);
1000	local_bh_disable();
1001	ctl_sk = this_cpu_read(ipv4_tcp_sk);
1002	sock_net_set(sk: ctl_sk, net);
1003	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1004	inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1005	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1006	inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1007	transmit_time = tcp_transmit_time(sk);
1008	ip_send_unicast_reply(sk: ctl_sk,
1009	skb, sopt: &TCP_SKB_CB(skb)->header.h4.opt,
1010	daddr: ip_hdr(skb)->saddr, saddr: ip_hdr(skb)->daddr,
1011	arg: &arg, len: arg.iov[`0`].iov_len,
1012	transmit_time, txhash);
1013
1014	sock_net_set(sk: ctl_sk, net: &init_net);
1015	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1016	local_bh_enable();
1017	}
1018
1019	static void tcp_v4_timewait_ack(struct sock sk, struct* sk_buff *skb)
1020	{
1021	struct inet_timewait_sock *tw = inet_twsk(sk);
1022	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1023	struct tcp_key key = {};
1024	#ifdef CONFIG_TCP_AO
1025	struct tcp_ao_info *ao_info;
1026
1027	if (static_branch_unlikely(&tcp_ao_needed.key)) {
1028	/ FIXME: the segment to-be-acked is not verified yet /
1029	ao_info = rcu_dereference(tcptw->ao_info);
1030	if (ao_info) {
1031	const struct tcp_ao_hdr *aoh;
1032
1033	if (tcp_parse_auth_options(th: tcp_hdr(skb), NULL, aoh: &aoh)) {
1034	inet_twsk_put(tw);
1035	return;
1036	}
1037
1038	if (aoh)
1039	key.ao_key = tcp_ao_established_key(ao: ao_info, sndid: aoh->rnext_keyid, rcvid: -`1`);
1040	}
1041	}
1042	if (key.ao_key) {
1043	struct tcp_ao_key *rnext_key;
1044
1045	key.traffic_key = snd_other_key(key: key.ao_key);
1046	key.sne = READ_ONCE(ao_info->snd_sne);
1047	rnext_key = READ_ONCE(ao_info->rnext_key);
1048	key.rcv_next = rnext_key->rcvid;
1049	key.type = TCP_KEY_AO;
1050	#else
1051	if (`0`) {
1052	#endif
1053	#ifdef CONFIG_TCP_MD5SIG
1054	} else if (static_branch_unlikely(&tcp_md5_needed.key)) {
1055	key.md5_key = tcp_twsk_md5_key(tcptw);
1056	if (key.md5_key)
1057	key.type = TCP_KEY_MD5;
1058	#endif
1059	}
1060
1061	tcp_v4_send_ack(sk, skb,
1062	seq: tcptw->tw_snd_nxt, ack: tcptw->tw_rcv_nxt,
1063	win: tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1064	tsval: tcp_tw_tsval(tcptw),
1065	tsecr: tcptw->tw_ts_recent,
1066	oif: tw->tw_bound_dev_if, key: &key,
1067	reply_flags: tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : `0`,
1068	tos: tw->tw_tos,
1069	txhash: tw->tw_txhash);
1070
1071	inet_twsk_put(tw);
1072	}
1073
1074	static void tcp_v4_reqsk_send_ack(const struct sock sk, struct* sk_buff *skb,
1075	struct request_sock *req)
1076	{
1077	struct tcp_key key = {};
1078
1079	/ sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV*
1080	* sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1081	*/
1082	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + `1` :
1083	tcp_sk(sk)->snd_nxt;
1084
1085	#ifdef CONFIG_TCP_AO
1086	if (static_branch_unlikely(&tcp_ao_needed.key) &&
1087	tcp_rsk_used_ao(req)) {
1088	const union tcp_md5_addr *addr;
1089	const struct tcp_ao_hdr *aoh;
1090	int l3index;
1091
1092	/ Invalid TCP option size or twice included auth /
1093	if (tcp_parse_auth_options(th: tcp_hdr(skb), NULL, aoh: &aoh))
1094	return;
1095	if (!aoh)
1096	return;
1097
1098	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1099	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : `0`;
1100	key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1101	sndid: aoh->rnext_keyid, rcvid: -`1`);
1102	if (unlikely(!key.ao_key)) {
1103	/ Send ACK with any matching MKT for the peer /
1104	key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, sndid: -`1`, rcvid: -`1`);
1105	/ Matching key disappeared (user removed the key?)*
1106	* let the handshake timeout.
1107	*/
1108	if (!key.ao_key) {
1109	net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1110	addr,
1111	ntohs(tcp_hdr(skb)->source),
1112	&ip_hdr(skb)->daddr,
1113	ntohs(tcp_hdr(skb)->dest));
1114	return;
1115	}
1116	}
1117	key.traffic_key = kmalloc(size: tcp_ao_digest_size(key: key.ao_key), GFP_ATOMIC);
1118	if (!key.traffic_key)
1119	return;
1120
1121	key.type = TCP_KEY_AO;
1122	key.rcv_next = aoh->keyid;
1123	tcp_v4_ao_calc_key_rsk(mkt: key.ao_key, key: key.traffic_key, req);
1124	#else
1125	if (`0`) {
1126	#endif
1127	#ifdef CONFIG_TCP_MD5SIG
1128	} else if (static_branch_unlikely(&tcp_md5_needed.key)) {
1129	const union tcp_md5_addr *addr;
1130	int l3index;
1131
1132	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1133	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : `0`;
1134	key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1135	if (key.md5_key)
1136	key.type = TCP_KEY_MD5;
1137	#endif
1138	}
1139
1140	/ RFC 7323 2.3*
1141	* The window field (SEG.WND) of every outgoing segment, with the
1142	* exception of <SYN> segments, MUST be right-shifted by
1143	* Rcv.Wind.Shift bits:
1144	*/
1145	tcp_v4_send_ack(sk, skb, seq,
1146	ack: tcp_rsk(req)->rcv_nxt,
1147	win: req->rsk_rcv_wnd >> inet_rsk(sk: req)->rcv_wscale,
1148	tsval: tcp_rsk_tsval(treq: tcp_rsk(req)),
1149	READ_ONCE(req->ts_recent),
1150	oif: `0`, key: &key,
1151	reply_flags: inet_rsk(sk: req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : `0`,
1152	tos: ip_hdr(skb)->tos,
1153	READ_ONCE(tcp_rsk(req)->txhash));
1154	if (tcp_key_is_ao(key: &key))
1155	kfree(objp: key.traffic_key);
1156	}
1157
1158	/*
1159	* Send a SYN-ACK after having received a SYN.
1160	* This still operates on a request_sock only, not on a big
1161	* socket.
1162	*/
1163	static int tcp_v4_send_synack(const struct sock sk, struct* dst_entry *dst,
1164	struct flowi *fl,
1165	struct request_sock *req,
1166	struct tcp_fastopen_cookie *foc,
1167	enum tcp_synack_type synack_type,
1168	struct sk_buff *syn_skb)
1169	{
1170	const struct inet_request_sock *ireq = inet_rsk(sk: req);
1171	struct flowi4 fl4;
1172	int err = -`1`;
1173	struct sk_buff *skb;
1174	u8 tos;
1175
1176	/ First, grab a route. /
1177	if (!dst && (dst = inet_csk_route_req(sk, fl4: &fl4, req)) == NULL)
1178	return -`1`;
1179
1180	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1181
1182	if (skb) {
1183	__tcp_v4_send_check(skb, saddr: ireq->ir_loc_addr, daddr: ireq->ir_rmt_addr);
1184
1185	tos = READ_ONCE(inet_sk(sk)->tos);
1186
1187	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1188	tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) \|
1189	(tos & INET_ECN_MASK);
1190
1191	if (!INET_ECN_is_capable(dsfield: tos) &&
1192	tcp_bpf_ca_needs_ecn(sk: (struct sock *)req))
1193	tos \|= INET_ECN_ECT_0;
1194
1195	rcu_read_lock();
1196	err = ip_build_and_send_pkt(skb, sk, saddr: ireq->ir_loc_addr,
1197	daddr: ireq->ir_rmt_addr,
1198	rcu_dereference(ireq->ireq_opt),
1199	tos);
1200	rcu_read_unlock();
1201	err = net_xmit_eval(err);
1202	}
1203
1204	return err;
1205	}
1206
1207	/*
1208	* IPv4 request_sock destructor.
1209	*/
1210	static void tcp_v4_reqsk_destructor(struct request_sock *req)
1211	{
1212	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, `1`));
1213	}
1214
1215	#ifdef CONFIG_TCP_MD5SIG
1216	/*
1217	* RFC2385 MD5 checksumming requires a mapping of
1218	* IP address->MD5 Key.
1219	* We need to maintain these in the sk structure.
1220	*/
1221
1222	DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1223	EXPORT_SYMBOL(tcp_md5_needed);
1224
1225	static bool better_md5_match(struct tcp_md5sig_key old, struct* tcp_md5sig_key *new)
1226	{
1227	if (!old)
1228	return true;
1229
1230	/ l3index always overrides non-l3index /
1231	if (old->l3index && new->l3index == `0`)
1232	return false;
1233	if (old->l3index == `0` && new->l3index)
1234	return true;
1235
1236	return old->prefixlen < new->prefixlen;
1237	}
1238
1239	/ Find the Key structure for an address. /
1240	struct tcp_md5sig_key __tcp_md5_do_lookup(const* struct sock sk, int* l3index,
1241	const union tcp_md5_addr *addr,
1242	int family, bool any_l3index)
1243	{
1244	const struct tcp_sock *tp = tcp_sk(sk);
1245	struct tcp_md5sig_key *key;
1246	const struct tcp_md5sig_info *md5sig;
1247	__be32 mask;
1248	struct tcp_md5sig_key *best_match = NULL;
1249	bool match;
1250
1251	/ caller either holds rcu_read_lock() or socket lock /
1252	md5sig = rcu_dereference_check(tp->md5sig_info,
1253	lockdep_sock_is_held(sk));
1254	if (!md5sig)
1255	return NULL;
1256
1257	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1258	lockdep_sock_is_held(sk)) {
1259	if (key->family != family)
1260	continue;
1261	if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1262	key->l3index != l3index)
1263	continue;
1264	if (family == AF_INET) {
1265	mask = inet_make_mask(logmask: key->prefixlen);
1266	match = (key->addr.a4.s_addr & mask) ==
1267	(addr->a4.s_addr & mask);
1268	#if IS_ENABLED(CONFIG_IPV6)
1269	} else if (family == AF_INET6) {
1270	match = ipv6_prefix_equal(addr1: &key->addr.a6, addr2: &addr->a6,
1271	prefixlen: key->prefixlen);
1272	#endif
1273	} else {
1274	match = false;
1275	}
1276
1277	if (match && better_md5_match(old: best_match, new: key))
1278	best_match = key;
1279	}
1280	return best_match;
1281	}
1282	EXPORT_SYMBOL(__tcp_md5_do_lookup);
1283
1284	static struct tcp_md5sig_key tcp_md5_do_lookup_exact(const* struct sock *sk,
1285	const union tcp_md5_addr *addr,
1286	int family, u8 prefixlen,
1287	int l3index, u8 flags)
1288	{
1289	const struct tcp_sock *tp = tcp_sk(sk);
1290	struct tcp_md5sig_key *key;
1291	unsigned int size = sizeof(struct in_addr);
1292	const struct tcp_md5sig_info *md5sig;
1293
1294	/ caller either holds rcu_read_lock() or socket lock /
1295	md5sig = rcu_dereference_check(tp->md5sig_info,
1296	lockdep_sock_is_held(sk));
1297	if (!md5sig)
1298	return NULL;
1299	#if IS_ENABLED(CONFIG_IPV6)
1300	if (family == AF_INET6)
1301	size = sizeof(struct in6_addr);
1302	#endif
1303	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1304	lockdep_sock_is_held(sk)) {
1305	if (key->family != family)
1306	continue;
1307	if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1308	continue;
1309	if (key->l3index != l3index)
1310	continue;
1311	if (!memcmp(p: &key->addr, q: addr, size) &&
1312	key->prefixlen == prefixlen)
1313	return key;
1314	}
1315	return NULL;
1316	}
1317
1318	struct tcp_md5sig_key tcp_v4_md5_lookup(const* struct sock *sk,
1319	const struct sock *addr_sk)
1320	{
1321	const union tcp_md5_addr *addr;
1322	int l3index;
1323
1324	l3index = l3mdev_master_ifindex_by_index(net: sock_net(sk),
1325	ifindex: addr_sk->sk_bound_dev_if);
1326	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1327	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1328	}
1329	EXPORT_SYMBOL(tcp_v4_md5_lookup);
1330
1331	static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1332	{
1333	struct tcp_sock *tp = tcp_sk(sk);
1334	struct tcp_md5sig_info *md5sig;
1335
1336	md5sig = kmalloc(size: sizeof(*md5sig), flags: gfp);
1337	if (!md5sig)
1338	return -ENOMEM;
1339
1340	sk_gso_disable(sk);
1341	INIT_HLIST_HEAD(&md5sig->head);
1342	rcu_assign_pointer(tp->md5sig_info, md5sig);
1343	return `0`;
1344	}
1345
1346	/ This can be called on a newly created socket, from other files /
1347	static int __tcp_md5_do_add(struct sock sk, const* union tcp_md5_addr *addr,
1348	int family, u8 prefixlen, int l3index, u8 flags,
1349	const u8 *newkey, u8 newkeylen, gfp_t gfp)
1350	{
1351	/ Add Key to the list /
1352	struct tcp_md5sig_key *key;
1353	struct tcp_sock *tp = tcp_sk(sk);
1354	struct tcp_md5sig_info *md5sig;
1355
1356	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1357	if (key) {
1358	/ Pre-existing entry - just update that one.*
1359	* Note that the key might be used concurrently.
1360	* data_race() is telling kcsan that we do not care of
1361	* key mismatches, since changing MD5 key on live flows
1362	* can lead to packet drops.
1363	*/
1364	data_race(memcpy(key->key, newkey, newkeylen));
1365
1366	/ Pairs with READ_ONCE() in tcp_md5_hash_key().*
1367	* Also note that a reader could catch new key->keylen value
1368	* but old key->key[], this is the reason we use __GFP_ZERO
1369	* at sock_kmalloc() time below these lines.
1370	*/
1371	WRITE_ONCE(key->keylen, newkeylen);
1372
1373	return `0`;
1374	}
1375
1376	md5sig = rcu_dereference_protected(tp->md5sig_info,
1377	lockdep_sock_is_held(sk));
1378
1379	key = sock_kmalloc(sk, size: sizeof(*key), priority: gfp \| __GFP_ZERO);
1380	if (!key)
1381	return -ENOMEM;
1382
1383	memcpy(key->key, newkey, newkeylen);
1384	key->keylen = newkeylen;
1385	key->family = family;
1386	key->prefixlen = prefixlen;
1387	key->l3index = l3index;
1388	key->flags = flags;
1389	memcpy(&key->addr, addr,
1390	(IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1391	sizeof(struct in_addr));
1392	hlist_add_head_rcu(n: &key->node, h: &md5sig->head);
1393	return `0`;
1394	}
1395
1396	int tcp_md5_do_add(struct sock sk, const* union tcp_md5_addr *addr,
1397	int family, u8 prefixlen, int l3index, u8 flags,
1398	const u8 *newkey, u8 newkeylen)
1399	{
1400	struct tcp_sock *tp = tcp_sk(sk);
1401
1402	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1403	if (tcp_md5_alloc_sigpool())
1404	return -ENOMEM;
1405
1406	if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1407	tcp_md5_release_sigpool();
1408	return -ENOMEM;
1409	}
1410
1411	if (!static_branch_inc(&tcp_md5_needed.key)) {
1412	struct tcp_md5sig_info *md5sig;
1413
1414	md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1415	rcu_assign_pointer(tp->md5sig_info, NULL);
1416	kfree_rcu(md5sig, rcu);
1417	tcp_md5_release_sigpool();
1418	return -EUSERS;
1419	}
1420	}
1421
1422	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1423	newkey, newkeylen, GFP_KERNEL);
1424	}
1425	EXPORT_SYMBOL(tcp_md5_do_add);
1426
1427	int tcp_md5_key_copy(struct sock sk, const* union tcp_md5_addr *addr,
1428	int family, u8 prefixlen, int l3index,
1429	struct tcp_md5sig_key *key)
1430	{
1431	struct tcp_sock *tp = tcp_sk(sk);
1432
1433	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1434	tcp_md5_add_sigpool();
1435
1436	if (tcp_md5sig_info_add(sk, gfp: sk_gfp_mask(sk, GFP_ATOMIC))) {
1437	tcp_md5_release_sigpool();
1438	return -ENOMEM;
1439	}
1440
1441	if (!static_key_fast_inc_not_disabled(key: &tcp_md5_needed.key.key)) {
1442	struct tcp_md5sig_info *md5sig;
1443
1444	md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1445	net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1446	rcu_assign_pointer(tp->md5sig_info, NULL);
1447	kfree_rcu(md5sig, rcu);
1448	tcp_md5_release_sigpool();
1449	return -EUSERS;
1450	}
1451	}
1452
1453	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1454	flags: key->flags, newkey: key->key, newkeylen: key->keylen,
1455	gfp: sk_gfp_mask(sk, GFP_ATOMIC));
1456	}
1457	EXPORT_SYMBOL(tcp_md5_key_copy);
1458
1459	int tcp_md5_do_del(struct sock sk, const* union tcp_md5_addr addr, int* family,
1460	u8 prefixlen, int l3index, u8 flags)
1461	{
1462	struct tcp_md5sig_key *key;
1463
1464	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1465	if (!key)
1466	return -ENOENT;
1467	hlist_del_rcu(n: &key->node);
1468	atomic_sub(i: sizeof(*key), v: &sk->sk_omem_alloc);
1469	kfree_rcu(key, rcu);
1470	return `0`;
1471	}
1472	EXPORT_SYMBOL(tcp_md5_do_del);
1473
1474	void tcp_clear_md5_list(struct sock *sk)
1475	{
1476	struct tcp_sock *tp = tcp_sk(sk);
1477	struct tcp_md5sig_key *key;
1478	struct hlist_node *n;
1479	struct tcp_md5sig_info *md5sig;
1480
1481	md5sig = rcu_dereference_protected(tp->md5sig_info, `1`);
1482
1483	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1484	hlist_del_rcu(n: &key->node);
1485	atomic_sub(i: sizeof(*key), v: &sk->sk_omem_alloc);
1486	kfree_rcu(key, rcu);
1487	}
1488	}
1489
1490	static int tcp_v4_parse_md5_keys(struct sock sk, int* optname,
1491	sockptr_t optval, int optlen)
1492	{
1493	struct tcp_md5sig cmd;
1494	struct sockaddr_in sin = (struct* sockaddr_in *)&cmd.tcpm_addr;
1495	const union tcp_md5_addr *addr;
1496	u8 prefixlen = `32`;
1497	int l3index = `0`;
1498	bool l3flag;
1499	u8 flags;
1500
1501	if (optlen < sizeof(cmd))
1502	return -EINVAL;
1503
1504	if (copy_from_sockptr(dst: &cmd, src: optval, size: sizeof(cmd)))
1505	return -EFAULT;
1506
1507	if (sin->sin_family != AF_INET)
1508	return -EINVAL;
1509
1510	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1511	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1512
1513	if (optname == TCP_MD5SIG_EXT &&
1514	cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1515	prefixlen = cmd.tcpm_prefixlen;
1516	if (prefixlen > `32`)
1517	return -EINVAL;
1518	}
1519
1520	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1521	cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1522	struct net_device *dev;
1523
1524	rcu_read_lock();
1525	dev = dev_get_by_index_rcu(net: sock_net(sk), ifindex: cmd.tcpm_ifindex);
1526	if (dev && netif_is_l3_master(dev))
1527	l3index = dev->ifindex;
1528
1529	rcu_read_unlock();
1530
1531	/ ok to reference set/not set outside of rcu;*
1532	* right now device MUST be an L3 master
1533	*/
1534	if (!dev \|\| !l3index)
1535	return -EINVAL;
1536	}
1537
1538	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1539
1540	if (!cmd.tcpm_keylen)
1541	return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1542
1543	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1544	return -EINVAL;
1545
1546	/ Don't allow keys for peers that have a matching TCP-AO key.*
1547	* See the comment in tcp_ao_add_cmd()
1548	*/
1549	if (tcp_ao_required(sk, saddr: addr, AF_INET, l3index: l3flag ? l3index : -`1`, stat_inc: false))
1550	return -EKEYREJECTED;
1551
1552	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1553	cmd.tcpm_key, cmd.tcpm_keylen);
1554	}
1555
1556	static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1557	__be32 daddr, __be32 saddr,
1558	const struct tcphdr th, int* nbytes)
1559	{
1560	struct tcp4_pseudohdr *bp;
1561	struct scatterlist sg;
1562	struct tcphdr *_th;
1563
1564	bp = hp->scratch;
1565	bp->saddr = saddr;
1566	bp->daddr = daddr;
1567	bp->pad = `0`;
1568	bp->protocol = IPPROTO_TCP;
1569	bp->len = cpu_to_be16(nbytes);
1570
1571	_th = (struct tcphdr *)(bp + `1`);
1572	memcpy(_th, th, sizeof(*th));
1573	_th->check = `0`;
1574
1575	sg_init_one(&sg, bp, sizeof(bp) + sizeof(th));
1576	ahash_request_set_crypt(req: hp->req, src: &sg, NULL,
1577	nbytes: sizeof(bp) + sizeof(th));
1578	return crypto_ahash_update(req: hp->req);
1579	}
1580
1581	static int tcp_v4_md5_hash_hdr(char md5_hash, const* struct tcp_md5sig_key *key,
1582	__be32 daddr, __be32 saddr, const struct tcphdr *th)
1583	{
1584	struct tcp_sigpool hp;
1585
1586	if (tcp_sigpool_start(id: tcp_md5_sigpool_id, c: &hp))
1587	goto clear_hash_nostart;
1588
1589	if (crypto_ahash_init(req: hp.req))
1590	goto clear_hash;
1591	if (tcp_v4_md5_hash_headers(hp: &hp, daddr, saddr, th, nbytes: th->doff << `2`))
1592	goto clear_hash;
1593	if (tcp_md5_hash_key(hp: &hp, key))
1594	goto clear_hash;
1595	ahash_request_set_crypt(req: hp.req, NULL, result: md5_hash, nbytes: `0`);
1596	if (crypto_ahash_final(req: hp.req))
1597	goto clear_hash;
1598
1599	tcp_sigpool_end(c: &hp);
1600	return `0`;
1601
1602	clear_hash:
1603	tcp_sigpool_end(c: &hp);
1604	clear_hash_nostart:
1605	memset(md5_hash, `0`, `16`);
1606	return `1`;
1607	}
1608
1609	int tcp_v4_md5_hash_skb(char md5_hash, const* struct tcp_md5sig_key *key,
1610	const struct sock *sk,
1611	const struct sk_buff *skb)
1612	{
1613	const struct tcphdr *th = tcp_hdr(skb);
1614	struct tcp_sigpool hp;
1615	__be32 saddr, daddr;
1616
1617	if (sk) { / valid for establish/request sockets /
1618	saddr = sk->sk_rcv_saddr;
1619	daddr = sk->sk_daddr;
1620	} else {
1621	const struct iphdr *iph = ip_hdr(skb);
1622	saddr = iph->saddr;
1623	daddr = iph->daddr;
1624	}
1625
1626	if (tcp_sigpool_start(id: tcp_md5_sigpool_id, c: &hp))
1627	goto clear_hash_nostart;
1628
1629	if (crypto_ahash_init(req: hp.req))
1630	goto clear_hash;
1631
1632	if (tcp_v4_md5_hash_headers(hp: &hp, daddr, saddr, th, nbytes: skb->len))
1633	goto clear_hash;
1634	if (tcp_sigpool_hash_skb_data(hp: &hp, skb, header_len: th->doff << `2`))
1635	goto clear_hash;
1636	if (tcp_md5_hash_key(hp: &hp, key))
1637	goto clear_hash;
1638	ahash_request_set_crypt(req: hp.req, NULL, result: md5_hash, nbytes: `0`);
1639	if (crypto_ahash_final(req: hp.req))
1640	goto clear_hash;
1641
1642	tcp_sigpool_end(c: &hp);
1643	return `0`;
1644
1645	clear_hash:
1646	tcp_sigpool_end(c: &hp);
1647	clear_hash_nostart:
1648	memset(md5_hash, `0`, `16`);
1649	return `1`;
1650	}
1651	EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1652
1653	#endif
1654
1655	static void tcp_v4_init_req(struct request_sock *req,
1656	const struct sock *sk_listener,
1657	struct sk_buff *skb)
1658	{
1659	struct inet_request_sock *ireq = inet_rsk(sk: req);
1660	struct net *net = sock_net(sk: sk_listener);
1661
1662	sk_rcv_saddr_set(sk: req_to_sk(req), addr: ip_hdr(skb)->daddr);
1663	sk_daddr_set(sk: req_to_sk(req), addr: ip_hdr(skb)->saddr);
1664	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1665	}
1666
1667	static struct dst_entry tcp_v4_route_req(const* struct sock *sk,
1668	struct sk_buff *skb,
1669	struct flowi *fl,
1670	struct request_sock *req)
1671	{
1672	tcp_v4_init_req(req, sk_listener: sk, skb);
1673
1674	if (security_inet_conn_request(sk, skb, req))
1675	return NULL;
1676
1677	return inet_csk_route_req(sk, fl4: &fl->u.ip4, req);
1678	}
1679
1680	struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1681	.family = PF_INET,
1682	.obj_size = sizeof(struct tcp_request_sock),
1683	.rtx_syn_ack = tcp_rtx_synack,
1684	.send_ack = tcp_v4_reqsk_send_ack,
1685	.destructor = tcp_v4_reqsk_destructor,
1686	.send_reset = tcp_v4_send_reset,
1687	.syn_ack_timeout = tcp_syn_ack_timeout,
1688	};
1689
1690	const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1691	.mss_clamp = TCP_MSS_DEFAULT,
1692	#ifdef CONFIG_TCP_MD5SIG
1693	.req_md5_lookup = tcp_v4_md5_lookup,
1694	.calc_md5_hash = tcp_v4_md5_hash_skb,
1695	#endif
1696	#ifdef CONFIG_TCP_AO
1697	.ao_lookup = tcp_v4_ao_lookup_rsk,
1698	.ao_calc_key = tcp_v4_ao_calc_key_rsk,
1699	.ao_synack_hash = tcp_v4_ao_synack_hash,
1700	#endif
1701	#ifdef CONFIG_SYN_COOKIES
1702	.cookie_init_seq = cookie_v4_init_sequence,
1703	#endif
1704	.route_req = tcp_v4_route_req,
1705	.init_seq = tcp_v4_init_seq,
1706	.init_ts_off = tcp_v4_init_ts_off,
1707	.send_synack = tcp_v4_send_synack,
1708	};
1709
1710	int tcp_v4_conn_request(struct sock sk, struct* sk_buff *skb)
1711	{
1712	/ Never answer to SYNs send to broadcast or multicast /
1713	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST \| RTCF_MULTICAST))
1714	goto drop;
1715
1716	return tcp_conn_request(rsk_ops: &tcp_request_sock_ops,
1717	af_ops: &tcp_request_sock_ipv4_ops, sk, skb);
1718
1719	drop:
1720	tcp_listendrop(sk);
1721	return `0`;
1722	}
1723	EXPORT_SYMBOL(tcp_v4_conn_request);
1724
1725
1726	/*
1727	* The three way handshake has completed - we got a valid synack -
1728	* now create the new socket.
1729	*/
1730	struct sock tcp_v4_syn_recv_sock(const* struct sock sk, struct* sk_buff *skb,
1731	struct request_sock *req,
1732	struct dst_entry *dst,
1733	struct request_sock *req_unhash,
1734	bool *own_req)
1735	{
1736	struct inet_request_sock *ireq;
1737	bool found_dup_sk = false;
1738	struct inet_sock *newinet;
1739	struct tcp_sock *newtp;
1740	struct sock *newsk;
1741	#ifdef CONFIG_TCP_MD5SIG
1742	const union tcp_md5_addr *addr;
1743	struct tcp_md5sig_key *key;
1744	int l3index;
1745	#endif
1746	struct ip_options_rcu *inet_opt;
1747
1748	if (sk_acceptq_is_full(sk))
1749	goto exit_overflow;
1750
1751	newsk = tcp_create_openreq_child(sk, req, skb);
1752	if (!newsk)
1753	goto exit_nonewsk;
1754
1755	newsk->sk_gso_type = SKB_GSO_TCPV4;
1756	inet_sk_rx_dst_set(sk: newsk, skb);
1757
1758	newtp = tcp_sk(newsk);
1759	newinet = inet_sk(newsk);
1760	ireq = inet_rsk(sk: req);
1761	sk_daddr_set(sk: newsk, addr: ireq->ir_rmt_addr);
1762	sk_rcv_saddr_set(sk: newsk, addr: ireq->ir_loc_addr);
1763	newsk->sk_bound_dev_if = ireq->ir_iif;
1764	newinet->inet_saddr = ireq->ir_loc_addr;
1765	inet_opt = rcu_dereference(ireq->ireq_opt);
1766	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1767	newinet->mc_index = inet_iif(skb);
1768	newinet->mc_ttl = ip_hdr(skb)->ttl;
1769	newinet->rcv_tos = ip_hdr(skb)->tos;
1770	inet_csk(sk: newsk)->icsk_ext_hdr_len = `0`;
1771	if (inet_opt)
1772	inet_csk(sk: newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1773	atomic_set(v: &newinet->inet_id, i: get_random_u16());
1774
1775	/ Set ToS of the new socket based upon the value of incoming SYN.*
1776	* ECT bits are set later in tcp_init_transfer().
1777	*/
1778	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1779	newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1780
1781	if (!dst) {
1782	dst = inet_csk_route_child_sock(sk, newsk, req);
1783	if (!dst)
1784	goto put_and_exit;
1785	} else {
1786	/ syncookie case : see end of cookie_v4_check() /
1787	}
1788	sk_setup_caps(sk: newsk, dst);
1789
1790	tcp_ca_openreq_child(sk: newsk, dst);
1791
1792	tcp_sync_mss(sk: newsk, pmtu: dst_mtu(dst));
1793	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), mss: dst_metric_advmss(dst));
1794
1795	tcp_initialize_rcv_mss(sk: newsk);
1796
1797	#ifdef CONFIG_TCP_MD5SIG
1798	l3index = l3mdev_master_ifindex_by_index(net: sock_net(sk), ifindex: ireq->ir_iif);
1799	/ Copy over the MD5 key from the original socket /
1800	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1801	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1802	if (key && !tcp_rsk_used_ao(req)) {
1803	if (tcp_md5_key_copy(newsk, addr, AF_INET, `32`, l3index, key))
1804	goto put_and_exit;
1805	sk_gso_disable(sk: newsk);
1806	}
1807	#endif
1808	#ifdef CONFIG_TCP_AO
1809	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1810	goto put_and_exit; / OOM, release back memory /
1811	#endif
1812
1813	if (__inet_inherit_port(sk, child: newsk) < `0`)
1814	goto put_and_exit;
1815	*own_req = inet_ehash_nolisten(sk: newsk, osk: req_to_sk(req: req_unhash),
1816	found_dup_sk: &found_dup_sk);
1817	if (likely(*own_req)) {
1818	tcp_move_syn(tp: newtp, req);
1819	ireq->ireq_opt = NULL;
1820	} else {
1821	newinet->inet_opt = NULL;
1822
1823	if (!req_unhash && found_dup_sk) {
1824	/ This code path should only be executed in the*
1825	* syncookie case only
1826	*/
1827	bh_unlock_sock(newsk);
1828	sock_put(sk: newsk);
1829	newsk = NULL;
1830	}
1831	}
1832	return newsk;
1833
1834	exit_overflow:
1835	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1836	exit_nonewsk:
1837	dst_release(dst);
1838	exit:
1839	tcp_listendrop(sk);
1840	return NULL;
1841	put_and_exit:
1842	newinet->inet_opt = NULL;
1843	inet_csk_prepare_forced_close(sk: newsk);
1844	tcp_done(sk: newsk);
1845	goto exit;
1846	}
1847	EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1848
1849	static struct sock tcp_v4_cookie_check(struct* sock sk, struct* sk_buff *skb)
1850	{
1851	#ifdef CONFIG_SYN_COOKIES
1852	const struct tcphdr *th = tcp_hdr(skb);
1853
1854	if (!th->syn)
1855	sk = cookie_v4_check(sk, skb);
1856	#endif
1857	return sk;
1858	}
1859
1860	u16 tcp_v4_get_syncookie(struct sock sk, struct* iphdr *iph,
1861	struct tcphdr th, u32 cookie)
1862	{
1863	u16 mss = `0`;
1864	#ifdef CONFIG_SYN_COOKIES
1865	mss = tcp_get_syncookie_mss(rsk_ops: &tcp_request_sock_ops,
1866	af_ops: &tcp_request_sock_ipv4_ops, sk, th);
1867	if (mss) {
1868	*cookie = __cookie_v4_init_sequence(iph, th, mssp: &mss);
1869	tcp_synq_overflow(sk);
1870	}
1871	#endif
1872	return mss;
1873	}
1874
1875	INDIRECT_CALLABLE_DECLARE(struct dst_entry ipv4_dst_check(struct* dst_entry *,
1876	u32));
1877	/ The socket must have it's spinlock held when we get*
1878	* here, unless it is a TCP_LISTEN socket.
1879	*
1880	* We have a potential double-lock case here, so even when
1881	* doing backlog processing we use the BH locking scheme.
1882	* This is because we cannot sleep with the original spinlock
1883	* held.
1884	*/
1885	int tcp_v4_do_rcv(struct sock sk, struct* sk_buff *skb)
1886	{
1887	enum skb_drop_reason reason;
1888	struct sock *rsk;
1889
1890	if (sk->sk_state == TCP_ESTABLISHED) { / Fast path /
1891	struct dst_entry *dst;
1892
1893	dst = rcu_dereference_protected(sk->sk_rx_dst,
1894	lockdep_sock_is_held(sk));
1895
1896	sock_rps_save_rxhash(sk, skb);
1897	sk_mark_napi_id(sk, skb);
1898	if (dst) {
1899	if (sk->sk_rx_dst_ifindex != skb->skb_iif \|\|
1900	!INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1901	dst, `0`)) {
1902	RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1903	dst_release(dst);
1904	}
1905	}
1906	tcp_rcv_established(sk, skb);
1907	return `0`;
1908	}
1909
1910	if (tcp_checksum_complete(skb))
1911	goto csum_err;
1912
1913	if (sk->sk_state == TCP_LISTEN) {
1914	struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1915
1916	if (!nsk)
1917	return `0`;
1918	if (nsk != sk) {
1919	reason = tcp_child_process(parent: sk, child: nsk, skb);
1920	if (reason) {
1921	rsk = nsk;
1922	goto reset;
1923	}
1924	return `0`;
1925	}
1926	} else
1927	sock_rps_save_rxhash(sk, skb);
1928
1929	reason = tcp_rcv_state_process(sk, skb);
1930	if (reason) {
1931	rsk = sk;
1932	goto reset;
1933	}
1934	return `0`;
1935
1936	reset:
1937	tcp_v4_send_reset(sk: rsk, skb);
1938	discard:
1939	kfree_skb_reason(skb, reason);
1940	/ Be careful here. If this function gets more complicated and*
1941	* gcc suffers from register pressure on the x86, sk (in %ebx)
1942	* might be destroyed here. This current version compiles correctly,
1943	* but you have been warned.
1944	*/
1945	return `0`;
1946
1947	csum_err:
1948	reason = SKB_DROP_REASON_TCP_CSUM;
1949	trace_tcp_bad_csum(skb);
1950	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1951	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1952	goto discard;
1953	}
1954	EXPORT_SYMBOL(tcp_v4_do_rcv);
1955
1956	int tcp_v4_early_demux(struct sk_buff *skb)
1957	{
1958	struct net *net = dev_net(dev: skb->dev);
1959	const struct iphdr *iph;
1960	const struct tcphdr *th;
1961	struct sock *sk;
1962
1963	if (skb->pkt_type != PACKET_HOST)
1964	return `0`;
1965
1966	if (!pskb_may_pull(skb, len: skb_transport_offset(skb) + sizeof(struct tcphdr)))
1967	return `0`;
1968
1969	iph = ip_hdr(skb);
1970	th = tcp_hdr(skb);
1971
1972	if (th->doff < sizeof(struct tcphdr) / `4`)
1973	return `0`;
1974
1975	sk = __inet_lookup_established(net, hashinfo: net->ipv4.tcp_death_row.hashinfo,
1976	saddr: iph->saddr, sport: th->source,
1977	daddr: iph->daddr, ntohs(th->dest),
1978	dif: skb->skb_iif, sdif: inet_sdif(skb));
1979	if (sk) {
1980	skb->sk = sk;
1981	skb->destructor = sock_edemux;
1982	if (sk_fullsock(sk)) {
1983	struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1984
1985	if (dst)
1986	dst = dst_check(dst, cookie: `0`);
1987	if (dst &&
1988	sk->sk_rx_dst_ifindex == skb->skb_iif)
1989	skb_dst_set_noref(skb, dst);
1990	}
1991	}
1992	return `0`;
1993	}
1994
1995	bool tcp_add_backlog(struct sock sk, struct* sk_buff *skb,
1996	enum skb_drop_reason *reason)
1997	{
1998	u32 limit, tail_gso_size, tail_gso_segs;
1999	struct skb_shared_info *shinfo;
2000	const struct tcphdr *th;
2001	struct tcphdr *thtail;
2002	struct sk_buff *tail;
2003	unsigned int hdrlen;
2004	bool fragstolen;
2005	u32 gso_segs;
2006	u32 gso_size;
2007	int delta;
2008
2009	/ In case all data was pulled from skb frags (in __pskb_pull_tail()),*
2010	* we can fix skb->truesize to its real value to avoid future drops.
2011	* This is valid because skb is not yet charged to the socket.
2012	* It has been noticed pure SACK packets were sometimes dropped
2013	* (if cooked by drivers without copybreak feature).
2014	*/
2015	skb_condense(skb);
2016
2017	skb_dst_drop(skb);
2018
2019	if (unlikely(tcp_checksum_complete(skb))) {
2020	bh_unlock_sock(sk);
2021	trace_tcp_bad_csum(skb);
2022	*reason = SKB_DROP_REASON_TCP_CSUM;
2023	__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2024	__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2025	return true;
2026	}
2027
2028	/ Attempt coalescing to last skb in backlog, even if we are*
2029	* above the limits.
2030	* This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2031	*/
2032	th = (const struct tcphdr *)skb->data;
2033	hdrlen = th->doff * `4`;
2034
2035	tail = sk->sk_backlog.tail;
2036	if (!tail)
2037	goto no_coalesce;
2038	thtail = (struct tcphdr *)tail->data;
2039
2040	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq \|\|
2041	TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield \|\|
2042	((TCP_SKB_CB(tail)->tcp_flags \|
2043	TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN \| TCPHDR_RST \| TCPHDR_URG)) \|\|
2044	!((TCP_SKB_CB(tail)->tcp_flags &
2045	TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) \|\|
2046	((TCP_SKB_CB(tail)->tcp_flags ^
2047	TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE \| TCPHDR_CWR)) \|\|
2048	#ifdef CONFIG_TLS_DEVICE
2049	tail->decrypted != skb->decrypted \|\|
2050	#endif
2051	!mptcp_skb_can_collapse(to: tail, from: skb) \|\|
2052	thtail->doff != th->doff \|\|
2053	memcmp(p: thtail + `1`, q: th + `1`, size: hdrlen - sizeof(*th)))
2054	goto no_coalesce;
2055
2056	__skb_pull(skb, len: hdrlen);
2057
2058	shinfo = skb_shinfo(skb);
2059	gso_size = shinfo->gso_size ?: skb->len;
2060	gso_segs = shinfo->gso_segs ?: `1`;
2061
2062	shinfo = skb_shinfo(tail);
2063	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2064	tail_gso_segs = shinfo->gso_segs ?: `1`;
2065
2066	if (skb_try_coalesce(to: tail, from: skb, fragstolen: &fragstolen, delta_truesize: &delta)) {
2067	TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2068
2069	if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2070	TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2071	thtail->window = th->window;
2072	}
2073
2074	/ We have to update both TCP_SKB_CB(tail)->tcp_flags and*
2075	* thtail->fin, so that the fast path in tcp_rcv_established()
2076	* is not entered if we append a packet with a FIN.
2077	* SYN, RST, URG are not present.
2078	* ACK is set on both packets.
2079	* PSH : we do not really care in TCP stack,
2080	* at least for 'GRO' packets.
2081	*/
2082	thtail->fin \|= th->fin;
2083	TCP_SKB_CB(tail)->tcp_flags \|= TCP_SKB_CB(skb)->tcp_flags;
2084
2085	if (TCP_SKB_CB(skb)->has_rxtstamp) {
2086	TCP_SKB_CB(tail)->has_rxtstamp = true;
2087	tail->tstamp = skb->tstamp;
2088	skb_hwtstamps(skb: tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2089	}
2090
2091	/ Not as strict as GRO. We only need to carry mss max value /
2092	shinfo->gso_size = max(gso_size, tail_gso_size);
2093	shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, `0xFFFF`);
2094
2095	sk->sk_backlog.len += delta;
2096	__NET_INC_STATS(sock_net(sk),
2097	LINUX_MIB_TCPBACKLOGCOALESCE);
2098	kfree_skb_partial(skb, head_stolen: fragstolen);
2099	return false;
2100	}
2101	__skb_push(skb, len: hdrlen);
2102
2103	no_coalesce:
2104	limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> `1`);
2105
2106	/ Only socket owner can try to collapse/prune rx queues*
2107	* to reduce memory overhead, so add a little headroom here.
2108	* Few sockets backlog are possibly concurrently non empty.
2109	*/
2110	limit += `64` * `1024`;
2111
2112	if (unlikely(sk_add_backlog(sk, skb, limit))) {
2113	bh_unlock_sock(sk);
2114	*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2115	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2116	return true;
2117	}
2118	return false;
2119	}
2120	EXPORT_SYMBOL(tcp_add_backlog);
2121
2122	int tcp_filter(struct sock sk, struct* sk_buff *skb)
2123	{
2124	struct tcphdr th = (struct* tcphdr *)skb->data;
2125
2126	return sk_filter_trim_cap(sk, skb, cap: th->doff * `4`);
2127	}
2128	EXPORT_SYMBOL(tcp_filter);
2129
2130	static void tcp_v4_restore_cb(struct sk_buff *skb)
2131	{
2132	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2133	sizeof(struct inet_skb_parm));
2134	}
2135
2136	static void tcp_v4_fill_cb(struct sk_buff skb, const* struct iphdr *iph,
2137	const struct tcphdr *th)
2138	{
2139	/ This is tricky : We move IPCB at its correct location into TCP_SKB_CB()*
2140	* barrier() makes sure compiler wont play fool^Waliasing games.
2141	*/
2142	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2143	sizeof(struct inet_skb_parm));
2144	barrier();
2145
2146	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2147	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2148	skb->len - th->doff * `4`);
2149	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2150	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
2151	TCP_SKB_CB(skb)->tcp_tw_isn = `0`;
2152	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2153	TCP_SKB_CB(skb)->sacked = `0`;
2154	TCP_SKB_CB(skb)->has_rxtstamp =
2155	skb->tstamp \|\| skb_hwtstamps(skb)->hwtstamp;
2156	}
2157
2158	/*
2159	* From tcp_input.c
2160	*/
2161
2162	int tcp_v4_rcv(struct sk_buff *skb)
2163	{
2164	struct net *net = dev_net(dev: skb->dev);
2165	enum skb_drop_reason drop_reason;
2166	int sdif = inet_sdif(skb);
2167	int dif = inet_iif(skb);
2168	const struct iphdr *iph;
2169	const struct tcphdr *th;
2170	bool refcounted;
2171	struct sock *sk;
2172	int ret;
2173
2174	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2175	if (skb->pkt_type != PACKET_HOST)
2176	goto discard_it;
2177
2178	/ Count it even if it's bad /
2179	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2180
2181	if (!pskb_may_pull(skb, len: sizeof(struct tcphdr)))
2182	goto discard_it;
2183
2184	th = (const struct tcphdr *)skb->data;
2185
2186	if (unlikely(th->doff < sizeof(struct tcphdr) / `4`)) {
2187	drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2188	goto bad_packet;
2189	}
2190	if (!pskb_may_pull(skb, len: th->doff * `4`))
2191	goto discard_it;
2192
2193	/ An explanation is required here, I think.*
2194	* Packet length and doff are validated by header prediction,
2195	* provided case of th->doff==0 is eliminated.
2196	* So, we defer the checks. */
2197
2198	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2199	goto csum_error;
2200
2201	th = (const struct tcphdr *)skb->data;
2202	iph = ip_hdr(skb);
2203	lookup:
2204	sk = __inet_lookup_skb(hashinfo: net->ipv4.tcp_death_row.hashinfo,
2205	skb, doff: __tcp_hdrlen(th), sport: th->source,
2206	dport: th->dest, sdif, refcounted: &refcounted);
2207	if (!sk)
2208	goto no_tcp_socket;
2209
2210	process:
2211	if (sk->sk_state == TCP_TIME_WAIT)
2212	goto do_time_wait;
2213
2214	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2215	struct request_sock *req = inet_reqsk(sk);
2216	bool req_stolen = false;
2217	struct sock *nsk;
2218
2219	sk = req->rsk_listener;
2220	if (!xfrm4_policy_check(sk, dir: XFRM_POLICY_IN, skb))
2221	drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2222	else
2223	drop_reason = tcp_inbound_hash(sk, req, skb,
2224	saddr: &iph->saddr, daddr: &iph->daddr,
2225	AF_INET, dif, sdif);
2226	if (unlikely(drop_reason)) {
2227	sk_drops_add(sk, skb);
2228	reqsk_put(req);
2229	goto discard_it;
2230	}
2231	if (tcp_checksum_complete(skb)) {
2232	reqsk_put(req);
2233	goto csum_error;
2234	}
2235	if (unlikely(sk->sk_state != TCP_LISTEN)) {
2236	nsk = reuseport_migrate_sock(sk, migrating_sk: req_to_sk(req), skb);
2237	if (!nsk) {
2238	inet_csk_reqsk_queue_drop_and_put(sk, req);
2239	goto lookup;
2240	}
2241	sk = nsk;
2242	/ reuseport_migrate_sock() has already held one sk_refcnt*
2243	* before returning.
2244	*/
2245	} else {
2246	/ We own a reference on the listener, increase it again*
2247	* as we might lose it too soon.
2248	*/
2249	sock_hold(sk);
2250	}
2251	refcounted = true;
2252	nsk = NULL;
2253	if (!tcp_filter(sk, skb)) {
2254	th = (const struct tcphdr *)skb->data;
2255	iph = ip_hdr(skb);
2256	tcp_v4_fill_cb(skb, iph, th);
2257	nsk = tcp_check_req(sk, skb, req, fastopen: false, lost_race: &req_stolen);
2258	} else {
2259	drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2260	}
2261	if (!nsk) {
2262	reqsk_put(req);
2263	if (req_stolen) {
2264	/ Another cpu got exclusive access to req*
2265	* and created a full blown socket.
2266	* Try to feed this packet to this socket
2267	* instead of discarding it.
2268	*/
2269	tcp_v4_restore_cb(skb);
2270	sock_put(sk);
2271	goto lookup;
2272	}
2273	goto discard_and_relse;
2274	}
2275	nf_reset_ct(skb);
2276	if (nsk == sk) {
2277	reqsk_put(req);
2278	tcp_v4_restore_cb(skb);
2279	} else {
2280	drop_reason = tcp_child_process(parent: sk, child: nsk, skb);
2281	if (drop_reason) {
2282	tcp_v4_send_reset(sk: nsk, skb);
2283	goto discard_and_relse;
2284	}
2285	sock_put(sk);
2286	return `0`;
2287	}
2288	}
2289
2290	if (static_branch_unlikely(&ip4_min_ttl)) {
2291	/ min_ttl can be changed concurrently from do_ip_setsockopt() /
2292	if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2293	__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2294	drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2295	goto discard_and_relse;
2296	}
2297	}
2298
2299	if (!xfrm4_policy_check(sk, dir: XFRM_POLICY_IN, skb)) {
2300	drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2301	goto discard_and_relse;
2302	}
2303
2304	drop_reason = tcp_inbound_hash(sk, NULL, skb, saddr: &iph->saddr, daddr: &iph->daddr,
2305	AF_INET, dif, sdif);
2306	if (drop_reason)
2307	goto discard_and_relse;
2308
2309	nf_reset_ct(skb);
2310
2311	if (tcp_filter(sk, skb)) {
2312	drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2313	goto discard_and_relse;
2314	}
2315	th = (const struct tcphdr *)skb->data;
2316	iph = ip_hdr(skb);
2317	tcp_v4_fill_cb(skb, iph, th);
2318
2319	skb->dev = NULL;
2320
2321	if (sk->sk_state == TCP_LISTEN) {
2322	ret = tcp_v4_do_rcv(sk, skb);
2323	goto put_and_return;
2324	}
2325
2326	sk_incoming_cpu_update(sk);
2327
2328	bh_lock_sock_nested(sk);
2329	tcp_segs_in(tcp_sk(sk), skb);
2330	ret = `0`;
2331	if (!sock_owned_by_user(sk)) {
2332	ret = tcp_v4_do_rcv(sk, skb);
2333	} else {
2334	if (tcp_add_backlog(sk, skb, &drop_reason))
2335	goto discard_and_relse;
2336	}
2337	bh_unlock_sock(sk);
2338
2339	put_and_return:
2340	if (refcounted)
2341	sock_put(sk);
2342
2343	return ret;
2344
2345	no_tcp_socket:
2346	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2347	if (!xfrm4_policy_check(NULL, dir: XFRM_POLICY_IN, skb))
2348	goto discard_it;
2349
2350	tcp_v4_fill_cb(skb, iph, th);
2351
2352	if (tcp_checksum_complete(skb)) {
2353	csum_error:
2354	drop_reason = SKB_DROP_REASON_TCP_CSUM;
2355	trace_tcp_bad_csum(skb);
2356	__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2357	bad_packet:
2358	__TCP_INC_STATS(net, TCP_MIB_INERRS);
2359	} else {
2360	tcp_v4_send_reset(NULL, skb);
2361	}
2362
2363	discard_it:
2364	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2365	/ Discard frame. /
2366	kfree_skb_reason(skb, reason: drop_reason);
2367	return `0`;
2368
2369	discard_and_relse:
2370	sk_drops_add(sk, skb);
2371	if (refcounted)
2372	sock_put(sk);
2373	goto discard_it;
2374
2375	do_time_wait:
2376	if (!xfrm4_policy_check(NULL, dir: XFRM_POLICY_IN, skb)) {
2377	drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2378	inet_twsk_put(tw: inet_twsk(sk));
2379	goto discard_it;
2380	}
2381
2382	tcp_v4_fill_cb(skb, iph, th);
2383
2384	if (tcp_checksum_complete(skb)) {
2385	inet_twsk_put(tw: inet_twsk(sk));
2386	goto csum_error;
2387	}
2388	switch (tcp_timewait_state_process(tw: inet_twsk(sk), skb, th)) {
2389	case TCP_TW_SYN: {
2390	struct sock *sk2 = inet_lookup_listener(net,
2391	hashinfo: net->ipv4.tcp_death_row.hashinfo,
2392	skb, doff: __tcp_hdrlen(th),
2393	saddr: iph->saddr, sport: th->source,
2394	daddr: iph->daddr, dport: th->dest,
2395	dif: inet_iif(skb),
2396	sdif);
2397	if (sk2) {
2398	inet_twsk_deschedule_put(tw: inet_twsk(sk));
2399	sk = sk2;
2400	tcp_v4_restore_cb(skb);
2401	refcounted = false;
2402	goto process;
2403	}
2404	}
2405	/ to ACK /
2406	fallthrough;
2407	case TCP_TW_ACK:
2408	tcp_v4_timewait_ack(sk, skb);
2409	break;
2410	case TCP_TW_RST:
2411	tcp_v4_send_reset(sk, skb);
2412	inet_twsk_deschedule_put(tw: inet_twsk(sk));
2413	goto discard_it;
2414	case TCP_TW_SUCCESS:;
2415	}
2416	goto discard_it;
2417	}
2418
2419	static struct timewait_sock_ops tcp_timewait_sock_ops = {
2420	.twsk_obj_size = sizeof(struct tcp_timewait_sock),
2421	.twsk_unique = tcp_twsk_unique,
2422	.twsk_destructor= tcp_twsk_destructor,
2423	};
2424
2425	void inet_sk_rx_dst_set(struct sock sk, const* struct sk_buff *skb)
2426	{
2427	struct dst_entry *dst = skb_dst(skb);
2428
2429	if (dst && dst_hold_safe(dst)) {
2430	rcu_assign_pointer(sk->sk_rx_dst, dst);
2431	sk->sk_rx_dst_ifindex = skb->skb_iif;
2432	}
2433	}
2434	EXPORT_SYMBOL(inet_sk_rx_dst_set);
2435
2436	const struct inet_connection_sock_af_ops ipv4_specific = {
2437	.queue_xmit = ip_queue_xmit,
2438	.send_check = tcp_v4_send_check,
2439	.rebuild_header = inet_sk_rebuild_header,
2440	.sk_rx_dst_set = inet_sk_rx_dst_set,
2441	.conn_request = tcp_v4_conn_request,
2442	.syn_recv_sock = tcp_v4_syn_recv_sock,
2443	.net_header_len = sizeof(struct iphdr),
2444	.setsockopt = ip_setsockopt,
2445	.getsockopt = ip_getsockopt,
2446	.addr2sockaddr = inet_csk_addr2sockaddr,
2447	.sockaddr_len = sizeof(struct sockaddr_in),
2448	.mtu_reduced = tcp_v4_mtu_reduced,
2449	};
2450	EXPORT_SYMBOL(ipv4_specific);
2451
2452	#if defined(CONFIG_TCP_MD5SIG) \|\| defined(CONFIG_TCP_AO)
2453	static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2454	#ifdef CONFIG_TCP_MD5SIG
2455	.md5_lookup = tcp_v4_md5_lookup,
2456	.calc_md5_hash = tcp_v4_md5_hash_skb,
2457	.md5_parse = tcp_v4_parse_md5_keys,
2458	#endif
2459	#ifdef CONFIG_TCP_AO
2460	.ao_lookup = tcp_v4_ao_lookup,
2461	.calc_ao_hash = tcp_v4_ao_hash_skb,
2462	.ao_parse = tcp_v4_parse_ao,
2463	.ao_calc_key_sk = tcp_v4_ao_calc_key_sk,
2464	#endif
2465	};
2466	#endif
2467
2468	/ NOTE: A lot of things set to zero explicitly by call to*
2469	* sk_alloc() so need not be done here.
2470	*/
2471	static int tcp_v4_init_sock(struct sock *sk)
2472	{
2473	struct inet_connection_sock *icsk = inet_csk(sk);
2474
2475	tcp_init_sock(sk);
2476
2477	icsk->icsk_af_ops = &ipv4_specific;
2478
2479	#if defined(CONFIG_TCP_MD5SIG) \|\| defined(CONFIG_TCP_AO)
2480	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2481	#endif
2482
2483	return `0`;
2484	}
2485
2486	#ifdef CONFIG_TCP_MD5SIG
2487	static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
2488	{
2489	struct tcp_md5sig_info *md5sig;
2490
2491	md5sig = container_of(head, struct tcp_md5sig_info, rcu);
2492	kfree(objp: md5sig);
2493	static_branch_slow_dec_deferred(&tcp_md5_needed);
2494	tcp_md5_release_sigpool();
2495	}
2496	#endif
2497
2498	void tcp_v4_destroy_sock(struct sock *sk)
2499	{
2500	struct tcp_sock *tp = tcp_sk(sk);
2501
2502	trace_tcp_destroy_sock(sk);
2503
2504	tcp_clear_xmit_timers(sk);
2505
2506	tcp_cleanup_congestion_control(sk);
2507
2508	tcp_cleanup_ulp(sk);
2509
2510	/ Cleanup up the write buffer. /
2511	tcp_write_queue_purge(sk);
2512
2513	/ Check if we want to disable active TFO /
2514	tcp_fastopen_active_disable_ofo_check(sk);
2515
2516	/ Cleans up our, hopefully empty, out_of_order_queue. /
2517	skb_rbtree_purge(root: &tp->out_of_order_queue);
2518
2519	#ifdef CONFIG_TCP_MD5SIG
2520	/ Clean up the MD5 key list, if any /
2521	if (tp->md5sig_info) {
2522	struct tcp_md5sig_info *md5sig;
2523
2524	md5sig = rcu_dereference_protected(tp->md5sig_info, `1`);
2525	tcp_clear_md5_list(sk);
2526	call_rcu(head: &md5sig->rcu, func: tcp_md5sig_info_free_rcu);
2527	rcu_assign_pointer(tp->md5sig_info, NULL);
2528	}
2529	#endif
2530	tcp_ao_destroy_sock(sk, twsk: false);
2531
2532	/ Clean up a referenced TCP bind bucket. /
2533	if (inet_csk(sk)->icsk_bind_hash)
2534	inet_put_port(sk);
2535
2536	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2537
2538	/ If socket is aborted during connect operation /
2539	tcp_free_fastopen_req(tp);
2540	tcp_fastopen_destroy_cipher(sk);
2541	tcp_saved_syn_free(tp);
2542
2543	sk_sockets_allocated_dec(sk);
2544	}
2545	EXPORT_SYMBOL(tcp_v4_destroy_sock);
2546
2547	#ifdef CONFIG_PROC_FS
2548	/ Proc filesystem TCP sock list dumping. /
2549
2550	static unsigned short seq_file_family(const struct seq_file *seq);
2551
2552	static bool seq_sk_match(struct seq_file seq, const* struct sock *sk)
2553	{
2554	unsigned short family = seq_file_family(seq);
2555
2556	/ AF_UNSPEC is used as a match all /
2557	return ((family == AF_UNSPEC \|\| family == sk->sk_family) &&
2558	net_eq(net1: sock_net(sk), net2: seq_file_net(seq)));
2559	}
2560
2561	/ Find a non empty bucket (starting from st->bucket)*
2562	* and return the first sk from it.
2563	*/
2564	static void listening_get_first(struct* seq_file *seq)
2565	{
2566	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2567	struct tcp_iter_state *st = seq->private;
2568
2569	st->offset = `0`;
2570	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2571	struct inet_listen_hashbucket *ilb2;
2572	struct hlist_nulls_node *node;
2573	struct sock *sk;
2574
2575	ilb2 = &hinfo->lhash2[st->bucket];
2576	if (hlist_nulls_empty(h: &ilb2->nulls_head))
2577	continue;
2578
2579	spin_lock(lock: &ilb2->lock);
2580	sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2581	if (seq_sk_match(seq, sk))
2582	return sk;
2583	}
2584	spin_unlock(lock: &ilb2->lock);
2585	}
2586
2587	return NULL;
2588	}
2589
2590	/ Find the next sk of "cur" within the same bucket (i.e. st->bucket).*
2591	* If "cur" is the last one in the st->bucket,
2592	* call listening_get_first() to return the first sk of the next
2593	* non empty bucket.
2594	*/
2595	static void listening_get_next(struct* seq_file seq, void* *cur)
2596	{
2597	struct tcp_iter_state *st = seq->private;
2598	struct inet_listen_hashbucket *ilb2;
2599	struct hlist_nulls_node *node;
2600	struct inet_hashinfo *hinfo;
2601	struct sock *sk = cur;
2602
2603	++st->num;
2604	++st->offset;
2605
2606	sk = sk_nulls_next(sk);
2607	sk_nulls_for_each_from(sk, node) {
2608	if (seq_sk_match(seq, sk))
2609	return sk;
2610	}
2611
2612	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2613	ilb2 = &hinfo->lhash2[st->bucket];
2614	spin_unlock(lock: &ilb2->lock);
2615	++st->bucket;
2616	return listening_get_first(seq);
2617	}
2618
2619	static void listening_get_idx(struct* seq_file seq, loff_t pos)
2620	{
2621	struct tcp_iter_state *st = seq->private;
2622	void *rc;
2623
2624	st->bucket = `0`;
2625	st->offset = `0`;
2626	rc = listening_get_first(seq);
2627
2628	while (rc && *pos) {
2629	rc = listening_get_next(seq, cur: rc);
2630	--*pos;
2631	}
2632	return rc;
2633	}
2634
2635	static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2636	const struct tcp_iter_state *st)
2637	{
2638	return hlist_nulls_empty(h: &hinfo->ehash[st->bucket].chain);
2639	}
2640
2641	/*
2642	* Get first established socket starting from bucket given in st->bucket.
2643	* If st->bucket is zero, the very first socket in the hash is returned.
2644	*/
2645	static void established_get_first(struct* seq_file *seq)
2646	{
2647	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2648	struct tcp_iter_state *st = seq->private;
2649
2650	st->offset = `0`;
2651	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2652	struct sock *sk;
2653	struct hlist_nulls_node *node;
2654	spinlock_t *lock = inet_ehash_lockp(hashinfo: hinfo, hash: st->bucket);
2655
2656	cond_resched();
2657
2658	/ Lockless fast path for the common case of empty buckets /
2659	if (empty_bucket(hinfo, st))
2660	continue;
2661
2662	spin_lock_bh(lock);
2663	sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2664	if (seq_sk_match(seq, sk))
2665	return sk;
2666	}
2667	spin_unlock_bh(lock);
2668	}
2669
2670	return NULL;
2671	}
2672
2673	static void established_get_next(struct* seq_file seq, void* *cur)
2674	{
2675	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2676	struct tcp_iter_state *st = seq->private;
2677	struct hlist_nulls_node *node;
2678	struct sock *sk = cur;
2679
2680	++st->num;
2681	++st->offset;
2682
2683	sk = sk_nulls_next(sk);
2684
2685	sk_nulls_for_each_from(sk, node) {
2686	if (seq_sk_match(seq, sk))
2687	return sk;
2688	}
2689
2690	spin_unlock_bh(lock: inet_ehash_lockp(hashinfo: hinfo, hash: st->bucket));
2691	++st->bucket;
2692	return established_get_first(seq);
2693	}
2694
2695	static void established_get_idx(struct* seq_file *seq, loff_t pos)
2696	{
2697	struct tcp_iter_state *st = seq->private;
2698	void *rc;
2699
2700	st->bucket = `0`;
2701	rc = established_get_first(seq);
2702
2703	while (rc && pos) {
2704	rc = established_get_next(seq, cur: rc);
2705	--pos;
2706	}
2707	return rc;
2708	}
2709
2710	static void tcp_get_idx(struct* seq_file *seq, loff_t pos)
2711	{
2712	void *rc;
2713	struct tcp_iter_state *st = seq->private;
2714
2715	st->state = TCP_SEQ_STATE_LISTENING;
2716	rc = listening_get_idx(seq, pos: &pos);
2717
2718	if (!rc) {
2719	st->state = TCP_SEQ_STATE_ESTABLISHED;
2720	rc = established_get_idx(seq, pos);
2721	}
2722
2723	return rc;
2724	}
2725
2726	static void tcp_seek_last_pos(struct* seq_file *seq)
2727	{
2728	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2729	struct tcp_iter_state *st = seq->private;
2730	int bucket = st->bucket;
2731	int offset = st->offset;
2732	int orig_num = st->num;
2733	void *rc = NULL;
2734
2735	switch (st->state) {
2736	case TCP_SEQ_STATE_LISTENING:
2737	if (st->bucket > hinfo->lhash2_mask)
2738	break;
2739	rc = listening_get_first(seq);
2740	while (offset-- && rc && bucket == st->bucket)
2741	rc = listening_get_next(seq, cur: rc);
2742	if (rc)
2743	break;
2744	st->bucket = `0`;
2745	st->state = TCP_SEQ_STATE_ESTABLISHED;
2746	fallthrough;
2747	case TCP_SEQ_STATE_ESTABLISHED:
2748	if (st->bucket > hinfo->ehash_mask)
2749	break;
2750	rc = established_get_first(seq);
2751	while (offset-- && rc && bucket == st->bucket)
2752	rc = established_get_next(seq, cur: rc);
2753	}
2754
2755	st->num = orig_num;
2756
2757	return rc;
2758	}
2759
2760	void tcp_seq_start(struct* seq_file seq, loff_t pos)
2761	{
2762	struct tcp_iter_state *st = seq->private;
2763	void *rc;
2764
2765	if (pos && pos == st->last_pos) {
2766	rc = tcp_seek_last_pos(seq);
2767	if (rc)
2768	goto out;
2769	}
2770
2771	st->state = TCP_SEQ_STATE_LISTENING;
2772	st->num = `0`;
2773	st->bucket = `0`;
2774	st->offset = `0`;
2775	rc = pos ? tcp_get_idx(seq, pos: pos - `1`) : SEQ_START_TOKEN;
2776
2777	out:
2778	st->last_pos = *pos;
2779	return rc;
2780	}
2781	EXPORT_SYMBOL(tcp_seq_start);
2782
2783	void tcp_seq_next(struct* seq_file seq, void* v, loff_t pos)
2784	{
2785	struct tcp_iter_state *st = seq->private;
2786	void *rc = NULL;
2787
2788	if (v == SEQ_START_TOKEN) {
2789	rc = tcp_get_idx(seq, pos: `0`);
2790	goto out;
2791	}
2792
2793	switch (st->state) {
2794	case TCP_SEQ_STATE_LISTENING:
2795	rc = listening_get_next(seq, cur: v);
2796	if (!rc) {
2797	st->state = TCP_SEQ_STATE_ESTABLISHED;
2798	st->bucket = `0`;
2799	st->offset = `0`;
2800	rc = established_get_first(seq);
2801	}
2802	break;
2803	case TCP_SEQ_STATE_ESTABLISHED:
2804	rc = established_get_next(seq, cur: v);
2805	break;
2806	}
2807	out:
2808	++*pos;
2809	st->last_pos = *pos;
2810	return rc;
2811	}
2812	EXPORT_SYMBOL(tcp_seq_next);
2813
2814	void tcp_seq_stop(struct seq_file seq, void* *v)
2815	{
2816	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2817	struct tcp_iter_state *st = seq->private;
2818
2819	switch (st->state) {
2820	case TCP_SEQ_STATE_LISTENING:
2821	if (v != SEQ_START_TOKEN)
2822	spin_unlock(lock: &hinfo->lhash2[st->bucket].lock);
2823	break;
2824	case TCP_SEQ_STATE_ESTABLISHED:
2825	if (v)
2826	spin_unlock_bh(lock: inet_ehash_lockp(hashinfo: hinfo, hash: st->bucket));
2827	break;
2828	}
2829	}
2830	EXPORT_SYMBOL(tcp_seq_stop);
2831
2832	static void get_openreq4(const struct request_sock *req,
2833	struct seq_file f, int* i)
2834	{
2835	const struct inet_request_sock *ireq = inet_rsk(sk: req);
2836	long delta = req->rsk_timer.expires - jiffies;
2837
2838	seq_printf(m: f, fmt: "%4d: %08X:%04X %08X:%04X"
2839	" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2840	i,
2841	ireq->ir_loc_addr,
2842	ireq->ir_num,
2843	ireq->ir_rmt_addr,
2844	ntohs(ireq->ir_rmt_port),
2845	TCP_SYN_RECV,
2846	`0`, `0`, / could print option size, but that is af dependent. /
2847	`1`, / timers active (only the expire timer) /
2848	jiffies_delta_to_clock_t(delta),
2849	req->num_timeout,
2850	from_kuid_munged(to: seq_user_ns(seq: f),
2851	uid: sock_i_uid(sk: req->rsk_listener)),
2852	`0`, / non standard timer /
2853	`0`, / open_requests have no inode /
2854	`0`,
2855	req);
2856	}
2857
2858	static void get_tcp4_sock(struct sock sk, struct* seq_file f, int* i)
2859	{
2860	int timer_active;
2861	unsigned long timer_expires;
2862	const struct tcp_sock *tp = tcp_sk(sk);
2863	const struct inet_connection_sock *icsk = inet_csk(sk);
2864	const struct inet_sock *inet = inet_sk(sk);
2865	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2866	__be32 dest = inet->inet_daddr;
2867	__be32 src = inet->inet_rcv_saddr;
2868	__u16 destp = ntohs(inet->inet_dport);
2869	__u16 srcp = ntohs(inet->inet_sport);
2870	int rx_queue;
2871	int state;
2872
2873	if (icsk->icsk_pending == ICSK_TIME_RETRANS \|\|
2874	icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT \|\|
2875	icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2876	timer_active = `1`;
2877	timer_expires = icsk->icsk_timeout;
2878	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2879	timer_active = `4`;
2880	timer_expires = icsk->icsk_timeout;
2881	} else if (timer_pending(timer: &sk->sk_timer)) {
2882	timer_active = `2`;
2883	timer_expires = sk->sk_timer.expires;
2884	} else {
2885	timer_active = `0`;
2886	timer_expires = jiffies;
2887	}
2888
2889	state = inet_sk_state_load(sk);
2890	if (state == TCP_LISTEN)
2891	rx_queue = READ_ONCE(sk->sk_ack_backlog);
2892	else
2893	/ Because we don't lock the socket,*
2894	* we might find a transient negative value.
2895	*/
2896	rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2897	READ_ONCE(tp->copied_seq), `0`);
2898
2899	seq_printf(m: f, fmt: "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2900	"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2901	i, src, srcp, dest, destp, state,
2902	READ_ONCE(tp->write_seq) - tp->snd_una,
2903	rx_queue,
2904	timer_active,
2905	jiffies_delta_to_clock_t(delta: timer_expires - jiffies),
2906	icsk->icsk_retransmits,
2907	from_kuid_munged(to: seq_user_ns(seq: f), uid: sock_i_uid(sk)),
2908	icsk->icsk_probes_out,
2909	sock_i_ino(sk),
2910	refcount_read(r: &sk->sk_refcnt), sk,
2911	jiffies_to_clock_t(x: icsk->icsk_rto),
2912	jiffies_to_clock_t(x: icsk->icsk_ack.ato),
2913	(icsk->icsk_ack.quick << `1`) \| inet_csk_in_pingpong_mode(sk),
2914	tcp_snd_cwnd(tp),
2915	state == TCP_LISTEN ?
2916	fastopenq->max_qlen :
2917	(tcp_in_initial_slowstart(tp) ? -`1` : tp->snd_ssthresh));
2918	}
2919
2920	static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2921	struct seq_file f, int* i)
2922	{
2923	long delta = tw->tw_timer.expires - jiffies;
2924	__be32 dest, src;
2925	__u16 destp, srcp;
2926
2927	dest = tw->tw_daddr;
2928	src = tw->tw_rcv_saddr;
2929	destp = ntohs(tw->tw_dport);
2930	srcp = ntohs(tw->tw_sport);
2931
2932	seq_printf(m: f, fmt: "%4d: %08X:%04X %08X:%04X"
2933	" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2934	i, src, srcp, dest, destp, tw->tw_substate, `0`, `0`,
2935	`3`, jiffies_delta_to_clock_t(delta), `0`, `0`, `0`, `0`,
2936	refcount_read(r: &tw->tw_refcnt), tw);
2937	}
2938
2939	#define TMPSZ 150
2940
2941	static int tcp4_seq_show(struct seq_file seq, void* *v)
2942	{
2943	struct tcp_iter_state *st;
2944	struct sock *sk = v;
2945
2946	seq_setwidth(m: seq, TMPSZ - `1`);
2947	if (v == SEQ_START_TOKEN) {
2948	seq_puts(m: seq, s: " sl local_address rem_address st tx_queue "
2949	"rx_queue tr tm->when retrnsmt uid timeout "
2950	"inode");
2951	goto out;
2952	}
2953	st = seq->private;
2954
2955	if (sk->sk_state == TCP_TIME_WAIT)
2956	get_timewait4_sock(tw: v, f: seq, i: st->num);
2957	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2958	get_openreq4(req: v, f: seq, i: st->num);
2959	else
2960	get_tcp4_sock(sk: v, f: seq, i: st->num);
2961	out:
2962	seq_pad(m: seq, c: `'\n'`);
2963	return `0`;
2964	}
2965
2966	#ifdef CONFIG_BPF_SYSCALL
2967	struct bpf_tcp_iter_state {
2968	struct tcp_iter_state state;
2969	unsigned int cur_sk;
2970	unsigned int end_sk;
2971	unsigned int max_sk;
2972	struct sock **batch;
2973	bool st_bucket_done;
2974	};
2975
2976	struct bpf_iter__tcp {
2977	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2978	__bpf_md_ptr(struct sock_common *, sk_common);
2979	uid_t uid __aligned(`8`);
2980	};
2981
2982	static int tcp_prog_seq_show(struct bpf_prog prog, struct* bpf_iter_meta *meta,
2983	struct sock_common *sk_common, uid_t uid)
2984	{
2985	struct bpf_iter__tcp ctx;
2986
2987	meta->seq_num--; / skip SEQ_START_TOKEN /
2988	ctx.meta = meta;
2989	ctx.sk_common = sk_common;
2990	ctx.uid = uid;
2991	return bpf_iter_run_prog(prog, ctx: &ctx);
2992	}
2993
2994	static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2995	{
2996	while (iter->cur_sk < iter->end_sk)
2997	sock_gen_put(sk: iter->batch[iter->cur_sk++]);
2998	}
2999
3000	static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3001	unsigned int new_batch_sz)
3002	{
3003	struct sock **new_batch;
3004
3005	new_batch = kvmalloc(size: sizeof(new_batch) new_batch_sz,
3006	GFP_USER \| __GFP_NOWARN);
3007	if (!new_batch)
3008	return -ENOMEM;
3009
3010	bpf_iter_tcp_put_batch(iter);
3011	kvfree(addr: iter->batch);
3012	iter->batch = new_batch;
3013	iter->max_sk = new_batch_sz;
3014
3015	return `0`;
3016	}
3017
3018	static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3019	struct sock *start_sk)
3020	{
3021	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3022	struct bpf_tcp_iter_state *iter = seq->private;
3023	struct tcp_iter_state *st = &iter->state;
3024	struct hlist_nulls_node *node;
3025	unsigned int expected = `1`;
3026	struct sock *sk;
3027
3028	sock_hold(sk: start_sk);
3029	iter->batch[iter->end_sk++] = start_sk;
3030
3031	sk = sk_nulls_next(sk: start_sk);
3032	sk_nulls_for_each_from(sk, node) {
3033	if (seq_sk_match(seq, sk)) {
3034	if (iter->end_sk < iter->max_sk) {
3035	sock_hold(sk);
3036	iter->batch[iter->end_sk++] = sk;
3037	}
3038	expected++;
3039	}
3040	}
3041	spin_unlock(lock: &hinfo->lhash2[st->bucket].lock);
3042
3043	return expected;
3044	}
3045
3046	static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3047	struct sock *start_sk)
3048	{
3049	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3050	struct bpf_tcp_iter_state *iter = seq->private;
3051	struct tcp_iter_state *st = &iter->state;
3052	struct hlist_nulls_node *node;
3053	unsigned int expected = `1`;
3054	struct sock *sk;
3055
3056	sock_hold(sk: start_sk);
3057	iter->batch[iter->end_sk++] = start_sk;
3058
3059	sk = sk_nulls_next(sk: start_sk);
3060	sk_nulls_for_each_from(sk, node) {
3061	if (seq_sk_match(seq, sk)) {
3062	if (iter->end_sk < iter->max_sk) {
3063	sock_hold(sk);
3064	iter->batch[iter->end_sk++] = sk;
3065	}
3066	expected++;
3067	}
3068	}
3069	spin_unlock_bh(lock: inet_ehash_lockp(hashinfo: hinfo, hash: st->bucket));
3070
3071	return expected;
3072	}
3073
3074	static struct sock bpf_iter_tcp_batch(struct* seq_file *seq)
3075	{
3076	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3077	struct bpf_tcp_iter_state *iter = seq->private;
3078	struct tcp_iter_state *st = &iter->state;
3079	unsigned int expected;
3080	bool resized = false;
3081	struct sock *sk;
3082
3083	/ The st->bucket is done. Directly advance to the next*
3084	* bucket instead of having the tcp_seek_last_pos() to skip
3085	* one by one in the current bucket and eventually find out
3086	* it has to advance to the next bucket.
3087	*/
3088	if (iter->st_bucket_done) {
3089	st->offset = `0`;
3090	st->bucket++;
3091	if (st->state == TCP_SEQ_STATE_LISTENING &&
3092	st->bucket > hinfo->lhash2_mask) {
3093	st->state = TCP_SEQ_STATE_ESTABLISHED;
3094	st->bucket = `0`;
3095	}
3096	}
3097
3098	again:
3099	/ Get a new batch /
3100	iter->cur_sk = `0`;
3101	iter->end_sk = `0`;
3102	iter->st_bucket_done = false;
3103
3104	sk = tcp_seek_last_pos(seq);
3105	if (!sk)
3106	return NULL; / Done /
3107
3108	if (st->state == TCP_SEQ_STATE_LISTENING)
3109	expected = bpf_iter_tcp_listening_batch(seq, start_sk: sk);
3110	else
3111	expected = bpf_iter_tcp_established_batch(seq, start_sk: sk);
3112
3113	if (iter->end_sk == expected) {
3114	iter->st_bucket_done = true;
3115	return sk;
3116	}
3117
3118	if (!resized && !bpf_iter_tcp_realloc_batch(iter, new_batch_sz: expected * `3` / `2`)) {
3119	resized = true;
3120	goto again;
3121	}
3122
3123	return sk;
3124	}
3125
3126	static void bpf_iter_tcp_seq_start(struct* seq_file seq, loff_t pos)
3127	{
3128	/ bpf iter does not support lseek, so it always*
3129	* continue from where it was stop()-ped.
3130	*/
3131	if (*pos)
3132	return bpf_iter_tcp_batch(seq);
3133
3134	return SEQ_START_TOKEN;
3135	}
3136
3137	static void bpf_iter_tcp_seq_next(struct* seq_file seq, void* v, loff_t pos)
3138	{
3139	struct bpf_tcp_iter_state *iter = seq->private;
3140	struct tcp_iter_state *st = &iter->state;
3141	struct sock *sk;
3142
3143	/ Whenever seq_next() is called, the iter->cur_sk is*
3144	* done with seq_show(), so advance to the next sk in
3145	* the batch.
3146	*/
3147	if (iter->cur_sk < iter->end_sk) {
3148	/ Keeping st->num consistent in tcp_iter_state.*
3149	* bpf_iter_tcp does not use st->num.
3150	* meta.seq_num is used instead.
3151	*/
3152	st->num++;
3153	/ Move st->offset to the next sk in the bucket such that*
3154	* the future start() will resume at st->offset in
3155	* st->bucket. See tcp_seek_last_pos().
3156	*/
3157	st->offset++;
3158	sock_gen_put(sk: iter->batch[iter->cur_sk++]);
3159	}
3160
3161	if (iter->cur_sk < iter->end_sk)
3162	sk = iter->batch[iter->cur_sk];
3163	else
3164	sk = bpf_iter_tcp_batch(seq);
3165
3166	++*pos;
3167	/ Keeping st->last_pos consistent in tcp_iter_state.*
3168	* bpf iter does not do lseek, so st->last_pos always equals to *pos.
3169	*/
3170	st->last_pos = *pos;
3171	return sk;
3172	}
3173
3174	static int bpf_iter_tcp_seq_show(struct seq_file seq, void* *v)
3175	{
3176	struct bpf_iter_meta meta;
3177	struct bpf_prog *prog;
3178	struct sock *sk = v;
3179	uid_t uid;
3180	int ret;
3181
3182	if (v == SEQ_START_TOKEN)
3183	return `0`;
3184
3185	if (sk_fullsock(sk))
3186	lock_sock(sk);
3187
3188	if (unlikely(sk_unhashed(sk))) {
3189	ret = SEQ_SKIP;
3190	goto unlock;
3191	}
3192
3193	if (sk->sk_state == TCP_TIME_WAIT) {
3194	uid = `0`;
3195	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3196	const struct request_sock *req = v;
3197
3198	uid = from_kuid_munged(to: seq_user_ns(seq),
3199	uid: sock_i_uid(sk: req->rsk_listener));
3200	} else {
3201	uid = from_kuid_munged(to: seq_user_ns(seq), uid: sock_i_uid(sk));
3202	}
3203
3204	meta.seq = seq;
3205	prog = bpf_iter_get_info(meta: &meta, in_stop: false);
3206	ret = tcp_prog_seq_show(prog, meta: &meta, sk_common: v, uid);
3207
3208	unlock:
3209	if (sk_fullsock(sk))
3210	release_sock(sk);
3211	return ret;
3212
3213	}
3214
3215	static void bpf_iter_tcp_seq_stop(struct seq_file seq, void* *v)
3216	{
3217	struct bpf_tcp_iter_state *iter = seq->private;
3218	struct bpf_iter_meta meta;
3219	struct bpf_prog *prog;
3220
3221	if (!v) {
3222	meta.seq = seq;
3223	prog = bpf_iter_get_info(meta: &meta, in_stop: true);
3224	if (prog)
3225	(void)tcp_prog_seq_show(prog, meta: &meta, sk_common: v, uid: `0`);
3226	}
3227
3228	if (iter->cur_sk < iter->end_sk) {
3229	bpf_iter_tcp_put_batch(iter);
3230	iter->st_bucket_done = false;
3231	}
3232	}
3233
3234	static const struct seq_operations bpf_iter_tcp_seq_ops = {
3235	.show = bpf_iter_tcp_seq_show,
3236	.start = bpf_iter_tcp_seq_start,
3237	.next = bpf_iter_tcp_seq_next,
3238	.stop = bpf_iter_tcp_seq_stop,
3239	};
3240	#endif
3241	static unsigned short seq_file_family(const struct seq_file *seq)
3242	{
3243	const struct tcp_seq_afinfo *afinfo;
3244
3245	#ifdef CONFIG_BPF_SYSCALL
3246	/ Iterated from bpf_iter. Let the bpf prog to filter instead. /
3247	if (seq->op == &bpf_iter_tcp_seq_ops)
3248	return AF_UNSPEC;
3249	#endif
3250
3251	/ Iterated from proc fs /
3252	afinfo = pde_data(inode: file_inode(f: seq->file));
3253	return afinfo->family;
3254	}
3255
3256	static const struct seq_operations tcp4_seq_ops = {
3257	.show = tcp4_seq_show,
3258	.start = tcp_seq_start,
3259	.next = tcp_seq_next,
3260	.stop = tcp_seq_stop,
3261	};
3262
3263	static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3264	.family = AF_INET,
3265	};
3266
3267	static int __net_init tcp4_proc_init_net(struct net *net)
3268	{
3269	if (!proc_create_net_data(name: "tcp", mode: `0444`, parent: net->proc_net, ops: &tcp4_seq_ops,
3270	state_size: sizeof(struct tcp_iter_state), data: &tcp4_seq_afinfo))
3271	return -ENOMEM;
3272	return `0`;
3273	}
3274
3275	static void __net_exit tcp4_proc_exit_net(struct net *net)
3276	{
3277	remove_proc_entry("tcp", net->proc_net);
3278	}
3279
3280	static struct pernet_operations tcp4_net_ops = {
3281	.init = tcp4_proc_init_net,
3282	.exit = tcp4_proc_exit_net,
3283	};
3284
3285	int __init tcp4_proc_init(void)
3286	{
3287	return register_pernet_subsys(&tcp4_net_ops);
3288	}
3289
3290	void tcp4_proc_exit(void)
3291	{
3292	unregister_pernet_subsys(&tcp4_net_ops);
3293	}
3294	#endif /* CONFIG_PROC_FS */
3295
3296	/ @wake is one when sk_stream_write_space() calls us.*
3297	* This sends EPOLLOUT only if notsent_bytes is half the limit.
3298	* This mimics the strategy used in sock_def_write_space().
3299	*/
3300	bool tcp_stream_memory_free(const struct sock sk, int* wake)
3301	{
3302	const struct tcp_sock *tp = tcp_sk(sk);
3303	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3304	READ_ONCE(tp->snd_nxt);
3305
3306	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3307	}
3308	EXPORT_SYMBOL(tcp_stream_memory_free);
3309
3310	struct proto tcp_prot = {
3311	.name = "TCP",
3312	.owner = THIS_MODULE,
3313	.close = tcp_close,
3314	.pre_connect = tcp_v4_pre_connect,
3315	.connect = tcp_v4_connect,
3316	.disconnect = tcp_disconnect,
3317	.accept = inet_csk_accept,
3318	.ioctl = tcp_ioctl,
3319	.init = tcp_v4_init_sock,
3320	.destroy = tcp_v4_destroy_sock,
3321	.shutdown = tcp_shutdown,
3322	.setsockopt = tcp_setsockopt,
3323	.getsockopt = tcp_getsockopt,
3324	.bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3325	.keepalive = tcp_set_keepalive,
3326	.recvmsg = tcp_recvmsg,
3327	.sendmsg = tcp_sendmsg,
3328	.splice_eof = tcp_splice_eof,
3329	.backlog_rcv = tcp_v4_do_rcv,
3330	.release_cb = tcp_release_cb,
3331	.hash = inet_hash,
3332	.unhash = inet_unhash,
3333	.get_port = inet_csk_get_port,
3334	.put_port = inet_put_port,
3335	#ifdef CONFIG_BPF_SYSCALL
3336	.psock_update_sk_prot = tcp_bpf_update_proto,
3337	#endif
3338	.enter_memory_pressure = tcp_enter_memory_pressure,
3339	.leave_memory_pressure = tcp_leave_memory_pressure,
3340	.stream_memory_free = tcp_stream_memory_free,
3341	.sockets_allocated = &tcp_sockets_allocated,
3342	.orphan_count = &tcp_orphan_count,
3343
3344	.memory_allocated = &tcp_memory_allocated,
3345	.per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
3346
3347	.memory_pressure = &tcp_memory_pressure,
3348	.sysctl_mem = sysctl_tcp_mem,
3349	.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3350	.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3351	.max_header = MAX_TCP_HEADER,
3352	.obj_size = sizeof(struct tcp_sock),
3353	.slab_flags = SLAB_TYPESAFE_BY_RCU,
3354	.twsk_prot = &tcp_timewait_sock_ops,
3355	.rsk_prot = &tcp_request_sock_ops,
3356	.h.hashinfo = NULL,
3357	.no_autobind = true,
3358	.diag_destroy = tcp_abort,
3359	};
3360	EXPORT_SYMBOL(tcp_prot);
3361
3362	static void __net_exit tcp_sk_exit(struct net *net)
3363	{
3364	if (net->ipv4.tcp_congestion_control)
3365	bpf_module_put(data: net->ipv4.tcp_congestion_control,
3366	owner: net->ipv4.tcp_congestion_control->owner);
3367	}
3368
3369	static void __net_init tcp_set_hashinfo(struct net *net)
3370	{
3371	struct inet_hashinfo *hinfo;
3372	unsigned int ehash_entries;
3373	struct net *old_net;
3374
3375	if (net_eq(net1: net, net2: &init_net))
3376	goto fallback;
3377
3378	old_net = current->nsproxy->net_ns;
3379	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3380	if (!ehash_entries)
3381	goto fallback;
3382
3383	ehash_entries = roundup_pow_of_two(ehash_entries);
3384	hinfo = inet_pernet_hashinfo_alloc(hashinfo: &tcp_hashinfo, ehash_entries);
3385	if (!hinfo) {
3386	pr_warn("Failed to allocate TCP ehash (entries: %u) "
3387	"for a netns, fallback to the global one\n",
3388	ehash_entries);
3389	fallback:
3390	hinfo = &tcp_hashinfo;
3391	ehash_entries = tcp_hashinfo.ehash_mask + `1`;
3392	}
3393
3394	net->ipv4.tcp_death_row.hashinfo = hinfo;
3395	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / `2`;
3396	net->ipv4.sysctl_max_syn_backlog = max(`128U`, ehash_entries / `128`);
3397	}
3398
3399	static int __net_init tcp_sk_init(struct net *net)
3400	{
3401	net->ipv4.sysctl_tcp_ecn = `2`;
3402	net->ipv4.sysctl_tcp_ecn_fallback = `1`;
3403
3404	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3405	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3406	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3407	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3408	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3409
3410	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3411	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3412	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3413
3414	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3415	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3416	net->ipv4.sysctl_tcp_syncookies = `1`;
3417	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3418	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3419	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3420	net->ipv4.sysctl_tcp_orphan_retries = `0`;
3421	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3422	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3423	net->ipv4.sysctl_tcp_tw_reuse = `2`;
3424	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = `1`;
3425
3426	refcount_set(r: &net->ipv4.tcp_death_row.tw_refcount, n: `1`);
3427	tcp_set_hashinfo(net);
3428
3429	net->ipv4.sysctl_tcp_sack = `1`;
3430	net->ipv4.sysctl_tcp_window_scaling = `1`;
3431	net->ipv4.sysctl_tcp_timestamps = `1`;
3432	net->ipv4.sysctl_tcp_early_retrans = `3`;
3433	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3434	net->ipv4.sysctl_tcp_slow_start_after_idle = `1`; / By default, RFC2861 behavior. /
3435	net->ipv4.sysctl_tcp_retrans_collapse = `1`;
3436	net->ipv4.sysctl_tcp_max_reordering = `300`;
3437	net->ipv4.sysctl_tcp_dsack = `1`;
3438	net->ipv4.sysctl_tcp_app_win = `31`;
3439	net->ipv4.sysctl_tcp_adv_win_scale = `1`;
3440	net->ipv4.sysctl_tcp_frto = `2`;
3441	net->ipv4.sysctl_tcp_moderate_rcvbuf = `1`;
3442	/ This limits the percentage of the congestion window which we*
3443	* will allow a single TSO frame to consume. Building TSO frames
3444	* which are too large can cause TCP streams to be bursty.
3445	*/
3446	net->ipv4.sysctl_tcp_tso_win_divisor = `3`;
3447	/ Default TSQ limit of 16 TSO segments /
3448	net->ipv4.sysctl_tcp_limit_output_bytes = `16` * `65536`;
3449
3450	/ rfc5961 challenge ack rate limiting, per net-ns, disabled by default. /
3451	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3452
3453	net->ipv4.sysctl_tcp_min_tso_segs = `2`;
3454	net->ipv4.sysctl_tcp_tso_rtt_log = `9`; / 2^9 = 512 usec /
3455	net->ipv4.sysctl_tcp_min_rtt_wlen = `300`;
3456	net->ipv4.sysctl_tcp_autocorking = `1`;
3457	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/`2`;
3458	net->ipv4.sysctl_tcp_pacing_ss_ratio = `200`;
3459	net->ipv4.sysctl_tcp_pacing_ca_ratio = `120`;
3460	if (net != &init_net) {
3461	memcpy(net->ipv4.sysctl_tcp_rmem,
3462	init_net.ipv4.sysctl_tcp_rmem,
3463	sizeof(init_net.ipv4.sysctl_tcp_rmem));
3464	memcpy(net->ipv4.sysctl_tcp_wmem,
3465	init_net.ipv4.sysctl_tcp_wmem,
3466	sizeof(init_net.ipv4.sysctl_tcp_wmem));
3467	}
3468	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3469	net->ipv4.sysctl_tcp_comp_sack_slack_ns = `100` * NSEC_PER_USEC;
3470	net->ipv4.sysctl_tcp_comp_sack_nr = `44`;
3471	net->ipv4.sysctl_tcp_backlog_ack_defer = `1`;
3472	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3473	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = `0`;
3474	atomic_set(v: &net->ipv4.tfo_active_disable_times, i: `0`);
3475
3476	/ Set default values for PLB /
3477	net->ipv4.sysctl_tcp_plb_enabled = `0`; / Disabled by default /
3478	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = `3`;
3479	net->ipv4.sysctl_tcp_plb_rehash_rounds = `12`;
3480	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = `60`;
3481	/ Default congestion threshold for PLB to mark a round is 50% /
3482	net->ipv4.sysctl_tcp_plb_cong_thresh = (`1` << TCP_PLB_SCALE) / `2`;
3483
3484	/ Reno is always built in /
3485	if (!net_eq(net1: net, net2: &init_net) &&
3486	bpf_try_module_get(data: init_net.ipv4.tcp_congestion_control,
3487	owner: init_net.ipv4.tcp_congestion_control->owner))
3488	net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3489	else
3490	net->ipv4.tcp_congestion_control = &tcp_reno;
3491
3492	net->ipv4.sysctl_tcp_syn_linear_timeouts = `4`;
3493	net->ipv4.sysctl_tcp_shrink_window = `0`;
3494
3495	net->ipv4.sysctl_tcp_pingpong_thresh = `1`;
3496
3497	return `0`;
3498	}
3499
3500	static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3501	{
3502	struct net *net;
3503
3504	tcp_twsk_purge(net_exit_list, AF_INET);
3505
3506	list_for_each_entry(net, net_exit_list, exit_list) {
3507	inet_pernet_hashinfo_free(hashinfo: net->ipv4.tcp_death_row.hashinfo);
3508	WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3509	tcp_fastopen_ctx_destroy(net);
3510	}
3511	}
3512
3513	static struct pernet_operations __net_initdata tcp_sk_ops = {
3514	.init = tcp_sk_init,
3515	.exit = tcp_sk_exit,
3516	.exit_batch = tcp_sk_exit_batch,
3517	};
3518
3519	#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3520	DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3521	struct sock_common *sk_common, uid_t uid)
3522
3523	#define INIT_BATCH_SZ 16
3524
3525	static int bpf_iter_init_tcp(void priv_data, struct* bpf_iter_aux_info *aux)
3526	{
3527	struct bpf_tcp_iter_state *iter = priv_data;
3528	int err;
3529
3530	err = bpf_iter_init_seq_net(priv_data, aux);
3531	if (err)
3532	return err;
3533
3534	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3535	if (err) {
3536	bpf_iter_fini_seq_net(priv_data);
3537	return err;
3538	}
3539
3540	return `0`;
3541	}
3542
3543	static void bpf_iter_fini_tcp(void *priv_data)
3544	{
3545	struct bpf_tcp_iter_state *iter = priv_data;
3546
3547	bpf_iter_fini_seq_net(priv_data);
3548	kvfree(addr: iter->batch);
3549	}
3550
3551	static const struct bpf_iter_seq_info tcp_seq_info = {
3552	.seq_ops = &bpf_iter_tcp_seq_ops,
3553	.init_seq_private = bpf_iter_init_tcp,
3554	.fini_seq_private = bpf_iter_fini_tcp,
3555	.seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3556	};
3557
3558	static const struct bpf_func_proto *
3559	bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3560	const struct bpf_prog *prog)
3561	{
3562	switch (func_id) {
3563	case BPF_FUNC_setsockopt:
3564	return &bpf_sk_setsockopt_proto;
3565	case BPF_FUNC_getsockopt:
3566	return &bpf_sk_getsockopt_proto;
3567	default:
3568	return NULL;
3569	}
3570	}
3571
3572	static struct bpf_iter_reg tcp_reg_info = {
3573	.target = "tcp",
3574	.ctx_arg_info_size = `1`,
3575	.ctx_arg_info = {
3576	{ offsetof(struct bpf_iter__tcp, sk_common),
3577	PTR_TO_BTF_ID_OR_NULL \| PTR_TRUSTED },
3578	},
3579	.get_func_proto = bpf_iter_tcp_get_func_proto,
3580	.seq_info = &tcp_seq_info,
3581	};
3582
3583	static void __init bpf_iter_register(void)
3584	{
3585	tcp_reg_info.ctx_arg_info[`0`].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3586	if (bpf_iter_reg_target(reg_info: &tcp_reg_info))
3587	pr_warn("Warning: could not register bpf iterator tcp\n");
3588	}
3589
3590	#endif
3591
3592	void __init tcp_v4_init(void)
3593	{
3594	int cpu, res;
3595
3596	for_each_possible_cpu(cpu) {
3597	struct sock *sk;
3598
3599	res = inet_ctl_sock_create(sk: &sk, PF_INET, type: SOCK_RAW,
3600	IPPROTO_TCP, net: &init_net);
3601	if (res)
3602	panic(fmt: "Failed to create the TCP control socket.\n");
3603	sock_set_flag(sk, flag: SOCK_USE_WRITE_QUEUE);
3604
3605	/ Please enforce IP_DF and IPID==0 for RST and*
3606	* ACK sent in SYN-RECV and TIME-WAIT state.
3607	*/
3608	inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3609
3610	per_cpu(ipv4_tcp_sk, cpu) = sk;
3611	}
3612	if (register_pernet_subsys(&tcp_sk_ops))
3613	panic(fmt: "Failed to create the TCP control socket.\n");
3614
3615	#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3616	bpf_iter_register();
3617	#endif
3618	}
3619

source code of linux/net/ipv4/tcp_ipv4.c