sort.c source code [linux/lib/sort.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* A fast, small, non-recursive O(n log n) sort for the Linux kernel
4	*
5	* This performs nlog2(n) + 0.37n + o(n) comparisons on average,
6	* and 1.5nlog2(n) + O(n) in the (very contrived) worst case.
7	*
8	* Glibc qsort() manages nlog2(n) - 1.26n for random inputs (1.63*n
9	* better) at the expense of stack usage and much larger code to avoid
10	* quicksort's O(n^2) worst case.
11	*/
12
13	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14
15	#include <linux/types.h>
16	#include <linux/export.h>
17	#include <linux/sort.h>
18
19	/**
20	* is_aligned - is this pointer & size okay for word-wide copying?
21	* @base: pointer to data
22	* @size: size of each element
23	* @align: required alignment (typically 4 or 8)
24	*
25	* Returns true if elements can be copied using word loads and stores.
26	* The size must be a multiple of the alignment, and the base address must
27	* be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
28	*
29	* For some reason, gcc doesn't know to optimize "if (a & mask \|\| b & mask)"
30	* to "if ((a \| b) & mask)", so we do that by hand.
31	*/
32	__attribute_const__ __always_inline
33	static bool is_aligned(const void base, size_t size, unsigned* char align)
34	{
35	unsigned char lsbits = (unsigned char)size;
36
37	(void)base;
38	#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
39	lsbits \|= (unsigned char)(uintptr_t)base;
40	#endif
41	return (lsbits & (align - `1`)) == `0`;
42	}
43
44	/**
45	* swap_words_32 - swap two elements in 32-bit chunks
46	* @a: pointer to the first element to swap
47	* @b: pointer to the second element to swap
48	* @n: element size (must be a multiple of 4)
49	*
50	* Exchange the two objects in memory. This exploits base+index addressing,
51	* which basically all CPUs have, to minimize loop overhead computations.
52	*
53	* For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
54	* bottom of the loop, even though the zero flag is still valid from the
55	* subtract (since the intervening mov instructions don't alter the flags).
56	* Gcc 8.1.0 doesn't have that problem.
57	*/
58	static void swap_words_32(void a, void* *b, size_t n)
59	{
60	do {
61	u32 t = (u32 )(a + (n -= `4`));
62	(u32 )(a + n) = (u32 )(b + n);
63	(u32 )(b + n) = t;
64	} while (n);
65	}
66
67	/**
68	* swap_words_64 - swap two elements in 64-bit chunks
69	* @a: pointer to the first element to swap
70	* @b: pointer to the second element to swap
71	* @n: element size (must be a multiple of 8)
72	*
73	* Exchange the two objects in memory. This exploits base+index
74	* addressing, which basically all CPUs have, to minimize loop overhead
75	* computations.
76	*
77	* We'd like to use 64-bit loads if possible. If they're not, emulating
78	* one requires base+index+4 addressing which x86 has but most other
79	* processors do not. If CONFIG_64BIT, we definitely have 64-bit loads,
80	* but it's possible to have 64-bit loads without 64-bit pointers (e.g.
81	* x32 ABI). Are there any cases the kernel needs to worry about?
82	*/
83	static void swap_words_64(void a, void* *b, size_t n)
84	{
85	do {
86	#ifdef CONFIG_64BIT
87	u64 t = (u64 )(a + (n -= `8`));
88	(u64 )(a + n) = (u64 )(b + n);
89	(u64 )(b + n) = t;
90	#else
91	/ Use two 32-bit transfers to avoid base+index+4 addressing /
92	u32 t = (u32 )(a + (n -= `4`));
93	(u32 )(a + n) = (u32 )(b + n);
94	(u32 )(b + n) = t;
95
96	t = (u32 )(a + (n -= `4`));
97	(u32 )(a + n) = (u32 )(b + n);
98	(u32 )(b + n) = t;
99	#endif
100	} while (n);
101	}
102
103	/**
104	* swap_bytes - swap two elements a byte at a time
105	* @a: pointer to the first element to swap
106	* @b: pointer to the second element to swap
107	* @n: element size
108	*
109	* This is the fallback if alignment doesn't allow using larger chunks.
110	*/
111	static void swap_bytes(void a, void* *b, size_t n)
112	{
113	do {
114	char t = ((char *)a)[--n];
115	((char )a)[n] = ((char* *)b)[n];
116	((char *)b)[n] = t;
117	} while (n);
118	}
119
120	/*
121	* The values are arbitrary as long as they can't be confused with
122	* a pointer, but small integers make for the smallest compare
123	* instructions.
124	*/
125	#define SWAP_WORDS_64 (swap_r_func_t)0
126	#define SWAP_WORDS_32 (swap_r_func_t)1
127	#define SWAP_BYTES (swap_r_func_t)2
128	#define SWAP_WRAPPER (swap_r_func_t)3
129
130	struct wrapper {
131	cmp_func_t cmp;
132	swap_func_t swap;
133	};
134
135	/*
136	* The function pointer is last to make tail calls most efficient if the
137	* compiler decides not to inline this function.
138	*/
139	static void do_swap(void a, void* b, size_t size, swap_r_func_t swap_func, const* void *priv)
140	{
141	if (swap_func == SWAP_WRAPPER) {
142	((const struct wrapper )priv)->swap(a, b, (int*)size);
143	return;
144	}
145
146	if (swap_func == SWAP_WORDS_64)
147	swap_words_64(a, b, n: size);
148	else if (swap_func == SWAP_WORDS_32)
149	swap_words_32(a, b, n: size);
150	else if (swap_func == SWAP_BYTES)
151	swap_bytes(a, b, n: size);
152	else
153	swap_func(a, b, (int)size, priv);
154	}
155
156	#define _CMP_WRAPPER ((cmp_r_func_t)0L)
157
158	static int do_cmp(const void a, const* void b, cmp_r_func_t cmp, const* void *priv)
159	{
160	if (cmp == _CMP_WRAPPER)
161	return ((const struct wrapper *)priv)->cmp(a, b);
162	return cmp(a, b, priv);
163	}
164
165	/**
166	* parent - given the offset of the child, find the offset of the parent.
167	* @i: the offset of the heap element whose parent is sought. Non-zero.
168	* @lsbit: a precomputed 1-bit mask, equal to "size & -size"
169	* @size: size of each element
170	*
171	* In terms of array indexes, the parent of element j = @i/@size is simply
172	* (j-1)/2. But when working in byte offsets, we can't use implicit
173	* truncation of integer divides.
174	*
175	* Fortunately, we only need one bit of the quotient, not the full divide.
176	* @size has a least significant bit. That bit will be clear if @i is
177	* an even multiple of @size, and set if it's an odd multiple.
178	*
179	* Logically, we're doing "if (i & lsbit) i -= size;", but since the
180	* branch is unpredictable, it's done with a bit of clever branch-free
181	* code instead.
182	*/
183	__attribute_const__ __always_inline
184	static size_t parent(size_t i, unsigned int lsbit, size_t size)
185	{
186	i -= size;
187	i -= size & -(i & lsbit);
188	return i / `2`;
189	}
190
191	/**
192	* sort_r - sort an array of elements
193	* @base: pointer to data to sort
194	* @num: number of elements
195	* @size: size of each element
196	* @cmp_func: pointer to comparison function
197	* @swap_func: pointer to swap function or NULL
198	* @priv: third argument passed to comparison function
199	*
200	* This function does a heapsort on the given array. You may provide
201	* a swap_func function if you need to do something more than a memory
202	* copy (e.g. fix up pointers or auxiliary data), but the built-in swap
203	* avoids a slow retpoline and so is significantly faster.
204	*
205	* Sorting time is O(n log n) both on average and worst-case. While
206	* quicksort is slightly faster on average, it suffers from exploitable
207	* O(n*n) worst-case behavior and extra memory requirements that make
208	* it less suitable for kernel use.
209	*/
210	void sort_r(void *base, size_t num, size_t size,
211	cmp_r_func_t cmp_func,
212	swap_r_func_t swap_func,
213	const void *priv)
214	{
215	/ pre-scale counters for performance /
216	size_t n = num * size, a = (num/`2`) * size;
217	const unsigned int lsbit = size & -size; / Used to find parent /
218	size_t shift = `0`;
219
220	if (!a) / num < 2 \|\| size == 0 /
221	return;
222
223	/ called from 'sort' without swap function, let's pick the default /
224	if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap)
225	swap_func = NULL;
226
227	if (!swap_func) {
228	if (is_aligned(base, size, align: `8`))
229	swap_func = SWAP_WORDS_64;
230	else if (is_aligned(base, size, align: `4`))
231	swap_func = SWAP_WORDS_32;
232	else
233	swap_func = SWAP_BYTES;
234	}
235
236	/*
237	* Loop invariants:
238	* 1. elements [a,n) satisfy the heap property (compare greater than
239	* all of their children),
240	* 2. elements [n,num*size) are sorted, and
241	* 3. a <= b <= c <= d <= n (whenever they are valid).
242	*/
243	for (;;) {
244	size_t b, c, d;
245
246	if (a) / Building heap: sift down a /
247	a -= size << shift;
248	else if (n > `3` * size) { / Sorting: Extract two largest elements /
249	n -= size;
250	do_swap(a: base, b: base + n, size, swap_func, priv);
251	shift = do_cmp(a: base + size, b: base + `2` * size, cmp: cmp_func, priv) <= `0`;
252	a = size << shift;
253	n -= size;
254	do_swap(a: base + a, b: base + n, size, swap_func, priv);
255	} else if (n > size) { / Sorting: Extract root /
256	n -= size;
257	do_swap(a: base, b: base + n, size, swap_func, priv);
258	} else { / Sort complete /
259	break;
260	}
261
262	/*
263	* Sift element at "a" down into heap. This is the
264	* "bottom-up" variant, which significantly reduces
265	* calls to cmp_func(): we find the sift-down path all
266	* the way to the leaves (one compare per level), then
267	* backtrack to find where to insert the target element.
268	*
269	* Because elements tend to sift down close to the leaves,
270	* this uses fewer compares than doing two per level
271	* on the way down. (A bit more than half as many on
272	* average, 3/4 worst-case.)
273	*/
274	for (b = a; c = `2`*b + size, (d = c + size) < n;)
275	b = do_cmp(a: base + c, b: base + d, cmp: cmp_func, priv) > `0` ? c : d;
276	if (d == n) / Special case last leaf with no sibling /
277	b = c;
278
279	/ Now backtrack from "b" to the correct location for "a" /
280	while (b != a && do_cmp(a: base + a, b: base + b, cmp: cmp_func, priv) >= `0`)
281	b = parent(i: b, lsbit, size);
282	c = b; / Where "a" belongs /
283	while (b != a) { / Shift it into place /
284	b = parent(i: b, lsbit, size);
285	do_swap(a: base + b, b: base + c, size, swap_func, priv);
286	}
287	}
288	}
289	EXPORT_SYMBOL(sort_r);
290
291	void sort(void *base, size_t num, size_t size,
292	cmp_func_t cmp_func,
293	swap_func_t swap_func)
294	{
295	struct wrapper w = {
296	.cmp = cmp_func,
297	.swap = swap_func,
298	};
299
300	return sort_r(base, num, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
301	}
302	EXPORT_SYMBOL(sort);
303

source code of linux/lib/sort.c