workingset.c source code [linux/mm/workingset.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Workingset detection
4	*
5	* Copyright (C) 2013 Red Hat, Inc., Johannes Weiner
6	*/
7
8	#include <linux/memcontrol.h>
9	#include <linux/mm_inline.h>
10	#include <linux/writeback.h>
11	#include <linux/shmem_fs.h>
12	#include <linux/pagemap.h>
13	#include <linux/atomic.h>
14	#include <linux/module.h>
15	#include <linux/swap.h>
16	#include <linux/dax.h>
17	#include <linux/fs.h>
18	#include <linux/mm.h>
19	#include "internal.h"
20
21	/*
22	* Double CLOCK lists
23	*
24	* Per node, two clock lists are maintained for file pages: the
25	* inactive and the active list. Freshly faulted pages start out at
26	* the head of the inactive list and page reclaim scans pages from the
27	* tail. Pages that are accessed multiple times on the inactive list
28	* are promoted to the active list, to protect them from reclaim,
29	* whereas active pages are demoted to the inactive list when the
30	* active list grows too big.
31	*
32	* fault ------------------------+
33	* \|
34	* +--------------+ \| +-------------+
35	* reclaim <- \| inactive \| <-+-- demotion \| active \| <--+
36	* +--------------+ +-------------+ \|
37	* \| \|
38	* +-------------- promotion ------------------+
39	*
40	*
41	* Access frequency and refault distance
42	*
43	* A workload is thrashing when its pages are frequently used but they
44	* are evicted from the inactive list every time before another access
45	* would have promoted them to the active list.
46	*
47	* In cases where the average access distance between thrashing pages
48	* is bigger than the size of memory there is nothing that can be
49	* done - the thrashing set could never fit into memory under any
50	* circumstance.
51	*
52	* However, the average access distance could be bigger than the
53	* inactive list, yet smaller than the size of memory. In this case,
54	* the set could fit into memory if it weren't for the currently
55	* active pages - which may be used more, hopefully less frequently:
56	*
57	* +-memory available to cache-+
58	* \| \|
59	* +-inactive------+-active----+
60	* a b \| c d e f g h i \| J K L M N \|
61	* +---------------+-----------+
62	*
63	* It is prohibitively expensive to accurately track access frequency
64	* of pages. But a reasonable approximation can be made to measure
65	* thrashing on the inactive list, after which refaulting pages can be
66	* activated optimistically to compete with the existing active pages.
67	*
68	* Approximating inactive page access frequency - Observations:
69	*
70	* 1. When a page is accessed for the first time, it is added to the
71	* head of the inactive list, slides every existing inactive page
72	* towards the tail by one slot, and pushes the current tail page
73	* out of memory.
74	*
75	* 2. When a page is accessed for the second time, it is promoted to
76	* the active list, shrinking the inactive list by one slot. This
77	* also slides all inactive pages that were faulted into the cache
78	* more recently than the activated page towards the tail of the
79	* inactive list.
80	*
81	* Thus:
82	*
83	* 1. The sum of evictions and activations between any two points in
84	* time indicate the minimum number of inactive pages accessed in
85	* between.
86	*
87	* 2. Moving one inactive page N page slots towards the tail of the
88	* list requires at least N inactive page accesses.
89	*
90	* Combining these:
91	*
92	* 1. When a page is finally evicted from memory, the number of
93	* inactive pages accessed while the page was in cache is at least
94	* the number of page slots on the inactive list.
95	*
96	* 2. In addition, measuring the sum of evictions and activations (E)
97	* at the time of a page's eviction, and comparing it to another
98	* reading (R) at the time the page faults back into memory tells
99	* the minimum number of accesses while the page was not cached.
100	* This is called the refault distance.
101	*
102	* Because the first access of the page was the fault and the second
103	* access the refault, we combine the in-cache distance with the
104	* out-of-cache distance to get the complete minimum access distance
105	* of this page:
106	*
107	* NR_inactive + (R - E)
108	*
109	* And knowing the minimum access distance of a page, we can easily
110	* tell if the page would be able to stay in cache assuming all page
111	* slots in the cache were available:
112	*
113	* NR_inactive + (R - E) <= NR_inactive + NR_active
114	*
115	* If we have swap we should consider about NR_inactive_anon and
116	* NR_active_anon, so for page cache and anonymous respectively:
117	*
118	* NR_inactive_file + (R - E) <= NR_inactive_file + NR_active_file
119	* + NR_inactive_anon + NR_active_anon
120	*
121	* NR_inactive_anon + (R - E) <= NR_inactive_anon + NR_active_anon
122	* + NR_inactive_file + NR_active_file
123	*
124	* Which can be further simplified to:
125	*
126	* (R - E) <= NR_active_file + NR_inactive_anon + NR_active_anon
127	*
128	* (R - E) <= NR_active_anon + NR_inactive_file + NR_active_file
129	*
130	* Put into words, the refault distance (out-of-cache) can be seen as
131	* a deficit in inactive list space (in-cache). If the inactive list
132	* had (R - E) more page slots, the page would not have been evicted
133	* in between accesses, but activated instead. And on a full system,
134	* the only thing eating into inactive list space is active pages.
135	*
136	*
137	* Refaulting inactive pages
138	*
139	* All that is known about the active list is that the pages have been
140	* accessed more than once in the past. This means that at any given
141	* time there is actually a good chance that pages on the active list
142	* are no longer in active use.
143	*
144	* So when a refault distance of (R - E) is observed and there are at
145	* least (R - E) pages in the userspace workingset, the refaulting page
146	* is activated optimistically in the hope that (R - E) pages are actually
147	* used less frequently than the refaulting page - or even not used at
148	* all anymore.
149	*
150	* That means if inactive cache is refaulting with a suitable refault
151	* distance, we assume the cache workingset is transitioning and put
152	* pressure on the current workingset.
153	*
154	* If this is wrong and demotion kicks in, the pages which are truly
155	* used more frequently will be reactivated while the less frequently
156	* used once will be evicted from memory.
157	*
158	* But if this is right, the stale pages will be pushed out of memory
159	* and the used pages get to stay in cache.
160	*
161	* Refaulting active pages
162	*
163	* If on the other hand the refaulting pages have recently been
164	* deactivated, it means that the active list is no longer protecting
165	* actively used cache from reclaim. The cache is NOT transitioning to
166	* a different workingset; the existing workingset is thrashing in the
167	* space allocated to the page cache.
168	*
169	*
170	* Implementation
171	*
172	* For each node's LRU lists, a counter for inactive evictions and
173	* activations is maintained (node->nonresident_age).
174	*
175	* On eviction, a snapshot of this counter (along with some bits to
176	* identify the node) is stored in the now empty page cache
177	* slot of the evicted page. This is called a shadow entry.
178	*
179	* On cache misses for which there are shadow entries, an eligible
180	* refault distance will immediately activate the refaulting page.
181	*/
182
183	#define WORKINGSET_SHIFT 1
184	#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \
185	WORKINGSET_SHIFT + NODES_SHIFT + \
186	MEM_CGROUP_ID_SHIFT)
187	#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
188
189	/*
190	* Eviction timestamps need to be able to cover the full range of
191	* actionable refaults. However, bits are tight in the xarray
192	* entry, and after storing the identifier for the lruvec there might
193	* not be enough left to represent every single actionable refault. In
194	* that case, we have to sacrifice granularity for distance, and group
195	* evictions into coarser buckets by shaving off lower timestamp bits.
196	*/
197	static unsigned int bucket_order __read_mostly;
198
199	static void pack_shadow(int* memcgid, pg_data_t pgdat, unsigned* long eviction,
200	bool workingset)
201	{
202	eviction &= EVICTION_MASK;
203	eviction = (eviction << MEM_CGROUP_ID_SHIFT) \| memcgid;
204	eviction = (eviction << NODES_SHIFT) \| pgdat->node_id;
205	eviction = (eviction << WORKINGSET_SHIFT) \| workingset;
206
207	return xa_mk_value(v: eviction);
208	}
209
210	static void unpack_shadow(void shadow, int* memcgidp, pg_data_t *pgdat,
211	unsigned long evictionp, bool workingsetp)
212	{
213	unsigned long entry = xa_to_value(entry: shadow);
214	int memcgid, nid;
215	bool workingset;
216
217	workingset = entry & ((`1UL` << WORKINGSET_SHIFT) - `1`);
218	entry >>= WORKINGSET_SHIFT;
219	nid = entry & ((`1UL` << NODES_SHIFT) - `1`);
220	entry >>= NODES_SHIFT;
221	memcgid = entry & ((`1UL` << MEM_CGROUP_ID_SHIFT) - `1`);
222	entry >>= MEM_CGROUP_ID_SHIFT;
223
224	*memcgidp = memcgid;
225	*pgdat = NODE_DATA(nid);
226	*evictionp = entry;
227	*workingsetp = workingset;
228	}
229
230	#ifdef CONFIG_LRU_GEN
231
232	static void lru_gen_eviction(struct* folio *folio)
233	{
234	int hist;
235	unsigned long token;
236	unsigned long min_seq;
237	struct lruvec *lruvec;
238	struct lru_gen_folio *lrugen;
239	int type = folio_is_file_lru(folio);
240	int delta = folio_nr_pages(folio);
241	int refs = folio_lru_refs(folio);
242	int tier = lru_tier_from_refs(refs);
243	struct mem_cgroup *memcg = folio_memcg(folio);
244	struct pglist_data *pgdat = folio_pgdat(folio);
245
246	BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
247
248	lruvec = mem_cgroup_lruvec(memcg, pgdat);
249	lrugen = &lruvec->lrugen;
250	min_seq = READ_ONCE(lrugen->min_seq[type]);
251	token = (min_seq << LRU_REFS_WIDTH) \| max(refs - `1`, `0`);
252
253	hist = lru_hist_from_seq(seq: min_seq);
254	atomic_long_add(i: delta, v: &lrugen->evicted[hist][type][tier]);
255
256	return pack_shadow(memcgid: mem_cgroup_id(memcg), pgdat, eviction: token, workingset: refs);
257	}
258
259	/*
260	* Tests if the shadow entry is for a folio that was recently evicted.
261	* Fills in @lruvec, @token, @workingset with the values unpacked from shadow.
262	*/
263	static bool lru_gen_test_recent(void shadow, bool file, struct* lruvec **lruvec,
264	unsigned long token, bool workingset)
265	{
266	int memcg_id;
267	unsigned long min_seq;
268	struct mem_cgroup *memcg;
269	struct pglist_data *pgdat;
270
271	unpack_shadow(shadow, memcgidp: &memcg_id, pgdat: &pgdat, evictionp: token, workingsetp: workingset);
272
273	memcg = mem_cgroup_from_id(id: memcg_id);
274	*lruvec = mem_cgroup_lruvec(memcg, pgdat);
275
276	min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]);
277	return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH));
278	}
279
280	static void lru_gen_refault(struct folio folio, void* *shadow)
281	{
282	bool recent;
283	int hist, tier, refs;
284	bool workingset;
285	unsigned long token;
286	struct lruvec *lruvec;
287	struct lru_gen_folio *lrugen;
288	int type = folio_is_file_lru(folio);
289	int delta = folio_nr_pages(folio);
290
291	rcu_read_lock();
292
293	recent = lru_gen_test_recent(shadow, file: type, lruvec: &lruvec, token: &token, workingset: &workingset);
294	if (lruvec != folio_lruvec(folio))
295	goto unlock;
296
297	mod_lruvec_state(lruvec, idx: WORKINGSET_REFAULT_BASE + type, val: delta);
298
299	if (!recent)
300	goto unlock;
301
302	lrugen = &lruvec->lrugen;
303
304	hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type]));
305	/ see the comment in folio_lru_refs() /
306	refs = (token & (BIT(LRU_REFS_WIDTH) - `1`)) + workingset;
307	tier = lru_tier_from_refs(refs);
308
309	atomic_long_add(i: delta, v: &lrugen->refaulted[hist][type][tier]);
310	mod_lruvec_state(lruvec, idx: WORKINGSET_ACTIVATE_BASE + type, val: delta);
311
312	/*
313	* Count the following two cases as stalls:
314	* 1. For pages accessed through page tables, hotter pages pushed out
315	* hot pages which refaulted immediately.
316	* 2. For pages accessed multiple times through file descriptors,
317	* they would have been protected by sort_folio().
318	*/
319	if (lru_gen_in_fault() \|\| refs >= BIT(LRU_REFS_WIDTH) - `1`) {
320	set_mask_bits(&folio->flags, `0`, LRU_REFS_MASK \| BIT(PG_workingset));
321	mod_lruvec_state(lruvec, idx: WORKINGSET_RESTORE_BASE + type, val: delta);
322	}
323	unlock:
324	rcu_read_unlock();
325	}
326
327	#else /* !CONFIG_LRU_GEN */
328
329	static void lru_gen_eviction(struct* folio *folio)
330	{
331	return NULL;
332	}
333
334	static bool lru_gen_test_recent(void shadow, bool file, struct* lruvec **lruvec,
335	unsigned long token, bool workingset)
336	{
337	return false;
338	}
339
340	static void lru_gen_refault(struct folio folio, void* *shadow)
341	{
342	}
343
344	#endif /* CONFIG_LRU_GEN */
345
346	/**
347	* workingset_age_nonresident - age non-resident entries as LRU ages
348	* @lruvec: the lruvec that was aged
349	* @nr_pages: the number of pages to count
350	*
351	* As in-memory pages are aged, non-resident pages need to be aged as
352	* well, in order for the refault distances later on to be comparable
353	* to the in-memory dimensions. This function allows reclaim and LRU
354	* operations to drive the non-resident aging along in parallel.
355	*/
356	void workingset_age_nonresident(struct lruvec lruvec, unsigned* long nr_pages)
357	{
358	/*
359	* Reclaiming a cgroup means reclaiming all its children in a
360	* round-robin fashion. That means that each cgroup has an LRU
361	* order that is composed of the LRU orders of its child
362	* cgroups; and every page has an LRU position not just in the
363	* cgroup that owns it, but in all of that group's ancestors.
364	*
365	* So when the physical inactive list of a leaf cgroup ages,
366	* the virtual inactive lists of all its parents, including
367	* the root cgroup's, age as well.
368	*/
369	do {
370	atomic_long_add(i: nr_pages, v: &lruvec->nonresident_age);
371	} while ((lruvec = parent_lruvec(lruvec)));
372	}
373
374	/**
375	* workingset_eviction - note the eviction of a folio from memory
376	* @target_memcg: the cgroup that is causing the reclaim
377	* @folio: the folio being evicted
378	*
379	* Return: a shadow entry to be stored in @folio->mapping->i_pages in place
380	* of the evicted @folio so that a later refault can be detected.
381	*/
382	void workingset_eviction(struct* folio folio, struct* mem_cgroup *target_memcg)
383	{
384	struct pglist_data *pgdat = folio_pgdat(folio);
385	unsigned long eviction;
386	struct lruvec *lruvec;
387	int memcgid;
388
389	/ Folio is fully exclusive and pins folio's memory cgroup pointer /
390	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
391	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
392	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
393
394	if (lru_gen_enabled())
395	return lru_gen_eviction(folio);
396
397	lruvec = mem_cgroup_lruvec(memcg: target_memcg, pgdat);
398	/ XXX: target_memcg can be NULL, go through lruvec /
399	memcgid = mem_cgroup_id(memcg: lruvec_memcg(lruvec));
400	eviction = atomic_long_read(v: &lruvec->nonresident_age);
401	eviction >>= bucket_order;
402	workingset_age_nonresident(lruvec, nr_pages: folio_nr_pages(folio));
403	return pack_shadow(memcgid, pgdat, eviction,
404	workingset: folio_test_workingset(folio));
405	}
406
407	/**
408	* workingset_test_recent - tests if the shadow entry is for a folio that was
409	* recently evicted. Also fills in @workingset with the value unpacked from
410	* shadow.
411	* @shadow: the shadow entry to be tested.
412	* @file: whether the corresponding folio is from the file lru.
413	* @workingset: where the workingset value unpacked from shadow should
414	* be stored.
415	*
416	* Return: true if the shadow is for a recently evicted folio; false otherwise.
417	*/
418	bool workingset_test_recent(void shadow, bool file, bool workingset)
419	{
420	struct mem_cgroup *eviction_memcg;
421	struct lruvec *eviction_lruvec;
422	unsigned long refault_distance;
423	unsigned long workingset_size;
424	unsigned long refault;
425	int memcgid;
426	struct pglist_data *pgdat;
427	unsigned long eviction;
428
429	rcu_read_lock();
430
431	if (lru_gen_enabled()) {
432	bool recent = lru_gen_test_recent(shadow, file,
433	lruvec: &eviction_lruvec, token: &eviction, workingset);
434
435	rcu_read_unlock();
436	return recent;
437	}
438
439
440	unpack_shadow(shadow, memcgidp: &memcgid, pgdat: &pgdat, evictionp: &eviction, workingsetp: workingset);
441	eviction <<= bucket_order;
442
443	/*
444	* Look up the memcg associated with the stored ID. It might
445	* have been deleted since the folio's eviction.
446	*
447	* Note that in rare events the ID could have been recycled
448	* for a new cgroup that refaults a shared folio. This is
449	* impossible to tell from the available data. However, this
450	* should be a rare and limited disturbance, and activations
451	* are always speculative anyway. Ultimately, it's the aging
452	* algorithm's job to shake out the minimum access frequency
453	* for the active cache.
454	*
455	* XXX: On !CONFIG_MEMCG, this will always return NULL; it
456	* would be better if the root_mem_cgroup existed in all
457	* configurations instead.
458	*/
459	eviction_memcg = mem_cgroup_from_id(id: memcgid);
460	if (!mem_cgroup_disabled() &&
461	(!eviction_memcg \|\| !mem_cgroup_tryget(memcg: eviction_memcg))) {
462	rcu_read_unlock();
463	return false;
464	}
465
466	rcu_read_unlock();
467
468	/*
469	* Flush stats (and potentially sleep) outside the RCU read section.
470	* XXX: With per-memcg flushing and thresholding, is ratelimiting
471	* still needed here?
472	*/
473	mem_cgroup_flush_stats_ratelimited(memcg: eviction_memcg);
474
475	eviction_lruvec = mem_cgroup_lruvec(memcg: eviction_memcg, pgdat);
476	refault = atomic_long_read(v: &eviction_lruvec->nonresident_age);
477
478	/*
479	* Calculate the refault distance
480	*
481	* The unsigned subtraction here gives an accurate distance
482	* across nonresident_age overflows in most cases. There is a
483	* special case: usually, shadow entries have a short lifetime
484	* and are either refaulted or reclaimed along with the inode
485	* before they get too old. But it is not impossible for the
486	* nonresident_age to lap a shadow entry in the field, which
487	* can then result in a false small refault distance, leading
488	* to a false activation should this old entry actually
489	* refault again. However, earlier kernels used to deactivate
490	* unconditionally with every reclaim invocation for the
491	* longest time, so the occasional inappropriate activation
492	* leading to pressure on the active list is not a problem.
493	*/
494	refault_distance = (refault - eviction) & EVICTION_MASK;
495
496	/*
497	* Compare the distance to the existing workingset size. We
498	* don't activate pages that couldn't stay resident even if
499	* all the memory was available to the workingset. Whether
500	* workingset competition needs to consider anon or not depends
501	* on having free swap space.
502	*/
503	workingset_size = lruvec_page_state(lruvec: eviction_lruvec, idx: NR_ACTIVE_FILE);
504	if (!file) {
505	workingset_size += lruvec_page_state(lruvec: eviction_lruvec,
506	idx: NR_INACTIVE_FILE);
507	}
508	if (mem_cgroup_get_nr_swap_pages(memcg: eviction_memcg) > `0`) {
509	workingset_size += lruvec_page_state(lruvec: eviction_lruvec,
510	idx: NR_ACTIVE_ANON);
511	if (file) {
512	workingset_size += lruvec_page_state(lruvec: eviction_lruvec,
513	idx: NR_INACTIVE_ANON);
514	}
515	}
516
517	mem_cgroup_put(memcg: eviction_memcg);
518	return refault_distance <= workingset_size;
519	}
520
521	/**
522	* workingset_refault - Evaluate the refault of a previously evicted folio.
523	* @folio: The freshly allocated replacement folio.
524	* @shadow: Shadow entry of the evicted folio.
525	*
526	* Calculates and evaluates the refault distance of the previously
527	* evicted folio in the context of the node and the memcg whose memory
528	* pressure caused the eviction.
529	*/
530	void workingset_refault(struct folio folio, void* *shadow)
531	{
532	bool file = folio_is_file_lru(folio);
533	struct pglist_data *pgdat;
534	struct mem_cgroup *memcg;
535	struct lruvec *lruvec;
536	bool workingset;
537	long nr;
538
539	if (lru_gen_enabled()) {
540	lru_gen_refault(folio, shadow);
541	return;
542	}
543
544	/*
545	* The activation decision for this folio is made at the level
546	* where the eviction occurred, as that is where the LRU order
547	* during folio reclaim is being determined.
548	*
549	* However, the cgroup that will own the folio is the one that
550	* is actually experiencing the refault event. Make sure the folio is
551	* locked to guarantee folio_memcg() stability throughout.
552	*/
553	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
554	nr = folio_nr_pages(folio);
555	memcg = folio_memcg(folio);
556	pgdat = folio_pgdat(folio);
557	lruvec = mem_cgroup_lruvec(memcg, pgdat);
558
559	mod_lruvec_state(lruvec, idx: WORKINGSET_REFAULT_BASE + file, val: nr);
560
561	if (!workingset_test_recent(shadow, file, workingset: &workingset))
562	return;
563
564	folio_set_active(folio);
565	workingset_age_nonresident(lruvec, nr_pages: nr);
566	mod_lruvec_state(lruvec, idx: WORKINGSET_ACTIVATE_BASE + file, val: nr);
567
568	/ Folio was active prior to eviction /
569	if (workingset) {
570	folio_set_workingset(folio);
571	/*
572	* XXX: Move to folio_add_lru() when it supports new vs
573	* putback
574	*/
575	lru_note_cost_refault(folio);
576	mod_lruvec_state(lruvec, idx: WORKINGSET_RESTORE_BASE + file, val: nr);
577	}
578	}
579
580	/**
581	* workingset_activation - note a page activation
582	* @folio: Folio that is being activated.
583	*/
584	void workingset_activation(struct folio *folio)
585	{
586	struct mem_cgroup *memcg;
587
588	rcu_read_lock();
589	/*
590	* Filter non-memcg pages here, e.g. unmap can call
591	* mark_page_accessed() on VDSO pages.
592	*
593	* XXX: See workingset_refault() - this should return
594	* root_mem_cgroup even for !CONFIG_MEMCG.
595	*/
596	memcg = folio_memcg_rcu(folio);
597	if (!mem_cgroup_disabled() && !memcg)
598	goto out;
599	workingset_age_nonresident(lruvec: folio_lruvec(folio), nr_pages: folio_nr_pages(folio));
600	out:
601	rcu_read_unlock();
602	}
603
604	/*
605	* Shadow entries reflect the share of the working set that does not
606	* fit into memory, so their number depends on the access pattern of
607	* the workload. In most cases, they will refault or get reclaimed
608	* along with the inode, but a (malicious) workload that streams
609	* through files with a total size several times that of available
610	* memory, while preventing the inodes from being reclaimed, can
611	* create excessive amounts of shadow nodes. To keep a lid on this,
612	* track shadow nodes and reclaim them when they grow way past the
613	* point where they would still be useful.
614	*/
615
616	struct list_lru shadow_nodes;
617
618	void workingset_update_node(struct xa_node *node)
619	{
620	struct address_space *mapping;
621
622	/*
623	* Track non-empty nodes that contain only shadow entries;
624	* unlink those that contain pages or are being freed.
625	*
626	* Avoid acquiring the list_lru lock when the nodes are
627	* already where they should be. The list_empty() test is safe
628	* as node->private_list is protected by the i_pages lock.
629	*/
630	mapping = container_of(node->array, struct address_space, i_pages);
631	lockdep_assert_held(&mapping->i_pages.xa_lock);
632
633	if (node->count && node->count == node->nr_values) {
634	if (list_empty(head: &node->private_list)) {
635	list_lru_add_obj(lru: &shadow_nodes, item: &node->private_list);
636	__inc_lruvec_kmem_state(p: node, idx: WORKINGSET_NODES);
637	}
638	} else {
639	if (!list_empty(head: &node->private_list)) {
640	list_lru_del_obj(lru: &shadow_nodes, item: &node->private_list);
641	__dec_lruvec_kmem_state(p: node, idx: WORKINGSET_NODES);
642	}
643	}
644	}
645
646	static unsigned long count_shadow_nodes(struct shrinker *shrinker,
647	struct shrink_control *sc)
648	{
649	unsigned long max_nodes;
650	unsigned long nodes;
651	unsigned long pages;
652
653	nodes = list_lru_shrink_count(lru: &shadow_nodes, sc);
654	if (!nodes)
655	return SHRINK_EMPTY;
656
657	/*
658	* Approximate a reasonable limit for the nodes
659	* containing shadow entries. We don't need to keep more
660	* shadow entries than possible pages on the active list,
661	* since refault distances bigger than that are dismissed.
662	*
663	* The size of the active list converges toward 100% of
664	* overall page cache as memory grows, with only a tiny
665	* inactive list. Assume the total cache size for that.
666	*
667	* Nodes might be sparsely populated, with only one shadow
668	* entry in the extreme case. Obviously, we cannot keep one
669	* node for every eligible shadow entry, so compromise on a
670	* worst-case density of 1/8th. Below that, not all eligible
671	* refaults can be detected anymore.
672	*
673	* On 64-bit with 7 xa_nodes per page and 64 slots
674	* each, this will reclaim shadow entries when they consume
675	* ~1.8% of available memory:
676	*
677	* PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE
678	*/
679	#ifdef CONFIG_MEMCG
680	if (sc->memcg) {
681	struct lruvec *lruvec;
682	int i;
683
684	mem_cgroup_flush_stats_ratelimited(memcg: sc->memcg);
685	lruvec = mem_cgroup_lruvec(memcg: sc->memcg, NODE_DATA(sc->nid));
686	for (pages = `0`, i = `0`; i < NR_LRU_LISTS; i++)
687	pages += lruvec_page_state_local(lruvec,
688	idx: NR_LRU_BASE + i);
689	pages += lruvec_page_state_local(
690	lruvec, idx: NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT;
691	pages += lruvec_page_state_local(
692	lruvec, idx: NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT;
693	} else
694	#endif
695	pages = node_present_pages(sc->nid);
696
697	max_nodes = pages >> (XA_CHUNK_SHIFT - `3`);
698
699	if (nodes <= max_nodes)
700	return `0`;
701	return nodes - max_nodes;
702	}
703
704	static enum lru_status shadow_lru_isolate(struct list_head *item,
705	struct list_lru_one *lru,
706	spinlock_t *lru_lock,
707	void *arg) __must_hold(lru_lock)
708	{
709	struct xa_node node = container_of(item, struct* xa_node, private_list);
710	struct address_space *mapping;
711	int ret;
712
713	/*
714	* Page cache insertions and deletions synchronously maintain
715	* the shadow node LRU under the i_pages lock and the
716	* lru_lock. Because the page cache tree is emptied before
717	* the inode can be destroyed, holding the lru_lock pins any
718	* address_space that has nodes on the LRU.
719	*
720	* We can then safely transition to the i_pages lock to
721	* pin only the address_space of the particular node we want
722	* to reclaim, take the node off-LRU, and drop the lru_lock.
723	*/
724
725	mapping = container_of(node->array, struct address_space, i_pages);
726
727	/ Coming from the list, invert the lock order /
728	if (!xa_trylock(&mapping->i_pages)) {
729	spin_unlock_irq(lock: lru_lock);
730	ret = LRU_RETRY;
731	goto out;
732	}
733
734	/ For page cache we need to hold i_lock /
735	if (mapping->host != NULL) {
736	if (!spin_trylock(lock: &mapping->host->i_lock)) {
737	xa_unlock(&mapping->i_pages);
738	spin_unlock_irq(lock: lru_lock);
739	ret = LRU_RETRY;
740	goto out;
741	}
742	}
743
744	list_lru_isolate(list: lru, item);
745	__dec_lruvec_kmem_state(p: node, idx: WORKINGSET_NODES);
746
747	spin_unlock(lock: lru_lock);
748
749	/*
750	* The nodes should only contain one or more shadow entries,
751	* no pages, so we expect to be able to remove them all and
752	* delete and free the empty node afterwards.
753	*/
754	if (WARN_ON_ONCE(!node->nr_values))
755	goto out_invalid;
756	if (WARN_ON_ONCE(node->count != node->nr_values))
757	goto out_invalid;
758	xa_delete_node(node, workingset_update_node);
759	__inc_lruvec_kmem_state(p: node, idx: WORKINGSET_NODERECLAIM);
760
761	out_invalid:
762	xa_unlock_irq(&mapping->i_pages);
763	if (mapping->host != NULL) {
764	if (mapping_shrinkable(mapping))
765	inode_add_lru(inode: mapping->host);
766	spin_unlock(lock: &mapping->host->i_lock);
767	}
768	ret = LRU_REMOVED_RETRY;
769	out:
770	cond_resched();
771	spin_lock_irq(lock: lru_lock);
772	return ret;
773	}
774
775	static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
776	struct shrink_control *sc)
777	{
778	/ list_lru lock nests inside the IRQ-safe i_pages lock /
779	return list_lru_shrink_walk_irq(lru: &shadow_nodes, sc, isolate: shadow_lru_isolate,
780	NULL);
781	}
782
783	/*
784	* Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
785	* i_pages lock.
786	*/
787	static struct lock_class_key shadow_nodes_key;
788
789	static int __init workingset_init(void)
790	{
791	struct shrinker *workingset_shadow_shrinker;
792	unsigned int timestamp_bits;
793	unsigned int max_order;
794	int ret = -ENOMEM;
795
796	BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
797	/*
798	* Calculate the eviction bucket size to cover the longest
799	* actionable refault distance, which is currently half of
800	* memory (totalram_pages/2). However, memory hotplug may add
801	* some more pages at runtime, so keep working with up to
802	* double the initial memory by using totalram_pages as-is.
803	*/
804	timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
805	max_order = fls_long(l: totalram_pages() - `1`);
806	if (max_order > timestamp_bits)
807	bucket_order = max_order - timestamp_bits;
808	pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
809	timestamp_bits, max_order, bucket_order);
810
811	workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE \|
812	SHRINKER_MEMCG_AWARE,
813	fmt: "mm-shadow");
814	if (!workingset_shadow_shrinker)
815	goto err;
816
817	ret = __list_lru_init(lru: &shadow_nodes, memcg_aware: true, key: &shadow_nodes_key,
818	shrinker: workingset_shadow_shrinker);
819	if (ret)
820	goto err_list_lru;
821
822	workingset_shadow_shrinker->count_objects = count_shadow_nodes;
823	workingset_shadow_shrinker->scan_objects = scan_shadow_nodes;
824	/ ->count reports only fully expendable nodes /
825	workingset_shadow_shrinker->seeks = `0`;
826
827	shrinker_register(shrinker: workingset_shadow_shrinker);
828	return `0`;
829	err_list_lru:
830	shrinker_free(shrinker: workingset_shadow_shrinker);
831	err:
832	return ret;
833	}
834	module_init(workingset_init);
835

source code of linux/mm/workingset.c