1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
8#include "xfs_shared.h"
9#include "xfs_format.h"
10#include "xfs_trans_resv.h"
11#include "xfs_mount.h"
12#include "xfs_log_format.h"
13#include "xfs_trans.h"
14#include "xfs_inode.h"
15#include "xfs_icache.h"
16#include "xfs_iwalk.h"
17#include "xfs_ialloc.h"
18#include "xfs_dir2.h"
19#include "xfs_dir2_priv.h"
20#include "xfs_ag.h"
21#include "xfs_parent.h"
22#include "scrub/scrub.h"
23#include "scrub/common.h"
24#include "scrub/repair.h"
25#include "scrub/xfile.h"
26#include "scrub/xfarray.h"
27#include "scrub/iscan.h"
28#include "scrub/orphanage.h"
29#include "scrub/nlinks.h"
30#include "scrub/trace.h"
31#include "scrub/readdir.h"
32#include "scrub/tempfile.h"
33#include "scrub/listxattr.h"
34
35/*
36 * Live Inode Link Count Checking
37 * ==============================
38 *
39 * Inode link counts are "summary" metadata, in the sense that they are
40 * computed as the number of directory entries referencing each file on the
41 * filesystem.  Therefore, we compute the correct link counts by creating a
42 * shadow link count structure and walking every inode.
43 */
44
45/* Set us up to scrub inode link counts. */
46int
47xchk_setup_nlinks(
48	struct xfs_scrub	*sc)
49{
50	struct xchk_nlink_ctrs	*xnc;
51	int			error;
52
53	xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
54
55	if (xchk_could_repair(sc)) {
56		error = xrep_setup_nlinks(sc);
57		if (error)
58			return error;
59	}
60
61	xnc = kvzalloc(sizeof(struct xchk_nlink_ctrs), XCHK_GFP_FLAGS);
62	if (!xnc)
63		return -ENOMEM;
64	xnc->xname.name = xnc->namebuf;
65	xnc->sc = sc;
66	sc->buf = xnc;
67
68	return xchk_setup_fs(sc);
69}
70
71/*
72 * Part 1: Collecting file link counts.  For each file, we create a shadow link
73 * counting structure, then walk the entire directory tree, incrementing parent
74 * and child link counts for each directory entry seen.
75 *
76 * To avoid false corruption reports in part 2, any failure in this part must
77 * set the INCOMPLETE flag even when a negative errno is returned.  This care
78 * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
79 * ECANCELED) that are absorbed into a scrub state flag update by
80 * xchk_*_process_error.  Scrub and repair share the same incore data
81 * structures, so the INCOMPLETE flag is critical to prevent a repair based on
82 * insufficient information.
83 *
84 * Because we are scanning a live filesystem, it's possible that another thread
85 * will try to update the link counts for an inode that we've already scanned.
86 * This will cause our counts to be incorrect.  Therefore, we hook all
87 * directory entry updates because that is when link count updates occur.  By
88 * shadowing transaction updates in this manner, live nlink check can ensure by
89 * locking the inode and the shadow structure that its own copies are not out
90 * of date.  Because the hook code runs in a different process context from the
91 * scrub code and the scrub state flags are not accessed atomically, failures
92 * in the hook code must abort the iscan and the scrubber must notice the
93 * aborted scan and set the incomplete flag.
94 *
95 * Note that we use jump labels and srcu notifier hooks to minimize the
96 * overhead when live nlinks is /not/ running.  Locking order for nlink
97 * observations is inode ILOCK -> iscan_lock/xchk_nlink_ctrs lock.
98 */
99
100/*
101 * Add a delta to an nlink counter, clamping the value to U32_MAX.  Because
102 * XFS_MAXLINK < U32_MAX, the checking code will produce the correct results
103 * even if we lose some precision.
104 */
105static inline void
106careful_add(
107	xfs_nlink_t	*nlinkp,
108	int		delta)
109{
110	uint64_t	new_value = (uint64_t)(*nlinkp) + delta;
111
112	BUILD_BUG_ON(XFS_MAXLINK > U32_MAX);
113	*nlinkp = min_t(uint64_t, new_value, U32_MAX);
114}
115
116/* Update incore link count information.  Caller must hold the nlinks lock. */
117STATIC int
118xchk_nlinks_update_incore(
119	struct xchk_nlink_ctrs	*xnc,
120	xfs_ino_t		ino,
121	int			parents_delta,
122	int			backrefs_delta,
123	int			children_delta)
124{
125	struct xchk_nlink	nl;
126	int			error;
127
128	if (!xnc->nlinks)
129		return 0;
130
131	error = xfarray_load_sparse(xnc->nlinks, ino, &nl);
132	if (error)
133		return error;
134
135	trace_xchk_nlinks_update_incore(xnc->sc->mp, ino, &nl, parents_delta,
136			backrefs_delta, children_delta);
137
138	careful_add(&nl.parents, parents_delta);
139	careful_add(&nl.backrefs, backrefs_delta);
140	careful_add(&nl.children, children_delta);
141
142	nl.flags |= XCHK_NLINK_WRITTEN;
143	error = xfarray_store(xnc->nlinks, ino, &nl);
144	if (error == -EFBIG) {
145		/*
146		 * EFBIG means we tried to store data at too high a byte offset
147		 * in the sparse array.  IOWs, we cannot complete the check and
148		 * must notify userspace that the check was incomplete.
149		 */
150		error = -ECANCELED;
151	}
152	return error;
153}
154
155/*
156 * Apply a link count change from the regular filesystem into our shadow link
157 * count structure based on a directory update in progress.
158 */
159STATIC int
160xchk_nlinks_live_update(
161	struct notifier_block		*nb,
162	unsigned long			action,
163	void				*data)
164{
165	struct xfs_dir_update_params	*p = data;
166	struct xchk_nlink_ctrs		*xnc;
167	int				error;
168
169	xnc = container_of(nb, struct xchk_nlink_ctrs, dhook.dirent_hook.nb);
170
171	/*
172	 * Ignore temporary directories being used to stage dir repairs, since
173	 * we don't bump the link counts of the children.
174	 */
175	if (xrep_is_tempfile(p->dp))
176		return NOTIFY_DONE;
177
178	trace_xchk_nlinks_live_update(xnc->sc->mp, p->dp, action, p->ip->i_ino,
179			p->delta, p->name->name, p->name->len);
180
181	/*
182	 * If we've already scanned @dp, update the number of parents that link
183	 * to @ip.  If @ip is a subdirectory, update the number of child links
184	 * going out of @dp.
185	 */
186	if (xchk_iscan_want_live_update(&xnc->collect_iscan, p->dp->i_ino)) {
187		mutex_lock(&xnc->lock);
188		error = xchk_nlinks_update_incore(xnc, p->ip->i_ino, p->delta,
189				0, 0);
190		if (!error && S_ISDIR(VFS_IC(p->ip)->i_mode))
191			error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0,
192					0, p->delta);
193		mutex_unlock(&xnc->lock);
194		if (error)
195			goto out_abort;
196	}
197
198	/*
199	 * If @ip is a subdirectory and we've already scanned it, update the
200	 * number of backrefs pointing to @dp.
201	 */
202	if (S_ISDIR(VFS_IC(p->ip)->i_mode) &&
203	    xchk_iscan_want_live_update(&xnc->collect_iscan, p->ip->i_ino)) {
204		mutex_lock(&xnc->lock);
205		error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0,
206				p->delta, 0);
207		mutex_unlock(&xnc->lock);
208		if (error)
209			goto out_abort;
210	}
211
212	return NOTIFY_DONE;
213
214out_abort:
215	xchk_iscan_abort(&xnc->collect_iscan);
216	return NOTIFY_DONE;
217}
218
219/* Bump the observed link count for the inode referenced by this entry. */
220STATIC int
221xchk_nlinks_collect_dirent(
222	struct xfs_scrub	*sc,
223	struct xfs_inode	*dp,
224	xfs_dir2_dataptr_t	dapos,
225	const struct xfs_name	*name,
226	xfs_ino_t		ino,
227	void			*priv)
228{
229	struct xchk_nlink_ctrs	*xnc = priv;
230	bool			dot = false, dotdot = false;
231	int			error;
232
233	/* Does this name make sense? */
234	if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) {
235		error = -ECANCELED;
236		goto out_abort;
237	}
238
239	if (name->len == 1 && name->name[0] == '.')
240		dot = true;
241	else if (name->len == 2 && name->name[0] == '.' &&
242				   name->name[1] == '.')
243		dotdot = true;
244
245	/* Don't accept a '.' entry that points somewhere else. */
246	if (dot && ino != dp->i_ino) {
247		error = -ECANCELED;
248		goto out_abort;
249	}
250
251	/* Don't accept an invalid inode number. */
252	if (!xfs_verify_dir_ino(sc->mp, ino)) {
253		error = -ECANCELED;
254		goto out_abort;
255	}
256
257	/* Update the shadow link counts if we haven't already failed. */
258
259	if (xchk_iscan_aborted(&xnc->collect_iscan)) {
260		error = -ECANCELED;
261		goto out_incomplete;
262	}
263
264	trace_xchk_nlinks_collect_dirent(sc->mp, dp, ino, name);
265
266	mutex_lock(&xnc->lock);
267
268	/*
269	 * If this is a dotdot entry, it is a back link from dp to ino.  How
270	 * we handle this depends on whether or not dp is the root directory.
271	 *
272	 * The root directory is its own parent, so we pretend the dotdot entry
273	 * establishes the "parent" of the root directory.  Increment the
274	 * number of parents of the root directory.
275	 *
276	 * Otherwise, increment the number of backrefs pointing back to ino.
277	 *
278	 * If the filesystem has parent pointers, we walk the pptrs to
279	 * determine the backref count.
280	 */
281	if (dotdot) {
282		if (dp == sc->mp->m_rootip)
283			error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
284		else if (!xfs_has_parent(sc->mp))
285			error = xchk_nlinks_update_incore(xnc, ino, 0, 1, 0);
286		else
287			error = 0;
288		if (error)
289			goto out_unlock;
290	}
291
292	/*
293	 * If this dirent is a forward link from dp to ino, increment the
294	 * number of parents linking into ino.
295	 */
296	if (!dot && !dotdot) {
297		error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
298		if (error)
299			goto out_unlock;
300	}
301
302	/*
303	 * If this dirent is a forward link to a subdirectory, increment the
304	 * number of child links of dp.
305	 */
306	if (!dot && !dotdot && name->type == XFS_DIR3_FT_DIR) {
307		error = xchk_nlinks_update_incore(xnc, dp->i_ino, 0, 0, 1);
308		if (error)
309			goto out_unlock;
310	}
311
312	mutex_unlock(&xnc->lock);
313	return 0;
314
315out_unlock:
316	mutex_unlock(&xnc->lock);
317out_abort:
318	xchk_iscan_abort(&xnc->collect_iscan);
319out_incomplete:
320	xchk_set_incomplete(sc);
321	return error;
322}
323
324/* Bump the backref count for the inode referenced by this parent pointer. */
325STATIC int
326xchk_nlinks_collect_pptr(
327	struct xfs_scrub		*sc,
328	struct xfs_inode		*ip,
329	unsigned int			attr_flags,
330	const unsigned char		*name,
331	unsigned int			namelen,
332	const void			*value,
333	unsigned int			valuelen,
334	void				*priv)
335{
336	struct xfs_name			xname = {
337		.name			= name,
338		.len			= namelen,
339	};
340	struct xchk_nlink_ctrs		*xnc = priv;
341	const struct xfs_parent_rec	*pptr_rec = value;
342	xfs_ino_t			parent_ino;
343	int				error;
344
345	/* Update the shadow link counts if we haven't already failed. */
346
347	if (xchk_iscan_aborted(&xnc->collect_iscan)) {
348		error = -ECANCELED;
349		goto out_incomplete;
350	}
351
352	if (!(attr_flags & XFS_ATTR_PARENT))
353		return 0;
354
355	error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
356			valuelen, &parent_ino, NULL);
357	if (error)
358		return error;
359
360	trace_xchk_nlinks_collect_pptr(sc->mp, ip, &xname, pptr_rec);
361
362	mutex_lock(&xnc->lock);
363
364	error = xchk_nlinks_update_incore(xnc, parent_ino, 0, 1, 0);
365	if (error)
366		goto out_unlock;
367
368	mutex_unlock(&xnc->lock);
369	return 0;
370
371out_unlock:
372	mutex_unlock(&xnc->lock);
373	xchk_iscan_abort(&xnc->collect_iscan);
374out_incomplete:
375	xchk_set_incomplete(sc);
376	return error;
377}
378
379/* Walk a directory to bump the observed link counts of the children. */
380STATIC int
381xchk_nlinks_collect_dir(
382	struct xchk_nlink_ctrs	*xnc,
383	struct xfs_inode	*dp)
384{
385	struct xfs_scrub	*sc = xnc->sc;
386	unsigned int		lock_mode;
387	int			error = 0;
388
389	/*
390	 * Ignore temporary directories being used to stage dir repairs, since
391	 * we don't bump the link counts of the children.
392	 */
393	if (xrep_is_tempfile(dp))
394		return 0;
395
396	/* Prevent anyone from changing this directory while we walk it. */
397	xfs_ilock(dp, XFS_IOLOCK_SHARED);
398	lock_mode = xfs_ilock_data_map_shared(dp);
399
400	/*
401	 * The dotdot entry of an unlinked directory still points to the last
402	 * parent, but the parent no longer links to this directory.  Skip the
403	 * directory to avoid overcounting.
404	 */
405	if (VFS_I(dp)->i_nlink == 0)
406		goto out_unlock;
407
408	/*
409	 * We cannot count file links if the directory looks as though it has
410	 * been zapped by the inode record repair code.
411	 */
412	if (xchk_dir_looks_zapped(dp)) {
413		error = -EBUSY;
414		goto out_abort;
415	}
416
417	error = xchk_dir_walk(sc, dp, xchk_nlinks_collect_dirent, xnc);
418	if (error == -ECANCELED) {
419		error = 0;
420		goto out_unlock;
421	}
422	if (error)
423		goto out_abort;
424
425	/* Walk the parent pointers to get real backref counts. */
426	if (xfs_has_parent(sc->mp)) {
427		/*
428		 * If the extended attributes look as though they has been
429		 * zapped by the inode record repair code, we cannot scan for
430		 * parent pointers.
431		 */
432		if (xchk_pptr_looks_zapped(dp)) {
433			error = -EBUSY;
434			goto out_unlock;
435		}
436
437		error = xchk_xattr_walk(sc, dp, xchk_nlinks_collect_pptr, NULL,
438				xnc);
439		if (error == -ECANCELED) {
440			error = 0;
441			goto out_unlock;
442		}
443		if (error)
444			goto out_abort;
445	}
446
447	xchk_iscan_mark_visited(&xnc->collect_iscan, dp);
448	goto out_unlock;
449
450out_abort:
451	xchk_set_incomplete(sc);
452	xchk_iscan_abort(&xnc->collect_iscan);
453out_unlock:
454	xfs_iunlock(dp, lock_mode);
455	xfs_iunlock(dp, XFS_IOLOCK_SHARED);
456	return error;
457}
458
459/* If this looks like a valid pointer, count it. */
460static inline int
461xchk_nlinks_collect_metafile(
462	struct xchk_nlink_ctrs	*xnc,
463	xfs_ino_t		ino)
464{
465	if (!xfs_verify_ino(xnc->sc->mp, ino))
466		return 0;
467
468	trace_xchk_nlinks_collect_metafile(xnc->sc->mp, ino);
469	return xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
470}
471
472/* Bump the link counts of metadata files rooted in the superblock. */
473STATIC int
474xchk_nlinks_collect_metafiles(
475	struct xchk_nlink_ctrs	*xnc)
476{
477	struct xfs_mount	*mp = xnc->sc->mp;
478	int			error = -ECANCELED;
479
480
481	if (xchk_iscan_aborted(&xnc->collect_iscan))
482		goto out_incomplete;
483
484	mutex_lock(&xnc->lock);
485	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rbmino);
486	if (error)
487		goto out_abort;
488
489	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rsumino);
490	if (error)
491		goto out_abort;
492
493	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_uquotino);
494	if (error)
495		goto out_abort;
496
497	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_gquotino);
498	if (error)
499		goto out_abort;
500
501	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_pquotino);
502	if (error)
503		goto out_abort;
504	mutex_unlock(&xnc->lock);
505
506	return 0;
507
508out_abort:
509	mutex_unlock(&xnc->lock);
510	xchk_iscan_abort(&xnc->collect_iscan);
511out_incomplete:
512	xchk_set_incomplete(xnc->sc);
513	return error;
514}
515
516/* Advance the collection scan cursor for this non-directory file. */
517static inline int
518xchk_nlinks_collect_file(
519	struct xchk_nlink_ctrs	*xnc,
520	struct xfs_inode	*ip)
521{
522	xfs_ilock(ip, XFS_IOLOCK_SHARED);
523	xchk_iscan_mark_visited(&xnc->collect_iscan, ip);
524	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
525	return 0;
526}
527
528/* Walk all directories and count inode links. */
529STATIC int
530xchk_nlinks_collect(
531	struct xchk_nlink_ctrs	*xnc)
532{
533	struct xfs_scrub	*sc = xnc->sc;
534	struct xfs_inode	*ip;
535	int			error;
536
537	/* Count the rt and quota files that are rooted in the superblock. */
538	error = xchk_nlinks_collect_metafiles(xnc);
539	if (error)
540		return error;
541
542	/*
543	 * Set up for a potentially lengthy filesystem scan by reducing our
544	 * transaction resource usage for the duration.  Specifically:
545	 *
546	 * Cancel the transaction to release the log grant space while we scan
547	 * the filesystem.
548	 *
549	 * Create a new empty transaction to eliminate the possibility of the
550	 * inode scan deadlocking on cyclical metadata.
551	 *
552	 * We pass the empty transaction to the file scanning function to avoid
553	 * repeatedly cycling empty transactions.  This can be done even though
554	 * we take the IOLOCK to quiesce the file because empty transactions
555	 * do not take sb_internal.
556	 */
557	xchk_trans_cancel(sc);
558	error = xchk_trans_alloc_empty(sc);
559	if (error)
560		return error;
561
562	while ((error = xchk_iscan_iter(&xnc->collect_iscan, &ip)) == 1) {
563		if (S_ISDIR(VFS_I(ip)->i_mode))
564			error = xchk_nlinks_collect_dir(xnc, ip);
565		else
566			error = xchk_nlinks_collect_file(xnc, ip);
567		xchk_irele(sc, ip);
568		if (error)
569			break;
570
571		if (xchk_should_terminate(sc, &error))
572			break;
573	}
574	xchk_iscan_iter_finish(&xnc->collect_iscan);
575	if (error) {
576		xchk_set_incomplete(sc);
577		/*
578		 * If we couldn't grab an inode that was busy with a state
579		 * change, change the error code so that we exit to userspace
580		 * as quickly as possible.
581		 */
582		if (error == -EBUSY)
583			return -ECANCELED;
584		return error;
585	}
586
587	/*
588	 * Switch out for a real transaction in preparation for building a new
589	 * tree.
590	 */
591	xchk_trans_cancel(sc);
592	return xchk_setup_fs(sc);
593}
594
595/*
596 * Part 2: Comparing file link counters.  Walk each inode and compare the link
597 * counts against our shadow information; and then walk each shadow link count
598 * structure (that wasn't covered in the first part), comparing it against the
599 * file.
600 */
601
602/* Read the observed link count for comparison with the actual inode. */
603STATIC int
604xchk_nlinks_comparison_read(
605	struct xchk_nlink_ctrs	*xnc,
606	xfs_ino_t		ino,
607	struct xchk_nlink	*obs)
608{
609	struct xchk_nlink	nl;
610	int			error;
611
612	error = xfarray_load_sparse(xnc->nlinks, ino, &nl);
613	if (error)
614		return error;
615
616	nl.flags |= (XCHK_NLINK_COMPARE_SCANNED | XCHK_NLINK_WRITTEN);
617
618	error = xfarray_store(xnc->nlinks, ino, &nl);
619	if (error == -EFBIG) {
620		/*
621		 * EFBIG means we tried to store data at too high a byte offset
622		 * in the sparse array.  IOWs, we cannot complete the check and
623		 * must notify userspace that the check was incomplete.  This
624		 * shouldn't really happen outside of the collection phase.
625		 */
626		xchk_set_incomplete(xnc->sc);
627		return -ECANCELED;
628	}
629	if (error)
630		return error;
631
632	/* Copy the counters, but do not expose the internal state. */
633	obs->parents = nl.parents;
634	obs->backrefs = nl.backrefs;
635	obs->children = nl.children;
636	obs->flags = 0;
637	return 0;
638}
639
640/* Check our link count against an inode. */
641STATIC int
642xchk_nlinks_compare_inode(
643	struct xchk_nlink_ctrs	*xnc,
644	struct xfs_inode	*ip)
645{
646	struct xchk_nlink	obs;
647	struct xfs_scrub	*sc = xnc->sc;
648	uint64_t		total_links;
649	unsigned int		actual_nlink;
650	int			error;
651
652	/*
653	 * Ignore temporary files being used to stage repairs, since we assume
654	 * they're correct for non-directories, and the directory repair code
655	 * doesn't bump the link counts for the children.
656	 */
657	if (xrep_is_tempfile(ip))
658		return 0;
659
660	xfs_ilock(ip, XFS_ILOCK_SHARED);
661	mutex_lock(&xnc->lock);
662
663	if (xchk_iscan_aborted(&xnc->collect_iscan)) {
664		xchk_set_incomplete(xnc->sc);
665		error = -ECANCELED;
666		goto out_scanlock;
667	}
668
669	error = xchk_nlinks_comparison_read(xnc, ip->i_ino, &obs);
670	if (error)
671		goto out_scanlock;
672
673	/*
674	 * If we don't have ftype to get an accurate count of the subdirectory
675	 * entries in this directory, take advantage of the fact that on a
676	 * consistent ftype=0 filesystem, the number of subdirectory
677	 * backreferences (dotdot entries) pointing towards this directory
678	 * should be equal to the number of subdirectory entries in the
679	 * directory.
680	 */
681	if (!xfs_has_ftype(sc->mp) && S_ISDIR(VFS_I(ip)->i_mode))
682		obs.children = obs.backrefs;
683
684	total_links = xchk_nlink_total(ip, &obs);
685	actual_nlink = VFS_I(ip)->i_nlink;
686
687	trace_xchk_nlinks_compare_inode(sc->mp, ip, &obs);
688
689	/*
690	 * If we found so many parents that we'd overflow i_nlink, we must flag
691	 * this as a corruption.  The VFS won't let users increase the link
692	 * count, but it will let them decrease it.
693	 */
694	if (total_links > XFS_NLINK_PINNED) {
695		xchk_ino_set_corrupt(sc, ip->i_ino);
696		goto out_corrupt;
697	} else if (total_links > XFS_MAXLINK) {
698		xchk_ino_set_warning(sc, ip->i_ino);
699	}
700
701	/* Link counts should match. */
702	if (total_links != actual_nlink) {
703		xchk_ino_set_corrupt(sc, ip->i_ino);
704		goto out_corrupt;
705	}
706
707	if (S_ISDIR(VFS_I(ip)->i_mode) && actual_nlink > 0) {
708		/*
709		 * The collection phase ignores directories with zero link
710		 * count, so we ignore them here too.
711		 *
712		 * The number of subdirectory backreferences (dotdot entries)
713		 * pointing towards this directory should be equal to the
714		 * number of subdirectory entries in the directory.
715		 */
716		if (obs.children != obs.backrefs)
717			xchk_ino_xref_set_corrupt(sc, ip->i_ino);
718	} else {
719		/*
720		 * Non-directories and unlinked directories should not have
721		 * back references.
722		 */
723		if (obs.backrefs != 0) {
724			xchk_ino_set_corrupt(sc, ip->i_ino);
725			goto out_corrupt;
726		}
727
728		/*
729		 * Non-directories and unlinked directories should not have
730		 * children.
731		 */
732		if (obs.children != 0) {
733			xchk_ino_set_corrupt(sc, ip->i_ino);
734			goto out_corrupt;
735		}
736	}
737
738	if (ip == sc->mp->m_rootip) {
739		/*
740		 * For the root of a directory tree, both the '.' and '..'
741		 * entries should point to the root directory.  The dotdot
742		 * entry is counted as a parent of the root /and/ a backref of
743		 * the root directory.
744		 */
745		if (obs.parents != 1) {
746			xchk_ino_set_corrupt(sc, ip->i_ino);
747			goto out_corrupt;
748		}
749	} else if (actual_nlink > 0) {
750		/*
751		 * Linked files that are not the root directory should have at
752		 * least one parent.
753		 */
754		if (obs.parents == 0) {
755			xchk_ino_set_corrupt(sc, ip->i_ino);
756			goto out_corrupt;
757		}
758	}
759
760out_corrupt:
761	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
762		error = -ECANCELED;
763out_scanlock:
764	mutex_unlock(&xnc->lock);
765	xfs_iunlock(ip, XFS_ILOCK_SHARED);
766	return error;
767}
768
769/*
770 * Check our link count against an inode that wasn't checked previously.  This
771 * is intended to catch directories with dangling links, though we could be
772 * racing with inode allocation in other threads.
773 */
774STATIC int
775xchk_nlinks_compare_inum(
776	struct xchk_nlink_ctrs	*xnc,
777	xfs_ino_t		ino)
778{
779	struct xchk_nlink	obs;
780	struct xfs_mount	*mp = xnc->sc->mp;
781	struct xfs_trans	*tp = xnc->sc->tp;
782	struct xfs_buf		*agi_bp;
783	struct xfs_inode	*ip;
784	int			error;
785
786	/*
787	 * The first iget failed, so try again with the variant that returns
788	 * either an incore inode or the AGI buffer.  If the function returns
789	 * EINVAL/ENOENT, it should have passed us the AGI buffer so that we
790	 * can guarantee that the inode won't be allocated while we check for
791	 * a zero link count in the observed link count data.
792	 */
793	error = xchk_iget_agi(xnc->sc, ino, &agi_bp, &ip);
794	if (!error) {
795		/* Actually got an inode, so use the inode compare. */
796		error = xchk_nlinks_compare_inode(xnc, ip);
797		xchk_irele(xnc->sc, ip);
798		return error;
799	}
800	if (error == -ENOENT || error == -EINVAL) {
801		/* No inode was found.  Check for zero link count below. */
802		error = 0;
803	}
804	if (error)
805		goto out_agi;
806
807	/* Ensure that we have protected against inode allocation/freeing. */
808	if (agi_bp == NULL) {
809		ASSERT(agi_bp != NULL);
810		xchk_set_incomplete(xnc->sc);
811		return -ECANCELED;
812	}
813
814	if (xchk_iscan_aborted(&xnc->collect_iscan)) {
815		xchk_set_incomplete(xnc->sc);
816		error = -ECANCELED;
817		goto out_agi;
818	}
819
820	mutex_lock(&xnc->lock);
821	error = xchk_nlinks_comparison_read(xnc, ino, &obs);
822	if (error)
823		goto out_scanlock;
824
825	trace_xchk_nlinks_check_zero(mp, ino, &obs);
826
827	/*
828	 * If we can't grab the inode, the link count had better be zero.  We
829	 * still hold the AGI to prevent inode allocation/freeing.
830	 */
831	if (xchk_nlink_total(NULL, &obs) != 0) {
832		xchk_ino_set_corrupt(xnc->sc, ino);
833		error = -ECANCELED;
834	}
835
836out_scanlock:
837	mutex_unlock(&xnc->lock);
838out_agi:
839	if (agi_bp)
840		xfs_trans_brelse(tp, agi_bp);
841	return error;
842}
843
844/*
845 * Try to visit every inode in the filesystem to compare the link count.  Move
846 * on if we can't grab an inode, since we'll revisit unchecked nlink records in
847 * the second part.
848 */
849static int
850xchk_nlinks_compare_iter(
851	struct xchk_nlink_ctrs	*xnc,
852	struct xfs_inode	**ipp)
853{
854	int			error;
855
856	do {
857		error = xchk_iscan_iter(&xnc->compare_iscan, ipp);
858	} while (error == -EBUSY);
859
860	return error;
861}
862
863/* Compare the link counts we observed against the live information. */
864STATIC int
865xchk_nlinks_compare(
866	struct xchk_nlink_ctrs	*xnc)
867{
868	struct xchk_nlink	nl;
869	struct xfs_scrub	*sc = xnc->sc;
870	struct xfs_inode	*ip;
871	xfarray_idx_t		cur = XFARRAY_CURSOR_INIT;
872	int			error;
873
874	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
875		return 0;
876
877	/*
878	 * Create a new empty transaction so that we can advance the iscan
879	 * cursor without deadlocking if the inobt has a cycle and push on the
880	 * inactivation workqueue.
881	 */
882	xchk_trans_cancel(sc);
883	error = xchk_trans_alloc_empty(sc);
884	if (error)
885		return error;
886
887	/*
888	 * Use the inobt to walk all allocated inodes to compare the link
889	 * counts.  Inodes skipped by _compare_iter will be tried again in the
890	 * next phase of the scan.
891	 */
892	xchk_iscan_start(sc, 0, 0, &xnc->compare_iscan);
893	while ((error = xchk_nlinks_compare_iter(xnc, &ip)) == 1) {
894		error = xchk_nlinks_compare_inode(xnc, ip);
895		xchk_iscan_mark_visited(&xnc->compare_iscan, ip);
896		xchk_irele(sc, ip);
897		if (error)
898			break;
899
900		if (xchk_should_terminate(sc, &error))
901			break;
902	}
903	xchk_iscan_iter_finish(&xnc->compare_iscan);
904	xchk_iscan_teardown(&xnc->compare_iscan);
905	if (error)
906		return error;
907
908	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
909		return 0;
910
911	/*
912	 * Walk all the non-null nlink observations that weren't checked in the
913	 * previous step.
914	 */
915	mutex_lock(&xnc->lock);
916	while ((error = xfarray_iter(xnc->nlinks, &cur, &nl)) == 1) {
917		xfs_ino_t	ino = cur - 1;
918
919		if (nl.flags & XCHK_NLINK_COMPARE_SCANNED)
920			continue;
921
922		mutex_unlock(&xnc->lock);
923
924		error = xchk_nlinks_compare_inum(xnc, ino);
925		if (error)
926			return error;
927
928		if (xchk_should_terminate(xnc->sc, &error))
929			return error;
930
931		mutex_lock(&xnc->lock);
932	}
933	mutex_unlock(&xnc->lock);
934
935	return error;
936}
937
938/* Tear down everything associated with a nlinks check. */
939static void
940xchk_nlinks_teardown_scan(
941	void			*priv)
942{
943	struct xchk_nlink_ctrs	*xnc = priv;
944
945	/* Discourage any hook functions that might be running. */
946	xchk_iscan_abort(&xnc->collect_iscan);
947
948	xfs_dir_hook_del(xnc->sc->mp, &xnc->dhook);
949
950	xfarray_destroy(xnc->nlinks);
951	xnc->nlinks = NULL;
952
953	xchk_iscan_teardown(&xnc->collect_iscan);
954	mutex_destroy(&xnc->lock);
955	xnc->sc = NULL;
956}
957
958/*
959 * Scan all inodes in the entire filesystem to generate link count data.  If
960 * the scan is successful, the counts will be left alive for a repair.  If any
961 * error occurs, we'll tear everything down.
962 */
963STATIC int
964xchk_nlinks_setup_scan(
965	struct xfs_scrub	*sc,
966	struct xchk_nlink_ctrs	*xnc)
967{
968	struct xfs_mount	*mp = sc->mp;
969	char			*descr;
970	unsigned long long	max_inos;
971	xfs_agnumber_t		last_agno = mp->m_sb.sb_agcount - 1;
972	xfs_agino_t		first_agino, last_agino;
973	int			error;
974
975	mutex_init(&xnc->lock);
976
977	/* Retry iget every tenth of a second for up to 30 seconds. */
978	xchk_iscan_start(sc, 30000, 100, &xnc->collect_iscan);
979
980	/*
981	 * Set up enough space to store an nlink record for the highest
982	 * possible inode number in this system.
983	 */
984	xfs_agino_range(mp, last_agno, &first_agino, &last_agino);
985	max_inos = XFS_AGINO_TO_INO(mp, last_agno, last_agino) + 1;
986	descr = xchk_xfile_descr(sc, "file link counts");
987	error = xfarray_create(descr, min(XFS_MAXINUMBER + 1, max_inos),
988			sizeof(struct xchk_nlink), &xnc->nlinks);
989	kfree(descr);
990	if (error)
991		goto out_teardown;
992
993	/*
994	 * Hook into the directory entry code so that we can capture updates to
995	 * file link counts.  The hook only triggers for inodes that were
996	 * already scanned, and the scanner thread takes each inode's ILOCK,
997	 * which means that any in-progress inode updates will finish before we
998	 * can scan the inode.
999	 */
1000	ASSERT(sc->flags & XCHK_FSGATES_DIRENTS);
1001	xfs_dir_hook_setup(&xnc->dhook, xchk_nlinks_live_update);
1002	error = xfs_dir_hook_add(mp, &xnc->dhook);
1003	if (error)
1004		goto out_teardown;
1005
1006	/* Use deferred cleanup to pass the inode link count data to repair. */
1007	sc->buf_cleanup = xchk_nlinks_teardown_scan;
1008	return 0;
1009
1010out_teardown:
1011	xchk_nlinks_teardown_scan(xnc);
1012	return error;
1013}
1014
1015/* Scrub the link count of all inodes on the filesystem. */
1016int
1017xchk_nlinks(
1018	struct xfs_scrub	*sc)
1019{
1020	struct xchk_nlink_ctrs	*xnc = sc->buf;
1021	int			error = 0;
1022
1023	/* Set ourselves up to check link counts on the live filesystem. */
1024	error = xchk_nlinks_setup_scan(sc, xnc);
1025	if (error)
1026		return error;
1027
1028	/* Walk all inodes, picking up link count information. */
1029	error = xchk_nlinks_collect(xnc);
1030	if (!xchk_xref_process_error(sc, 0, 0, &error))
1031		return error;
1032
1033	/* Fail fast if we're not playing with a full dataset. */
1034	if (xchk_iscan_aborted(&xnc->collect_iscan))
1035		xchk_set_incomplete(sc);
1036	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
1037		return 0;
1038
1039	/* Compare link counts. */
1040	error = xchk_nlinks_compare(xnc);
1041	if (!xchk_xref_process_error(sc, 0, 0, &error))
1042		return error;
1043
1044	/* Check one last time for an incomplete dataset. */
1045	if (xchk_iscan_aborted(&xnc->collect_iscan))
1046		xchk_set_incomplete(sc);
1047
1048	return 0;
1049}
1050