1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
8#include "xfs_shared.h"
9#include "xfs_format.h"
10#include "xfs_trans_resv.h"
11#include "xfs_mount.h"
12#include "xfs_btree.h"
13#include "xfs_log_format.h"
14#include "xfs_trans.h"
15#include "xfs_inode.h"
16#include "xfs_icache.h"
17#include "xfs_alloc.h"
18#include "xfs_alloc_btree.h"
19#include "xfs_ialloc.h"
20#include "xfs_ialloc_btree.h"
21#include "xfs_refcount_btree.h"
22#include "xfs_rmap.h"
23#include "xfs_rmap_btree.h"
24#include "xfs_log.h"
25#include "xfs_trans_priv.h"
26#include "xfs_da_format.h"
27#include "xfs_da_btree.h"
28#include "xfs_dir2_priv.h"
29#include "xfs_attr.h"
30#include "xfs_reflink.h"
31#include "xfs_ag.h"
32#include "xfs_error.h"
33#include "xfs_quota.h"
34#include "xfs_exchmaps.h"
35#include "xfs_rtbitmap.h"
36#include "scrub/scrub.h"
37#include "scrub/common.h"
38#include "scrub/trace.h"
39#include "scrub/repair.h"
40#include "scrub/health.h"
41
42/* Common code for the metadata scrubbers. */
43
44/*
45 * Handling operational errors.
46 *
47 * The *_process_error() family of functions are used to process error return
48 * codes from functions called as part of a scrub operation.
49 *
50 * If there's no error, we return true to tell the caller that it's ok
51 * to move on to the next check in its list.
52 *
53 * For non-verifier errors (e.g. ENOMEM) we return false to tell the
54 * caller that something bad happened, and we preserve *error so that
55 * the caller can return the *error up the stack to userspace.
56 *
57 * Verifier errors (EFSBADCRC/EFSCORRUPTED) are recorded by setting
58 * OFLAG_CORRUPT in sm_flags and the *error is cleared.  In other words,
59 * we track verifier errors (and failed scrub checks) via OFLAG_CORRUPT,
60 * not via return codes.  We return false to tell the caller that
61 * something bad happened.  Since the error has been cleared, the caller
62 * will (presumably) return that zero and scrubbing will move on to
63 * whatever's next.
64 *
65 * ftrace can be used to record the precise metadata location and the
66 * approximate code location of the failed operation.
67 */
68
69/* Check for operational errors. */
70static bool
71__xchk_process_error(
72	struct xfs_scrub	*sc,
73	xfs_agnumber_t		agno,
74	xfs_agblock_t		bno,
75	int			*error,
76	__u32			errflag,
77	void			*ret_ip)
78{
79	switch (*error) {
80	case 0:
81		return true;
82	case -EDEADLOCK:
83	case -ECHRNG:
84		/* Used to restart an op with deadlock avoidance. */
85		trace_xchk_deadlock_retry(
86				sc->ip ? sc->ip : XFS_I(file_inode(sc->file)),
87				sc->sm, *error);
88		break;
89	case -ECANCELED:
90		/*
91		 * ECANCELED here means that the caller set one of the scrub
92		 * outcome flags (corrupt, xfail, xcorrupt) and wants to exit
93		 * quickly.  Set error to zero and do not continue.
94		 */
95		trace_xchk_op_error(sc, agno, bno, *error, ret_ip);
96		*error = 0;
97		break;
98	case -EFSBADCRC:
99	case -EFSCORRUPTED:
100		/* Note the badness but don't abort. */
101		sc->sm->sm_flags |= errflag;
102		*error = 0;
103		fallthrough;
104	default:
105		trace_xchk_op_error(sc, agno, bno, *error, ret_ip);
106		break;
107	}
108	return false;
109}
110
111bool
112xchk_process_error(
113	struct xfs_scrub	*sc,
114	xfs_agnumber_t		agno,
115	xfs_agblock_t		bno,
116	int			*error)
117{
118	return __xchk_process_error(sc, agno, bno, error,
119			XFS_SCRUB_OFLAG_CORRUPT, __return_address);
120}
121
122bool
123xchk_xref_process_error(
124	struct xfs_scrub	*sc,
125	xfs_agnumber_t		agno,
126	xfs_agblock_t		bno,
127	int			*error)
128{
129	return __xchk_process_error(sc, agno, bno, error,
130			XFS_SCRUB_OFLAG_XFAIL, __return_address);
131}
132
133/* Check for operational errors for a file offset. */
134static bool
135__xchk_fblock_process_error(
136	struct xfs_scrub	*sc,
137	int			whichfork,
138	xfs_fileoff_t		offset,
139	int			*error,
140	__u32			errflag,
141	void			*ret_ip)
142{
143	switch (*error) {
144	case 0:
145		return true;
146	case -EDEADLOCK:
147	case -ECHRNG:
148		/* Used to restart an op with deadlock avoidance. */
149		trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
150		break;
151	case -ECANCELED:
152		/*
153		 * ECANCELED here means that the caller set one of the scrub
154		 * outcome flags (corrupt, xfail, xcorrupt) and wants to exit
155		 * quickly.  Set error to zero and do not continue.
156		 */
157		trace_xchk_file_op_error(sc, whichfork, offset, *error,
158				ret_ip);
159		*error = 0;
160		break;
161	case -EFSBADCRC:
162	case -EFSCORRUPTED:
163		/* Note the badness but don't abort. */
164		sc->sm->sm_flags |= errflag;
165		*error = 0;
166		fallthrough;
167	default:
168		trace_xchk_file_op_error(sc, whichfork, offset, *error,
169				ret_ip);
170		break;
171	}
172	return false;
173}
174
175bool
176xchk_fblock_process_error(
177	struct xfs_scrub	*sc,
178	int			whichfork,
179	xfs_fileoff_t		offset,
180	int			*error)
181{
182	return __xchk_fblock_process_error(sc, whichfork, offset, error,
183			XFS_SCRUB_OFLAG_CORRUPT, __return_address);
184}
185
186bool
187xchk_fblock_xref_process_error(
188	struct xfs_scrub	*sc,
189	int			whichfork,
190	xfs_fileoff_t		offset,
191	int			*error)
192{
193	return __xchk_fblock_process_error(sc, whichfork, offset, error,
194			XFS_SCRUB_OFLAG_XFAIL, __return_address);
195}
196
197/*
198 * Handling scrub corruption/optimization/warning checks.
199 *
200 * The *_set_{corrupt,preen,warning}() family of functions are used to
201 * record the presence of metadata that is incorrect (corrupt), could be
202 * optimized somehow (preen), or should be flagged for administrative
203 * review but is not incorrect (warn).
204 *
205 * ftrace can be used to record the precise metadata location and
206 * approximate code location of the failed check.
207 */
208
209/* Record a block which could be optimized. */
210void
211xchk_block_set_preen(
212	struct xfs_scrub	*sc,
213	struct xfs_buf		*bp)
214{
215	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
216	trace_xchk_block_preen(sc, xfs_buf_daddr(bp), __return_address);
217}
218
219/*
220 * Record an inode which could be optimized.  The trace data will
221 * include the block given by bp if bp is given; otherwise it will use
222 * the block location of the inode record itself.
223 */
224void
225xchk_ino_set_preen(
226	struct xfs_scrub	*sc,
227	xfs_ino_t		ino)
228{
229	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
230	trace_xchk_ino_preen(sc, ino, __return_address);
231}
232
233/* Record something being wrong with the filesystem primary superblock. */
234void
235xchk_set_corrupt(
236	struct xfs_scrub	*sc)
237{
238	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
239	trace_xchk_fs_error(sc, 0, __return_address);
240}
241
242/* Record a corrupt block. */
243void
244xchk_block_set_corrupt(
245	struct xfs_scrub	*sc,
246	struct xfs_buf		*bp)
247{
248	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
249	trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
250}
251
252#ifdef CONFIG_XFS_QUOTA
253/* Record a corrupt quota counter. */
254void
255xchk_qcheck_set_corrupt(
256	struct xfs_scrub	*sc,
257	unsigned int		dqtype,
258	xfs_dqid_t		id)
259{
260	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
261	trace_xchk_qcheck_error(sc, dqtype, id, __return_address);
262}
263#endif
264
265/* Record a corruption while cross-referencing. */
266void
267xchk_block_xref_set_corrupt(
268	struct xfs_scrub	*sc,
269	struct xfs_buf		*bp)
270{
271	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
272	trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
273}
274
275/*
276 * Record a corrupt inode.  The trace data will include the block given
277 * by bp if bp is given; otherwise it will use the block location of the
278 * inode record itself.
279 */
280void
281xchk_ino_set_corrupt(
282	struct xfs_scrub	*sc,
283	xfs_ino_t		ino)
284{
285	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
286	trace_xchk_ino_error(sc, ino, __return_address);
287}
288
289/* Record a corruption while cross-referencing with an inode. */
290void
291xchk_ino_xref_set_corrupt(
292	struct xfs_scrub	*sc,
293	xfs_ino_t		ino)
294{
295	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
296	trace_xchk_ino_error(sc, ino, __return_address);
297}
298
299/* Record corruption in a block indexed by a file fork. */
300void
301xchk_fblock_set_corrupt(
302	struct xfs_scrub	*sc,
303	int			whichfork,
304	xfs_fileoff_t		offset)
305{
306	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
307	trace_xchk_fblock_error(sc, whichfork, offset, __return_address);
308}
309
310/* Record a corruption while cross-referencing a fork block. */
311void
312xchk_fblock_xref_set_corrupt(
313	struct xfs_scrub	*sc,
314	int			whichfork,
315	xfs_fileoff_t		offset)
316{
317	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
318	trace_xchk_fblock_error(sc, whichfork, offset, __return_address);
319}
320
321/*
322 * Warn about inodes that need administrative review but is not
323 * incorrect.
324 */
325void
326xchk_ino_set_warning(
327	struct xfs_scrub	*sc,
328	xfs_ino_t		ino)
329{
330	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING;
331	trace_xchk_ino_warning(sc, ino, __return_address);
332}
333
334/* Warn about a block indexed by a file fork that needs review. */
335void
336xchk_fblock_set_warning(
337	struct xfs_scrub	*sc,
338	int			whichfork,
339	xfs_fileoff_t		offset)
340{
341	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING;
342	trace_xchk_fblock_warning(sc, whichfork, offset, __return_address);
343}
344
345/* Signal an incomplete scrub. */
346void
347xchk_set_incomplete(
348	struct xfs_scrub	*sc)
349{
350	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_INCOMPLETE;
351	trace_xchk_incomplete(sc, __return_address);
352}
353
354/*
355 * rmap scrubbing -- compute the number of blocks with a given owner,
356 * at least according to the reverse mapping data.
357 */
358
359struct xchk_rmap_ownedby_info {
360	const struct xfs_owner_info	*oinfo;
361	xfs_filblks_t			*blocks;
362};
363
364STATIC int
365xchk_count_rmap_ownedby_irec(
366	struct xfs_btree_cur		*cur,
367	const struct xfs_rmap_irec	*rec,
368	void				*priv)
369{
370	struct xchk_rmap_ownedby_info	*sroi = priv;
371	bool				irec_attr;
372	bool				oinfo_attr;
373
374	irec_attr = rec->rm_flags & XFS_RMAP_ATTR_FORK;
375	oinfo_attr = sroi->oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK;
376
377	if (rec->rm_owner != sroi->oinfo->oi_owner)
378		return 0;
379
380	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || irec_attr == oinfo_attr)
381		(*sroi->blocks) += rec->rm_blockcount;
382
383	return 0;
384}
385
386/*
387 * Calculate the number of blocks the rmap thinks are owned by something.
388 * The caller should pass us an rmapbt cursor.
389 */
390int
391xchk_count_rmap_ownedby_ag(
392	struct xfs_scrub		*sc,
393	struct xfs_btree_cur		*cur,
394	const struct xfs_owner_info	*oinfo,
395	xfs_filblks_t			*blocks)
396{
397	struct xchk_rmap_ownedby_info	sroi = {
398		.oinfo			= oinfo,
399		.blocks			= blocks,
400	};
401
402	*blocks = 0;
403	return xfs_rmap_query_all(cur, xchk_count_rmap_ownedby_irec,
404			&sroi);
405}
406
407/*
408 * AG scrubbing
409 *
410 * These helpers facilitate locking an allocation group's header
411 * buffers, setting up cursors for all btrees that are present, and
412 * cleaning everything up once we're through.
413 */
414
415/* Decide if we want to return an AG header read failure. */
416static inline bool
417want_ag_read_header_failure(
418	struct xfs_scrub	*sc,
419	unsigned int		type)
420{
421	/* Return all AG header read failures when scanning btrees. */
422	if (sc->sm->sm_type != XFS_SCRUB_TYPE_AGF &&
423	    sc->sm->sm_type != XFS_SCRUB_TYPE_AGFL &&
424	    sc->sm->sm_type != XFS_SCRUB_TYPE_AGI)
425		return true;
426	/*
427	 * If we're scanning a given type of AG header, we only want to
428	 * see read failures from that specific header.  We'd like the
429	 * other headers to cross-check them, but this isn't required.
430	 */
431	if (sc->sm->sm_type == type)
432		return true;
433	return false;
434}
435
436/*
437 * Grab the AG header buffers for the attached perag structure.
438 *
439 * The headers should be released by xchk_ag_free, but as a fail safe we attach
440 * all the buffers we grab to the scrub transaction so they'll all be freed
441 * when we cancel it.
442 */
443static inline int
444xchk_perag_read_headers(
445	struct xfs_scrub	*sc,
446	struct xchk_ag		*sa)
447{
448	int			error;
449
450	error = xfs_ialloc_read_agi(sa->pag, sc->tp, 0, &sa->agi_bp);
451	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
452		return error;
453
454	error = xfs_alloc_read_agf(sa->pag, sc->tp, 0, &sa->agf_bp);
455	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF))
456		return error;
457
458	return 0;
459}
460
461/*
462 * Grab the AG headers for the attached perag structure and wait for pending
463 * intents to drain.
464 */
465int
466xchk_perag_drain_and_lock(
467	struct xfs_scrub	*sc)
468{
469	struct xchk_ag		*sa = &sc->sa;
470	int			error = 0;
471
472	ASSERT(sa->pag != NULL);
473	ASSERT(sa->agi_bp == NULL);
474	ASSERT(sa->agf_bp == NULL);
475
476	do {
477		if (xchk_should_terminate(sc, &error))
478			return error;
479
480		error = xchk_perag_read_headers(sc, sa);
481		if (error)
482			return error;
483
484		/*
485		 * If we've grabbed an inode for scrubbing then we assume that
486		 * holding its ILOCK will suffice to coordinate with any intent
487		 * chains involving this inode.
488		 */
489		if (sc->ip)
490			return 0;
491
492		/*
493		 * Decide if this AG is quiet enough for all metadata to be
494		 * consistent with each other.  XFS allows the AG header buffer
495		 * locks to cycle across transaction rolls while processing
496		 * chains of deferred ops, which means that there could be
497		 * other threads in the middle of processing a chain of
498		 * deferred ops.  For regular operations we are careful about
499		 * ordering operations to prevent collisions between threads
500		 * (which is why we don't need a per-AG lock), but scrub and
501		 * repair have to serialize against chained operations.
502		 *
503		 * We just locked all the AG headers buffers; now take a look
504		 * to see if there are any intents in progress.  If there are,
505		 * drop the AG headers and wait for the intents to drain.
506		 * Since we hold all the AG header locks for the duration of
507		 * the scrub, this is the only time we have to sample the
508		 * intents counter; any threads increasing it after this point
509		 * can't possibly be in the middle of a chain of AG metadata
510		 * updates.
511		 *
512		 * Obviously, this should be slanted against scrub and in favor
513		 * of runtime threads.
514		 */
515		if (!xfs_perag_intent_busy(sa->pag))
516			return 0;
517
518		if (sa->agf_bp) {
519			xfs_trans_brelse(sc->tp, sa->agf_bp);
520			sa->agf_bp = NULL;
521		}
522
523		if (sa->agi_bp) {
524			xfs_trans_brelse(sc->tp, sa->agi_bp);
525			sa->agi_bp = NULL;
526		}
527
528		if (!(sc->flags & XCHK_FSGATES_DRAIN))
529			return -ECHRNG;
530		error = xfs_perag_intent_drain(sa->pag);
531		if (error == -ERESTARTSYS)
532			error = -EINTR;
533	} while (!error);
534
535	return error;
536}
537
538/*
539 * Grab the per-AG structure, grab all AG header buffers, and wait until there
540 * aren't any pending intents.  Returns -ENOENT if we can't grab the perag
541 * structure.
542 */
543int
544xchk_ag_read_headers(
545	struct xfs_scrub	*sc,
546	xfs_agnumber_t		agno,
547	struct xchk_ag		*sa)
548{
549	struct xfs_mount	*mp = sc->mp;
550
551	ASSERT(!sa->pag);
552	sa->pag = xfs_perag_get(mp, agno);
553	if (!sa->pag)
554		return -ENOENT;
555
556	return xchk_perag_drain_and_lock(sc);
557}
558
559/* Release all the AG btree cursors. */
560void
561xchk_ag_btcur_free(
562	struct xchk_ag		*sa)
563{
564	if (sa->refc_cur)
565		xfs_btree_del_cursor(sa->refc_cur, XFS_BTREE_ERROR);
566	if (sa->rmap_cur)
567		xfs_btree_del_cursor(sa->rmap_cur, XFS_BTREE_ERROR);
568	if (sa->fino_cur)
569		xfs_btree_del_cursor(sa->fino_cur, XFS_BTREE_ERROR);
570	if (sa->ino_cur)
571		xfs_btree_del_cursor(sa->ino_cur, XFS_BTREE_ERROR);
572	if (sa->cnt_cur)
573		xfs_btree_del_cursor(sa->cnt_cur, XFS_BTREE_ERROR);
574	if (sa->bno_cur)
575		xfs_btree_del_cursor(sa->bno_cur, XFS_BTREE_ERROR);
576
577	sa->refc_cur = NULL;
578	sa->rmap_cur = NULL;
579	sa->fino_cur = NULL;
580	sa->ino_cur = NULL;
581	sa->bno_cur = NULL;
582	sa->cnt_cur = NULL;
583}
584
585/* Initialize all the btree cursors for an AG. */
586void
587xchk_ag_btcur_init(
588	struct xfs_scrub	*sc,
589	struct xchk_ag		*sa)
590{
591	struct xfs_mount	*mp = sc->mp;
592
593	if (sa->agf_bp) {
594		/* Set up a bnobt cursor for cross-referencing. */
595		sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp,
596				sa->pag);
597		xchk_ag_btree_del_cursor_if_sick(sc, &sa->bno_cur,
598				XFS_SCRUB_TYPE_BNOBT);
599
600		/* Set up a cntbt cursor for cross-referencing. */
601		sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp,
602				sa->pag);
603		xchk_ag_btree_del_cursor_if_sick(sc, &sa->cnt_cur,
604				XFS_SCRUB_TYPE_CNTBT);
605
606		/* Set up a rmapbt cursor for cross-referencing. */
607		if (xfs_has_rmapbt(mp)) {
608			sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp,
609					sa->agf_bp, sa->pag);
610			xchk_ag_btree_del_cursor_if_sick(sc, &sa->rmap_cur,
611					XFS_SCRUB_TYPE_RMAPBT);
612		}
613
614		/* Set up a refcountbt cursor for cross-referencing. */
615		if (xfs_has_reflink(mp)) {
616			sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
617					sa->agf_bp, sa->pag);
618			xchk_ag_btree_del_cursor_if_sick(sc, &sa->refc_cur,
619					XFS_SCRUB_TYPE_REFCNTBT);
620		}
621	}
622
623	if (sa->agi_bp) {
624		/* Set up a inobt cursor for cross-referencing. */
625		sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp,
626				sa->agi_bp);
627		xchk_ag_btree_del_cursor_if_sick(sc, &sa->ino_cur,
628				XFS_SCRUB_TYPE_INOBT);
629
630		/* Set up a finobt cursor for cross-referencing. */
631		if (xfs_has_finobt(mp)) {
632			sa->fino_cur = xfs_finobt_init_cursor(sa->pag, sc->tp,
633					sa->agi_bp);
634			xchk_ag_btree_del_cursor_if_sick(sc, &sa->fino_cur,
635					XFS_SCRUB_TYPE_FINOBT);
636		}
637	}
638}
639
640/* Release the AG header context and btree cursors. */
641void
642xchk_ag_free(
643	struct xfs_scrub	*sc,
644	struct xchk_ag		*sa)
645{
646	xchk_ag_btcur_free(sa);
647	xrep_reset_perag_resv(sc);
648	if (sa->agf_bp) {
649		xfs_trans_brelse(sc->tp, sa->agf_bp);
650		sa->agf_bp = NULL;
651	}
652	if (sa->agi_bp) {
653		xfs_trans_brelse(sc->tp, sa->agi_bp);
654		sa->agi_bp = NULL;
655	}
656	if (sa->pag) {
657		xfs_perag_put(sa->pag);
658		sa->pag = NULL;
659	}
660}
661
662/*
663 * For scrub, grab the perag structure, the AGI, and the AGF headers, in that
664 * order.  Locking order requires us to get the AGI before the AGF.  We use the
665 * transaction to avoid deadlocking on crosslinked metadata buffers; either the
666 * caller passes one in (bmap scrub) or we have to create a transaction
667 * ourselves.  Returns ENOENT if the perag struct cannot be grabbed.
668 */
669int
670xchk_ag_init(
671	struct xfs_scrub	*sc,
672	xfs_agnumber_t		agno,
673	struct xchk_ag		*sa)
674{
675	int			error;
676
677	error = xchk_ag_read_headers(sc, agno, sa);
678	if (error)
679		return error;
680
681	xchk_ag_btcur_init(sc, sa);
682	return 0;
683}
684
685/* Per-scrubber setup functions */
686
687void
688xchk_trans_cancel(
689	struct xfs_scrub	*sc)
690{
691	xfs_trans_cancel(sc->tp);
692	sc->tp = NULL;
693}
694
695int
696xchk_trans_alloc_empty(
697	struct xfs_scrub	*sc)
698{
699	return xfs_trans_alloc_empty(sc->mp, &sc->tp);
700}
701
702/*
703 * Grab an empty transaction so that we can re-grab locked buffers if
704 * one of our btrees turns out to be cyclic.
705 *
706 * If we're going to repair something, we need to ask for the largest possible
707 * log reservation so that we can handle the worst case scenario for metadata
708 * updates while rebuilding a metadata item.  We also need to reserve as many
709 * blocks in the head transaction as we think we're going to need to rebuild
710 * the metadata object.
711 */
712int
713xchk_trans_alloc(
714	struct xfs_scrub	*sc,
715	uint			resblks)
716{
717	if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
718		return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
719				resblks, 0, 0, &sc->tp);
720
721	return xchk_trans_alloc_empty(sc);
722}
723
724/* Set us up with a transaction and an empty context. */
725int
726xchk_setup_fs(
727	struct xfs_scrub	*sc)
728{
729	uint			resblks;
730
731	resblks = xrep_calc_ag_resblks(sc);
732	return xchk_trans_alloc(sc, resblks);
733}
734
735/* Set us up with AG headers and btree cursors. */
736int
737xchk_setup_ag_btree(
738	struct xfs_scrub	*sc,
739	bool			force_log)
740{
741	struct xfs_mount	*mp = sc->mp;
742	int			error;
743
744	/*
745	 * If the caller asks us to checkpont the log, do so.  This
746	 * expensive operation should be performed infrequently and only
747	 * as a last resort.  Any caller that sets force_log should
748	 * document why they need to do so.
749	 */
750	if (force_log) {
751		error = xchk_checkpoint_log(mp);
752		if (error)
753			return error;
754	}
755
756	error = xchk_setup_fs(sc);
757	if (error)
758		return error;
759
760	return xchk_ag_init(sc, sc->sm->sm_agno, &sc->sa);
761}
762
763/* Push everything out of the log onto disk. */
764int
765xchk_checkpoint_log(
766	struct xfs_mount	*mp)
767{
768	int			error;
769
770	error = xfs_log_force(mp, XFS_LOG_SYNC);
771	if (error)
772		return error;
773	xfs_ail_push_all_sync(mp->m_ail);
774	return 0;
775}
776
777/* Verify that an inode is allocated ondisk, then return its cached inode. */
778int
779xchk_iget(
780	struct xfs_scrub	*sc,
781	xfs_ino_t		inum,
782	struct xfs_inode	**ipp)
783{
784	ASSERT(sc->tp != NULL);
785
786	return xfs_iget(sc->mp, sc->tp, inum, XCHK_IGET_FLAGS, 0, ipp);
787}
788
789/*
790 * Try to grab an inode in a manner that avoids races with physical inode
791 * allocation.  If we can't, return the locked AGI buffer so that the caller
792 * can single-step the loading process to see where things went wrong.
793 * Callers must have a valid scrub transaction.
794 *
795 * If the iget succeeds, return 0, a NULL AGI, and the inode.
796 *
797 * If the iget fails, return the error, the locked AGI, and a NULL inode.  This
798 * can include -EINVAL and -ENOENT for invalid inode numbers or inodes that are
799 * no longer allocated; or any other corruption or runtime error.
800 *
801 * If the AGI read fails, return the error, a NULL AGI, and NULL inode.
802 *
803 * If a fatal signal is pending, return -EINTR, a NULL AGI, and a NULL inode.
804 */
805int
806xchk_iget_agi(
807	struct xfs_scrub	*sc,
808	xfs_ino_t		inum,
809	struct xfs_buf		**agi_bpp,
810	struct xfs_inode	**ipp)
811{
812	struct xfs_mount	*mp = sc->mp;
813	struct xfs_trans	*tp = sc->tp;
814	struct xfs_perag	*pag;
815	int			error;
816
817	ASSERT(sc->tp != NULL);
818
819again:
820	*agi_bpp = NULL;
821	*ipp = NULL;
822	error = 0;
823
824	if (xchk_should_terminate(sc, &error))
825		return error;
826
827	/*
828	 * Attach the AGI buffer to the scrub transaction to avoid deadlocks
829	 * in the iget cache miss path.
830	 */
831	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
832	error = xfs_ialloc_read_agi(pag, tp, 0, agi_bpp);
833	xfs_perag_put(pag);
834	if (error)
835		return error;
836
837	error = xfs_iget(mp, tp, inum, XFS_IGET_NORETRY | XCHK_IGET_FLAGS, 0,
838			ipp);
839	if (error == -EAGAIN) {
840		/*
841		 * The inode may be in core but temporarily unavailable and may
842		 * require the AGI buffer before it can be returned.  Drop the
843		 * AGI buffer and retry the lookup.
844		 *
845		 * Incore lookup will fail with EAGAIN on a cache hit if the
846		 * inode is queued to the inactivation list.  The inactivation
847		 * worker may remove the inode from the unlinked list and hence
848		 * needs the AGI.
849		 *
850		 * Hence xchk_iget_agi() needs to drop the AGI lock on EAGAIN
851		 * to allow inodegc to make progress and move the inode to
852		 * IRECLAIMABLE state where xfs_iget will be able to return it
853		 * again if it can lock the inode.
854		 */
855		xfs_trans_brelse(tp, *agi_bpp);
856		delay(1);
857		goto again;
858	}
859	if (error)
860		return error;
861
862	/* We got the inode, so we can release the AGI. */
863	ASSERT(*ipp != NULL);
864	xfs_trans_brelse(tp, *agi_bpp);
865	*agi_bpp = NULL;
866	return 0;
867}
868
869#ifdef CONFIG_XFS_QUOTA
870/*
871 * Try to attach dquots to this inode if we think we might want to repair it.
872 * Callers must not hold any ILOCKs.  If the dquots are broken and cannot be
873 * attached, a quotacheck will be scheduled.
874 */
875int
876xchk_ino_dqattach(
877	struct xfs_scrub	*sc)
878{
879	ASSERT(sc->tp != NULL);
880	ASSERT(sc->ip != NULL);
881
882	if (!xchk_could_repair(sc))
883		return 0;
884
885	return xrep_ino_dqattach(sc);
886}
887#endif
888
889/* Install an inode that we opened by handle for scrubbing. */
890int
891xchk_install_handle_inode(
892	struct xfs_scrub	*sc,
893	struct xfs_inode	*ip)
894{
895	if (VFS_I(ip)->i_generation != sc->sm->sm_gen) {
896		xchk_irele(sc, ip);
897		return -ENOENT;
898	}
899
900	sc->ip = ip;
901	return 0;
902}
903
904/*
905 * Install an already-referenced inode for scrubbing.  Get our own reference to
906 * the inode to make disposal simpler.  The inode must not be in I_FREEING or
907 * I_WILL_FREE state!
908 */
909int
910xchk_install_live_inode(
911	struct xfs_scrub	*sc,
912	struct xfs_inode	*ip)
913{
914	if (!igrab(VFS_I(ip))) {
915		xchk_ino_set_corrupt(sc, ip->i_ino);
916		return -EFSCORRUPTED;
917	}
918
919	sc->ip = ip;
920	return 0;
921}
922
923/*
924 * In preparation to scrub metadata structures that hang off of an inode,
925 * grab either the inode referenced in the scrub control structure or the
926 * inode passed in.  If the inumber does not reference an allocated inode
927 * record, the function returns ENOENT to end the scrub early.  The inode
928 * is not locked.
929 */
930int
931xchk_iget_for_scrubbing(
932	struct xfs_scrub	*sc)
933{
934	struct xfs_imap		imap;
935	struct xfs_mount	*mp = sc->mp;
936	struct xfs_perag	*pag;
937	struct xfs_buf		*agi_bp;
938	struct xfs_inode	*ip_in = XFS_I(file_inode(sc->file));
939	struct xfs_inode	*ip = NULL;
940	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, sc->sm->sm_ino);
941	int			error;
942
943	ASSERT(sc->tp == NULL);
944
945	/* We want to scan the inode we already had opened. */
946	if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino)
947		return xchk_install_live_inode(sc, ip_in);
948
949	/* Reject internal metadata files and obviously bad inode numbers. */
950	if (xfs_internal_inum(mp, sc->sm->sm_ino))
951		return -ENOENT;
952	if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
953		return -ENOENT;
954
955	/* Try a safe untrusted iget. */
956	error = xchk_iget_safe(sc, sc->sm->sm_ino, &ip);
957	if (!error)
958		return xchk_install_handle_inode(sc, ip);
959	if (error == -ENOENT)
960		return error;
961	if (error != -EINVAL)
962		goto out_error;
963
964	/*
965	 * EINVAL with IGET_UNTRUSTED probably means one of several things:
966	 * userspace gave us an inode number that doesn't correspond to fs
967	 * space; the inode btree lacks a record for this inode; or there is a
968	 * record, and it says this inode is free.
969	 *
970	 * We want to look up this inode in the inobt to distinguish two
971	 * scenarios: (1) the inobt says the inode is free, in which case
972	 * there's nothing to do; and (2) the inobt says the inode is
973	 * allocated, but loading it failed due to corruption.
974	 *
975	 * Allocate a transaction and grab the AGI to prevent inobt activity
976	 * in this AG.  Retry the iget in case someone allocated a new inode
977	 * after the first iget failed.
978	 */
979	error = xchk_trans_alloc(sc, 0);
980	if (error)
981		goto out_error;
982
983	error = xchk_iget_agi(sc, sc->sm->sm_ino, &agi_bp, &ip);
984	if (error == 0) {
985		/* Actually got the inode, so install it. */
986		xchk_trans_cancel(sc);
987		return xchk_install_handle_inode(sc, ip);
988	}
989	if (error == -ENOENT)
990		goto out_gone;
991	if (error != -EINVAL)
992		goto out_cancel;
993
994	/* Ensure that we have protected against inode allocation/freeing. */
995	if (agi_bp == NULL) {
996		ASSERT(agi_bp != NULL);
997		error = -ECANCELED;
998		goto out_cancel;
999	}
1000
1001	/*
1002	 * Untrusted iget failed a second time.  Let's try an inobt lookup.
1003	 * If the inobt thinks this the inode neither can exist inside the
1004	 * filesystem nor is allocated, return ENOENT to signal that the check
1005	 * can be skipped.
1006	 *
1007	 * If the lookup returns corruption, we'll mark this inode corrupt and
1008	 * exit to userspace.  There's little chance of fixing anything until
1009	 * the inobt is straightened out, but there's nothing we can do here.
1010	 *
1011	 * If the lookup encounters any other error, exit to userspace.
1012	 *
1013	 * If the lookup succeeds, something else must be very wrong in the fs
1014	 * such that setting up the incore inode failed in some strange way.
1015	 * Treat those as corruptions.
1016	 */
1017	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino));
1018	if (!pag) {
1019		error = -EFSCORRUPTED;
1020		goto out_cancel;
1021	}
1022
1023	error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap,
1024			XFS_IGET_UNTRUSTED);
1025	xfs_perag_put(pag);
1026	if (error == -EINVAL || error == -ENOENT)
1027		goto out_gone;
1028	if (!error)
1029		error = -EFSCORRUPTED;
1030
1031out_cancel:
1032	xchk_trans_cancel(sc);
1033out_error:
1034	trace_xchk_op_error(sc, agno, XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino),
1035			error, __return_address);
1036	return error;
1037out_gone:
1038	/* The file is gone, so there's nothing to check. */
1039	xchk_trans_cancel(sc);
1040	return -ENOENT;
1041}
1042
1043/* Release an inode, possibly dropping it in the process. */
1044void
1045xchk_irele(
1046	struct xfs_scrub	*sc,
1047	struct xfs_inode	*ip)
1048{
1049	if (sc->tp) {
1050		/*
1051		 * If we are in a transaction, we /cannot/ drop the inode
1052		 * ourselves, because the VFS will trigger writeback, which
1053		 * can require a transaction.  Clear DONTCACHE to force the
1054		 * inode to the LRU, where someone else can take care of
1055		 * dropping it.
1056		 *
1057		 * Note that when we grabbed our reference to the inode, it
1058		 * could have had an active ref and DONTCACHE set if a sysadmin
1059		 * is trying to coerce a change in file access mode.  icache
1060		 * hits do not clear DONTCACHE, so we must do it here.
1061		 */
1062		spin_lock(&VFS_I(ip)->i_lock);
1063		VFS_I(ip)->i_state &= ~I_DONTCACHE;
1064		spin_unlock(&VFS_I(ip)->i_lock);
1065	}
1066
1067	xfs_irele(ip);
1068}
1069
1070/*
1071 * Set us up to scrub metadata mapped by a file's fork.  Callers must not use
1072 * this to operate on user-accessible regular file data because the MMAPLOCK is
1073 * not taken.
1074 */
1075int
1076xchk_setup_inode_contents(
1077	struct xfs_scrub	*sc,
1078	unsigned int		resblks)
1079{
1080	int			error;
1081
1082	error = xchk_iget_for_scrubbing(sc);
1083	if (error)
1084		return error;
1085
1086	/* Lock the inode so the VFS cannot touch this file. */
1087	xchk_ilock(sc, XFS_IOLOCK_EXCL);
1088
1089	error = xchk_trans_alloc(sc, resblks);
1090	if (error)
1091		goto out;
1092
1093	error = xchk_ino_dqattach(sc);
1094	if (error)
1095		goto out;
1096
1097	xchk_ilock(sc, XFS_ILOCK_EXCL);
1098out:
1099	/* scrub teardown will unlock and release the inode for us */
1100	return error;
1101}
1102
1103void
1104xchk_ilock(
1105	struct xfs_scrub	*sc,
1106	unsigned int		ilock_flags)
1107{
1108	xfs_ilock(sc->ip, ilock_flags);
1109	sc->ilock_flags |= ilock_flags;
1110}
1111
1112bool
1113xchk_ilock_nowait(
1114	struct xfs_scrub	*sc,
1115	unsigned int		ilock_flags)
1116{
1117	if (xfs_ilock_nowait(sc->ip, ilock_flags)) {
1118		sc->ilock_flags |= ilock_flags;
1119		return true;
1120	}
1121
1122	return false;
1123}
1124
1125void
1126xchk_iunlock(
1127	struct xfs_scrub	*sc,
1128	unsigned int		ilock_flags)
1129{
1130	sc->ilock_flags &= ~ilock_flags;
1131	xfs_iunlock(sc->ip, ilock_flags);
1132}
1133
1134/*
1135 * Predicate that decides if we need to evaluate the cross-reference check.
1136 * If there was an error accessing the cross-reference btree, just delete
1137 * the cursor and skip the check.
1138 */
1139bool
1140xchk_should_check_xref(
1141	struct xfs_scrub	*sc,
1142	int			*error,
1143	struct xfs_btree_cur	**curpp)
1144{
1145	/* No point in xref if we already know we're corrupt. */
1146	if (xchk_skip_xref(sc->sm))
1147		return false;
1148
1149	if (*error == 0)
1150		return true;
1151
1152	if (curpp) {
1153		/* If we've already given up on xref, just bail out. */
1154		if (!*curpp)
1155			return false;
1156
1157		/* xref error, delete cursor and bail out. */
1158		xfs_btree_del_cursor(*curpp, XFS_BTREE_ERROR);
1159		*curpp = NULL;
1160	}
1161
1162	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL;
1163	trace_xchk_xref_error(sc, *error, __return_address);
1164
1165	/*
1166	 * Errors encountered during cross-referencing with another
1167	 * data structure should not cause this scrubber to abort.
1168	 */
1169	*error = 0;
1170	return false;
1171}
1172
1173/* Run the structure verifiers on in-memory buffers to detect bad memory. */
1174void
1175xchk_buffer_recheck(
1176	struct xfs_scrub	*sc,
1177	struct xfs_buf		*bp)
1178{
1179	xfs_failaddr_t		fa;
1180
1181	if (bp->b_ops == NULL) {
1182		xchk_block_set_corrupt(sc, bp);
1183		return;
1184	}
1185	if (bp->b_ops->verify_struct == NULL) {
1186		xchk_set_incomplete(sc);
1187		return;
1188	}
1189	fa = bp->b_ops->verify_struct(bp);
1190	if (!fa)
1191		return;
1192	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
1193	trace_xchk_block_error(sc, xfs_buf_daddr(bp), fa);
1194}
1195
1196static inline int
1197xchk_metadata_inode_subtype(
1198	struct xfs_scrub	*sc,
1199	unsigned int		scrub_type)
1200{
1201	struct xfs_scrub_subord	*sub;
1202	int			error;
1203
1204	sub = xchk_scrub_create_subord(sc, scrub_type);
1205	error = sub->sc.ops->scrub(&sub->sc);
1206	xchk_scrub_free_subord(sub);
1207	return error;
1208}
1209
1210/*
1211 * Scrub the attr/data forks of a metadata inode.  The metadata inode must be
1212 * pointed to by sc->ip and the ILOCK must be held.
1213 */
1214int
1215xchk_metadata_inode_forks(
1216	struct xfs_scrub	*sc)
1217{
1218	bool			shared;
1219	int			error;
1220
1221	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
1222		return 0;
1223
1224	/* Check the inode record. */
1225	error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE);
1226	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
1227		return error;
1228
1229	/* Metadata inodes don't live on the rt device. */
1230	if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) {
1231		xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1232		return 0;
1233	}
1234
1235	/* They should never participate in reflink. */
1236	if (xfs_is_reflink_inode(sc->ip)) {
1237		xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1238		return 0;
1239	}
1240
1241	/* They also should never have extended attributes. */
1242	if (xfs_inode_hasattr(sc->ip)) {
1243		xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1244		return 0;
1245	}
1246
1247	/* Invoke the data fork scrubber. */
1248	error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD);
1249	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
1250		return error;
1251
1252	/* Look for incorrect shared blocks. */
1253	if (xfs_has_reflink(sc->mp)) {
1254		error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip,
1255				&shared);
1256		if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0,
1257				&error))
1258			return error;
1259		if (shared)
1260			xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1261	}
1262
1263	return 0;
1264}
1265
1266/*
1267 * Enable filesystem hooks (i.e. runtime code patching) before starting a scrub
1268 * operation.  Callers must not hold any locks that intersect with the CPU
1269 * hotplug lock (e.g. writeback locks) because code patching must halt the CPUs
1270 * to change kernel code.
1271 */
1272void
1273xchk_fsgates_enable(
1274	struct xfs_scrub	*sc,
1275	unsigned int		scrub_fsgates)
1276{
1277	ASSERT(!(scrub_fsgates & ~XCHK_FSGATES_ALL));
1278	ASSERT(!(sc->flags & scrub_fsgates));
1279
1280	trace_xchk_fsgates_enable(sc, scrub_fsgates);
1281
1282	if (scrub_fsgates & XCHK_FSGATES_DRAIN)
1283		xfs_drain_wait_enable();
1284
1285	if (scrub_fsgates & XCHK_FSGATES_QUOTA)
1286		xfs_dqtrx_hook_enable();
1287
1288	if (scrub_fsgates & XCHK_FSGATES_DIRENTS)
1289		xfs_dir_hook_enable();
1290
1291	if (scrub_fsgates & XCHK_FSGATES_RMAP)
1292		xfs_rmap_hook_enable();
1293
1294	sc->flags |= scrub_fsgates;
1295}
1296
1297/*
1298 * Decide if this is this a cached inode that's also allocated.  The caller
1299 * must hold a reference to an AG and the AGI buffer lock to prevent inodes
1300 * from being allocated or freed.
1301 *
1302 * Look up an inode by number in the given file system.  If the inode number
1303 * is invalid, return -EINVAL.  If the inode is not in cache, return -ENODATA.
1304 * If the inode is being reclaimed, return -ENODATA because we know the inode
1305 * cache cannot be updating the ondisk metadata.
1306 *
1307 * Otherwise, the incore inode is the one we want, and it is either live,
1308 * somewhere in the inactivation machinery, or reclaimable.  The inode is
1309 * allocated if i_mode is nonzero.  In all three cases, the cached inode will
1310 * be more up to date than the ondisk inode buffer, so we must use the incore
1311 * i_mode.
1312 */
1313int
1314xchk_inode_is_allocated(
1315	struct xfs_scrub	*sc,
1316	xfs_agino_t		agino,
1317	bool			*inuse)
1318{
1319	struct xfs_mount	*mp = sc->mp;
1320	struct xfs_perag	*pag = sc->sa.pag;
1321	xfs_ino_t		ino;
1322	struct xfs_inode	*ip;
1323	int			error;
1324
1325	/* caller must hold perag reference */
1326	if (pag == NULL) {
1327		ASSERT(pag != NULL);
1328		return -EINVAL;
1329	}
1330
1331	/* caller must have AGI buffer */
1332	if (sc->sa.agi_bp == NULL) {
1333		ASSERT(sc->sa.agi_bp != NULL);
1334		return -EINVAL;
1335	}
1336
1337	/* reject inode numbers outside existing AGs */
1338	ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino);
1339	if (!xfs_verify_ino(mp, ino))
1340		return -EINVAL;
1341
1342	error = -ENODATA;
1343	rcu_read_lock();
1344	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
1345	if (!ip) {
1346		/* cache miss */
1347		goto out_rcu;
1348	}
1349
1350	/*
1351	 * If the inode number doesn't match, the incore inode got reused
1352	 * during an RCU grace period and the radix tree hasn't been updated.
1353	 * This isn't the inode we want.
1354	 */
1355	spin_lock(&ip->i_flags_lock);
1356	if (ip->i_ino != ino)
1357		goto out_skip;
1358
1359	trace_xchk_inode_is_allocated(ip);
1360
1361	/*
1362	 * We have an incore inode that matches the inode we want, and the
1363	 * caller holds the perag structure and the AGI buffer.  Let's check
1364	 * our assumptions below:
1365	 */
1366
1367#ifdef DEBUG
1368	/*
1369	 * (1) If the incore inode is live (i.e. referenced from the dcache),
1370	 * it will not be INEW, nor will it be in the inactivation or reclaim
1371	 * machinery.  The ondisk inode had better be allocated.  This is the
1372	 * most trivial case.
1373	 */
1374	if (!(ip->i_flags & (XFS_NEED_INACTIVE | XFS_INEW | XFS_IRECLAIMABLE |
1375			     XFS_INACTIVATING))) {
1376		/* live inode */
1377		ASSERT(VFS_I(ip)->i_mode != 0);
1378	}
1379
1380	/*
1381	 * If the incore inode is INEW, there are several possibilities:
1382	 *
1383	 * (2) For a file that is being created, note that we allocate the
1384	 * ondisk inode before allocating, initializing, and adding the incore
1385	 * inode to the radix tree.
1386	 *
1387	 * (3) If the incore inode is being recycled, the inode has to be
1388	 * allocated because we don't allow freed inodes to be recycled.
1389	 * Recycling doesn't touch i_mode.
1390	 */
1391	if (ip->i_flags & XFS_INEW) {
1392		/* created on disk already or recycling */
1393		ASSERT(VFS_I(ip)->i_mode != 0);
1394	}
1395
1396	/*
1397	 * (4) If the inode is queued for inactivation (NEED_INACTIVE) but
1398	 * inactivation has not started (!INACTIVATING), it is still allocated.
1399	 */
1400	if ((ip->i_flags & XFS_NEED_INACTIVE) &&
1401	    !(ip->i_flags & XFS_INACTIVATING)) {
1402		/* definitely before difree */
1403		ASSERT(VFS_I(ip)->i_mode != 0);
1404	}
1405#endif
1406
1407	/*
1408	 * If the incore inode is undergoing inactivation (INACTIVATING), there
1409	 * are two possibilities:
1410	 *
1411	 * (5) It is before the point where it would get freed ondisk, in which
1412	 * case i_mode is still nonzero.
1413	 *
1414	 * (6) It has already been freed, in which case i_mode is zero.
1415	 *
1416	 * We don't take the ILOCK here, but difree and dialloc update the AGI,
1417	 * and we've taken the AGI buffer lock, which prevents that from
1418	 * happening.
1419	 */
1420
1421	/*
1422	 * (7) Inodes undergoing inactivation (INACTIVATING) or queued for
1423	 * reclaim (IRECLAIMABLE) could be allocated or free.  i_mode still
1424	 * reflects the ondisk state.
1425	 */
1426
1427	/*
1428	 * (8) If the inode is in IFLUSHING, it's safe to query i_mode because
1429	 * the flush code uses i_mode to format the ondisk inode.
1430	 */
1431
1432	/*
1433	 * (9) If the inode is in IRECLAIM and was reachable via the radix
1434	 * tree, it still has the same i_mode as it did before it entered
1435	 * reclaim.  The inode object is still alive because we hold the RCU
1436	 * read lock.
1437	 */
1438
1439	*inuse = VFS_I(ip)->i_mode != 0;
1440	error = 0;
1441
1442out_skip:
1443	spin_unlock(&ip->i_flags_lock);
1444out_rcu:
1445	rcu_read_unlock();
1446	return error;
1447}
1448