1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
8#include "xfs_shared.h"
9#include "xfs_format.h"
10#include "xfs_trans_resv.h"
11#include "xfs_mount.h"
12#include "xfs_btree.h"
13#include "xfs_log_format.h"
14#include "xfs_trans.h"
15#include "xfs_sb.h"
16#include "xfs_alloc.h"
17#include "xfs_alloc_btree.h"
18#include "xfs_ialloc.h"
19#include "xfs_ialloc_btree.h"
20#include "xfs_rmap.h"
21#include "xfs_rmap_btree.h"
22#include "xfs_refcount_btree.h"
23#include "xfs_ag.h"
24#include "xfs_inode.h"
25#include "xfs_iunlink_item.h"
26#include "scrub/scrub.h"
27#include "scrub/common.h"
28#include "scrub/trace.h"
29#include "scrub/repair.h"
30#include "scrub/bitmap.h"
31#include "scrub/agb_bitmap.h"
32#include "scrub/agino_bitmap.h"
33#include "scrub/reap.h"
34#include "scrub/xfile.h"
35#include "scrub/xfarray.h"
36
37/* Superblock */
38
39/* Repair the superblock. */
40int
41xrep_superblock(
42	struct xfs_scrub	*sc)
43{
44	struct xfs_mount	*mp = sc->mp;
45	struct xfs_buf		*bp;
46	xfs_agnumber_t		agno;
47	int			error;
48
49	/* Don't try to repair AG 0's sb; let xfs_repair deal with it. */
50	agno = sc->sm->sm_agno;
51	if (agno == 0)
52		return -EOPNOTSUPP;
53
54	error = xfs_sb_get_secondary(mp, sc->tp, agno, &bp);
55	if (error)
56		return error;
57
58	/* Last chance to abort before we start committing fixes. */
59	if (xchk_should_terminate(sc, &error))
60		return error;
61
62	/* Copy AG 0's superblock to this one. */
63	xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
64	xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
65
66	/*
67	 * Don't write out a secondary super with NEEDSREPAIR or log incompat
68	 * features set, since both are ignored when set on a secondary.
69	 */
70	if (xfs_has_crc(mp)) {
71		struct xfs_dsb		*sb = bp->b_addr;
72
73		sb->sb_features_incompat &=
74				~cpu_to_be32(XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR);
75		sb->sb_features_log_incompat = 0;
76	}
77
78	/* Write this to disk. */
79	xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF);
80	xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
81	return 0;
82}
83
84/* AGF */
85
86struct xrep_agf_allocbt {
87	struct xfs_scrub	*sc;
88	xfs_agblock_t		freeblks;
89	xfs_agblock_t		longest;
90};
91
92/* Record free space shape information. */
93STATIC int
94xrep_agf_walk_allocbt(
95	struct xfs_btree_cur		*cur,
96	const struct xfs_alloc_rec_incore *rec,
97	void				*priv)
98{
99	struct xrep_agf_allocbt		*raa = priv;
100	int				error = 0;
101
102	if (xchk_should_terminate(raa->sc, &error))
103		return error;
104
105	raa->freeblks += rec->ar_blockcount;
106	if (rec->ar_blockcount > raa->longest)
107		raa->longest = rec->ar_blockcount;
108	return error;
109}
110
111/* Does this AGFL block look sane? */
112STATIC int
113xrep_agf_check_agfl_block(
114	struct xfs_mount	*mp,
115	xfs_agblock_t		agbno,
116	void			*priv)
117{
118	struct xfs_scrub	*sc = priv;
119
120	if (!xfs_verify_agbno(sc->sa.pag, agbno))
121		return -EFSCORRUPTED;
122	return 0;
123}
124
125/*
126 * Offset within the xrep_find_ag_btree array for each btree type.  Avoid the
127 * XFS_BTNUM_ names here to avoid creating a sparse array.
128 */
129enum {
130	XREP_AGF_BNOBT = 0,
131	XREP_AGF_CNTBT,
132	XREP_AGF_RMAPBT,
133	XREP_AGF_REFCOUNTBT,
134	XREP_AGF_END,
135	XREP_AGF_MAX
136};
137
138/* Check a btree root candidate. */
139static inline bool
140xrep_check_btree_root(
141	struct xfs_scrub		*sc,
142	struct xrep_find_ag_btree	*fab)
143{
144	return xfs_verify_agbno(sc->sa.pag, fab->root) &&
145	       fab->height <= fab->maxlevels;
146}
147
148/*
149 * Given the btree roots described by *fab, find the roots, check them for
150 * sanity, and pass the root data back out via *fab.
151 *
152 * This is /also/ a chicken and egg problem because we have to use the rmapbt
153 * (rooted in the AGF) to find the btrees rooted in the AGF.  We also have no
154 * idea if the btrees make any sense.  If we hit obvious corruptions in those
155 * btrees we'll bail out.
156 */
157STATIC int
158xrep_agf_find_btrees(
159	struct xfs_scrub		*sc,
160	struct xfs_buf			*agf_bp,
161	struct xrep_find_ag_btree	*fab,
162	struct xfs_buf			*agfl_bp)
163{
164	struct xfs_agf			*old_agf = agf_bp->b_addr;
165	int				error;
166
167	/* Go find the root data. */
168	error = xrep_find_ag_btree_roots(sc, agf_bp, fab, agfl_bp);
169	if (error)
170		return error;
171
172	/* We must find the bnobt, cntbt, and rmapbt roots. */
173	if (!xrep_check_btree_root(sc, &fab[XREP_AGF_BNOBT]) ||
174	    !xrep_check_btree_root(sc, &fab[XREP_AGF_CNTBT]) ||
175	    !xrep_check_btree_root(sc, &fab[XREP_AGF_RMAPBT]))
176		return -EFSCORRUPTED;
177
178	/*
179	 * We relied on the rmapbt to reconstruct the AGF.  If we get a
180	 * different root then something's seriously wrong.
181	 */
182	if (fab[XREP_AGF_RMAPBT].root != be32_to_cpu(old_agf->agf_rmap_root))
183		return -EFSCORRUPTED;
184
185	/* We must find the refcountbt root if that feature is enabled. */
186	if (xfs_has_reflink(sc->mp) &&
187	    !xrep_check_btree_root(sc, &fab[XREP_AGF_REFCOUNTBT]))
188		return -EFSCORRUPTED;
189
190	return 0;
191}
192
193/*
194 * Reinitialize the AGF header, making an in-core copy of the old contents so
195 * that we know which in-core state needs to be reinitialized.
196 */
197STATIC void
198xrep_agf_init_header(
199	struct xfs_scrub	*sc,
200	struct xfs_buf		*agf_bp,
201	struct xfs_agf		*old_agf)
202{
203	struct xfs_mount	*mp = sc->mp;
204	struct xfs_perag	*pag = sc->sa.pag;
205	struct xfs_agf		*agf = agf_bp->b_addr;
206
207	memcpy(old_agf, agf, sizeof(*old_agf));
208	memset(agf, 0, BBTOB(agf_bp->b_length));
209	agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
210	agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
211	agf->agf_seqno = cpu_to_be32(pag->pag_agno);
212	agf->agf_length = cpu_to_be32(pag->block_count);
213	agf->agf_flfirst = old_agf->agf_flfirst;
214	agf->agf_fllast = old_agf->agf_fllast;
215	agf->agf_flcount = old_agf->agf_flcount;
216	if (xfs_has_crc(mp))
217		uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
218
219	/* Mark the incore AGF data stale until we're done fixing things. */
220	ASSERT(xfs_perag_initialised_agf(pag));
221	clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
222}
223
224/* Set btree root information in an AGF. */
225STATIC void
226xrep_agf_set_roots(
227	struct xfs_scrub		*sc,
228	struct xfs_agf			*agf,
229	struct xrep_find_ag_btree	*fab)
230{
231	agf->agf_bno_root = cpu_to_be32(fab[XREP_AGF_BNOBT].root);
232	agf->agf_bno_level = cpu_to_be32(fab[XREP_AGF_BNOBT].height);
233
234	agf->agf_cnt_root = cpu_to_be32(fab[XREP_AGF_CNTBT].root);
235	agf->agf_cnt_level = cpu_to_be32(fab[XREP_AGF_CNTBT].height);
236
237	agf->agf_rmap_root = cpu_to_be32(fab[XREP_AGF_RMAPBT].root);
238	agf->agf_rmap_level = cpu_to_be32(fab[XREP_AGF_RMAPBT].height);
239
240	if (xfs_has_reflink(sc->mp)) {
241		agf->agf_refcount_root =
242				cpu_to_be32(fab[XREP_AGF_REFCOUNTBT].root);
243		agf->agf_refcount_level =
244				cpu_to_be32(fab[XREP_AGF_REFCOUNTBT].height);
245	}
246}
247
248/* Update all AGF fields which derive from btree contents. */
249STATIC int
250xrep_agf_calc_from_btrees(
251	struct xfs_scrub	*sc,
252	struct xfs_buf		*agf_bp)
253{
254	struct xrep_agf_allocbt	raa = { .sc = sc };
255	struct xfs_btree_cur	*cur = NULL;
256	struct xfs_agf		*agf = agf_bp->b_addr;
257	struct xfs_mount	*mp = sc->mp;
258	xfs_agblock_t		btreeblks;
259	xfs_agblock_t		blocks;
260	int			error;
261
262	/* Update the AGF counters from the bnobt. */
263	cur = xfs_bnobt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
264	error = xfs_alloc_query_all(cur, xrep_agf_walk_allocbt, &raa);
265	if (error)
266		goto err;
267	error = xfs_btree_count_blocks(cur, &blocks);
268	if (error)
269		goto err;
270	xfs_btree_del_cursor(cur, error);
271	btreeblks = blocks - 1;
272	agf->agf_freeblks = cpu_to_be32(raa.freeblks);
273	agf->agf_longest = cpu_to_be32(raa.longest);
274
275	/* Update the AGF counters from the cntbt. */
276	cur = xfs_cntbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
277	error = xfs_btree_count_blocks(cur, &blocks);
278	if (error)
279		goto err;
280	xfs_btree_del_cursor(cur, error);
281	btreeblks += blocks - 1;
282
283	/* Update the AGF counters from the rmapbt. */
284	cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
285	error = xfs_btree_count_blocks(cur, &blocks);
286	if (error)
287		goto err;
288	xfs_btree_del_cursor(cur, error);
289	agf->agf_rmap_blocks = cpu_to_be32(blocks);
290	btreeblks += blocks - 1;
291
292	agf->agf_btreeblks = cpu_to_be32(btreeblks);
293
294	/* Update the AGF counters from the refcountbt. */
295	if (xfs_has_reflink(mp)) {
296		cur = xfs_refcountbt_init_cursor(mp, sc->tp, agf_bp,
297				sc->sa.pag);
298		error = xfs_btree_count_blocks(cur, &blocks);
299		if (error)
300			goto err;
301		xfs_btree_del_cursor(cur, error);
302		agf->agf_refcount_blocks = cpu_to_be32(blocks);
303	}
304
305	return 0;
306err:
307	xfs_btree_del_cursor(cur, error);
308	return error;
309}
310
311/* Commit the new AGF and reinitialize the incore state. */
312STATIC int
313xrep_agf_commit_new(
314	struct xfs_scrub	*sc,
315	struct xfs_buf		*agf_bp)
316{
317	struct xfs_perag	*pag;
318	struct xfs_agf		*agf = agf_bp->b_addr;
319
320	/* Trigger fdblocks recalculation */
321	xfs_force_summary_recalc(sc->mp);
322
323	/* Write this to disk. */
324	xfs_trans_buf_set_type(sc->tp, agf_bp, XFS_BLFT_AGF_BUF);
325	xfs_trans_log_buf(sc->tp, agf_bp, 0, BBTOB(agf_bp->b_length) - 1);
326
327	/* Now reinitialize the in-core counters we changed. */
328	pag = sc->sa.pag;
329	pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
330	pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
331	pag->pagf_longest = be32_to_cpu(agf->agf_longest);
332	pag->pagf_bno_level = be32_to_cpu(agf->agf_bno_level);
333	pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level);
334	pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level);
335	pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
336	set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
337
338	return xrep_roll_ag_trans(sc);
339}
340
341/* Repair the AGF. v5 filesystems only. */
342int
343xrep_agf(
344	struct xfs_scrub		*sc)
345{
346	struct xrep_find_ag_btree	fab[XREP_AGF_MAX] = {
347		[XREP_AGF_BNOBT] = {
348			.rmap_owner = XFS_RMAP_OWN_AG,
349			.buf_ops = &xfs_bnobt_buf_ops,
350			.maxlevels = sc->mp->m_alloc_maxlevels,
351		},
352		[XREP_AGF_CNTBT] = {
353			.rmap_owner = XFS_RMAP_OWN_AG,
354			.buf_ops = &xfs_cntbt_buf_ops,
355			.maxlevels = sc->mp->m_alloc_maxlevels,
356		},
357		[XREP_AGF_RMAPBT] = {
358			.rmap_owner = XFS_RMAP_OWN_AG,
359			.buf_ops = &xfs_rmapbt_buf_ops,
360			.maxlevels = sc->mp->m_rmap_maxlevels,
361		},
362		[XREP_AGF_REFCOUNTBT] = {
363			.rmap_owner = XFS_RMAP_OWN_REFC,
364			.buf_ops = &xfs_refcountbt_buf_ops,
365			.maxlevels = sc->mp->m_refc_maxlevels,
366		},
367		[XREP_AGF_END] = {
368			.buf_ops = NULL,
369		},
370	};
371	struct xfs_agf			old_agf;
372	struct xfs_mount		*mp = sc->mp;
373	struct xfs_buf			*agf_bp;
374	struct xfs_buf			*agfl_bp;
375	struct xfs_agf			*agf;
376	int				error;
377
378	/* We require the rmapbt to rebuild anything. */
379	if (!xfs_has_rmapbt(mp))
380		return -EOPNOTSUPP;
381
382	/*
383	 * Make sure we have the AGF buffer, as scrub might have decided it
384	 * was corrupt after xfs_alloc_read_agf failed with -EFSCORRUPTED.
385	 */
386	error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
387			XFS_AG_DADDR(mp, sc->sa.pag->pag_agno,
388						XFS_AGF_DADDR(mp)),
389			XFS_FSS_TO_BB(mp, 1), 0, &agf_bp, NULL);
390	if (error)
391		return error;
392	agf_bp->b_ops = &xfs_agf_buf_ops;
393	agf = agf_bp->b_addr;
394
395	/*
396	 * Load the AGFL so that we can screen out OWN_AG blocks that are on
397	 * the AGFL now; these blocks might have once been part of the
398	 * bno/cnt/rmap btrees but are not now.  This is a chicken and egg
399	 * problem: the AGF is corrupt, so we have to trust the AGFL contents
400	 * because we can't do any serious cross-referencing with any of the
401	 * btrees rooted in the AGF.  If the AGFL contents are obviously bad
402	 * then we'll bail out.
403	 */
404	error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
405	if (error)
406		return error;
407
408	/*
409	 * Spot-check the AGFL blocks; if they're obviously corrupt then
410	 * there's nothing we can do but bail out.
411	 */
412	error = xfs_agfl_walk(sc->mp, agf_bp->b_addr, agfl_bp,
413			xrep_agf_check_agfl_block, sc);
414	if (error)
415		return error;
416
417	/*
418	 * Find the AGF btree roots.  This is also a chicken-and-egg situation;
419	 * see the function for more details.
420	 */
421	error = xrep_agf_find_btrees(sc, agf_bp, fab, agfl_bp);
422	if (error)
423		return error;
424
425	/* Last chance to abort before we start committing fixes. */
426	if (xchk_should_terminate(sc, &error))
427		return error;
428
429	/* Start rewriting the header and implant the btrees we found. */
430	xrep_agf_init_header(sc, agf_bp, &old_agf);
431	xrep_agf_set_roots(sc, agf, fab);
432	error = xrep_agf_calc_from_btrees(sc, agf_bp);
433	if (error)
434		goto out_revert;
435
436	/* Commit the changes and reinitialize incore state. */
437	return xrep_agf_commit_new(sc, agf_bp);
438
439out_revert:
440	/* Mark the incore AGF state stale and revert the AGF. */
441	clear_bit(XFS_AGSTATE_AGF_INIT, &sc->sa.pag->pag_opstate);
442	memcpy(agf, &old_agf, sizeof(old_agf));
443	return error;
444}
445
446/* AGFL */
447
448struct xrep_agfl {
449	/* Bitmap of alleged AGFL blocks that we're not going to add. */
450	struct xagb_bitmap	crossed;
451
452	/* Bitmap of other OWN_AG metadata blocks. */
453	struct xagb_bitmap	agmetablocks;
454
455	/* Bitmap of free space. */
456	struct xagb_bitmap	*freesp;
457
458	/* rmapbt cursor for finding crosslinked blocks */
459	struct xfs_btree_cur	*rmap_cur;
460
461	struct xfs_scrub	*sc;
462};
463
464/* Record all OWN_AG (free space btree) information from the rmap data. */
465STATIC int
466xrep_agfl_walk_rmap(
467	struct xfs_btree_cur	*cur,
468	const struct xfs_rmap_irec *rec,
469	void			*priv)
470{
471	struct xrep_agfl	*ra = priv;
472	int			error = 0;
473
474	if (xchk_should_terminate(ra->sc, &error))
475		return error;
476
477	/* Record all the OWN_AG blocks. */
478	if (rec->rm_owner == XFS_RMAP_OWN_AG) {
479		error = xagb_bitmap_set(ra->freesp, rec->rm_startblock,
480				rec->rm_blockcount);
481		if (error)
482			return error;
483	}
484
485	return xagb_bitmap_set_btcur_path(&ra->agmetablocks, cur);
486}
487
488/* Strike out the blocks that are cross-linked according to the rmapbt. */
489STATIC int
490xrep_agfl_check_extent(
491	uint32_t		agbno,
492	uint32_t		len,
493	void			*priv)
494{
495	struct xrep_agfl	*ra = priv;
496	xfs_agblock_t		last_agbno = agbno + len - 1;
497	int			error;
498
499	while (agbno <= last_agbno) {
500		bool		other_owners;
501
502		error = xfs_rmap_has_other_keys(ra->rmap_cur, agbno, 1,
503				&XFS_RMAP_OINFO_AG, &other_owners);
504		if (error)
505			return error;
506
507		if (other_owners) {
508			error = xagb_bitmap_set(&ra->crossed, agbno, 1);
509			if (error)
510				return error;
511		}
512
513		if (xchk_should_terminate(ra->sc, &error))
514			return error;
515		agbno++;
516	}
517
518	return 0;
519}
520
521/*
522 * Map out all the non-AGFL OWN_AG space in this AG so that we can deduce
523 * which blocks belong to the AGFL.
524 *
525 * Compute the set of old AGFL blocks by subtracting from the list of OWN_AG
526 * blocks the list of blocks owned by all other OWN_AG metadata (bnobt, cntbt,
527 * rmapbt).  These are the old AGFL blocks, so return that list and the number
528 * of blocks we're actually going to put back on the AGFL.
529 */
530STATIC int
531xrep_agfl_collect_blocks(
532	struct xfs_scrub	*sc,
533	struct xfs_buf		*agf_bp,
534	struct xagb_bitmap	*agfl_extents,
535	xfs_agblock_t		*flcount)
536{
537	struct xrep_agfl	ra;
538	struct xfs_mount	*mp = sc->mp;
539	struct xfs_btree_cur	*cur;
540	int			error;
541
542	ra.sc = sc;
543	ra.freesp = agfl_extents;
544	xagb_bitmap_init(&ra.agmetablocks);
545	xagb_bitmap_init(&ra.crossed);
546
547	/* Find all space used by the free space btrees & rmapbt. */
548	cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
549	error = xfs_rmap_query_all(cur, xrep_agfl_walk_rmap, &ra);
550	xfs_btree_del_cursor(cur, error);
551	if (error)
552		goto out_bmp;
553
554	/* Find all blocks currently being used by the bnobt. */
555	cur = xfs_bnobt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
556	error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur);
557	xfs_btree_del_cursor(cur, error);
558	if (error)
559		goto out_bmp;
560
561	/* Find all blocks currently being used by the cntbt. */
562	cur = xfs_cntbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
563	error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur);
564	xfs_btree_del_cursor(cur, error);
565	if (error)
566		goto out_bmp;
567
568	/*
569	 * Drop the freesp meta blocks that are in use by btrees.
570	 * The remaining blocks /should/ be AGFL blocks.
571	 */
572	error = xagb_bitmap_disunion(agfl_extents, &ra.agmetablocks);
573	if (error)
574		goto out_bmp;
575
576	/* Strike out the blocks that are cross-linked. */
577	ra.rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
578	error = xagb_bitmap_walk(agfl_extents, xrep_agfl_check_extent, &ra);
579	xfs_btree_del_cursor(ra.rmap_cur, error);
580	if (error)
581		goto out_bmp;
582	error = xagb_bitmap_disunion(agfl_extents, &ra.crossed);
583	if (error)
584		goto out_bmp;
585
586	/*
587	 * Calculate the new AGFL size.  If we found more blocks than fit in
588	 * the AGFL we'll free them later.
589	 */
590	*flcount = min_t(uint64_t, xagb_bitmap_hweight(agfl_extents),
591			 xfs_agfl_size(mp));
592
593out_bmp:
594	xagb_bitmap_destroy(&ra.crossed);
595	xagb_bitmap_destroy(&ra.agmetablocks);
596	return error;
597}
598
599/* Update the AGF and reset the in-core state. */
600STATIC void
601xrep_agfl_update_agf(
602	struct xfs_scrub	*sc,
603	struct xfs_buf		*agf_bp,
604	xfs_agblock_t		flcount)
605{
606	struct xfs_agf		*agf = agf_bp->b_addr;
607
608	ASSERT(flcount <= xfs_agfl_size(sc->mp));
609
610	/* Trigger fdblocks recalculation */
611	xfs_force_summary_recalc(sc->mp);
612
613	/* Update the AGF counters. */
614	if (xfs_perag_initialised_agf(sc->sa.pag)) {
615		sc->sa.pag->pagf_flcount = flcount;
616		clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET,
617				&sc->sa.pag->pag_opstate);
618	}
619	agf->agf_flfirst = cpu_to_be32(0);
620	agf->agf_flcount = cpu_to_be32(flcount);
621	if (flcount)
622		agf->agf_fllast = cpu_to_be32(flcount - 1);
623	else
624		agf->agf_fllast = cpu_to_be32(xfs_agfl_size(sc->mp) - 1);
625
626	xfs_alloc_log_agf(sc->tp, agf_bp,
627			XFS_AGF_FLFIRST | XFS_AGF_FLLAST | XFS_AGF_FLCOUNT);
628}
629
630struct xrep_agfl_fill {
631	struct xagb_bitmap	used_extents;
632	struct xfs_scrub	*sc;
633	__be32			*agfl_bno;
634	xfs_agblock_t		flcount;
635	unsigned int		fl_off;
636};
637
638/* Fill the AGFL with whatever blocks are in this extent. */
639static int
640xrep_agfl_fill(
641	uint32_t		start,
642	uint32_t		len,
643	void			*priv)
644{
645	struct xrep_agfl_fill	*af = priv;
646	struct xfs_scrub	*sc = af->sc;
647	xfs_agblock_t		agbno = start;
648	int			error;
649
650	trace_xrep_agfl_insert(sc->sa.pag, agbno, len);
651
652	while (agbno < start + len && af->fl_off < af->flcount)
653		af->agfl_bno[af->fl_off++] = cpu_to_be32(agbno++);
654
655	error = xagb_bitmap_set(&af->used_extents, start, agbno - 1);
656	if (error)
657		return error;
658
659	if (af->fl_off == af->flcount)
660		return -ECANCELED;
661
662	return 0;
663}
664
665/* Write out a totally new AGFL. */
666STATIC int
667xrep_agfl_init_header(
668	struct xfs_scrub	*sc,
669	struct xfs_buf		*agfl_bp,
670	struct xagb_bitmap	*agfl_extents,
671	xfs_agblock_t		flcount)
672{
673	struct xrep_agfl_fill	af = {
674		.sc		= sc,
675		.flcount	= flcount,
676	};
677	struct xfs_mount	*mp = sc->mp;
678	struct xfs_agfl		*agfl;
679	int			error;
680
681	ASSERT(flcount <= xfs_agfl_size(mp));
682
683	/*
684	 * Start rewriting the header by setting the bno[] array to
685	 * NULLAGBLOCK, then setting AGFL header fields.
686	 */
687	agfl = XFS_BUF_TO_AGFL(agfl_bp);
688	memset(agfl, 0xFF, BBTOB(agfl_bp->b_length));
689	agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
690	agfl->agfl_seqno = cpu_to_be32(sc->sa.pag->pag_agno);
691	uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
692
693	/*
694	 * Fill the AGFL with the remaining blocks.  If agfl_extents has more
695	 * blocks than fit in the AGFL, they will be freed in a subsequent
696	 * step.
697	 */
698	xagb_bitmap_init(&af.used_extents);
699	af.agfl_bno = xfs_buf_to_agfl_bno(agfl_bp),
700	xagb_bitmap_walk(agfl_extents, xrep_agfl_fill, &af);
701	error = xagb_bitmap_disunion(agfl_extents, &af.used_extents);
702	if (error)
703		return error;
704
705	/* Write new AGFL to disk. */
706	xfs_trans_buf_set_type(sc->tp, agfl_bp, XFS_BLFT_AGFL_BUF);
707	xfs_trans_log_buf(sc->tp, agfl_bp, 0, BBTOB(agfl_bp->b_length) - 1);
708	xagb_bitmap_destroy(&af.used_extents);
709	return 0;
710}
711
712/* Repair the AGFL. */
713int
714xrep_agfl(
715	struct xfs_scrub	*sc)
716{
717	struct xagb_bitmap	agfl_extents;
718	struct xfs_mount	*mp = sc->mp;
719	struct xfs_buf		*agf_bp;
720	struct xfs_buf		*agfl_bp;
721	xfs_agblock_t		flcount;
722	int			error;
723
724	/* We require the rmapbt to rebuild anything. */
725	if (!xfs_has_rmapbt(mp))
726		return -EOPNOTSUPP;
727
728	xagb_bitmap_init(&agfl_extents);
729
730	/*
731	 * Read the AGF so that we can query the rmapbt.  We hope that there's
732	 * nothing wrong with the AGF, but all the AG header repair functions
733	 * have this chicken-and-egg problem.
734	 */
735	error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp);
736	if (error)
737		return error;
738
739	/*
740	 * Make sure we have the AGFL buffer, as scrub might have decided it
741	 * was corrupt after xfs_alloc_read_agfl failed with -EFSCORRUPTED.
742	 */
743	error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
744			XFS_AG_DADDR(mp, sc->sa.pag->pag_agno,
745						XFS_AGFL_DADDR(mp)),
746			XFS_FSS_TO_BB(mp, 1), 0, &agfl_bp, NULL);
747	if (error)
748		return error;
749	agfl_bp->b_ops = &xfs_agfl_buf_ops;
750
751	/* Gather all the extents we're going to put on the new AGFL. */
752	error = xrep_agfl_collect_blocks(sc, agf_bp, &agfl_extents, &flcount);
753	if (error)
754		goto err;
755
756	/* Last chance to abort before we start committing fixes. */
757	if (xchk_should_terminate(sc, &error))
758		goto err;
759
760	/*
761	 * Update AGF and AGFL.  We reset the global free block counter when
762	 * we adjust the AGF flcount (which can fail) so avoid updating any
763	 * buffers until we know that part works.
764	 */
765	xrep_agfl_update_agf(sc, agf_bp, flcount);
766	error = xrep_agfl_init_header(sc, agfl_bp, &agfl_extents, flcount);
767	if (error)
768		goto err;
769
770	/*
771	 * Ok, the AGFL should be ready to go now.  Roll the transaction to
772	 * make the new AGFL permanent before we start using it to return
773	 * freespace overflow to the freespace btrees.
774	 */
775	sc->sa.agf_bp = agf_bp;
776	error = xrep_roll_ag_trans(sc);
777	if (error)
778		goto err;
779
780	/* Dump any AGFL overflow. */
781	error = xrep_reap_agblocks(sc, &agfl_extents, &XFS_RMAP_OINFO_AG,
782			XFS_AG_RESV_AGFL);
783	if (error)
784		goto err;
785
786err:
787	xagb_bitmap_destroy(&agfl_extents);
788	return error;
789}
790
791/* AGI */
792
793/*
794 * Offset within the xrep_find_ag_btree array for each btree type.  Avoid the
795 * XFS_BTNUM_ names here to avoid creating a sparse array.
796 */
797enum {
798	XREP_AGI_INOBT = 0,
799	XREP_AGI_FINOBT,
800	XREP_AGI_END,
801	XREP_AGI_MAX
802};
803
804#define XREP_AGI_LOOKUP_BATCH		32
805
806struct xrep_agi {
807	struct xfs_scrub		*sc;
808
809	/* AGI buffer, tracked separately */
810	struct xfs_buf			*agi_bp;
811
812	/* context for finding btree roots */
813	struct xrep_find_ag_btree	fab[XREP_AGI_MAX];
814
815	/* old AGI contents in case we have to revert */
816	struct xfs_agi			old_agi;
817
818	/* bitmap of which inodes are unlinked */
819	struct xagino_bitmap		iunlink_bmp;
820
821	/* heads of the unlinked inode bucket lists */
822	xfs_agino_t			iunlink_heads[XFS_AGI_UNLINKED_BUCKETS];
823
824	/* scratchpad for batched lookups of the radix tree */
825	struct xfs_inode		*lookup_batch[XREP_AGI_LOOKUP_BATCH];
826
827	/* Map of ino -> next_ino for unlinked inode processing. */
828	struct xfarray			*iunlink_next;
829
830	/* Map of ino -> prev_ino for unlinked inode processing. */
831	struct xfarray			*iunlink_prev;
832};
833
834static void
835xrep_agi_buf_cleanup(
836	void		*buf)
837{
838	struct xrep_agi	*ragi = buf;
839
840	xfarray_destroy(ragi->iunlink_prev);
841	xfarray_destroy(ragi->iunlink_next);
842	xagino_bitmap_destroy(&ragi->iunlink_bmp);
843}
844
845/*
846 * Given the inode btree roots described by *fab, find the roots, check them
847 * for sanity, and pass the root data back out via *fab.
848 */
849STATIC int
850xrep_agi_find_btrees(
851	struct xrep_agi			*ragi)
852{
853	struct xfs_scrub		*sc = ragi->sc;
854	struct xrep_find_ag_btree	*fab = ragi->fab;
855	struct xfs_buf			*agf_bp;
856	struct xfs_mount		*mp = sc->mp;
857	int				error;
858
859	/* Read the AGF. */
860	error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp);
861	if (error)
862		return error;
863
864	/* Find the btree roots. */
865	error = xrep_find_ag_btree_roots(sc, agf_bp, fab, NULL);
866	if (error)
867		return error;
868
869	/* We must find the inobt root. */
870	if (!xrep_check_btree_root(sc, &fab[XREP_AGI_INOBT]))
871		return -EFSCORRUPTED;
872
873	/* We must find the finobt root if that feature is enabled. */
874	if (xfs_has_finobt(mp) &&
875	    !xrep_check_btree_root(sc, &fab[XREP_AGI_FINOBT]))
876		return -EFSCORRUPTED;
877
878	return 0;
879}
880
881/*
882 * Reinitialize the AGI header, making an in-core copy of the old contents so
883 * that we know which in-core state needs to be reinitialized.
884 */
885STATIC void
886xrep_agi_init_header(
887	struct xrep_agi		*ragi)
888{
889	struct xfs_scrub	*sc = ragi->sc;
890	struct xfs_buf		*agi_bp = ragi->agi_bp;
891	struct xfs_agi		*old_agi = &ragi->old_agi;
892	struct xfs_agi		*agi = agi_bp->b_addr;
893	struct xfs_perag	*pag = sc->sa.pag;
894	struct xfs_mount	*mp = sc->mp;
895
896	memcpy(old_agi, agi, sizeof(*old_agi));
897	memset(agi, 0, BBTOB(agi_bp->b_length));
898	agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
899	agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
900	agi->agi_seqno = cpu_to_be32(pag->pag_agno);
901	agi->agi_length = cpu_to_be32(pag->block_count);
902	agi->agi_newino = cpu_to_be32(NULLAGINO);
903	agi->agi_dirino = cpu_to_be32(NULLAGINO);
904	if (xfs_has_crc(mp))
905		uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid);
906
907	/* Mark the incore AGF data stale until we're done fixing things. */
908	ASSERT(xfs_perag_initialised_agi(pag));
909	clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
910}
911
912/* Set btree root information in an AGI. */
913STATIC void
914xrep_agi_set_roots(
915	struct xrep_agi			*ragi)
916{
917	struct xfs_scrub		*sc = ragi->sc;
918	struct xfs_agi			*agi = ragi->agi_bp->b_addr;
919	struct xrep_find_ag_btree	*fab = ragi->fab;
920
921	agi->agi_root = cpu_to_be32(fab[XREP_AGI_INOBT].root);
922	agi->agi_level = cpu_to_be32(fab[XREP_AGI_INOBT].height);
923
924	if (xfs_has_finobt(sc->mp)) {
925		agi->agi_free_root = cpu_to_be32(fab[XREP_AGI_FINOBT].root);
926		agi->agi_free_level = cpu_to_be32(fab[XREP_AGI_FINOBT].height);
927	}
928}
929
930/* Update the AGI counters. */
931STATIC int
932xrep_agi_calc_from_btrees(
933	struct xrep_agi		*ragi)
934{
935	struct xfs_scrub	*sc = ragi->sc;
936	struct xfs_buf		*agi_bp = ragi->agi_bp;
937	struct xfs_btree_cur	*cur;
938	struct xfs_agi		*agi = agi_bp->b_addr;
939	struct xfs_mount	*mp = sc->mp;
940	xfs_agino_t		count;
941	xfs_agino_t		freecount;
942	int			error;
943
944	cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp);
945	error = xfs_ialloc_count_inodes(cur, &count, &freecount);
946	if (error)
947		goto err;
948	if (xfs_has_inobtcounts(mp)) {
949		xfs_agblock_t	blocks;
950
951		error = xfs_btree_count_blocks(cur, &blocks);
952		if (error)
953			goto err;
954		agi->agi_iblocks = cpu_to_be32(blocks);
955	}
956	xfs_btree_del_cursor(cur, error);
957
958	agi->agi_count = cpu_to_be32(count);
959	agi->agi_freecount = cpu_to_be32(freecount);
960
961	if (xfs_has_finobt(mp) && xfs_has_inobtcounts(mp)) {
962		xfs_agblock_t	blocks;
963
964		cur = xfs_finobt_init_cursor(sc->sa.pag, sc->tp, agi_bp);
965		error = xfs_btree_count_blocks(cur, &blocks);
966		if (error)
967			goto err;
968		xfs_btree_del_cursor(cur, error);
969		agi->agi_fblocks = cpu_to_be32(blocks);
970	}
971
972	return 0;
973err:
974	xfs_btree_del_cursor(cur, error);
975	return error;
976}
977
978/*
979 * Record a forwards unlinked chain pointer from agino -> next_agino in our
980 * staging information.
981 */
982static inline int
983xrep_iunlink_store_next(
984	struct xrep_agi		*ragi,
985	xfs_agino_t		agino,
986	xfs_agino_t		next_agino)
987{
988	ASSERT(next_agino != 0);
989
990	return xfarray_store(ragi->iunlink_next, agino, &next_agino);
991}
992
993/*
994 * Record a backwards unlinked chain pointer from prev_ino <- agino in our
995 * staging information.
996 */
997static inline int
998xrep_iunlink_store_prev(
999	struct xrep_agi		*ragi,
1000	xfs_agino_t		agino,
1001	xfs_agino_t		prev_agino)
1002{
1003	ASSERT(prev_agino != 0);
1004
1005	return xfarray_store(ragi->iunlink_prev, agino, &prev_agino);
1006}
1007
1008/*
1009 * Given an @agino, look up the next inode in the iunlink bucket.  Returns
1010 * NULLAGINO if we're at the end of the chain, 0 if @agino is not in memory
1011 * like it should be, or a per-AG inode number.
1012 */
1013static inline xfs_agino_t
1014xrep_iunlink_next(
1015	struct xfs_scrub	*sc,
1016	xfs_agino_t		agino)
1017{
1018	struct xfs_inode	*ip;
1019
1020	ip = xfs_iunlink_lookup(sc->sa.pag, agino);
1021	if (!ip)
1022		return 0;
1023
1024	return ip->i_next_unlinked;
1025}
1026
1027/*
1028 * Load the inode @agino into memory, set its i_prev_unlinked, and drop the
1029 * inode so it can be inactivated.  Returns NULLAGINO if we're at the end of
1030 * the chain or if we should stop walking the chain due to corruption; or a
1031 * per-AG inode number.
1032 */
1033STATIC xfs_agino_t
1034xrep_iunlink_reload_next(
1035	struct xrep_agi		*ragi,
1036	xfs_agino_t		prev_agino,
1037	xfs_agino_t		agino)
1038{
1039	struct xfs_scrub	*sc = ragi->sc;
1040	struct xfs_inode	*ip;
1041	xfs_ino_t		ino;
1042	xfs_agino_t		ret = NULLAGINO;
1043	int			error;
1044
1045	ino = XFS_AGINO_TO_INO(sc->mp, sc->sa.pag->pag_agno, agino);
1046	error = xchk_iget(ragi->sc, ino, &ip);
1047	if (error)
1048		return ret;
1049
1050	trace_xrep_iunlink_reload_next(ip, prev_agino);
1051
1052	/* If this is a linked inode, stop processing the chain. */
1053	if (VFS_I(ip)->i_nlink != 0) {
1054		xrep_iunlink_store_next(ragi, agino, NULLAGINO);
1055		goto rele;
1056	}
1057
1058	ip->i_prev_unlinked = prev_agino;
1059	ret = ip->i_next_unlinked;
1060
1061	/*
1062	 * Drop the inode reference that we just took.  We hold the AGI, so
1063	 * this inode cannot move off the unlinked list and hence cannot be
1064	 * reclaimed.
1065	 */
1066rele:
1067	xchk_irele(sc, ip);
1068	return ret;
1069}
1070
1071/*
1072 * Walk an AGI unlinked bucket's list to load incore any unlinked inodes that
1073 * still existed at mount time.  This can happen if iunlink processing fails
1074 * during log recovery.
1075 */
1076STATIC int
1077xrep_iunlink_walk_ondisk_bucket(
1078	struct xrep_agi		*ragi,
1079	unsigned int		bucket)
1080{
1081	struct xfs_scrub	*sc = ragi->sc;
1082	struct xfs_agi		*agi = sc->sa.agi_bp->b_addr;
1083	xfs_agino_t		prev_agino = NULLAGINO;
1084	xfs_agino_t		next_agino;
1085	int			error = 0;
1086
1087	next_agino = be32_to_cpu(agi->agi_unlinked[bucket]);
1088	while (next_agino != NULLAGINO) {
1089		xfs_agino_t	agino = next_agino;
1090
1091		if (xchk_should_terminate(ragi->sc, &error))
1092			return error;
1093
1094		trace_xrep_iunlink_walk_ondisk_bucket(sc->sa.pag, bucket,
1095				prev_agino, agino);
1096
1097		if (bucket != agino % XFS_AGI_UNLINKED_BUCKETS)
1098			break;
1099
1100		next_agino = xrep_iunlink_next(sc, agino);
1101		if (!next_agino)
1102			next_agino = xrep_iunlink_reload_next(ragi, prev_agino,
1103					agino);
1104
1105		prev_agino = agino;
1106	}
1107
1108	return 0;
1109}
1110
1111/* Decide if this is an unlinked inode in this AG. */
1112STATIC bool
1113xrep_iunlink_igrab(
1114	struct xfs_perag	*pag,
1115	struct xfs_inode	*ip)
1116{
1117	struct xfs_mount	*mp = pag->pag_mount;
1118
1119	if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
1120		return false;
1121
1122	if (!xfs_inode_on_unlinked_list(ip))
1123		return false;
1124
1125	return true;
1126}
1127
1128/*
1129 * Mark the given inode in the lookup batch in our unlinked inode bitmap, and
1130 * remember if this inode is the start of the unlinked chain.
1131 */
1132STATIC int
1133xrep_iunlink_visit(
1134	struct xrep_agi		*ragi,
1135	unsigned int		batch_idx)
1136{
1137	struct xfs_mount	*mp = ragi->sc->mp;
1138	struct xfs_inode	*ip = ragi->lookup_batch[batch_idx];
1139	xfs_agino_t		agino;
1140	unsigned int		bucket;
1141	int			error;
1142
1143	ASSERT(XFS_INO_TO_AGNO(mp, ip->i_ino) == ragi->sc->sa.pag->pag_agno);
1144	ASSERT(xfs_inode_on_unlinked_list(ip));
1145
1146	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1147	bucket = agino % XFS_AGI_UNLINKED_BUCKETS;
1148
1149	trace_xrep_iunlink_visit(ragi->sc->sa.pag, bucket,
1150			ragi->iunlink_heads[bucket], ip);
1151
1152	error = xagino_bitmap_set(&ragi->iunlink_bmp, agino, 1);
1153	if (error)
1154		return error;
1155
1156	if (ip->i_prev_unlinked == NULLAGINO) {
1157		if (ragi->iunlink_heads[bucket] == NULLAGINO)
1158			ragi->iunlink_heads[bucket] = agino;
1159	}
1160
1161	return 0;
1162}
1163
1164/*
1165 * Find all incore unlinked inodes so that we can rebuild the unlinked buckets.
1166 * We hold the AGI so there should not be any modifications to the unlinked
1167 * list.
1168 */
1169STATIC int
1170xrep_iunlink_mark_incore(
1171	struct xrep_agi		*ragi)
1172{
1173	struct xfs_perag	*pag = ragi->sc->sa.pag;
1174	struct xfs_mount	*mp = pag->pag_mount;
1175	uint32_t		first_index = 0;
1176	bool			done = false;
1177	unsigned int		nr_found = 0;
1178
1179	do {
1180		unsigned int	i;
1181		int		error = 0;
1182
1183		if (xchk_should_terminate(ragi->sc, &error))
1184			return error;
1185
1186		rcu_read_lock();
1187
1188		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
1189				(void **)&ragi->lookup_batch, first_index,
1190				XREP_AGI_LOOKUP_BATCH);
1191		if (!nr_found) {
1192			rcu_read_unlock();
1193			return 0;
1194		}
1195
1196		for (i = 0; i < nr_found; i++) {
1197			struct xfs_inode *ip = ragi->lookup_batch[i];
1198
1199			if (done || !xrep_iunlink_igrab(pag, ip))
1200				ragi->lookup_batch[i] = NULL;
1201
1202			/*
1203			 * Update the index for the next lookup. Catch
1204			 * overflows into the next AG range which can occur if
1205			 * we have inodes in the last block of the AG and we
1206			 * are currently pointing to the last inode.
1207			 *
1208			 * Because we may see inodes that are from the wrong AG
1209			 * due to RCU freeing and reallocation, only update the
1210			 * index if it lies in this AG. It was a race that lead
1211			 * us to see this inode, so another lookup from the
1212			 * same index will not find it again.
1213			 */
1214			if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
1215				continue;
1216			first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
1217			if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
1218				done = true;
1219		}
1220
1221		/* unlock now we've grabbed the inodes. */
1222		rcu_read_unlock();
1223
1224		for (i = 0; i < nr_found; i++) {
1225			if (!ragi->lookup_batch[i])
1226				continue;
1227			error = xrep_iunlink_visit(ragi, i);
1228			if (error)
1229				return error;
1230		}
1231	} while (!done);
1232
1233	return 0;
1234}
1235
1236/* Mark all the unlinked ondisk inodes in this inobt record in iunlink_bmp. */
1237STATIC int
1238xrep_iunlink_mark_ondisk_rec(
1239	struct xfs_btree_cur		*cur,
1240	const union xfs_btree_rec	*rec,
1241	void				*priv)
1242{
1243	struct xfs_inobt_rec_incore	irec;
1244	struct xrep_agi			*ragi = priv;
1245	struct xfs_scrub		*sc = ragi->sc;
1246	struct xfs_mount		*mp = cur->bc_mp;
1247	xfs_agino_t			agino;
1248	unsigned int			i;
1249	int				error = 0;
1250
1251	xfs_inobt_btrec_to_irec(mp, rec, &irec);
1252
1253	for (i = 0, agino = irec.ir_startino;
1254	     i < XFS_INODES_PER_CHUNK;
1255	     i++, agino++) {
1256		struct xfs_inode	*ip;
1257		unsigned int		len = 1;
1258
1259		/* Skip free inodes */
1260		if (XFS_INOBT_MASK(i) & irec.ir_free)
1261			continue;
1262		/* Skip inodes we've seen before */
1263		if (xagino_bitmap_test(&ragi->iunlink_bmp, agino, &len))
1264			continue;
1265
1266		/*
1267		 * Skip incore inodes; these were already picked up by
1268		 * the _mark_incore step.
1269		 */
1270		rcu_read_lock();
1271		ip = radix_tree_lookup(&sc->sa.pag->pag_ici_root, agino);
1272		rcu_read_unlock();
1273		if (ip)
1274			continue;
1275
1276		/*
1277		 * Try to look up this inode.  If we can't get it, just move
1278		 * on because we haven't actually scrubbed the inobt or the
1279		 * inodes yet.
1280		 */
1281		error = xchk_iget(ragi->sc,
1282				XFS_AGINO_TO_INO(mp, sc->sa.pag->pag_agno,
1283						 agino),
1284				&ip);
1285		if (error)
1286			continue;
1287
1288		trace_xrep_iunlink_reload_ondisk(ip);
1289
1290		if (VFS_I(ip)->i_nlink == 0)
1291			error = xagino_bitmap_set(&ragi->iunlink_bmp, agino, 1);
1292		xchk_irele(sc, ip);
1293		if (error)
1294			break;
1295	}
1296
1297	return error;
1298}
1299
1300/*
1301 * Find ondisk inodes that are unlinked and not in cache, and mark them in
1302 * iunlink_bmp.   We haven't checked the inobt yet, so we don't error out if
1303 * the btree is corrupt.
1304 */
1305STATIC void
1306xrep_iunlink_mark_ondisk(
1307	struct xrep_agi		*ragi)
1308{
1309	struct xfs_scrub	*sc = ragi->sc;
1310	struct xfs_buf		*agi_bp = ragi->agi_bp;
1311	struct xfs_btree_cur	*cur;
1312	int			error;
1313
1314	cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp);
1315	error = xfs_btree_query_all(cur, xrep_iunlink_mark_ondisk_rec, ragi);
1316	xfs_btree_del_cursor(cur, error);
1317}
1318
1319/*
1320 * Walk an iunlink bucket's inode list.  For each inode that should be on this
1321 * chain, clear its entry in in iunlink_bmp because it's ok and we don't need
1322 * to touch it further.
1323 */
1324STATIC int
1325xrep_iunlink_resolve_bucket(
1326	struct xrep_agi		*ragi,
1327	unsigned int		bucket)
1328{
1329	struct xfs_scrub	*sc = ragi->sc;
1330	struct xfs_inode	*ip;
1331	xfs_agino_t		prev_agino = NULLAGINO;
1332	xfs_agino_t		next_agino = ragi->iunlink_heads[bucket];
1333	int			error = 0;
1334
1335	while (next_agino != NULLAGINO) {
1336		if (xchk_should_terminate(ragi->sc, &error))
1337			return error;
1338
1339		/* Find the next inode in the chain. */
1340		ip = xfs_iunlink_lookup(sc->sa.pag, next_agino);
1341		if (!ip) {
1342			/* Inode not incore?  Terminate the chain. */
1343			trace_xrep_iunlink_resolve_uncached(sc->sa.pag,
1344					bucket, prev_agino, next_agino);
1345
1346			next_agino = NULLAGINO;
1347			break;
1348		}
1349
1350		if (next_agino % XFS_AGI_UNLINKED_BUCKETS != bucket) {
1351			/*
1352			 * Inode is in the wrong bucket.  Advance the list,
1353			 * but pretend we didn't see this inode.
1354			 */
1355			trace_xrep_iunlink_resolve_wronglist(sc->sa.pag,
1356					bucket, prev_agino, next_agino);
1357
1358			next_agino = ip->i_next_unlinked;
1359			continue;
1360		}
1361
1362		if (!xfs_inode_on_unlinked_list(ip)) {
1363			/*
1364			 * Incore inode doesn't think this inode is on an
1365			 * unlinked list.  This is probably because we reloaded
1366			 * it from disk.  Advance the list, but pretend we
1367			 * didn't see this inode; we'll fix that later.
1368			 */
1369			trace_xrep_iunlink_resolve_nolist(sc->sa.pag,
1370					bucket, prev_agino, next_agino);
1371			next_agino = ip->i_next_unlinked;
1372			continue;
1373		}
1374
1375		trace_xrep_iunlink_resolve_ok(sc->sa.pag, bucket, prev_agino,
1376				next_agino);
1377
1378		/*
1379		 * Otherwise, this inode's unlinked pointers are ok.  Clear it
1380		 * from the unlinked bitmap since we're done with it, and make
1381		 * sure the chain is still correct.
1382		 */
1383		error = xagino_bitmap_clear(&ragi->iunlink_bmp, next_agino, 1);
1384		if (error)
1385			return error;
1386
1387		/* Remember the previous inode's next pointer. */
1388		if (prev_agino != NULLAGINO) {
1389			error = xrep_iunlink_store_next(ragi, prev_agino,
1390					next_agino);
1391			if (error)
1392				return error;
1393		}
1394
1395		/* Remember this inode's previous pointer. */
1396		error = xrep_iunlink_store_prev(ragi, next_agino, prev_agino);
1397		if (error)
1398			return error;
1399
1400		/* Advance the list and remember this inode. */
1401		prev_agino = next_agino;
1402		next_agino = ip->i_next_unlinked;
1403	}
1404
1405	/* Update the previous inode's next pointer. */
1406	if (prev_agino != NULLAGINO) {
1407		error = xrep_iunlink_store_next(ragi, prev_agino, next_agino);
1408		if (error)
1409			return error;
1410	}
1411
1412	return 0;
1413}
1414
1415/* Reinsert this unlinked inode into the head of the staged bucket list. */
1416STATIC int
1417xrep_iunlink_add_to_bucket(
1418	struct xrep_agi		*ragi,
1419	xfs_agino_t		agino)
1420{
1421	xfs_agino_t		current_head;
1422	unsigned int		bucket;
1423	int			error;
1424
1425	bucket = agino % XFS_AGI_UNLINKED_BUCKETS;
1426
1427	/* Point this inode at the current head of the bucket list. */
1428	current_head = ragi->iunlink_heads[bucket];
1429
1430	trace_xrep_iunlink_add_to_bucket(ragi->sc->sa.pag, bucket, agino,
1431			current_head);
1432
1433	error = xrep_iunlink_store_next(ragi, agino, current_head);
1434	if (error)
1435		return error;
1436
1437	/* Remember the head inode's previous pointer. */
1438	if (current_head != NULLAGINO) {
1439		error = xrep_iunlink_store_prev(ragi, current_head, agino);
1440		if (error)
1441			return error;
1442	}
1443
1444	ragi->iunlink_heads[bucket] = agino;
1445	return 0;
1446}
1447
1448/* Reinsert unlinked inodes into the staged iunlink buckets. */
1449STATIC int
1450xrep_iunlink_add_lost_inodes(
1451	uint32_t		start,
1452	uint32_t		len,
1453	void			*priv)
1454{
1455	struct xrep_agi		*ragi = priv;
1456	int			error;
1457
1458	for (; len > 0; start++, len--) {
1459		error = xrep_iunlink_add_to_bucket(ragi, start);
1460		if (error)
1461			return error;
1462	}
1463
1464	return 0;
1465}
1466
1467/*
1468 * Figure out the iunlink bucket values and find inodes that need to be
1469 * reinserted into the list.
1470 */
1471STATIC int
1472xrep_iunlink_rebuild_buckets(
1473	struct xrep_agi		*ragi)
1474{
1475	unsigned int		i;
1476	int			error;
1477
1478	/*
1479	 * Walk the ondisk AGI unlinked list to find inodes that are on the
1480	 * list but aren't in memory.  This can happen if a past log recovery
1481	 * tried to clear the iunlinked list but failed.  Our scan rebuilds the
1482	 * unlinked list using incore inodes, so we must load and link them
1483	 * properly.
1484	 */
1485	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
1486		error = xrep_iunlink_walk_ondisk_bucket(ragi, i);
1487		if (error)
1488			return error;
1489	}
1490
1491	/*
1492	 * Record all the incore unlinked inodes in iunlink_bmp that we didn't
1493	 * find by walking the ondisk iunlink buckets.  This shouldn't happen,
1494	 * but we can't risk forgetting an inode somewhere.
1495	 */
1496	error = xrep_iunlink_mark_incore(ragi);
1497	if (error)
1498		return error;
1499
1500	/*
1501	 * If there are ondisk inodes that are unlinked and are not been loaded
1502	 * into cache, record them in iunlink_bmp.
1503	 */
1504	xrep_iunlink_mark_ondisk(ragi);
1505
1506	/*
1507	 * Walk each iunlink bucket to (re)construct as much of the incore list
1508	 * as would be correct.  For each inode that survives this step, mark
1509	 * it clear in iunlink_bmp; we're done with those inodes.
1510	 */
1511	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
1512		error = xrep_iunlink_resolve_bucket(ragi, i);
1513		if (error)
1514			return error;
1515	}
1516
1517	/*
1518	 * Any unlinked inodes that we didn't find through the bucket list
1519	 * walk (or was ignored by the walk) must be inserted into the bucket
1520	 * list.  Stage this in memory for now.
1521	 */
1522	return xagino_bitmap_walk(&ragi->iunlink_bmp,
1523			xrep_iunlink_add_lost_inodes, ragi);
1524}
1525
1526/* Update i_next_iunlinked for the inode @agino. */
1527STATIC int
1528xrep_iunlink_relink_next(
1529	struct xrep_agi		*ragi,
1530	xfarray_idx_t		idx,
1531	xfs_agino_t		next_agino)
1532{
1533	struct xfs_scrub	*sc = ragi->sc;
1534	struct xfs_perag	*pag = sc->sa.pag;
1535	struct xfs_inode	*ip;
1536	xfarray_idx_t		agino = idx - 1;
1537	bool			want_rele = false;
1538	int			error = 0;
1539
1540	ip = xfs_iunlink_lookup(pag, agino);
1541	if (!ip) {
1542		xfs_ino_t	ino;
1543		xfs_agino_t	prev_agino;
1544
1545		/*
1546		 * No inode exists in cache.  Load it off the disk so that we
1547		 * can reinsert it into the incore unlinked list.
1548		 */
1549		ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino);
1550		error = xchk_iget(sc, ino, &ip);
1551		if (error)
1552			return -EFSCORRUPTED;
1553
1554		want_rele = true;
1555
1556		/* Set the backward pointer since this just came off disk. */
1557		error = xfarray_load(ragi->iunlink_prev, agino, &prev_agino);
1558		if (error)
1559			goto out_rele;
1560
1561		trace_xrep_iunlink_relink_prev(ip, prev_agino);
1562		ip->i_prev_unlinked = prev_agino;
1563	}
1564
1565	/* Update the forward pointer. */
1566	if (ip->i_next_unlinked != next_agino) {
1567		error = xfs_iunlink_log_inode(sc->tp, ip, pag, next_agino);
1568		if (error)
1569			goto out_rele;
1570
1571		trace_xrep_iunlink_relink_next(ip, next_agino);
1572		ip->i_next_unlinked = next_agino;
1573	}
1574
1575out_rele:
1576	/*
1577	 * The iunlink lookup doesn't igrab because we hold the AGI buffer lock
1578	 * and the inode cannot be reclaimed.  However, if we used iget to load
1579	 * a missing inode, we must irele it here.
1580	 */
1581	if (want_rele)
1582		xchk_irele(sc, ip);
1583	return error;
1584}
1585
1586/* Update i_prev_iunlinked for the inode @agino. */
1587STATIC int
1588xrep_iunlink_relink_prev(
1589	struct xrep_agi		*ragi,
1590	xfarray_idx_t		idx,
1591	xfs_agino_t		prev_agino)
1592{
1593	struct xfs_scrub	*sc = ragi->sc;
1594	struct xfs_perag	*pag = sc->sa.pag;
1595	struct xfs_inode	*ip;
1596	xfarray_idx_t		agino = idx - 1;
1597	bool			want_rele = false;
1598	int			error = 0;
1599
1600	ASSERT(prev_agino != 0);
1601
1602	ip = xfs_iunlink_lookup(pag, agino);
1603	if (!ip) {
1604		xfs_ino_t	ino;
1605		xfs_agino_t	next_agino;
1606
1607		/*
1608		 * No inode exists in cache.  Load it off the disk so that we
1609		 * can reinsert it into the incore unlinked list.
1610		 */
1611		ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino);
1612		error = xchk_iget(sc, ino, &ip);
1613		if (error)
1614			return -EFSCORRUPTED;
1615
1616		want_rele = true;
1617
1618		/* Set the forward pointer since this just came off disk. */
1619		error = xfarray_load(ragi->iunlink_prev, agino, &next_agino);
1620		if (error)
1621			goto out_rele;
1622
1623		error = xfs_iunlink_log_inode(sc->tp, ip, pag, next_agino);
1624		if (error)
1625			goto out_rele;
1626
1627		trace_xrep_iunlink_relink_next(ip, next_agino);
1628		ip->i_next_unlinked = next_agino;
1629	}
1630
1631	/* Update the backward pointer. */
1632	if (ip->i_prev_unlinked != prev_agino) {
1633		trace_xrep_iunlink_relink_prev(ip, prev_agino);
1634		ip->i_prev_unlinked = prev_agino;
1635	}
1636
1637out_rele:
1638	/*
1639	 * The iunlink lookup doesn't igrab because we hold the AGI buffer lock
1640	 * and the inode cannot be reclaimed.  However, if we used iget to load
1641	 * a missing inode, we must irele it here.
1642	 */
1643	if (want_rele)
1644		xchk_irele(sc, ip);
1645	return error;
1646}
1647
1648/* Log all the iunlink updates we need to finish regenerating the AGI. */
1649STATIC int
1650xrep_iunlink_commit(
1651	struct xrep_agi		*ragi)
1652{
1653	struct xfs_agi		*agi = ragi->agi_bp->b_addr;
1654	xfarray_idx_t		idx = XFARRAY_CURSOR_INIT;
1655	xfs_agino_t		agino;
1656	unsigned int		i;
1657	int			error;
1658
1659	/* Fix all the forward links */
1660	while ((error = xfarray_iter(ragi->iunlink_next, &idx, &agino)) == 1) {
1661		error = xrep_iunlink_relink_next(ragi, idx, agino);
1662		if (error)
1663			return error;
1664	}
1665
1666	/* Fix all the back links */
1667	idx = XFARRAY_CURSOR_INIT;
1668	while ((error = xfarray_iter(ragi->iunlink_prev, &idx, &agino)) == 1) {
1669		error = xrep_iunlink_relink_prev(ragi, idx, agino);
1670		if (error)
1671			return error;
1672	}
1673
1674	/* Copy the staged iunlink buckets to the new AGI. */
1675	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
1676		trace_xrep_iunlink_commit_bucket(ragi->sc->sa.pag, i,
1677				be32_to_cpu(ragi->old_agi.agi_unlinked[i]),
1678				ragi->iunlink_heads[i]);
1679
1680		agi->agi_unlinked[i] = cpu_to_be32(ragi->iunlink_heads[i]);
1681	}
1682
1683	return 0;
1684}
1685
1686/* Trigger reinitialization of the in-core data. */
1687STATIC int
1688xrep_agi_commit_new(
1689	struct xrep_agi		*ragi)
1690{
1691	struct xfs_scrub	*sc = ragi->sc;
1692	struct xfs_buf		*agi_bp = ragi->agi_bp;
1693	struct xfs_perag	*pag;
1694	struct xfs_agi		*agi = agi_bp->b_addr;
1695
1696	/* Trigger inode count recalculation */
1697	xfs_force_summary_recalc(sc->mp);
1698
1699	/* Write this to disk. */
1700	xfs_trans_buf_set_type(sc->tp, agi_bp, XFS_BLFT_AGI_BUF);
1701	xfs_trans_log_buf(sc->tp, agi_bp, 0, BBTOB(agi_bp->b_length) - 1);
1702
1703	/* Now reinitialize the in-core counters if necessary. */
1704	pag = sc->sa.pag;
1705	pag->pagi_count = be32_to_cpu(agi->agi_count);
1706	pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
1707	set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
1708
1709	return xrep_roll_ag_trans(sc);
1710}
1711
1712/* Repair the AGI. */
1713int
1714xrep_agi(
1715	struct xfs_scrub	*sc)
1716{
1717	struct xrep_agi		*ragi;
1718	struct xfs_mount	*mp = sc->mp;
1719	char			*descr;
1720	unsigned int		i;
1721	int			error;
1722
1723	/* We require the rmapbt to rebuild anything. */
1724	if (!xfs_has_rmapbt(mp))
1725		return -EOPNOTSUPP;
1726
1727	sc->buf = kzalloc(sizeof(struct xrep_agi), XCHK_GFP_FLAGS);
1728	if (!sc->buf)
1729		return -ENOMEM;
1730	ragi = sc->buf;
1731	ragi->sc = sc;
1732
1733	ragi->fab[XREP_AGI_INOBT] = (struct xrep_find_ag_btree){
1734		.rmap_owner	= XFS_RMAP_OWN_INOBT,
1735		.buf_ops	= &xfs_inobt_buf_ops,
1736		.maxlevels	= M_IGEO(sc->mp)->inobt_maxlevels,
1737	};
1738	ragi->fab[XREP_AGI_FINOBT] = (struct xrep_find_ag_btree){
1739		.rmap_owner	= XFS_RMAP_OWN_INOBT,
1740		.buf_ops	= &xfs_finobt_buf_ops,
1741		.maxlevels	= M_IGEO(sc->mp)->inobt_maxlevels,
1742	};
1743	ragi->fab[XREP_AGI_END] = (struct xrep_find_ag_btree){
1744		.buf_ops	= NULL,
1745	};
1746
1747	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
1748		ragi->iunlink_heads[i] = NULLAGINO;
1749
1750	xagino_bitmap_init(&ragi->iunlink_bmp);
1751	sc->buf_cleanup = xrep_agi_buf_cleanup;
1752
1753	descr = xchk_xfile_ag_descr(sc, "iunlinked next pointers");
1754	error = xfarray_create(descr, 0, sizeof(xfs_agino_t),
1755			&ragi->iunlink_next);
1756	kfree(descr);
1757	if (error)
1758		return error;
1759
1760	descr = xchk_xfile_ag_descr(sc, "iunlinked prev pointers");
1761	error = xfarray_create(descr, 0, sizeof(xfs_agino_t),
1762			&ragi->iunlink_prev);
1763	kfree(descr);
1764	if (error)
1765		return error;
1766
1767	/*
1768	 * Make sure we have the AGI buffer, as scrub might have decided it
1769	 * was corrupt after xfs_ialloc_read_agi failed with -EFSCORRUPTED.
1770	 */
1771	error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
1772			XFS_AG_DADDR(mp, sc->sa.pag->pag_agno,
1773						XFS_AGI_DADDR(mp)),
1774			XFS_FSS_TO_BB(mp, 1), 0, &ragi->agi_bp, NULL);
1775	if (error)
1776		return error;
1777	ragi->agi_bp->b_ops = &xfs_agi_buf_ops;
1778
1779	/* Find the AGI btree roots. */
1780	error = xrep_agi_find_btrees(ragi);
1781	if (error)
1782		return error;
1783
1784	error = xrep_iunlink_rebuild_buckets(ragi);
1785	if (error)
1786		return error;
1787
1788	/* Last chance to abort before we start committing fixes. */
1789	if (xchk_should_terminate(sc, &error))
1790		return error;
1791
1792	/* Start rewriting the header and implant the btrees we found. */
1793	xrep_agi_init_header(ragi);
1794	xrep_agi_set_roots(ragi);
1795	error = xrep_agi_calc_from_btrees(ragi);
1796	if (error)
1797		goto out_revert;
1798	error = xrep_iunlink_commit(ragi);
1799	if (error)
1800		goto out_revert;
1801
1802	/* Reinitialize in-core state. */
1803	return xrep_agi_commit_new(ragi);
1804
1805out_revert:
1806	/* Mark the incore AGI state stale and revert the AGI. */
1807	clear_bit(XFS_AGSTATE_AGI_INIT, &sc->sa.pag->pag_opstate);
1808	memcpy(ragi->agi_bp->b_addr, &ragi->old_agi, sizeof(struct xfs_agi));
1809	return error;
1810}
1811