1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (C) 2022-2023 Oracle.  All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
8#include "xfs_shared.h"
9#include "xfs_format.h"
10#include "xfs_trans_resv.h"
11#include "xfs_mount.h"
12#include "xfs_btree.h"
13#include "xfs_log_format.h"
14#include "xfs_trans.h"
15#include "xfs_sb.h"
16#include "xfs_inode.h"
17#include "xfs_alloc.h"
18#include "xfs_alloc_btree.h"
19#include "xfs_ialloc.h"
20#include "xfs_ialloc_btree.h"
21#include "xfs_rmap.h"
22#include "xfs_rmap_btree.h"
23#include "xfs_refcount.h"
24#include "xfs_refcount_btree.h"
25#include "xfs_extent_busy.h"
26#include "xfs_ag.h"
27#include "xfs_ag_resv.h"
28#include "xfs_quota.h"
29#include "xfs_qm.h"
30#include "xfs_bmap.h"
31#include "xfs_da_format.h"
32#include "xfs_da_btree.h"
33#include "xfs_attr.h"
34#include "xfs_attr_remote.h"
35#include "xfs_defer.h"
36#include "scrub/scrub.h"
37#include "scrub/common.h"
38#include "scrub/trace.h"
39#include "scrub/repair.h"
40#include "scrub/bitmap.h"
41#include "scrub/agb_bitmap.h"
42#include "scrub/fsb_bitmap.h"
43#include "scrub/reap.h"
44
45/*
46 * Disposal of Blocks from Old Metadata
47 *
48 * Now that we've constructed a new btree to replace the damaged one, we want
49 * to dispose of the blocks that (we think) the old btree was using.
50 * Previously, we used the rmapbt to collect the extents (bitmap) with the
51 * rmap owner corresponding to the tree we rebuilt, collected extents for any
52 * blocks with the same rmap owner that are owned by another data structure
53 * (sublist), and subtracted sublist from bitmap.  In theory the extents
54 * remaining in bitmap are the old btree's blocks.
55 *
56 * Unfortunately, it's possible that the btree was crosslinked with other
57 * blocks on disk.  The rmap data can tell us if there are multiple owners, so
58 * if the rmapbt says there is an owner of this block other than @oinfo, then
59 * the block is crosslinked.  Remove the reverse mapping and continue.
60 *
61 * If there is one rmap record, we can free the block, which removes the
62 * reverse mapping but doesn't add the block to the free space.  Our repair
63 * strategy is to hope the other metadata objects crosslinked on this block
64 * will be rebuilt (atop different blocks), thereby removing all the cross
65 * links.
66 *
67 * If there are no rmap records at all, we also free the block.  If the btree
68 * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
69 * supposed to be a rmap record and everything is ok.  For other btrees there
70 * had to have been an rmap entry for the block to have ended up on @bitmap,
71 * so if it's gone now there's something wrong and the fs will shut down.
72 *
73 * Note: If there are multiple rmap records with only the same rmap owner as
74 * the btree we're trying to rebuild and the block is indeed owned by another
75 * data structure with the same rmap owner, then the block will be in sublist
76 * and therefore doesn't need disposal.  If there are multiple rmap records
77 * with only the same rmap owner but the block is not owned by something with
78 * the same rmap owner, the block will be freed.
79 *
80 * The caller is responsible for locking the AG headers/inode for the entire
81 * rebuild operation so that nothing else can sneak in and change the incore
82 * state while we're not looking.  We must also invalidate any buffers
83 * associated with @bitmap.
84 */
85
86/* Information about reaping extents after a repair. */
87struct xreap_state {
88	struct xfs_scrub		*sc;
89
90	/* Reverse mapping owner and metadata reservation type. */
91	const struct xfs_owner_info	*oinfo;
92	enum xfs_ag_resv_type		resv;
93
94	/* If true, roll the transaction before reaping the next extent. */
95	bool				force_roll;
96
97	/* Number of deferred reaps attached to the current transaction. */
98	unsigned int			deferred;
99
100	/* Number of invalidated buffers logged to the current transaction. */
101	unsigned int			invalidated;
102
103	/* Number of deferred reaps queued during the whole reap sequence. */
104	unsigned long long		total_deferred;
105};
106
107/* Put a block back on the AGFL. */
108STATIC int
109xreap_put_freelist(
110	struct xfs_scrub	*sc,
111	xfs_agblock_t		agbno)
112{
113	struct xfs_buf		*agfl_bp;
114	int			error;
115
116	/* Make sure there's space on the freelist. */
117	error = xrep_fix_freelist(sc, 0);
118	if (error)
119		return error;
120
121	/*
122	 * Since we're "freeing" a lost block onto the AGFL, we have to
123	 * create an rmap for the block prior to merging it or else other
124	 * parts will break.
125	 */
126	error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1,
127			&XFS_RMAP_OINFO_AG);
128	if (error)
129		return error;
130
131	/* Put the block on the AGFL. */
132	error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
133	if (error)
134		return error;
135
136	error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
137			agfl_bp, agbno, 0);
138	if (error)
139		return error;
140	xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
141			XFS_EXTENT_BUSY_SKIP_DISCARD);
142
143	return 0;
144}
145
146/* Are there any uncommitted reap operations? */
147static inline bool xreap_dirty(const struct xreap_state *rs)
148{
149	if (rs->force_roll)
150		return true;
151	if (rs->deferred)
152		return true;
153	if (rs->invalidated)
154		return true;
155	if (rs->total_deferred)
156		return true;
157	return false;
158}
159
160#define XREAP_MAX_BINVAL	(2048)
161
162/*
163 * Decide if we want to roll the transaction after reaping an extent.  We don't
164 * want to overrun the transaction reservation, so we prohibit more than
165 * 128 EFIs per transaction.  For the same reason, we limit the number
166 * of buffer invalidations to 2048.
167 */
168static inline bool xreap_want_roll(const struct xreap_state *rs)
169{
170	if (rs->force_roll)
171		return true;
172	if (rs->deferred > XREP_MAX_ITRUNCATE_EFIS)
173		return true;
174	if (rs->invalidated > XREAP_MAX_BINVAL)
175		return true;
176	return false;
177}
178
179static inline void xreap_reset(struct xreap_state *rs)
180{
181	rs->total_deferred += rs->deferred;
182	rs->deferred = 0;
183	rs->invalidated = 0;
184	rs->force_roll = false;
185}
186
187#define XREAP_MAX_DEFER_CHAIN		(2048)
188
189/*
190 * Decide if we want to finish the deferred ops that are attached to the scrub
191 * transaction.  We don't want to queue huge chains of deferred ops because
192 * that can consume a lot of log space and kernel memory.  Hence we trigger a
193 * xfs_defer_finish if there are more than 2048 deferred reap operations or the
194 * caller did some real work.
195 */
196static inline bool
197xreap_want_defer_finish(const struct xreap_state *rs)
198{
199	if (rs->force_roll)
200		return true;
201	if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN)
202		return true;
203	return false;
204}
205
206static inline void xreap_defer_finish_reset(struct xreap_state *rs)
207{
208	rs->total_deferred = 0;
209	rs->deferred = 0;
210	rs->invalidated = 0;
211	rs->force_roll = false;
212}
213
214/*
215 * Compute the maximum length of a buffer cache scan (in units of sectors),
216 * given a quantity of fs blocks.
217 */
218xfs_daddr_t
219xrep_bufscan_max_sectors(
220	struct xfs_mount	*mp,
221	xfs_extlen_t		fsblocks)
222{
223	int			max_fsbs;
224
225	/* Remote xattr values are the largest buffers that we support. */
226	max_fsbs = xfs_attr3_max_rmt_blocks(mp);
227
228	return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, max_fsbs));
229}
230
231/*
232 * Return an incore buffer from a sector scan, or NULL if there are no buffers
233 * left to return.
234 */
235struct xfs_buf *
236xrep_bufscan_advance(
237	struct xfs_mount	*mp,
238	struct xrep_bufscan	*scan)
239{
240	scan->__sector_count += scan->daddr_step;
241	while (scan->__sector_count <= scan->max_sectors) {
242		struct xfs_buf	*bp = NULL;
243		int		error;
244
245		error = xfs_buf_incore(mp->m_ddev_targp, scan->daddr,
246				scan->__sector_count, XBF_LIVESCAN, &bp);
247		if (!error)
248			return bp;
249
250		scan->__sector_count += scan->daddr_step;
251	}
252
253	return NULL;
254}
255
256/* Try to invalidate the incore buffers for an extent that we're freeing. */
257STATIC void
258xreap_agextent_binval(
259	struct xreap_state	*rs,
260	xfs_agblock_t		agbno,
261	xfs_extlen_t		*aglenp)
262{
263	struct xfs_scrub	*sc = rs->sc;
264	struct xfs_perag	*pag = sc->sa.pag;
265	struct xfs_mount	*mp = sc->mp;
266	xfs_agnumber_t		agno = sc->sa.pag->pag_agno;
267	xfs_agblock_t		agbno_next = agbno + *aglenp;
268	xfs_agblock_t		bno = agbno;
269
270	/*
271	 * Avoid invalidating AG headers and post-EOFS blocks because we never
272	 * own those.
273	 */
274	if (!xfs_verify_agbno(pag, agbno) ||
275	    !xfs_verify_agbno(pag, agbno_next - 1))
276		return;
277
278	/*
279	 * If there are incore buffers for these blocks, invalidate them.  We
280	 * assume that the lack of any other known owners means that the buffer
281	 * can be locked without risk of deadlocking.  The buffer cache cannot
282	 * detect aliasing, so employ nested loops to scan for incore buffers
283	 * of any plausible size.
284	 */
285	while (bno < agbno_next) {
286		struct xrep_bufscan	scan = {
287			.daddr		= XFS_AGB_TO_DADDR(mp, agno, bno),
288			.max_sectors	= xrep_bufscan_max_sectors(mp,
289							agbno_next - bno),
290			.daddr_step	= XFS_FSB_TO_BB(mp, 1),
291		};
292		struct xfs_buf	*bp;
293
294		while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) {
295			xfs_trans_bjoin(sc->tp, bp);
296			xfs_trans_binval(sc->tp, bp);
297			rs->invalidated++;
298
299			/*
300			 * Stop invalidating if we've hit the limit; we should
301			 * still have enough reservation left to free however
302			 * far we've gotten.
303			 */
304			if (rs->invalidated > XREAP_MAX_BINVAL) {
305				*aglenp -= agbno_next - bno;
306				goto out;
307			}
308		}
309
310		bno++;
311	}
312
313out:
314	trace_xreap_agextent_binval(sc->sa.pag, agbno, *aglenp);
315}
316
317/*
318 * Figure out the longest run of blocks that we can dispose of with a single
319 * call.  Cross-linked blocks should have their reverse mappings removed, but
320 * single-owner extents can be freed.  AGFL blocks can only be put back one at
321 * a time.
322 */
323STATIC int
324xreap_agextent_select(
325	struct xreap_state	*rs,
326	xfs_agblock_t		agbno,
327	xfs_agblock_t		agbno_next,
328	bool			*crosslinked,
329	xfs_extlen_t		*aglenp)
330{
331	struct xfs_scrub	*sc = rs->sc;
332	struct xfs_btree_cur	*cur;
333	xfs_agblock_t		bno = agbno + 1;
334	xfs_extlen_t		len = 1;
335	int			error;
336
337	/*
338	 * Determine if there are any other rmap records covering the first
339	 * block of this extent.  If so, the block is crosslinked.
340	 */
341	cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
342			sc->sa.pag);
343	error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo,
344			crosslinked);
345	if (error)
346		goto out_cur;
347
348	/* AGFL blocks can only be deal with one at a time. */
349	if (rs->resv == XFS_AG_RESV_AGFL)
350		goto out_found;
351
352	/*
353	 * Figure out how many of the subsequent blocks have the same crosslink
354	 * status.
355	 */
356	while (bno < agbno_next) {
357		bool		also_crosslinked;
358
359		error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo,
360				&also_crosslinked);
361		if (error)
362			goto out_cur;
363
364		if (*crosslinked != also_crosslinked)
365			break;
366
367		len++;
368		bno++;
369	}
370
371out_found:
372	*aglenp = len;
373	trace_xreap_agextent_select(sc->sa.pag, agbno, len, *crosslinked);
374out_cur:
375	xfs_btree_del_cursor(cur, error);
376	return error;
377}
378
379/*
380 * Dispose of as much of the beginning of this AG extent as possible.  The
381 * number of blocks disposed of will be returned in @aglenp.
382 */
383STATIC int
384xreap_agextent_iter(
385	struct xreap_state	*rs,
386	xfs_agblock_t		agbno,
387	xfs_extlen_t		*aglenp,
388	bool			crosslinked)
389{
390	struct xfs_scrub	*sc = rs->sc;
391	xfs_fsblock_t		fsbno;
392	int			error = 0;
393
394	fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, agbno);
395
396	/*
397	 * If there are other rmappings, this block is cross linked and must
398	 * not be freed.  Remove the reverse mapping and move on.  Otherwise,
399	 * we were the only owner of the block, so free the extent, which will
400	 * also remove the rmap.
401	 *
402	 * XXX: XFS doesn't support detecting the case where a single block
403	 * metadata structure is crosslinked with a multi-block structure
404	 * because the buffer cache doesn't detect aliasing problems, so we
405	 * can't fix 100% of crosslinking problems (yet).  The verifiers will
406	 * blow on writeout, the filesystem will shut down, and the admin gets
407	 * to run xfs_repair.
408	 */
409	if (crosslinked) {
410		trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp);
411
412		rs->force_roll = true;
413
414		if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
415			/*
416			 * If we're unmapping CoW staging extents, remove the
417			 * records from the refcountbt, which will remove the
418			 * rmap record as well.
419			 */
420			xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
421			return 0;
422		}
423
424		return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno,
425				*aglenp, rs->oinfo);
426	}
427
428	trace_xreap_dispose_free_extent(sc->sa.pag, agbno, *aglenp);
429
430	/*
431	 * Invalidate as many buffers as we can, starting at agbno.  If this
432	 * function sets *aglenp to zero, the transaction is full of logged
433	 * buffer invalidations, so we need to return early so that we can
434	 * roll and retry.
435	 */
436	xreap_agextent_binval(rs, agbno, aglenp);
437	if (*aglenp == 0) {
438		ASSERT(xreap_want_roll(rs));
439		return 0;
440	}
441
442	/*
443	 * If we're getting rid of CoW staging extents, use deferred work items
444	 * to remove the refcountbt records (which removes the rmap records)
445	 * and free the extent.  We're not worried about the system going down
446	 * here because log recovery walks the refcount btree to clean out the
447	 * CoW staging extents.
448	 */
449	if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
450		ASSERT(rs->resv == XFS_AG_RESV_NONE);
451
452		xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
453		error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL,
454				rs->resv, true);
455		if (error)
456			return error;
457
458		rs->force_roll = true;
459		return 0;
460	}
461
462	/* Put blocks back on the AGFL one at a time. */
463	if (rs->resv == XFS_AG_RESV_AGFL) {
464		ASSERT(*aglenp == 1);
465		error = xreap_put_freelist(sc, agbno);
466		if (error)
467			return error;
468
469		rs->force_roll = true;
470		return 0;
471	}
472
473	/*
474	 * Use deferred frees to get rid of the old btree blocks to try to
475	 * minimize the window in which we could crash and lose the old blocks.
476	 * Add a defer ops barrier every other extent to avoid stressing the
477	 * system with large EFIs.
478	 */
479	error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo,
480			rs->resv, true);
481	if (error)
482		return error;
483
484	rs->deferred++;
485	if (rs->deferred % 2 == 0)
486		xfs_defer_add_barrier(sc->tp);
487	return 0;
488}
489
490/*
491 * Break an AG metadata extent into sub-extents by fate (crosslinked, not
492 * crosslinked), and dispose of each sub-extent separately.
493 */
494STATIC int
495xreap_agmeta_extent(
496	uint32_t		agbno,
497	uint32_t		len,
498	void			*priv)
499{
500	struct xreap_state	*rs = priv;
501	struct xfs_scrub	*sc = rs->sc;
502	xfs_agblock_t		agbno_next = agbno + len;
503	int			error = 0;
504
505	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
506	ASSERT(sc->ip == NULL);
507
508	while (agbno < agbno_next) {
509		xfs_extlen_t	aglen;
510		bool		crosslinked;
511
512		error = xreap_agextent_select(rs, agbno, agbno_next,
513				&crosslinked, &aglen);
514		if (error)
515			return error;
516
517		error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
518		if (error)
519			return error;
520
521		if (xreap_want_defer_finish(rs)) {
522			error = xrep_defer_finish(sc);
523			if (error)
524				return error;
525			xreap_defer_finish_reset(rs);
526		} else if (xreap_want_roll(rs)) {
527			error = xrep_roll_ag_trans(sc);
528			if (error)
529				return error;
530			xreap_reset(rs);
531		}
532
533		agbno += aglen;
534	}
535
536	return 0;
537}
538
539/* Dispose of every block of every AG metadata extent in the bitmap. */
540int
541xrep_reap_agblocks(
542	struct xfs_scrub		*sc,
543	struct xagb_bitmap		*bitmap,
544	const struct xfs_owner_info	*oinfo,
545	enum xfs_ag_resv_type		type)
546{
547	struct xreap_state		rs = {
548		.sc			= sc,
549		.oinfo			= oinfo,
550		.resv			= type,
551	};
552	int				error;
553
554	ASSERT(xfs_has_rmapbt(sc->mp));
555	ASSERT(sc->ip == NULL);
556
557	error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs);
558	if (error)
559		return error;
560
561	if (xreap_dirty(&rs))
562		return xrep_defer_finish(sc);
563
564	return 0;
565}
566
567/*
568 * Break a file metadata extent into sub-extents by fate (crosslinked, not
569 * crosslinked), and dispose of each sub-extent separately.  The extent must
570 * not cross an AG boundary.
571 */
572STATIC int
573xreap_fsmeta_extent(
574	uint64_t		fsbno,
575	uint64_t		len,
576	void			*priv)
577{
578	struct xreap_state	*rs = priv;
579	struct xfs_scrub	*sc = rs->sc;
580	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
581	xfs_agblock_t		agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
582	xfs_agblock_t		agbno_next = agbno + len;
583	int			error = 0;
584
585	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
586	ASSERT(sc->ip != NULL);
587	ASSERT(!sc->sa.pag);
588
589	/*
590	 * We're reaping blocks after repairing file metadata, which means that
591	 * we have to init the xchk_ag structure ourselves.
592	 */
593	sc->sa.pag = xfs_perag_get(sc->mp, agno);
594	if (!sc->sa.pag)
595		return -EFSCORRUPTED;
596
597	error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp);
598	if (error)
599		goto out_pag;
600
601	while (agbno < agbno_next) {
602		xfs_extlen_t	aglen;
603		bool		crosslinked;
604
605		error = xreap_agextent_select(rs, agbno, agbno_next,
606				&crosslinked, &aglen);
607		if (error)
608			goto out_agf;
609
610		error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
611		if (error)
612			goto out_agf;
613
614		if (xreap_want_defer_finish(rs)) {
615			/*
616			 * Holds the AGF buffer across the deferred chain
617			 * processing.
618			 */
619			error = xrep_defer_finish(sc);
620			if (error)
621				goto out_agf;
622			xreap_defer_finish_reset(rs);
623		} else if (xreap_want_roll(rs)) {
624			/*
625			 * Hold the AGF buffer across the transaction roll so
626			 * that we don't have to reattach it to the scrub
627			 * context.
628			 */
629			xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
630			error = xfs_trans_roll_inode(&sc->tp, sc->ip);
631			xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
632			if (error)
633				goto out_agf;
634			xreap_reset(rs);
635		}
636
637		agbno += aglen;
638	}
639
640out_agf:
641	xfs_trans_brelse(sc->tp, sc->sa.agf_bp);
642	sc->sa.agf_bp = NULL;
643out_pag:
644	xfs_perag_put(sc->sa.pag);
645	sc->sa.pag = NULL;
646	return error;
647}
648
649/*
650 * Dispose of every block of every fs metadata extent in the bitmap.
651 * Do not use this to dispose of the mappings in an ondisk inode fork.
652 */
653int
654xrep_reap_fsblocks(
655	struct xfs_scrub		*sc,
656	struct xfsb_bitmap		*bitmap,
657	const struct xfs_owner_info	*oinfo)
658{
659	struct xreap_state		rs = {
660		.sc			= sc,
661		.oinfo			= oinfo,
662		.resv			= XFS_AG_RESV_NONE,
663	};
664	int				error;
665
666	ASSERT(xfs_has_rmapbt(sc->mp));
667	ASSERT(sc->ip != NULL);
668
669	error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
670	if (error)
671		return error;
672
673	if (xreap_dirty(&rs))
674		return xrep_defer_finish(sc);
675
676	return 0;
677}
678
679/*
680 * Metadata files are not supposed to share blocks with anything else.
681 * If blocks are shared, we remove the reverse mapping (thus reducing the
682 * crosslink factor); if blocks are not shared, we also need to free them.
683 *
684 * This first step determines the longest subset of the passed-in imap
685 * (starting at its beginning) that is either crosslinked or not crosslinked.
686 * The blockcount will be adjust down as needed.
687 */
688STATIC int
689xreap_bmapi_select(
690	struct xfs_scrub	*sc,
691	struct xfs_inode	*ip,
692	int			whichfork,
693	struct xfs_bmbt_irec	*imap,
694	bool			*crosslinked)
695{
696	struct xfs_owner_info	oinfo;
697	struct xfs_btree_cur	*cur;
698	xfs_filblks_t		len = 1;
699	xfs_agblock_t		bno;
700	xfs_agblock_t		agbno;
701	xfs_agblock_t		agbno_next;
702	int			error;
703
704	agbno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock);
705	agbno_next = agbno + imap->br_blockcount;
706
707	cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
708			sc->sa.pag);
709
710	xfs_rmap_ino_owner(&oinfo, ip->i_ino, whichfork, imap->br_startoff);
711	error = xfs_rmap_has_other_keys(cur, agbno, 1, &oinfo, crosslinked);
712	if (error)
713		goto out_cur;
714
715	bno = agbno + 1;
716	while (bno < agbno_next) {
717		bool		also_crosslinked;
718
719		oinfo.oi_offset++;
720		error = xfs_rmap_has_other_keys(cur, bno, 1, &oinfo,
721				&also_crosslinked);
722		if (error)
723			goto out_cur;
724
725		if (also_crosslinked != *crosslinked)
726			break;
727
728		len++;
729		bno++;
730	}
731
732	imap->br_blockcount = len;
733	trace_xreap_bmapi_select(sc->sa.pag, agbno, len, *crosslinked);
734out_cur:
735	xfs_btree_del_cursor(cur, error);
736	return error;
737}
738
739/*
740 * Decide if this buffer can be joined to a transaction.  This is true for most
741 * buffers, but there are two cases that we want to catch: large remote xattr
742 * value buffers are not logged and can overflow the buffer log item dirty
743 * bitmap size; and oversized cached buffers if things have really gone
744 * haywire.
745 */
746static inline bool
747xreap_buf_loggable(
748	const struct xfs_buf	*bp)
749{
750	int			i;
751
752	for (i = 0; i < bp->b_map_count; i++) {
753		int		chunks;
754		int		map_size;
755
756		chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len),
757				XFS_BLF_CHUNK);
758		map_size = DIV_ROUND_UP(chunks, NBWORD);
759		if (map_size > XFS_BLF_DATAMAP_SIZE)
760			return false;
761	}
762
763	return true;
764}
765
766/*
767 * Invalidate any buffers for this file mapping.  The @imap blockcount may be
768 * adjusted downward if we need to roll the transaction.
769 */
770STATIC int
771xreap_bmapi_binval(
772	struct xfs_scrub	*sc,
773	struct xfs_inode	*ip,
774	int			whichfork,
775	struct xfs_bmbt_irec	*imap)
776{
777	struct xfs_mount	*mp = sc->mp;
778	struct xfs_perag	*pag = sc->sa.pag;
779	int			bmap_flags = xfs_bmapi_aflag(whichfork);
780	xfs_fileoff_t		off;
781	xfs_fileoff_t		max_off;
782	xfs_extlen_t		scan_blocks;
783	xfs_agnumber_t		agno = sc->sa.pag->pag_agno;
784	xfs_agblock_t		bno;
785	xfs_agblock_t		agbno;
786	xfs_agblock_t		agbno_next;
787	unsigned int		invalidated = 0;
788	int			error;
789
790	/*
791	 * Avoid invalidating AG headers and post-EOFS blocks because we never
792	 * own those.
793	 */
794	agbno = bno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock);
795	agbno_next = agbno + imap->br_blockcount;
796	if (!xfs_verify_agbno(pag, agbno) ||
797	    !xfs_verify_agbno(pag, agbno_next - 1))
798		return 0;
799
800	/*
801	 * Buffers for file blocks can span multiple contiguous mappings.  This
802	 * means that for each block in the mapping, there could exist an
803	 * xfs_buf indexed by that block with any length up to the maximum
804	 * buffer size (remote xattr values) or to the next hole in the fork.
805	 * To set up our binval scan, first we need to figure out the location
806	 * of the next hole.
807	 */
808	off = imap->br_startoff + imap->br_blockcount;
809	max_off = off + xfs_attr3_max_rmt_blocks(mp);
810	while (off < max_off) {
811		struct xfs_bmbt_irec	hmap;
812		int			nhmaps = 1;
813
814		error = xfs_bmapi_read(ip, off, max_off - off, &hmap,
815				&nhmaps, bmap_flags);
816		if (error)
817			return error;
818		if (nhmaps != 1 || hmap.br_startblock == DELAYSTARTBLOCK) {
819			ASSERT(0);
820			return -EFSCORRUPTED;
821		}
822
823		if (!xfs_bmap_is_real_extent(&hmap))
824			break;
825
826		off = hmap.br_startoff + hmap.br_blockcount;
827	}
828	scan_blocks = off - imap->br_startoff;
829
830	trace_xreap_bmapi_binval_scan(sc, imap, scan_blocks);
831
832	/*
833	 * If there are incore buffers for these blocks, invalidate them.  If
834	 * we can't (try)lock the buffer we assume it's owned by someone else
835	 * and leave it alone.  The buffer cache cannot detect aliasing, so
836	 * employ nested loops to detect incore buffers of any plausible size.
837	 */
838	while (bno < agbno_next) {
839		struct xrep_bufscan	scan = {
840			.daddr		= XFS_AGB_TO_DADDR(mp, agno, bno),
841			.max_sectors	= xrep_bufscan_max_sectors(mp,
842								scan_blocks),
843			.daddr_step	= XFS_FSB_TO_BB(mp, 1),
844		};
845		struct xfs_buf		*bp;
846
847		while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) {
848			if (xreap_buf_loggable(bp)) {
849				xfs_trans_bjoin(sc->tp, bp);
850				xfs_trans_binval(sc->tp, bp);
851			} else {
852				xfs_buf_stale(bp);
853				xfs_buf_relse(bp);
854			}
855			invalidated++;
856
857			/*
858			 * Stop invalidating if we've hit the limit; we should
859			 * still have enough reservation left to free however
860			 * much of the mapping we've seen so far.
861			 */
862			if (invalidated > XREAP_MAX_BINVAL) {
863				imap->br_blockcount = agbno_next - bno;
864				goto out;
865			}
866		}
867
868		bno++;
869		scan_blocks--;
870	}
871
872out:
873	trace_xreap_bmapi_binval(sc->sa.pag, agbno, imap->br_blockcount);
874	return 0;
875}
876
877/*
878 * Dispose of as much of the beginning of this file fork mapping as possible.
879 * The number of blocks disposed of is returned in @imap->br_blockcount.
880 */
881STATIC int
882xrep_reap_bmapi_iter(
883	struct xfs_scrub		*sc,
884	struct xfs_inode		*ip,
885	int				whichfork,
886	struct xfs_bmbt_irec		*imap,
887	bool				crosslinked)
888{
889	int				error;
890
891	if (crosslinked) {
892		/*
893		 * If there are other rmappings, this block is cross linked and
894		 * must not be freed.  Remove the reverse mapping, leave the
895		 * buffer cache in its possibly confused state, and move on.
896		 * We don't want to risk discarding valid data buffers from
897		 * anybody else who thinks they own the block, even though that
898		 * runs the risk of stale buffer warnings in the future.
899		 */
900		trace_xreap_dispose_unmap_extent(sc->sa.pag,
901				XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock),
902				imap->br_blockcount);
903
904		/*
905		 * Schedule removal of the mapping from the fork.  We use
906		 * deferred log intents in this function to control the exact
907		 * sequence of metadata updates.
908		 */
909		xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap);
910		xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT,
911				-(int64_t)imap->br_blockcount);
912		xfs_rmap_unmap_extent(sc->tp, ip, whichfork, imap);
913		return 0;
914	}
915
916	/*
917	 * If the block is not crosslinked, we can invalidate all the incore
918	 * buffers for the extent, and then free the extent.  This is a bit of
919	 * a mess since we don't detect discontiguous buffers that are indexed
920	 * by a block starting before the first block of the extent but overlap
921	 * anyway.
922	 */
923	trace_xreap_dispose_free_extent(sc->sa.pag,
924			XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock),
925			imap->br_blockcount);
926
927	/*
928	 * Invalidate as many buffers as we can, starting at the beginning of
929	 * this mapping.  If this function sets blockcount to zero, the
930	 * transaction is full of logged buffer invalidations, so we need to
931	 * return early so that we can roll and retry.
932	 */
933	error = xreap_bmapi_binval(sc, ip, whichfork, imap);
934	if (error || imap->br_blockcount == 0)
935		return error;
936
937	/*
938	 * Schedule removal of the mapping from the fork.  We use deferred log
939	 * intents in this function to control the exact sequence of metadata
940	 * updates.
941	 */
942	xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap);
943	xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT,
944			-(int64_t)imap->br_blockcount);
945	return xfs_free_extent_later(sc->tp, imap->br_startblock,
946			imap->br_blockcount, NULL, XFS_AG_RESV_NONE, true);
947}
948
949/*
950 * Dispose of as much of this file extent as we can.  Upon successful return,
951 * the imap will reflect the mapping that was removed from the fork.
952 */
953STATIC int
954xreap_ifork_extent(
955	struct xfs_scrub		*sc,
956	struct xfs_inode		*ip,
957	int				whichfork,
958	struct xfs_bmbt_irec		*imap)
959{
960	xfs_agnumber_t			agno;
961	bool				crosslinked;
962	int				error;
963
964	ASSERT(sc->sa.pag == NULL);
965
966	trace_xreap_ifork_extent(sc, ip, whichfork, imap);
967
968	agno = XFS_FSB_TO_AGNO(sc->mp, imap->br_startblock);
969	sc->sa.pag = xfs_perag_get(sc->mp, agno);
970	if (!sc->sa.pag)
971		return -EFSCORRUPTED;
972
973	error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp);
974	if (error)
975		goto out_pag;
976
977	/*
978	 * Decide the fate of the blocks at the beginning of the mapping, then
979	 * update the mapping to use it with the unmap calls.
980	 */
981	error = xreap_bmapi_select(sc, ip, whichfork, imap, &crosslinked);
982	if (error)
983		goto out_agf;
984
985	error = xrep_reap_bmapi_iter(sc, ip, whichfork, imap, crosslinked);
986	if (error)
987		goto out_agf;
988
989out_agf:
990	xfs_trans_brelse(sc->tp, sc->sa.agf_bp);
991	sc->sa.agf_bp = NULL;
992out_pag:
993	xfs_perag_put(sc->sa.pag);
994	sc->sa.pag = NULL;
995	return error;
996}
997
998/*
999 * Dispose of each block mapped to the given fork of the given file.  Callers
1000 * must hold ILOCK_EXCL, and ip can only be sc->ip or sc->tempip.  The fork
1001 * must not have any delalloc reservations.
1002 */
1003int
1004xrep_reap_ifork(
1005	struct xfs_scrub	*sc,
1006	struct xfs_inode	*ip,
1007	int			whichfork)
1008{
1009	xfs_fileoff_t		off = 0;
1010	int			bmap_flags = xfs_bmapi_aflag(whichfork);
1011	int			error;
1012
1013	ASSERT(xfs_has_rmapbt(sc->mp));
1014	ASSERT(ip == sc->ip || ip == sc->tempip);
1015	ASSERT(whichfork == XFS_ATTR_FORK || !XFS_IS_REALTIME_INODE(ip));
1016
1017	while (off < XFS_MAX_FILEOFF) {
1018		struct xfs_bmbt_irec	imap;
1019		int			nimaps = 1;
1020
1021		/* Read the next extent, skip past holes and delalloc. */
1022		error = xfs_bmapi_read(ip, off, XFS_MAX_FILEOFF - off, &imap,
1023				&nimaps, bmap_flags);
1024		if (error)
1025			return error;
1026		if (nimaps != 1 || imap.br_startblock == DELAYSTARTBLOCK) {
1027			ASSERT(0);
1028			return -EFSCORRUPTED;
1029		}
1030
1031		/*
1032		 * If this is a real space mapping, reap as much of it as we
1033		 * can in a single transaction.
1034		 */
1035		if (xfs_bmap_is_real_extent(&imap)) {
1036			error = xreap_ifork_extent(sc, ip, whichfork, &imap);
1037			if (error)
1038				return error;
1039
1040			error = xfs_defer_finish(&sc->tp);
1041			if (error)
1042				return error;
1043		}
1044
1045		off = imap.br_startoff + imap.br_blockcount;
1046	}
1047
1048	return 0;
1049}
1050