1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
8#include "xfs_shared.h"
9#include "xfs_format.h"
10#include "xfs_trans_resv.h"
11#include "xfs_mount.h"
12#include "xfs_defer.h"
13#include "xfs_btree.h"
14#include "xfs_bit.h"
15#include "xfs_log_format.h"
16#include "xfs_trans.h"
17#include "xfs_sb.h"
18#include "xfs_inode.h"
19#include "xfs_icache.h"
20#include "xfs_inode_buf.h"
21#include "xfs_inode_fork.h"
22#include "xfs_ialloc.h"
23#include "xfs_da_format.h"
24#include "xfs_reflink.h"
25#include "xfs_alloc.h"
26#include "xfs_rmap.h"
27#include "xfs_rmap_btree.h"
28#include "xfs_bmap.h"
29#include "xfs_bmap_btree.h"
30#include "xfs_bmap_util.h"
31#include "xfs_dir2.h"
32#include "xfs_dir2_priv.h"
33#include "xfs_quota_defs.h"
34#include "xfs_quota.h"
35#include "xfs_ag.h"
36#include "xfs_rtbitmap.h"
37#include "xfs_attr_leaf.h"
38#include "xfs_log_priv.h"
39#include "xfs_health.h"
40#include "xfs_symlink_remote.h"
41#include "scrub/xfs_scrub.h"
42#include "scrub/scrub.h"
43#include "scrub/common.h"
44#include "scrub/btree.h"
45#include "scrub/trace.h"
46#include "scrub/repair.h"
47#include "scrub/iscan.h"
48#include "scrub/readdir.h"
49#include "scrub/tempfile.h"
50
51/*
52 * Inode Record Repair
53 * ===================
54 *
55 * Roughly speaking, inode problems can be classified based on whether or not
56 * they trip the dinode verifiers.  If those trip, then we won't be able to
57 * xfs_iget ourselves the inode.
58 *
59 * Therefore, the xrep_dinode_* functions fix anything that will cause the
60 * inode buffer verifier or the dinode verifier.  The xrep_inode_* functions
61 * fix things on live incore inodes.  The inode repair functions make decisions
62 * with security and usability implications when reviving a file:
63 *
64 * - Files with zero di_mode or a garbage di_mode are converted to regular file
65 *   that only root can read.  This file may not actually contain user data,
66 *   if the file was not previously a regular file.  Setuid and setgid bits
67 *   are cleared.
68 *
69 * - Zero-size directories can be truncated to look empty.  It is necessary to
70 *   run the bmapbtd and directory repair functions to fully rebuild the
71 *   directory.
72 *
73 * - Zero-size symbolic link targets can be truncated to '?'.  It is necessary
74 *   to run the bmapbtd and symlink repair functions to salvage the symlink.
75 *
76 * - Invalid extent size hints will be removed.
77 *
78 * - Quotacheck will be scheduled if we repaired an inode that was so badly
79 *   damaged that the ondisk inode had to be rebuilt.
80 *
81 * - Invalid user, group, or project IDs (aka -1U) will be reset to zero.
82 *   Setuid and setgid bits are cleared.
83 *
84 * - Data and attr forks are reset to extents format with zero extents if the
85 *   fork data is inconsistent.  It is necessary to run the bmapbtd or bmapbta
86 *   repair functions to recover the space mapping.
87 *
88 * - ACLs will not be recovered if the attr fork is zapped or the extended
89 *   attribute structure itself requires salvaging.
90 *
91 * - If the attr fork is zapped, the user and group ids are reset to root and
92 *   the setuid and setgid bits are removed.
93 */
94
95/*
96 * All the information we need to repair the ondisk inode if we can't iget the
97 * incore inode.  We don't allocate this buffer unless we're going to perform
98 * a repair to the ondisk inode cluster buffer.
99 */
100struct xrep_inode {
101	/* Inode mapping that we saved from the initial lookup attempt. */
102	struct xfs_imap		imap;
103
104	struct xfs_scrub	*sc;
105
106	/* Blocks in use on the data device by data extents or bmbt blocks. */
107	xfs_rfsblock_t		data_blocks;
108
109	/* Blocks in use on the rt device. */
110	xfs_rfsblock_t		rt_blocks;
111
112	/* Blocks in use by the attr fork. */
113	xfs_rfsblock_t		attr_blocks;
114
115	/* Number of data device extents for the data fork. */
116	xfs_extnum_t		data_extents;
117
118	/*
119	 * Number of realtime device extents for the data fork.  If
120	 * data_extents and rt_extents indicate that the data fork has extents
121	 * on both devices, we'll just back away slowly.
122	 */
123	xfs_extnum_t		rt_extents;
124
125	/* Number of (data device) extents for the attr fork. */
126	xfs_aextnum_t		attr_extents;
127
128	/* Sick state to set after zapping parts of the inode. */
129	unsigned int		ino_sick_mask;
130
131	/* Must we remove all access from this file? */
132	bool			zap_acls;
133
134	/* Inode scanner to see if we can find the ftype from dirents */
135	struct xchk_iscan	ftype_iscan;
136	uint8_t			alleged_ftype;
137};
138
139/*
140 * Setup function for inode repair.  @imap contains the ondisk inode mapping
141 * information so that we can correct the ondisk inode cluster buffer if
142 * necessary to make iget work.
143 */
144int
145xrep_setup_inode(
146	struct xfs_scrub	*sc,
147	const struct xfs_imap	*imap)
148{
149	struct xrep_inode	*ri;
150
151	sc->buf = kzalloc(sizeof(struct xrep_inode), XCHK_GFP_FLAGS);
152	if (!sc->buf)
153		return -ENOMEM;
154
155	ri = sc->buf;
156	memcpy(&ri->imap, imap, sizeof(struct xfs_imap));
157	ri->sc = sc;
158	return 0;
159}
160
161/*
162 * Make sure this ondisk inode can pass the inode buffer verifier.  This is
163 * not the same as the dinode verifier.
164 */
165STATIC void
166xrep_dinode_buf_core(
167	struct xfs_scrub	*sc,
168	struct xfs_buf		*bp,
169	unsigned int		ioffset)
170{
171	struct xfs_dinode	*dip = xfs_buf_offset(bp, ioffset);
172	struct xfs_trans	*tp = sc->tp;
173	struct xfs_mount	*mp = sc->mp;
174	xfs_agino_t		agino;
175	bool			crc_ok = false;
176	bool			magic_ok = false;
177	bool			unlinked_ok = false;
178
179	agino = be32_to_cpu(dip->di_next_unlinked);
180
181	if (xfs_verify_agino_or_null(bp->b_pag, agino))
182		unlinked_ok = true;
183
184	if (dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
185	    xfs_dinode_good_version(mp, dip->di_version))
186		magic_ok = true;
187
188	if (xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
189			XFS_DINODE_CRC_OFF))
190		crc_ok = true;
191
192	if (magic_ok && unlinked_ok && crc_ok)
193		return;
194
195	if (!magic_ok) {
196		dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
197		dip->di_version = 3;
198	}
199	if (!unlinked_ok)
200		dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
201	xfs_dinode_calc_crc(mp, dip);
202	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
203	xfs_trans_log_buf(tp, bp, ioffset,
204				  ioffset + sizeof(struct xfs_dinode) - 1);
205}
206
207/* Make sure this inode cluster buffer can pass the inode buffer verifier. */
208STATIC void
209xrep_dinode_buf(
210	struct xfs_scrub	*sc,
211	struct xfs_buf		*bp)
212{
213	struct xfs_mount	*mp = sc->mp;
214	int			i;
215	int			ni;
216
217	ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
218	for (i = 0; i < ni; i++)
219		xrep_dinode_buf_core(sc, bp, i << mp->m_sb.sb_inodelog);
220}
221
222/* Reinitialize things that never change in an inode. */
223STATIC void
224xrep_dinode_header(
225	struct xfs_scrub	*sc,
226	struct xfs_dinode	*dip)
227{
228	trace_xrep_dinode_header(sc, dip);
229
230	dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
231	if (!xfs_dinode_good_version(sc->mp, dip->di_version))
232		dip->di_version = 3;
233	dip->di_ino = cpu_to_be64(sc->sm->sm_ino);
234	uuid_copy(&dip->di_uuid, &sc->mp->m_sb.sb_meta_uuid);
235	dip->di_gen = cpu_to_be32(sc->sm->sm_gen);
236}
237
238/*
239 * If this directory entry points to the scrub target inode, then the directory
240 * we're scanning is the parent of the scrub target inode.
241 */
242STATIC int
243xrep_dinode_findmode_dirent(
244	struct xfs_scrub		*sc,
245	struct xfs_inode		*dp,
246	xfs_dir2_dataptr_t		dapos,
247	const struct xfs_name		*name,
248	xfs_ino_t			ino,
249	void				*priv)
250{
251	struct xrep_inode		*ri = priv;
252	int				error = 0;
253
254	if (xchk_should_terminate(ri->sc, &error))
255		return error;
256
257	if (ino != sc->sm->sm_ino)
258		return 0;
259
260	/* Ignore garbage directory entry names. */
261	if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len))
262		return -EFSCORRUPTED;
263
264	/* Don't pick up dot or dotdot entries; we only want child dirents. */
265	if (xfs_dir2_samename(name, &xfs_name_dotdot) ||
266	    xfs_dir2_samename(name, &xfs_name_dot))
267		return 0;
268
269	/*
270	 * Uhoh, more than one parent for this inode and they don't agree on
271	 * the file type?
272	 */
273	if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN &&
274	    ri->alleged_ftype != name->type) {
275		trace_xrep_dinode_findmode_dirent_inval(ri->sc, dp, name->type,
276				ri->alleged_ftype);
277		return -EFSCORRUPTED;
278	}
279
280	/* We found a potential parent; remember the ftype. */
281	trace_xrep_dinode_findmode_dirent(ri->sc, dp, name->type);
282	ri->alleged_ftype = name->type;
283	return 0;
284}
285
286/* Try to lock a directory, or wait a jiffy. */
287static inline int
288xrep_dinode_ilock_nowait(
289	struct xfs_inode	*dp,
290	unsigned int		lock_mode)
291{
292	if (xfs_ilock_nowait(dp, lock_mode))
293		return true;
294
295	schedule_timeout_killable(1);
296	return false;
297}
298
299/*
300 * Try to lock a directory to look for ftype hints.  Since we already hold the
301 * AGI buffer, we cannot block waiting for the ILOCK because rename can take
302 * the ILOCK and then try to lock AGIs.
303 */
304STATIC int
305xrep_dinode_trylock_directory(
306	struct xrep_inode	*ri,
307	struct xfs_inode	*dp,
308	unsigned int		*lock_modep)
309{
310	unsigned long		deadline = jiffies + msecs_to_jiffies(30000);
311	unsigned int		lock_mode;
312	int			error = 0;
313
314	do {
315		if (xchk_should_terminate(ri->sc, &error))
316			return error;
317
318		if (xfs_need_iread_extents(&dp->i_df))
319			lock_mode = XFS_ILOCK_EXCL;
320		else
321			lock_mode = XFS_ILOCK_SHARED;
322
323		if (xrep_dinode_ilock_nowait(dp, lock_mode)) {
324			*lock_modep = lock_mode;
325			return 0;
326		}
327	} while (!time_is_before_jiffies(deadline));
328	return -EBUSY;
329}
330
331/*
332 * If this is a directory, walk the dirents looking for any that point to the
333 * scrub target inode.
334 */
335STATIC int
336xrep_dinode_findmode_walk_directory(
337	struct xrep_inode	*ri,
338	struct xfs_inode	*dp)
339{
340	struct xfs_scrub	*sc = ri->sc;
341	unsigned int		lock_mode;
342	int			error = 0;
343
344	/* Ignore temporary repair directories. */
345	if (xrep_is_tempfile(dp))
346		return 0;
347
348	/*
349	 * Scan the directory to see if there it contains an entry pointing to
350	 * the directory that we are repairing.
351	 */
352	error = xrep_dinode_trylock_directory(ri, dp, &lock_mode);
353	if (error)
354		return error;
355
356	/*
357	 * If this directory is known to be sick, we cannot scan it reliably
358	 * and must abort.
359	 */
360	if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE |
361				       XFS_SICK_INO_BMBTD |
362				       XFS_SICK_INO_DIR)) {
363		error = -EFSCORRUPTED;
364		goto out_unlock;
365	}
366
367	/*
368	 * We cannot complete our parent pointer scan if a directory looks as
369	 * though it has been zapped by the inode record repair code.
370	 */
371	if (xchk_dir_looks_zapped(dp)) {
372		error = -EBUSY;
373		goto out_unlock;
374	}
375
376	error = xchk_dir_walk(sc, dp, xrep_dinode_findmode_dirent, ri);
377	if (error)
378		goto out_unlock;
379
380out_unlock:
381	xfs_iunlock(dp, lock_mode);
382	return error;
383}
384
385/*
386 * Try to find the mode of the inode being repaired by looking for directories
387 * that point down to this file.
388 */
389STATIC int
390xrep_dinode_find_mode(
391	struct xrep_inode	*ri,
392	uint16_t		*mode)
393{
394	struct xfs_scrub	*sc = ri->sc;
395	struct xfs_inode	*dp;
396	int			error;
397
398	/* No ftype means we have no other metadata to consult. */
399	if (!xfs_has_ftype(sc->mp)) {
400		*mode = S_IFREG;
401		return 0;
402	}
403
404	/*
405	 * Scan all directories for parents that might point down to this
406	 * inode.  Skip the inode being repaired during the scan since it
407	 * cannot be its own parent.  Note that we still hold the AGI locked
408	 * so there's a real possibility that _iscan_iter can return EBUSY.
409	 */
410	xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan);
411	xchk_iscan_set_agi_trylock(&ri->ftype_iscan);
412	ri->ftype_iscan.skip_ino = sc->sm->sm_ino;
413	ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN;
414	while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) {
415		if (S_ISDIR(VFS_I(dp)->i_mode))
416			error = xrep_dinode_findmode_walk_directory(ri, dp);
417		xchk_iscan_mark_visited(&ri->ftype_iscan, dp);
418		xchk_irele(sc, dp);
419		if (error < 0)
420			break;
421		if (xchk_should_terminate(sc, &error))
422			break;
423	}
424	xchk_iscan_iter_finish(&ri->ftype_iscan);
425	xchk_iscan_teardown(&ri->ftype_iscan);
426
427	if (error == -EBUSY) {
428		if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN) {
429			/*
430			 * If we got an EBUSY after finding at least one
431			 * dirent, that means the scan found an inode on the
432			 * inactivation list and could not open it.  Accept the
433			 * alleged ftype and install a new mode below.
434			 */
435			error = 0;
436		} else if (!(sc->flags & XCHK_TRY_HARDER)) {
437			/*
438			 * Otherwise, retry the operation one time to see if
439			 * the reason for the delay is an inode from the same
440			 * cluster buffer waiting on the inactivation list.
441			 */
442			error = -EDEADLOCK;
443		}
444	}
445	if (error)
446		return error;
447
448	/*
449	 * Convert the discovered ftype into the file mode.  If all else fails,
450	 * return S_IFREG.
451	 */
452	switch (ri->alleged_ftype) {
453	case XFS_DIR3_FT_DIR:
454		*mode = S_IFDIR;
455		break;
456	case XFS_DIR3_FT_WHT:
457	case XFS_DIR3_FT_CHRDEV:
458		*mode = S_IFCHR;
459		break;
460	case XFS_DIR3_FT_BLKDEV:
461		*mode = S_IFBLK;
462		break;
463	case XFS_DIR3_FT_FIFO:
464		*mode = S_IFIFO;
465		break;
466	case XFS_DIR3_FT_SOCK:
467		*mode = S_IFSOCK;
468		break;
469	case XFS_DIR3_FT_SYMLINK:
470		*mode = S_IFLNK;
471		break;
472	default:
473		*mode = S_IFREG;
474		break;
475	}
476	return 0;
477}
478
479/* Turn di_mode into /something/ recognizable.  Returns true if we succeed. */
480STATIC int
481xrep_dinode_mode(
482	struct xrep_inode	*ri,
483	struct xfs_dinode	*dip)
484{
485	struct xfs_scrub	*sc = ri->sc;
486	uint16_t		mode = be16_to_cpu(dip->di_mode);
487	int			error;
488
489	trace_xrep_dinode_mode(sc, dip);
490
491	if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN)
492		return 0;
493
494	/* Try to fix the mode.  If we cannot, then leave everything alone. */
495	error = xrep_dinode_find_mode(ri, &mode);
496	switch (error) {
497	case -EINTR:
498	case -EBUSY:
499	case -EDEADLOCK:
500		/* temporary failure or fatal signal */
501		return error;
502	case 0:
503		/* found mode */
504		break;
505	default:
506		/* some other error, assume S_IFREG */
507		mode = S_IFREG;
508		break;
509	}
510
511	/* bad mode, so we set it to a file that only root can read */
512	dip->di_mode = cpu_to_be16(mode);
513	dip->di_uid = 0;
514	dip->di_gid = 0;
515	ri->zap_acls = true;
516	return 0;
517}
518
519/* Fix unused link count fields having nonzero values. */
520STATIC void
521xrep_dinode_nlinks(
522	struct xfs_dinode	*dip)
523{
524	if (dip->di_version > 1)
525		dip->di_onlink = 0;
526	else
527		dip->di_nlink = 0;
528}
529
530/* Fix any conflicting flags that the verifiers complain about. */
531STATIC void
532xrep_dinode_flags(
533	struct xfs_scrub	*sc,
534	struct xfs_dinode	*dip,
535	bool			isrt)
536{
537	struct xfs_mount	*mp = sc->mp;
538	uint64_t		flags2 = be64_to_cpu(dip->di_flags2);
539	uint16_t		flags = be16_to_cpu(dip->di_flags);
540	uint16_t		mode = be16_to_cpu(dip->di_mode);
541
542	trace_xrep_dinode_flags(sc, dip);
543
544	if (isrt)
545		flags |= XFS_DIFLAG_REALTIME;
546	else
547		flags &= ~XFS_DIFLAG_REALTIME;
548
549	/*
550	 * For regular files on a reflink filesystem, set the REFLINK flag to
551	 * protect shared extents.  A later stage will actually check those
552	 * extents and clear the flag if possible.
553	 */
554	if (xfs_has_reflink(mp) && S_ISREG(mode))
555		flags2 |= XFS_DIFLAG2_REFLINK;
556	else
557		flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE);
558	if (flags & XFS_DIFLAG_REALTIME)
559		flags2 &= ~XFS_DIFLAG2_REFLINK;
560	if (!xfs_has_bigtime(mp))
561		flags2 &= ~XFS_DIFLAG2_BIGTIME;
562	if (!xfs_has_large_extent_counts(mp))
563		flags2 &= ~XFS_DIFLAG2_NREXT64;
564	if (flags2 & XFS_DIFLAG2_NREXT64)
565		dip->di_nrext64_pad = 0;
566	else if (dip->di_version >= 3)
567		dip->di_v3_pad = 0;
568	dip->di_flags = cpu_to_be16(flags);
569	dip->di_flags2 = cpu_to_be64(flags2);
570}
571
572/*
573 * Blow out symlink; now it points nowhere.  We don't have to worry about
574 * incore state because this inode is failing the verifiers.
575 */
576STATIC void
577xrep_dinode_zap_symlink(
578	struct xrep_inode	*ri,
579	struct xfs_dinode	*dip)
580{
581	struct xfs_scrub	*sc = ri->sc;
582	char			*p;
583
584	trace_xrep_dinode_zap_symlink(sc, dip);
585
586	dip->di_format = XFS_DINODE_FMT_LOCAL;
587	dip->di_size = cpu_to_be64(1);
588	p = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
589	*p = '?';
590	ri->ino_sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED;
591}
592
593/*
594 * Blow out dir, make the parent point to the root.  In the future repair will
595 * reconstruct this directory for us.  Note that there's no in-core directory
596 * inode because the sf verifier tripped, so we don't have to worry about the
597 * dentry cache.
598 */
599STATIC void
600xrep_dinode_zap_dir(
601	struct xrep_inode	*ri,
602	struct xfs_dinode	*dip)
603{
604	struct xfs_scrub	*sc = ri->sc;
605	struct xfs_mount	*mp = sc->mp;
606	struct xfs_dir2_sf_hdr	*sfp;
607	int			i8count;
608
609	trace_xrep_dinode_zap_dir(sc, dip);
610
611	dip->di_format = XFS_DINODE_FMT_LOCAL;
612	i8count = mp->m_sb.sb_rootino > XFS_DIR2_MAX_SHORT_INUM;
613	sfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
614	sfp->count = 0;
615	sfp->i8count = i8count;
616	xfs_dir2_sf_put_parent_ino(sfp, mp->m_sb.sb_rootino);
617	dip->di_size = cpu_to_be64(xfs_dir2_sf_hdr_size(i8count));
618	ri->ino_sick_mask |= XFS_SICK_INO_DIR_ZAPPED;
619}
620
621/* Make sure we don't have a garbage file size. */
622STATIC void
623xrep_dinode_size(
624	struct xrep_inode	*ri,
625	struct xfs_dinode	*dip)
626{
627	struct xfs_scrub	*sc = ri->sc;
628	uint64_t		size = be64_to_cpu(dip->di_size);
629	uint16_t		mode = be16_to_cpu(dip->di_mode);
630
631	trace_xrep_dinode_size(sc, dip);
632
633	switch (mode & S_IFMT) {
634	case S_IFIFO:
635	case S_IFCHR:
636	case S_IFBLK:
637	case S_IFSOCK:
638		/* di_size can't be nonzero for special files */
639		dip->di_size = 0;
640		break;
641	case S_IFREG:
642		/* Regular files can't be larger than 2^63-1 bytes. */
643		dip->di_size = cpu_to_be64(size & ~(1ULL << 63));
644		break;
645	case S_IFLNK:
646		/*
647		 * Truncate ridiculously oversized symlinks.  If the size is
648		 * zero, reset it to point to the current directory.  Both of
649		 * these conditions trigger dinode verifier errors, so there
650		 * is no in-core state to reset.
651		 */
652		if (size > XFS_SYMLINK_MAXLEN)
653			dip->di_size = cpu_to_be64(XFS_SYMLINK_MAXLEN);
654		else if (size == 0)
655			xrep_dinode_zap_symlink(ri, dip);
656		break;
657	case S_IFDIR:
658		/*
659		 * Directories can't have a size larger than 32G.  If the size
660		 * is zero, reset it to an empty directory.  Both of these
661		 * conditions trigger dinode verifier errors, so there is no
662		 * in-core state to reset.
663		 */
664		if (size > XFS_DIR2_SPACE_SIZE)
665			dip->di_size = cpu_to_be64(XFS_DIR2_SPACE_SIZE);
666		else if (size == 0)
667			xrep_dinode_zap_dir(ri, dip);
668		break;
669	}
670}
671
672/* Fix extent size hints. */
673STATIC void
674xrep_dinode_extsize_hints(
675	struct xfs_scrub	*sc,
676	struct xfs_dinode	*dip)
677{
678	struct xfs_mount	*mp = sc->mp;
679	uint64_t		flags2 = be64_to_cpu(dip->di_flags2);
680	uint16_t		flags = be16_to_cpu(dip->di_flags);
681	uint16_t		mode = be16_to_cpu(dip->di_mode);
682
683	xfs_failaddr_t		fa;
684
685	trace_xrep_dinode_extsize_hints(sc, dip);
686
687	fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize),
688			mode, flags);
689	if (fa) {
690		dip->di_extsize = 0;
691		dip->di_flags &= ~cpu_to_be16(XFS_DIFLAG_EXTSIZE |
692					      XFS_DIFLAG_EXTSZINHERIT);
693	}
694
695	if (dip->di_version < 3)
696		return;
697
698	fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
699			mode, flags, flags2);
700	if (fa) {
701		dip->di_cowextsize = 0;
702		dip->di_flags2 &= ~cpu_to_be64(XFS_DIFLAG2_COWEXTSIZE);
703	}
704}
705
706/* Count extents and blocks for an inode given an rmap. */
707STATIC int
708xrep_dinode_walk_rmap(
709	struct xfs_btree_cur		*cur,
710	const struct xfs_rmap_irec	*rec,
711	void				*priv)
712{
713	struct xrep_inode		*ri = priv;
714	int				error = 0;
715
716	if (xchk_should_terminate(ri->sc, &error))
717		return error;
718
719	/* We only care about this inode. */
720	if (rec->rm_owner != ri->sc->sm->sm_ino)
721		return 0;
722
723	if (rec->rm_flags & XFS_RMAP_ATTR_FORK) {
724		ri->attr_blocks += rec->rm_blockcount;
725		if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
726			ri->attr_extents++;
727
728		return 0;
729	}
730
731	ri->data_blocks += rec->rm_blockcount;
732	if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
733		ri->data_extents++;
734
735	return 0;
736}
737
738/* Count extents and blocks for an inode from all AG rmap data. */
739STATIC int
740xrep_dinode_count_ag_rmaps(
741	struct xrep_inode	*ri,
742	struct xfs_perag	*pag)
743{
744	struct xfs_btree_cur	*cur;
745	struct xfs_buf		*agf;
746	int			error;
747
748	error = xfs_alloc_read_agf(pag, ri->sc->tp, 0, &agf);
749	if (error)
750		return error;
751
752	cur = xfs_rmapbt_init_cursor(ri->sc->mp, ri->sc->tp, agf, pag);
753	error = xfs_rmap_query_all(cur, xrep_dinode_walk_rmap, ri);
754	xfs_btree_del_cursor(cur, error);
755	xfs_trans_brelse(ri->sc->tp, agf);
756	return error;
757}
758
759/* Count extents and blocks for a given inode from all rmap data. */
760STATIC int
761xrep_dinode_count_rmaps(
762	struct xrep_inode	*ri)
763{
764	struct xfs_perag	*pag;
765	xfs_agnumber_t		agno;
766	int			error;
767
768	if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp))
769		return -EOPNOTSUPP;
770
771	for_each_perag(ri->sc->mp, agno, pag) {
772		error = xrep_dinode_count_ag_rmaps(ri, pag);
773		if (error) {
774			xfs_perag_rele(pag);
775			return error;
776		}
777	}
778
779	/* Can't have extents on both the rt and the data device. */
780	if (ri->data_extents && ri->rt_extents)
781		return -EFSCORRUPTED;
782
783	trace_xrep_dinode_count_rmaps(ri->sc,
784			ri->data_blocks, ri->rt_blocks, ri->attr_blocks,
785			ri->data_extents, ri->rt_extents, ri->attr_extents);
786	return 0;
787}
788
789/* Return true if this extents-format ifork looks like garbage. */
790STATIC bool
791xrep_dinode_bad_extents_fork(
792	struct xfs_scrub	*sc,
793	struct xfs_dinode	*dip,
794	unsigned int		dfork_size,
795	int			whichfork)
796{
797	struct xfs_bmbt_irec	new;
798	struct xfs_bmbt_rec	*dp;
799	xfs_extnum_t		nex;
800	bool			isrt;
801	unsigned int		i;
802
803	nex = xfs_dfork_nextents(dip, whichfork);
804	if (nex > dfork_size / sizeof(struct xfs_bmbt_rec))
805		return true;
806
807	dp = XFS_DFORK_PTR(dip, whichfork);
808
809	isrt = dip->di_flags & cpu_to_be16(XFS_DIFLAG_REALTIME);
810	for (i = 0; i < nex; i++, dp++) {
811		xfs_failaddr_t	fa;
812
813		xfs_bmbt_disk_get_all(dp, &new);
814		fa = xfs_bmap_validate_extent_raw(sc->mp, isrt, whichfork,
815				&new);
816		if (fa)
817			return true;
818	}
819
820	return false;
821}
822
823/* Return true if this btree-format ifork looks like garbage. */
824STATIC bool
825xrep_dinode_bad_bmbt_fork(
826	struct xfs_scrub	*sc,
827	struct xfs_dinode	*dip,
828	unsigned int		dfork_size,
829	int			whichfork)
830{
831	struct xfs_bmdr_block	*dfp;
832	xfs_extnum_t		nex;
833	unsigned int		i;
834	unsigned int		dmxr;
835	unsigned int		nrecs;
836	unsigned int		level;
837
838	nex = xfs_dfork_nextents(dip, whichfork);
839	if (nex <= dfork_size / sizeof(struct xfs_bmbt_rec))
840		return true;
841
842	if (dfork_size < sizeof(struct xfs_bmdr_block))
843		return true;
844
845	dfp = XFS_DFORK_PTR(dip, whichfork);
846	nrecs = be16_to_cpu(dfp->bb_numrecs);
847	level = be16_to_cpu(dfp->bb_level);
848
849	if (nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > dfork_size)
850		return true;
851	if (level == 0 || level >= XFS_BM_MAXLEVELS(sc->mp, whichfork))
852		return true;
853
854	dmxr = xfs_bmdr_maxrecs(dfork_size, 0);
855	for (i = 1; i <= nrecs; i++) {
856		struct xfs_bmbt_key	*fkp;
857		xfs_bmbt_ptr_t		*fpp;
858		xfs_fileoff_t		fileoff;
859		xfs_fsblock_t		fsbno;
860
861		fkp = XFS_BMDR_KEY_ADDR(dfp, i);
862		fileoff = be64_to_cpu(fkp->br_startoff);
863		if (!xfs_verify_fileoff(sc->mp, fileoff))
864			return true;
865
866		fpp = XFS_BMDR_PTR_ADDR(dfp, i, dmxr);
867		fsbno = be64_to_cpu(*fpp);
868		if (!xfs_verify_fsbno(sc->mp, fsbno))
869			return true;
870	}
871
872	return false;
873}
874
875/*
876 * Check the data fork for things that will fail the ifork verifiers or the
877 * ifork formatters.
878 */
879STATIC bool
880xrep_dinode_check_dfork(
881	struct xfs_scrub	*sc,
882	struct xfs_dinode	*dip,
883	uint16_t		mode)
884{
885	void			*dfork_ptr;
886	int64_t			data_size;
887	unsigned int		fmt;
888	unsigned int		dfork_size;
889
890	/*
891	 * Verifier functions take signed int64_t, so check for bogus negative
892	 * values first.
893	 */
894	data_size = be64_to_cpu(dip->di_size);
895	if (data_size < 0)
896		return true;
897
898	fmt = XFS_DFORK_FORMAT(dip, XFS_DATA_FORK);
899	switch (mode & S_IFMT) {
900	case S_IFIFO:
901	case S_IFCHR:
902	case S_IFBLK:
903	case S_IFSOCK:
904		if (fmt != XFS_DINODE_FMT_DEV)
905			return true;
906		break;
907	case S_IFREG:
908		if (fmt == XFS_DINODE_FMT_LOCAL)
909			return true;
910		fallthrough;
911	case S_IFLNK:
912	case S_IFDIR:
913		switch (fmt) {
914		case XFS_DINODE_FMT_LOCAL:
915		case XFS_DINODE_FMT_EXTENTS:
916		case XFS_DINODE_FMT_BTREE:
917			break;
918		default:
919			return true;
920		}
921		break;
922	default:
923		return true;
924	}
925
926	dfork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_DATA_FORK);
927	dfork_ptr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
928
929	switch (fmt) {
930	case XFS_DINODE_FMT_DEV:
931		break;
932	case XFS_DINODE_FMT_LOCAL:
933		/* dir/symlink structure cannot be larger than the fork */
934		if (data_size > dfork_size)
935			return true;
936		/* directory structure must pass verification. */
937		if (S_ISDIR(mode) &&
938		    xfs_dir2_sf_verify(sc->mp, dfork_ptr, data_size) != NULL)
939			return true;
940		/* symlink structure must pass verification. */
941		if (S_ISLNK(mode) &&
942		    xfs_symlink_shortform_verify(dfork_ptr, data_size) != NULL)
943			return true;
944		break;
945	case XFS_DINODE_FMT_EXTENTS:
946		if (xrep_dinode_bad_extents_fork(sc, dip, dfork_size,
947				XFS_DATA_FORK))
948			return true;
949		break;
950	case XFS_DINODE_FMT_BTREE:
951		if (xrep_dinode_bad_bmbt_fork(sc, dip, dfork_size,
952				XFS_DATA_FORK))
953			return true;
954		break;
955	default:
956		return true;
957	}
958
959	return false;
960}
961
962static void
963xrep_dinode_set_data_nextents(
964	struct xfs_dinode	*dip,
965	xfs_extnum_t		nextents)
966{
967	if (xfs_dinode_has_large_extent_counts(dip))
968		dip->di_big_nextents = cpu_to_be64(nextents);
969	else
970		dip->di_nextents = cpu_to_be32(nextents);
971}
972
973static void
974xrep_dinode_set_attr_nextents(
975	struct xfs_dinode	*dip,
976	xfs_extnum_t		nextents)
977{
978	if (xfs_dinode_has_large_extent_counts(dip))
979		dip->di_big_anextents = cpu_to_be32(nextents);
980	else
981		dip->di_anextents = cpu_to_be16(nextents);
982}
983
984/* Reset the data fork to something sane. */
985STATIC void
986xrep_dinode_zap_dfork(
987	struct xrep_inode	*ri,
988	struct xfs_dinode	*dip,
989	uint16_t		mode)
990{
991	struct xfs_scrub	*sc = ri->sc;
992
993	trace_xrep_dinode_zap_dfork(sc, dip);
994
995	ri->ino_sick_mask |= XFS_SICK_INO_BMBTD_ZAPPED;
996
997	xrep_dinode_set_data_nextents(dip, 0);
998	ri->data_blocks = 0;
999	ri->rt_blocks = 0;
1000
1001	/* Special files always get reset to DEV */
1002	switch (mode & S_IFMT) {
1003	case S_IFIFO:
1004	case S_IFCHR:
1005	case S_IFBLK:
1006	case S_IFSOCK:
1007		dip->di_format = XFS_DINODE_FMT_DEV;
1008		dip->di_size = 0;
1009		return;
1010	}
1011
1012	/*
1013	 * If we have data extents, reset to an empty map and hope the user
1014	 * will run the bmapbtd checker next.
1015	 */
1016	if (ri->data_extents || ri->rt_extents || S_ISREG(mode)) {
1017		dip->di_format = XFS_DINODE_FMT_EXTENTS;
1018		return;
1019	}
1020
1021	/* Otherwise, reset the local format to the minimum. */
1022	switch (mode & S_IFMT) {
1023	case S_IFLNK:
1024		xrep_dinode_zap_symlink(ri, dip);
1025		break;
1026	case S_IFDIR:
1027		xrep_dinode_zap_dir(ri, dip);
1028		break;
1029	}
1030}
1031
1032/*
1033 * Check the attr fork for things that will fail the ifork verifiers or the
1034 * ifork formatters.
1035 */
1036STATIC bool
1037xrep_dinode_check_afork(
1038	struct xfs_scrub		*sc,
1039	struct xfs_dinode		*dip)
1040{
1041	struct xfs_attr_sf_hdr		*afork_ptr;
1042	size_t				attr_size;
1043	unsigned int			afork_size;
1044
1045	if (XFS_DFORK_BOFF(dip) == 0)
1046		return dip->di_aformat != XFS_DINODE_FMT_EXTENTS ||
1047		       xfs_dfork_attr_extents(dip) != 0;
1048
1049	afork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK);
1050	afork_ptr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK);
1051
1052	switch (XFS_DFORK_FORMAT(dip, XFS_ATTR_FORK)) {
1053	case XFS_DINODE_FMT_LOCAL:
1054		/* Fork has to be large enough to extract the xattr size. */
1055		if (afork_size < sizeof(struct xfs_attr_sf_hdr))
1056			return true;
1057
1058		/* xattr structure cannot be larger than the fork */
1059		attr_size = be16_to_cpu(afork_ptr->totsize);
1060		if (attr_size > afork_size)
1061			return true;
1062
1063		/* xattr structure must pass verification. */
1064		return xfs_attr_shortform_verify(afork_ptr, attr_size) != NULL;
1065	case XFS_DINODE_FMT_EXTENTS:
1066		if (xrep_dinode_bad_extents_fork(sc, dip, afork_size,
1067					XFS_ATTR_FORK))
1068			return true;
1069		break;
1070	case XFS_DINODE_FMT_BTREE:
1071		if (xrep_dinode_bad_bmbt_fork(sc, dip, afork_size,
1072					XFS_ATTR_FORK))
1073			return true;
1074		break;
1075	default:
1076		return true;
1077	}
1078
1079	return false;
1080}
1081
1082/*
1083 * Reset the attr fork to empty.  Since the attr fork could have contained
1084 * ACLs, make the file readable only by root.
1085 */
1086STATIC void
1087xrep_dinode_zap_afork(
1088	struct xrep_inode	*ri,
1089	struct xfs_dinode	*dip,
1090	uint16_t		mode)
1091{
1092	struct xfs_scrub	*sc = ri->sc;
1093
1094	trace_xrep_dinode_zap_afork(sc, dip);
1095
1096	ri->ino_sick_mask |= XFS_SICK_INO_BMBTA_ZAPPED;
1097
1098	dip->di_aformat = XFS_DINODE_FMT_EXTENTS;
1099	xrep_dinode_set_attr_nextents(dip, 0);
1100	ri->attr_blocks = 0;
1101
1102	/*
1103	 * If the data fork is in btree format, removing the attr fork entirely
1104	 * might cause verifier failures if the next level down in the bmbt
1105	 * could now fit in the data fork area.
1106	 */
1107	if (dip->di_format != XFS_DINODE_FMT_BTREE)
1108		dip->di_forkoff = 0;
1109	dip->di_mode = cpu_to_be16(mode & ~0777);
1110	dip->di_uid = 0;
1111	dip->di_gid = 0;
1112}
1113
1114/* Make sure the fork offset is a sensible value. */
1115STATIC void
1116xrep_dinode_ensure_forkoff(
1117	struct xrep_inode	*ri,
1118	struct xfs_dinode	*dip,
1119	uint16_t		mode)
1120{
1121	struct xfs_bmdr_block	*bmdr;
1122	struct xfs_scrub	*sc = ri->sc;
1123	xfs_extnum_t		attr_extents, data_extents;
1124	size_t			bmdr_minsz = XFS_BMDR_SPACE_CALC(1);
1125	unsigned int		lit_sz = XFS_LITINO(sc->mp);
1126	unsigned int		afork_min, dfork_min;
1127
1128	trace_xrep_dinode_ensure_forkoff(sc, dip);
1129
1130	/*
1131	 * Before calling this function, xrep_dinode_core ensured that both
1132	 * forks actually fit inside their respective literal areas.  If this
1133	 * was not the case, the fork was reset to FMT_EXTENTS with zero
1134	 * records.  If the rmapbt scan found attr or data fork blocks, this
1135	 * will be noted in the dinode_stats, and we must leave enough room
1136	 * for the bmap repair code to reconstruct the mapping structure.
1137	 *
1138	 * First, compute the minimum space required for the attr fork.
1139	 */
1140	switch (dip->di_aformat) {
1141	case XFS_DINODE_FMT_LOCAL:
1142		/*
1143		 * If we still have a shortform xattr structure at all, that
1144		 * means the attr fork area was exactly large enough to fit
1145		 * the sf structure.
1146		 */
1147		afork_min = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK);
1148		break;
1149	case XFS_DINODE_FMT_EXTENTS:
1150		attr_extents = xfs_dfork_attr_extents(dip);
1151		if (attr_extents) {
1152			/*
1153			 * We must maintain sufficient space to hold the entire
1154			 * extent map array in the data fork.  Note that we
1155			 * previously zapped the fork if it had no chance of
1156			 * fitting in the inode.
1157			 */
1158			afork_min = sizeof(struct xfs_bmbt_rec) * attr_extents;
1159		} else if (ri->attr_extents > 0) {
1160			/*
1161			 * The attr fork thinks it has zero extents, but we
1162			 * found some xattr extents.  We need to leave enough
1163			 * empty space here so that the incore attr fork will
1164			 * get created (and hence trigger the attr fork bmap
1165			 * repairer).
1166			 */
1167			afork_min = bmdr_minsz;
1168		} else {
1169			/* No extents on disk or found in rmapbt. */
1170			afork_min = 0;
1171		}
1172		break;
1173	case XFS_DINODE_FMT_BTREE:
1174		/* Must have space for btree header and key/pointers. */
1175		bmdr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK);
1176		afork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr);
1177		break;
1178	default:
1179		/* We should never see any other formats. */
1180		afork_min = 0;
1181		break;
1182	}
1183
1184	/* Compute the minimum space required for the data fork. */
1185	switch (dip->di_format) {
1186	case XFS_DINODE_FMT_DEV:
1187		dfork_min = sizeof(__be32);
1188		break;
1189	case XFS_DINODE_FMT_UUID:
1190		dfork_min = sizeof(uuid_t);
1191		break;
1192	case XFS_DINODE_FMT_LOCAL:
1193		/*
1194		 * If we still have a shortform data fork at all, that means
1195		 * the data fork area was large enough to fit whatever was in
1196		 * there.
1197		 */
1198		dfork_min = be64_to_cpu(dip->di_size);
1199		break;
1200	case XFS_DINODE_FMT_EXTENTS:
1201		data_extents = xfs_dfork_data_extents(dip);
1202		if (data_extents) {
1203			/*
1204			 * We must maintain sufficient space to hold the entire
1205			 * extent map array in the data fork.  Note that we
1206			 * previously zapped the fork if it had no chance of
1207			 * fitting in the inode.
1208			 */
1209			dfork_min = sizeof(struct xfs_bmbt_rec) * data_extents;
1210		} else if (ri->data_extents > 0 || ri->rt_extents > 0) {
1211			/*
1212			 * The data fork thinks it has zero extents, but we
1213			 * found some data extents.  We need to leave enough
1214			 * empty space here so that the data fork bmap repair
1215			 * will recover the mappings.
1216			 */
1217			dfork_min = bmdr_minsz;
1218		} else {
1219			/* No extents on disk or found in rmapbt. */
1220			dfork_min = 0;
1221		}
1222		break;
1223	case XFS_DINODE_FMT_BTREE:
1224		/* Must have space for btree header and key/pointers. */
1225		bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
1226		dfork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr);
1227		break;
1228	default:
1229		dfork_min = 0;
1230		break;
1231	}
1232
1233	/*
1234	 * Round all values up to the nearest 8 bytes, because that is the
1235	 * precision of di_forkoff.
1236	 */
1237	afork_min = roundup(afork_min, 8);
1238	dfork_min = roundup(dfork_min, 8);
1239	bmdr_minsz = roundup(bmdr_minsz, 8);
1240
1241	ASSERT(dfork_min <= lit_sz);
1242	ASSERT(afork_min <= lit_sz);
1243
1244	/*
1245	 * If the data fork was zapped and we don't have enough space for the
1246	 * recovery fork, move the attr fork up.
1247	 */
1248	if (dip->di_format == XFS_DINODE_FMT_EXTENTS &&
1249	    xfs_dfork_data_extents(dip) == 0 &&
1250	    (ri->data_extents > 0 || ri->rt_extents > 0) &&
1251	    bmdr_minsz > XFS_DFORK_DSIZE(dip, sc->mp)) {
1252		if (bmdr_minsz + afork_min > lit_sz) {
1253			/*
1254			 * The attr for and the stub fork we need to recover
1255			 * the data fork won't both fit.  Zap the attr fork.
1256			 */
1257			xrep_dinode_zap_afork(ri, dip, mode);
1258			afork_min = bmdr_minsz;
1259		} else {
1260			void	*before, *after;
1261
1262			/* Otherwise, just slide the attr fork up. */
1263			before = XFS_DFORK_APTR(dip);
1264			dip->di_forkoff = bmdr_minsz >> 3;
1265			after = XFS_DFORK_APTR(dip);
1266			memmove(after, before, XFS_DFORK_ASIZE(dip, sc->mp));
1267		}
1268	}
1269
1270	/*
1271	 * If the attr fork was zapped and we don't have enough space for the
1272	 * recovery fork, move the attr fork down.
1273	 */
1274	if (dip->di_aformat == XFS_DINODE_FMT_EXTENTS &&
1275	    xfs_dfork_attr_extents(dip) == 0 &&
1276	    ri->attr_extents > 0 &&
1277	    bmdr_minsz > XFS_DFORK_ASIZE(dip, sc->mp)) {
1278		if (dip->di_format == XFS_DINODE_FMT_BTREE) {
1279			/*
1280			 * If the data fork is in btree format then we can't
1281			 * adjust forkoff because that runs the risk of
1282			 * violating the extents/btree format transition rules.
1283			 */
1284		} else if (bmdr_minsz + dfork_min > lit_sz) {
1285			/*
1286			 * If we can't move the attr fork, too bad, we lose the
1287			 * attr fork and leak its blocks.
1288			 */
1289			xrep_dinode_zap_afork(ri, dip, mode);
1290		} else {
1291			/*
1292			 * Otherwise, just slide the attr fork down.  The attr
1293			 * fork is empty, so we don't have any old contents to
1294			 * move here.
1295			 */
1296			dip->di_forkoff = (lit_sz - bmdr_minsz) >> 3;
1297		}
1298	}
1299}
1300
1301/*
1302 * Zap the data/attr forks if we spot anything that isn't going to pass the
1303 * ifork verifiers or the ifork formatters, because we need to get the inode
1304 * into good enough shape that the higher level repair functions can run.
1305 */
1306STATIC void
1307xrep_dinode_zap_forks(
1308	struct xrep_inode	*ri,
1309	struct xfs_dinode	*dip)
1310{
1311	struct xfs_scrub	*sc = ri->sc;
1312	xfs_extnum_t		data_extents;
1313	xfs_extnum_t		attr_extents;
1314	xfs_filblks_t		nblocks;
1315	uint16_t		mode;
1316	bool			zap_datafork = false;
1317	bool			zap_attrfork = ri->zap_acls;
1318
1319	trace_xrep_dinode_zap_forks(sc, dip);
1320
1321	mode = be16_to_cpu(dip->di_mode);
1322
1323	data_extents = xfs_dfork_data_extents(dip);
1324	attr_extents = xfs_dfork_attr_extents(dip);
1325	nblocks = be64_to_cpu(dip->di_nblocks);
1326
1327	/* Inode counters don't make sense? */
1328	if (data_extents > nblocks)
1329		zap_datafork = true;
1330	if (attr_extents > nblocks)
1331		zap_attrfork = true;
1332	if (data_extents + attr_extents > nblocks)
1333		zap_datafork = zap_attrfork = true;
1334
1335	if (!zap_datafork)
1336		zap_datafork = xrep_dinode_check_dfork(sc, dip, mode);
1337	if (!zap_attrfork)
1338		zap_attrfork = xrep_dinode_check_afork(sc, dip);
1339
1340	/* Zap whatever's bad. */
1341	if (zap_attrfork)
1342		xrep_dinode_zap_afork(ri, dip, mode);
1343	if (zap_datafork)
1344		xrep_dinode_zap_dfork(ri, dip, mode);
1345	xrep_dinode_ensure_forkoff(ri, dip, mode);
1346
1347	/*
1348	 * Zero di_nblocks if we don't have any extents at all to satisfy the
1349	 * buffer verifier.
1350	 */
1351	data_extents = xfs_dfork_data_extents(dip);
1352	attr_extents = xfs_dfork_attr_extents(dip);
1353	if (data_extents + attr_extents == 0)
1354		dip->di_nblocks = 0;
1355}
1356
1357/* Inode didn't pass dinode verifiers, so fix the raw buffer and retry iget. */
1358STATIC int
1359xrep_dinode_core(
1360	struct xrep_inode	*ri)
1361{
1362	struct xfs_scrub	*sc = ri->sc;
1363	struct xfs_buf		*bp;
1364	struct xfs_dinode	*dip;
1365	xfs_ino_t		ino = sc->sm->sm_ino;
1366	int			error;
1367	int			iget_error;
1368
1369	/* Figure out what this inode had mapped in both forks. */
1370	error = xrep_dinode_count_rmaps(ri);
1371	if (error)
1372		return error;
1373
1374	/* Read the inode cluster buffer. */
1375	error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp,
1376			ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp,
1377			NULL);
1378	if (error)
1379		return error;
1380
1381	/* Make sure we can pass the inode buffer verifier. */
1382	xrep_dinode_buf(sc, bp);
1383	bp->b_ops = &xfs_inode_buf_ops;
1384
1385	/* Fix everything the verifier will complain about. */
1386	dip = xfs_buf_offset(bp, ri->imap.im_boffset);
1387	xrep_dinode_header(sc, dip);
1388	iget_error = xrep_dinode_mode(ri, dip);
1389	if (iget_error)
1390		goto write;
1391	xrep_dinode_nlinks(dip);
1392	xrep_dinode_flags(sc, dip, ri->rt_extents > 0);
1393	xrep_dinode_size(ri, dip);
1394	xrep_dinode_extsize_hints(sc, dip);
1395	xrep_dinode_zap_forks(ri, dip);
1396
1397write:
1398	/* Write out the inode. */
1399	trace_xrep_dinode_fixed(sc, dip);
1400	xfs_dinode_calc_crc(sc->mp, dip);
1401	xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_DINO_BUF);
1402	xfs_trans_log_buf(sc->tp, bp, ri->imap.im_boffset,
1403			ri->imap.im_boffset + sc->mp->m_sb.sb_inodesize - 1);
1404
1405	/*
1406	 * In theory, we've fixed the ondisk inode record enough that we should
1407	 * be able to load the inode into the cache.  Try to iget that inode
1408	 * now while we hold the AGI and the inode cluster buffer and take the
1409	 * IOLOCK so that we can continue with repairs without anyone else
1410	 * accessing the inode.  If iget fails, we still need to commit the
1411	 * changes.
1412	 */
1413	if (!iget_error)
1414		iget_error = xchk_iget(sc, ino, &sc->ip);
1415	if (!iget_error)
1416		xchk_ilock(sc, XFS_IOLOCK_EXCL);
1417
1418	/*
1419	 * Commit the inode cluster buffer updates and drop the AGI buffer that
1420	 * we've been holding since scrub setup.  From here on out, repairs
1421	 * deal only with the cached inode.
1422	 */
1423	error = xrep_trans_commit(sc);
1424	if (error)
1425		return error;
1426
1427	if (iget_error)
1428		return iget_error;
1429
1430	error = xchk_trans_alloc(sc, 0);
1431	if (error)
1432		return error;
1433
1434	error = xrep_ino_dqattach(sc);
1435	if (error)
1436		return error;
1437
1438	xchk_ilock(sc, XFS_ILOCK_EXCL);
1439	if (ri->ino_sick_mask)
1440		xfs_inode_mark_sick(sc->ip, ri->ino_sick_mask);
1441	return 0;
1442}
1443
1444/* Fix everything xfs_dinode_verify cares about. */
1445STATIC int
1446xrep_dinode_problems(
1447	struct xrep_inode	*ri)
1448{
1449	struct xfs_scrub	*sc = ri->sc;
1450	int			error;
1451
1452	error = xrep_dinode_core(ri);
1453	if (error)
1454		return error;
1455
1456	/* We had to fix a totally busted inode, schedule quotacheck. */
1457	if (XFS_IS_UQUOTA_ON(sc->mp))
1458		xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
1459	if (XFS_IS_GQUOTA_ON(sc->mp))
1460		xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
1461	if (XFS_IS_PQUOTA_ON(sc->mp))
1462		xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
1463
1464	return 0;
1465}
1466
1467/*
1468 * Fix problems that the verifiers don't care about.  In general these are
1469 * errors that don't cause problems elsewhere in the kernel that we can easily
1470 * detect, so we don't check them all that rigorously.
1471 */
1472
1473/* Make sure block and extent counts are ok. */
1474STATIC int
1475xrep_inode_blockcounts(
1476	struct xfs_scrub	*sc)
1477{
1478	struct xfs_ifork	*ifp;
1479	xfs_filblks_t		count;
1480	xfs_filblks_t		acount;
1481	xfs_extnum_t		nextents;
1482	int			error;
1483
1484	trace_xrep_inode_blockcounts(sc);
1485
1486	/* Set data fork counters from the data fork mappings. */
1487	error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK,
1488			&nextents, &count);
1489	if (error)
1490		return error;
1491	if (xfs_is_reflink_inode(sc->ip)) {
1492		/*
1493		 * data fork blockcount can exceed physical storage if a user
1494		 * reflinks the same block over and over again.
1495		 */
1496		;
1497	} else if (XFS_IS_REALTIME_INODE(sc->ip)) {
1498		if (count >= sc->mp->m_sb.sb_rblocks)
1499			return -EFSCORRUPTED;
1500	} else {
1501		if (count >= sc->mp->m_sb.sb_dblocks)
1502			return -EFSCORRUPTED;
1503	}
1504	error = xrep_ino_ensure_extent_count(sc, XFS_DATA_FORK, nextents);
1505	if (error)
1506		return error;
1507	sc->ip->i_df.if_nextents = nextents;
1508
1509	/* Set attr fork counters from the attr fork mappings. */
1510	ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK);
1511	if (ifp) {
1512		error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
1513				&nextents, &acount);
1514		if (error)
1515			return error;
1516		if (count >= sc->mp->m_sb.sb_dblocks)
1517			return -EFSCORRUPTED;
1518		error = xrep_ino_ensure_extent_count(sc, XFS_ATTR_FORK,
1519				nextents);
1520		if (error)
1521			return error;
1522		ifp->if_nextents = nextents;
1523	} else {
1524		acount = 0;
1525	}
1526
1527	sc->ip->i_nblocks = count + acount;
1528	return 0;
1529}
1530
1531/* Check for invalid uid/gid/prid. */
1532STATIC void
1533xrep_inode_ids(
1534	struct xfs_scrub	*sc)
1535{
1536	bool			dirty = false;
1537
1538	trace_xrep_inode_ids(sc);
1539
1540	if (!uid_valid(VFS_I(sc->ip)->i_uid)) {
1541		i_uid_write(VFS_I(sc->ip), 0);
1542		dirty = true;
1543		if (XFS_IS_UQUOTA_ON(sc->mp))
1544			xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
1545	}
1546
1547	if (!gid_valid(VFS_I(sc->ip)->i_gid)) {
1548		i_gid_write(VFS_I(sc->ip), 0);
1549		dirty = true;
1550		if (XFS_IS_GQUOTA_ON(sc->mp))
1551			xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
1552	}
1553
1554	if (sc->ip->i_projid == -1U) {
1555		sc->ip->i_projid = 0;
1556		dirty = true;
1557		if (XFS_IS_PQUOTA_ON(sc->mp))
1558			xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
1559	}
1560
1561	/* strip setuid/setgid if we touched any of the ids */
1562	if (dirty)
1563		VFS_I(sc->ip)->i_mode &= ~(S_ISUID | S_ISGID);
1564}
1565
1566static inline void
1567xrep_clamp_timestamp(
1568	struct xfs_inode	*ip,
1569	struct timespec64	*ts)
1570{
1571	ts->tv_nsec = clamp_t(long, ts->tv_nsec, 0, NSEC_PER_SEC);
1572	*ts = timestamp_truncate(*ts, VFS_I(ip));
1573}
1574
1575/* Nanosecond counters can't have more than 1 billion. */
1576STATIC void
1577xrep_inode_timestamps(
1578	struct xfs_inode	*ip)
1579{
1580	struct timespec64	tstamp;
1581	struct inode		*inode = VFS_I(ip);
1582
1583	tstamp = inode_get_atime(inode);
1584	xrep_clamp_timestamp(ip, &tstamp);
1585	inode_set_atime_to_ts(inode, tstamp);
1586
1587	tstamp = inode_get_mtime(inode);
1588	xrep_clamp_timestamp(ip, &tstamp);
1589	inode_set_mtime_to_ts(inode, tstamp);
1590
1591	tstamp = inode_get_ctime(inode);
1592	xrep_clamp_timestamp(ip, &tstamp);
1593	inode_set_ctime_to_ts(inode, tstamp);
1594
1595	xrep_clamp_timestamp(ip, &ip->i_crtime);
1596}
1597
1598/* Fix inode flags that don't make sense together. */
1599STATIC void
1600xrep_inode_flags(
1601	struct xfs_scrub	*sc)
1602{
1603	uint16_t		mode;
1604
1605	trace_xrep_inode_flags(sc);
1606
1607	mode = VFS_I(sc->ip)->i_mode;
1608
1609	/* Clear junk flags */
1610	if (sc->ip->i_diflags & ~XFS_DIFLAG_ANY)
1611		sc->ip->i_diflags &= ~XFS_DIFLAG_ANY;
1612
1613	/* NEWRTBM only applies to realtime bitmaps */
1614	if (sc->ip->i_ino == sc->mp->m_sb.sb_rbmino)
1615		sc->ip->i_diflags |= XFS_DIFLAG_NEWRTBM;
1616	else
1617		sc->ip->i_diflags &= ~XFS_DIFLAG_NEWRTBM;
1618
1619	/* These only make sense for directories. */
1620	if (!S_ISDIR(mode))
1621		sc->ip->i_diflags &= ~(XFS_DIFLAG_RTINHERIT |
1622					  XFS_DIFLAG_EXTSZINHERIT |
1623					  XFS_DIFLAG_PROJINHERIT |
1624					  XFS_DIFLAG_NOSYMLINKS);
1625
1626	/* These only make sense for files. */
1627	if (!S_ISREG(mode))
1628		sc->ip->i_diflags &= ~(XFS_DIFLAG_REALTIME |
1629					  XFS_DIFLAG_EXTSIZE);
1630
1631	/* These only make sense for non-rt files. */
1632	if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME)
1633		sc->ip->i_diflags &= ~XFS_DIFLAG_FILESTREAM;
1634
1635	/* Immutable and append only?  Drop the append. */
1636	if ((sc->ip->i_diflags & XFS_DIFLAG_IMMUTABLE) &&
1637	    (sc->ip->i_diflags & XFS_DIFLAG_APPEND))
1638		sc->ip->i_diflags &= ~XFS_DIFLAG_APPEND;
1639
1640	/* Clear junk flags. */
1641	if (sc->ip->i_diflags2 & ~XFS_DIFLAG2_ANY)
1642		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_ANY;
1643
1644	/* No reflink flag unless we support it and it's a file. */
1645	if (!xfs_has_reflink(sc->mp) || !S_ISREG(mode))
1646		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
1647
1648	/* DAX only applies to files and dirs. */
1649	if (!(S_ISREG(mode) || S_ISDIR(mode)))
1650		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX;
1651
1652	/* No reflink files on the realtime device. */
1653	if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME)
1654		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
1655}
1656
1657/*
1658 * Fix size problems with block/node format directories.  If we fail to find
1659 * the extent list, just bail out and let the bmapbtd repair functions clean
1660 * up that mess.
1661 */
1662STATIC void
1663xrep_inode_blockdir_size(
1664	struct xfs_scrub	*sc)
1665{
1666	struct xfs_iext_cursor	icur;
1667	struct xfs_bmbt_irec	got;
1668	struct xfs_ifork	*ifp;
1669	xfs_fileoff_t		off;
1670	int			error;
1671
1672	trace_xrep_inode_blockdir_size(sc);
1673
1674	error = xfs_iread_extents(sc->tp, sc->ip, XFS_DATA_FORK);
1675	if (error)
1676		return;
1677
1678	/* Find the last block before 32G; this is the dir size. */
1679	ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
1680	off = XFS_B_TO_FSB(sc->mp, XFS_DIR2_SPACE_SIZE);
1681	if (!xfs_iext_lookup_extent_before(sc->ip, ifp, &off, &icur, &got)) {
1682		/* zero-extents directory? */
1683		return;
1684	}
1685
1686	off = got.br_startoff + got.br_blockcount;
1687	sc->ip->i_disk_size = min_t(loff_t, XFS_DIR2_SPACE_SIZE,
1688			XFS_FSB_TO_B(sc->mp, off));
1689}
1690
1691/* Fix size problems with short format directories. */
1692STATIC void
1693xrep_inode_sfdir_size(
1694	struct xfs_scrub	*sc)
1695{
1696	struct xfs_ifork	*ifp;
1697
1698	trace_xrep_inode_sfdir_size(sc);
1699
1700	ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
1701	sc->ip->i_disk_size = ifp->if_bytes;
1702}
1703
1704/*
1705 * Fix any irregularities in a directory inode's size now that we can iterate
1706 * extent maps and access other regular inode data.
1707 */
1708STATIC void
1709xrep_inode_dir_size(
1710	struct xfs_scrub	*sc)
1711{
1712	trace_xrep_inode_dir_size(sc);
1713
1714	switch (sc->ip->i_df.if_format) {
1715	case XFS_DINODE_FMT_EXTENTS:
1716	case XFS_DINODE_FMT_BTREE:
1717		xrep_inode_blockdir_size(sc);
1718		break;
1719	case XFS_DINODE_FMT_LOCAL:
1720		xrep_inode_sfdir_size(sc);
1721		break;
1722	}
1723}
1724
1725/* Fix extent size hint problems. */
1726STATIC void
1727xrep_inode_extsize(
1728	struct xfs_scrub	*sc)
1729{
1730	/* Fix misaligned extent size hints on a directory. */
1731	if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
1732	    (sc->ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
1733	    xfs_extlen_to_rtxmod(sc->mp, sc->ip->i_extsize) > 0) {
1734		sc->ip->i_extsize = 0;
1735		sc->ip->i_diflags &= ~XFS_DIFLAG_EXTSZINHERIT;
1736	}
1737}
1738
1739/* Ensure this file has an attr fork if it needs to hold a parent pointer. */
1740STATIC int
1741xrep_inode_pptr(
1742	struct xfs_scrub	*sc)
1743{
1744	struct xfs_mount	*mp = sc->mp;
1745	struct xfs_inode	*ip = sc->ip;
1746	struct inode		*inode = VFS_I(ip);
1747
1748	if (!xfs_has_parent(mp))
1749		return 0;
1750
1751	/*
1752	 * Unlinked inodes that cannot be added to the directory tree will not
1753	 * have a parent pointer.
1754	 */
1755	if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
1756		return 0;
1757
1758	/* The root directory doesn't have a parent pointer. */
1759	if (ip == mp->m_rootip)
1760		return 0;
1761
1762	/*
1763	 * Metadata inodes are rooted in the superblock and do not have any
1764	 * parents.
1765	 */
1766	if (xfs_is_metadata_inode(ip))
1767		return 0;
1768
1769	/* Inode already has an attr fork; no further work possible here. */
1770	if (xfs_inode_has_attr_fork(ip))
1771		return 0;
1772
1773	return xfs_bmap_add_attrfork(sc->tp, ip,
1774			sizeof(struct xfs_attr_sf_hdr), true);
1775}
1776
1777/* Fix any irregularities in an inode that the verifiers don't catch. */
1778STATIC int
1779xrep_inode_problems(
1780	struct xfs_scrub	*sc)
1781{
1782	int			error;
1783
1784	error = xrep_inode_blockcounts(sc);
1785	if (error)
1786		return error;
1787	error = xrep_inode_pptr(sc);
1788	if (error)
1789		return error;
1790	xrep_inode_timestamps(sc->ip);
1791	xrep_inode_flags(sc);
1792	xrep_inode_ids(sc);
1793	/*
1794	 * We can now do a better job fixing the size of a directory now that
1795	 * we can scan the data fork extents than we could in xrep_dinode_size.
1796	 */
1797	if (S_ISDIR(VFS_I(sc->ip)->i_mode))
1798		xrep_inode_dir_size(sc);
1799	xrep_inode_extsize(sc);
1800
1801	trace_xrep_inode_fixed(sc);
1802	xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
1803	return xrep_roll_trans(sc);
1804}
1805
1806/*
1807 * Make sure this inode's unlinked list pointers are consistent with its
1808 * link count.
1809 */
1810STATIC int
1811xrep_inode_unlinked(
1812	struct xfs_scrub	*sc)
1813{
1814	unsigned int		nlink = VFS_I(sc->ip)->i_nlink;
1815	int			error;
1816
1817	/*
1818	 * If this inode is linked from the directory tree and on the unlinked
1819	 * list, remove it from the unlinked list.
1820	 */
1821	if (nlink > 0 && xfs_inode_on_unlinked_list(sc->ip)) {
1822		struct xfs_perag	*pag;
1823		int			error;
1824
1825		pag = xfs_perag_get(sc->mp,
1826				XFS_INO_TO_AGNO(sc->mp, sc->ip->i_ino));
1827		error = xfs_iunlink_remove(sc->tp, pag, sc->ip);
1828		xfs_perag_put(pag);
1829		if (error)
1830			return error;
1831	}
1832
1833	/*
1834	 * If this inode is not linked from the directory tree yet not on the
1835	 * unlinked list, put it on the unlinked list.
1836	 */
1837	if (nlink == 0 && !xfs_inode_on_unlinked_list(sc->ip)) {
1838		error = xfs_iunlink(sc->tp, sc->ip);
1839		if (error)
1840			return error;
1841	}
1842
1843	return 0;
1844}
1845
1846/* Repair an inode's fields. */
1847int
1848xrep_inode(
1849	struct xfs_scrub	*sc)
1850{
1851	int			error = 0;
1852
1853	/*
1854	 * No inode?  That means we failed the _iget verifiers.  Repair all
1855	 * the things that the inode verifiers care about, then retry _iget.
1856	 */
1857	if (!sc->ip) {
1858		struct xrep_inode	*ri = sc->buf;
1859
1860		ASSERT(ri != NULL);
1861
1862		error = xrep_dinode_problems(ri);
1863		if (error == -EBUSY) {
1864			/*
1865			 * Directory scan to recover inode mode encountered a
1866			 * busy inode, so we did not continue repairing things.
1867			 */
1868			return 0;
1869		}
1870		if (error)
1871			return error;
1872
1873		/* By this point we had better have a working incore inode. */
1874		if (!sc->ip)
1875			return -EFSCORRUPTED;
1876	}
1877
1878	xfs_trans_ijoin(sc->tp, sc->ip, 0);
1879
1880	/* If we found corruption of any kind, try to fix it. */
1881	if ((sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) ||
1882	    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT)) {
1883		error = xrep_inode_problems(sc);
1884		if (error)
1885			return error;
1886	}
1887
1888	/* See if we can clear the reflink flag. */
1889	if (xfs_is_reflink_inode(sc->ip)) {
1890		error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp);
1891		if (error)
1892			return error;
1893	}
1894
1895	/* Reconnect incore unlinked list */
1896	error = xrep_inode_unlinked(sc);
1897	if (error)
1898		return error;
1899
1900	return xrep_defer_finish(sc);
1901}
1902