1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2007 Oracle.  All rights reserved.
4 */
5
6#include <linux/fs.h>
7#include <linux/pagemap.h>
8#include <linux/time.h>
9#include <linux/init.h>
10#include <linux/string.h>
11#include <linux/backing-dev.h>
12#include <linux/falloc.h>
13#include <linux/writeback.h>
14#include <linux/compat.h>
15#include <linux/slab.h>
16#include <linux/btrfs.h>
17#include <linux/uio.h>
18#include <linux/iversion.h>
19#include <linux/fsverity.h>
20#include <linux/iomap.h>
21#include "ctree.h"
22#include "disk-io.h"
23#include "transaction.h"
24#include "btrfs_inode.h"
25#include "tree-log.h"
26#include "locking.h"
27#include "qgroup.h"
28#include "compression.h"
29#include "delalloc-space.h"
30#include "reflink.h"
31#include "subpage.h"
32#include "fs.h"
33#include "accessors.h"
34#include "extent-tree.h"
35#include "file-item.h"
36#include "ioctl.h"
37#include "file.h"
38#include "super.h"
39
40/* simple helper to fault in pages and copy.  This should go away
41 * and be replaced with calls into generic code.
42 */
43static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
44					 struct page **prepared_pages,
45					 struct iov_iter *i)
46{
47	size_t copied = 0;
48	size_t total_copied = 0;
49	int pg = 0;
50	int offset = offset_in_page(pos);
51
52	while (write_bytes > 0) {
53		size_t count = min_t(size_t,
54				     PAGE_SIZE - offset, write_bytes);
55		struct page *page = prepared_pages[pg];
56		/*
57		 * Copy data from userspace to the current page
58		 */
59		copied = copy_page_from_iter_atomic(page, offset, count, i);
60
61		/* Flush processor's dcache for this page */
62		flush_dcache_page(page);
63
64		/*
65		 * if we get a partial write, we can end up with
66		 * partially up to date pages.  These add
67		 * a lot of complexity, so make sure they don't
68		 * happen by forcing this copy to be retried.
69		 *
70		 * The rest of the btrfs_file_write code will fall
71		 * back to page at a time copies after we return 0.
72		 */
73		if (unlikely(copied < count)) {
74			if (!PageUptodate(page)) {
75				iov_iter_revert(i, copied);
76				copied = 0;
77			}
78			if (!copied)
79				break;
80		}
81
82		write_bytes -= copied;
83		total_copied += copied;
84		offset += copied;
85		if (offset == PAGE_SIZE) {
86			pg++;
87			offset = 0;
88		}
89	}
90	return total_copied;
91}
92
93/*
94 * unlocks pages after btrfs_file_write is done with them
95 */
96static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
97			     struct page **pages, size_t num_pages,
98			     u64 pos, u64 copied)
99{
100	size_t i;
101	u64 block_start = round_down(pos, fs_info->sectorsize);
102	u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
103
104	ASSERT(block_len <= U32_MAX);
105	for (i = 0; i < num_pages; i++) {
106		/* page checked is some magic around finding pages that
107		 * have been modified without going through btrfs_set_page_dirty
108		 * clear it here. There should be no need to mark the pages
109		 * accessed as prepare_pages should have marked them accessed
110		 * in prepare_pages via find_or_create_page()
111		 */
112		btrfs_folio_clamp_clear_checked(fs_info, page_folio(pages[i]),
113						block_start, block_len);
114		unlock_page(pages[i]);
115		put_page(pages[i]);
116	}
117}
118
119/*
120 * After btrfs_copy_from_user(), update the following things for delalloc:
121 * - Mark newly dirtied pages as DELALLOC in the io tree.
122 *   Used to advise which range is to be written back.
123 * - Mark modified pages as Uptodate/Dirty and not needing COW fixup
124 * - Update inode size for past EOF write
125 */
126int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
127		      size_t num_pages, loff_t pos, size_t write_bytes,
128		      struct extent_state **cached, bool noreserve)
129{
130	struct btrfs_fs_info *fs_info = inode->root->fs_info;
131	int ret = 0;
132	int i;
133	u64 num_bytes;
134	u64 start_pos;
135	u64 end_of_last_block;
136	u64 end_pos = pos + write_bytes;
137	loff_t isize = i_size_read(&inode->vfs_inode);
138	unsigned int extra_bits = 0;
139
140	if (write_bytes == 0)
141		return 0;
142
143	if (noreserve)
144		extra_bits |= EXTENT_NORESERVE;
145
146	start_pos = round_down(pos, fs_info->sectorsize);
147	num_bytes = round_up(write_bytes + pos - start_pos,
148			     fs_info->sectorsize);
149	ASSERT(num_bytes <= U32_MAX);
150
151	end_of_last_block = start_pos + num_bytes - 1;
152
153	/*
154	 * The pages may have already been dirty, clear out old accounting so
155	 * we can set things up properly
156	 */
157	clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
158			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
159			 cached);
160
161	ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
162					extra_bits, cached);
163	if (ret)
164		return ret;
165
166	for (i = 0; i < num_pages; i++) {
167		struct page *p = pages[i];
168
169		btrfs_folio_clamp_set_uptodate(fs_info, page_folio(p),
170					       start_pos, num_bytes);
171		btrfs_folio_clamp_clear_checked(fs_info, page_folio(p),
172						start_pos, num_bytes);
173		btrfs_folio_clamp_set_dirty(fs_info, page_folio(p),
174					    start_pos, num_bytes);
175	}
176
177	/*
178	 * we've only changed i_size in ram, and we haven't updated
179	 * the disk i_size.  There is no need to log the inode
180	 * at this time.
181	 */
182	if (end_pos > isize)
183		i_size_write(&inode->vfs_inode, end_pos);
184	return 0;
185}
186
187/*
188 * this is very complex, but the basic idea is to drop all extents
189 * in the range start - end.  hint_block is filled in with a block number
190 * that would be a good hint to the block allocator for this file.
191 *
192 * If an extent intersects the range but is not entirely inside the range
193 * it is either truncated or split.  Anything entirely inside the range
194 * is deleted from the tree.
195 *
196 * Note: the VFS' inode number of bytes is not updated, it's up to the caller
197 * to deal with that. We set the field 'bytes_found' of the arguments structure
198 * with the number of allocated bytes found in the target range, so that the
199 * caller can update the inode's number of bytes in an atomic way when
200 * replacing extents in a range to avoid races with stat(2).
201 */
202int btrfs_drop_extents(struct btrfs_trans_handle *trans,
203		       struct btrfs_root *root, struct btrfs_inode *inode,
204		       struct btrfs_drop_extents_args *args)
205{
206	struct btrfs_fs_info *fs_info = root->fs_info;
207	struct extent_buffer *leaf;
208	struct btrfs_file_extent_item *fi;
209	struct btrfs_key key;
210	struct btrfs_key new_key;
211	u64 ino = btrfs_ino(inode);
212	u64 search_start = args->start;
213	u64 disk_bytenr = 0;
214	u64 num_bytes = 0;
215	u64 extent_offset = 0;
216	u64 extent_end = 0;
217	u64 last_end = args->start;
218	int del_nr = 0;
219	int del_slot = 0;
220	int extent_type;
221	int recow;
222	int ret;
223	int modify_tree = -1;
224	int update_refs;
225	int found = 0;
226	struct btrfs_path *path = args->path;
227
228	args->bytes_found = 0;
229	args->extent_inserted = false;
230
231	/* Must always have a path if ->replace_extent is true */
232	ASSERT(!(args->replace_extent && !args->path));
233
234	if (!path) {
235		path = btrfs_alloc_path();
236		if (!path) {
237			ret = -ENOMEM;
238			goto out;
239		}
240	}
241
242	if (args->drop_cache)
243		btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false);
244
245	if (args->start >= inode->disk_i_size && !args->replace_extent)
246		modify_tree = 0;
247
248	update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
249	while (1) {
250		recow = 0;
251		ret = btrfs_lookup_file_extent(trans, root, path, ino,
252					       search_start, modify_tree);
253		if (ret < 0)
254			break;
255		if (ret > 0 && path->slots[0] > 0 && search_start == args->start) {
256			leaf = path->nodes[0];
257			btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
258			if (key.objectid == ino &&
259			    key.type == BTRFS_EXTENT_DATA_KEY)
260				path->slots[0]--;
261		}
262		ret = 0;
263next_slot:
264		leaf = path->nodes[0];
265		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
266			BUG_ON(del_nr > 0);
267			ret = btrfs_next_leaf(root, path);
268			if (ret < 0)
269				break;
270			if (ret > 0) {
271				ret = 0;
272				break;
273			}
274			leaf = path->nodes[0];
275			recow = 1;
276		}
277
278		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
279
280		if (key.objectid > ino)
281			break;
282		if (WARN_ON_ONCE(key.objectid < ino) ||
283		    key.type < BTRFS_EXTENT_DATA_KEY) {
284			ASSERT(del_nr == 0);
285			path->slots[0]++;
286			goto next_slot;
287		}
288		if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end)
289			break;
290
291		fi = btrfs_item_ptr(leaf, path->slots[0],
292				    struct btrfs_file_extent_item);
293		extent_type = btrfs_file_extent_type(leaf, fi);
294
295		if (extent_type == BTRFS_FILE_EXTENT_REG ||
296		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
297			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
298			num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
299			extent_offset = btrfs_file_extent_offset(leaf, fi);
300			extent_end = key.offset +
301				btrfs_file_extent_num_bytes(leaf, fi);
302		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
303			extent_end = key.offset +
304				btrfs_file_extent_ram_bytes(leaf, fi);
305		} else {
306			/* can't happen */
307			BUG();
308		}
309
310		/*
311		 * Don't skip extent items representing 0 byte lengths. They
312		 * used to be created (bug) if while punching holes we hit
313		 * -ENOSPC condition. So if we find one here, just ensure we
314		 * delete it, otherwise we would insert a new file extent item
315		 * with the same key (offset) as that 0 bytes length file
316		 * extent item in the call to setup_items_for_insert() later
317		 * in this function.
318		 */
319		if (extent_end == key.offset && extent_end >= search_start) {
320			last_end = extent_end;
321			goto delete_extent_item;
322		}
323
324		if (extent_end <= search_start) {
325			path->slots[0]++;
326			goto next_slot;
327		}
328
329		found = 1;
330		search_start = max(key.offset, args->start);
331		if (recow || !modify_tree) {
332			modify_tree = -1;
333			btrfs_release_path(path);
334			continue;
335		}
336
337		/*
338		 *     | - range to drop - |
339		 *  | -------- extent -------- |
340		 */
341		if (args->start > key.offset && args->end < extent_end) {
342			BUG_ON(del_nr > 0);
343			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
344				ret = -EOPNOTSUPP;
345				break;
346			}
347
348			memcpy(&new_key, &key, sizeof(new_key));
349			new_key.offset = args->start;
350			ret = btrfs_duplicate_item(trans, root, path,
351						   &new_key);
352			if (ret == -EAGAIN) {
353				btrfs_release_path(path);
354				continue;
355			}
356			if (ret < 0)
357				break;
358
359			leaf = path->nodes[0];
360			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
361					    struct btrfs_file_extent_item);
362			btrfs_set_file_extent_num_bytes(leaf, fi,
363							args->start - key.offset);
364
365			fi = btrfs_item_ptr(leaf, path->slots[0],
366					    struct btrfs_file_extent_item);
367
368			extent_offset += args->start - key.offset;
369			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
370			btrfs_set_file_extent_num_bytes(leaf, fi,
371							extent_end - args->start);
372			btrfs_mark_buffer_dirty(trans, leaf);
373
374			if (update_refs && disk_bytenr > 0) {
375				struct btrfs_ref ref = {
376					.action = BTRFS_ADD_DELAYED_REF,
377					.bytenr = disk_bytenr,
378					.num_bytes = num_bytes,
379					.parent = 0,
380					.owning_root = btrfs_root_id(root),
381					.ref_root = btrfs_root_id(root),
382				};
383				btrfs_init_data_ref(&ref, new_key.objectid,
384						    args->start - extent_offset,
385						    0, false);
386				ret = btrfs_inc_extent_ref(trans, &ref);
387				if (ret) {
388					btrfs_abort_transaction(trans, ret);
389					break;
390				}
391			}
392			key.offset = args->start;
393		}
394		/*
395		 * From here on out we will have actually dropped something, so
396		 * last_end can be updated.
397		 */
398		last_end = extent_end;
399
400		/*
401		 *  | ---- range to drop ----- |
402		 *      | -------- extent -------- |
403		 */
404		if (args->start <= key.offset && args->end < extent_end) {
405			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
406				ret = -EOPNOTSUPP;
407				break;
408			}
409
410			memcpy(&new_key, &key, sizeof(new_key));
411			new_key.offset = args->end;
412			btrfs_set_item_key_safe(trans, path, &new_key);
413
414			extent_offset += args->end - key.offset;
415			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
416			btrfs_set_file_extent_num_bytes(leaf, fi,
417							extent_end - args->end);
418			btrfs_mark_buffer_dirty(trans, leaf);
419			if (update_refs && disk_bytenr > 0)
420				args->bytes_found += args->end - key.offset;
421			break;
422		}
423
424		search_start = extent_end;
425		/*
426		 *       | ---- range to drop ----- |
427		 *  | -------- extent -------- |
428		 */
429		if (args->start > key.offset && args->end >= extent_end) {
430			BUG_ON(del_nr > 0);
431			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
432				ret = -EOPNOTSUPP;
433				break;
434			}
435
436			btrfs_set_file_extent_num_bytes(leaf, fi,
437							args->start - key.offset);
438			btrfs_mark_buffer_dirty(trans, leaf);
439			if (update_refs && disk_bytenr > 0)
440				args->bytes_found += extent_end - args->start;
441			if (args->end == extent_end)
442				break;
443
444			path->slots[0]++;
445			goto next_slot;
446		}
447
448		/*
449		 *  | ---- range to drop ----- |
450		 *    | ------ extent ------ |
451		 */
452		if (args->start <= key.offset && args->end >= extent_end) {
453delete_extent_item:
454			if (del_nr == 0) {
455				del_slot = path->slots[0];
456				del_nr = 1;
457			} else {
458				BUG_ON(del_slot + del_nr != path->slots[0]);
459				del_nr++;
460			}
461
462			if (update_refs &&
463			    extent_type == BTRFS_FILE_EXTENT_INLINE) {
464				args->bytes_found += extent_end - key.offset;
465				extent_end = ALIGN(extent_end,
466						   fs_info->sectorsize);
467			} else if (update_refs && disk_bytenr > 0) {
468				struct btrfs_ref ref = {
469					.action = BTRFS_DROP_DELAYED_REF,
470					.bytenr = disk_bytenr,
471					.num_bytes = num_bytes,
472					.parent = 0,
473					.owning_root = btrfs_root_id(root),
474					.ref_root = btrfs_root_id(root),
475				};
476				btrfs_init_data_ref(&ref, key.objectid,
477						    key.offset - extent_offset,
478						    0, false);
479				ret = btrfs_free_extent(trans, &ref);
480				if (ret) {
481					btrfs_abort_transaction(trans, ret);
482					break;
483				}
484				args->bytes_found += extent_end - key.offset;
485			}
486
487			if (args->end == extent_end)
488				break;
489
490			if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
491				path->slots[0]++;
492				goto next_slot;
493			}
494
495			ret = btrfs_del_items(trans, root, path, del_slot,
496					      del_nr);
497			if (ret) {
498				btrfs_abort_transaction(trans, ret);
499				break;
500			}
501
502			del_nr = 0;
503			del_slot = 0;
504
505			btrfs_release_path(path);
506			continue;
507		}
508
509		BUG();
510	}
511
512	if (!ret && del_nr > 0) {
513		/*
514		 * Set path->slots[0] to first slot, so that after the delete
515		 * if items are move off from our leaf to its immediate left or
516		 * right neighbor leafs, we end up with a correct and adjusted
517		 * path->slots[0] for our insertion (if args->replace_extent).
518		 */
519		path->slots[0] = del_slot;
520		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
521		if (ret)
522			btrfs_abort_transaction(trans, ret);
523	}
524
525	leaf = path->nodes[0];
526	/*
527	 * If btrfs_del_items() was called, it might have deleted a leaf, in
528	 * which case it unlocked our path, so check path->locks[0] matches a
529	 * write lock.
530	 */
531	if (!ret && args->replace_extent &&
532	    path->locks[0] == BTRFS_WRITE_LOCK &&
533	    btrfs_leaf_free_space(leaf) >=
534	    sizeof(struct btrfs_item) + args->extent_item_size) {
535
536		key.objectid = ino;
537		key.type = BTRFS_EXTENT_DATA_KEY;
538		key.offset = args->start;
539		if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
540			struct btrfs_key slot_key;
541
542			btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
543			if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
544				path->slots[0]++;
545		}
546		btrfs_setup_item_for_insert(trans, root, path, &key,
547					    args->extent_item_size);
548		args->extent_inserted = true;
549	}
550
551	if (!args->path)
552		btrfs_free_path(path);
553	else if (!args->extent_inserted)
554		btrfs_release_path(path);
555out:
556	args->drop_end = found ? min(args->end, last_end) : args->end;
557
558	return ret;
559}
560
561static int extent_mergeable(struct extent_buffer *leaf, int slot,
562			    u64 objectid, u64 bytenr, u64 orig_offset,
563			    u64 *start, u64 *end)
564{
565	struct btrfs_file_extent_item *fi;
566	struct btrfs_key key;
567	u64 extent_end;
568
569	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
570		return 0;
571
572	btrfs_item_key_to_cpu(leaf, &key, slot);
573	if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
574		return 0;
575
576	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
577	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
578	    btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
579	    btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
580	    btrfs_file_extent_compression(leaf, fi) ||
581	    btrfs_file_extent_encryption(leaf, fi) ||
582	    btrfs_file_extent_other_encoding(leaf, fi))
583		return 0;
584
585	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
586	if ((*start && *start != key.offset) || (*end && *end != extent_end))
587		return 0;
588
589	*start = key.offset;
590	*end = extent_end;
591	return 1;
592}
593
594/*
595 * Mark extent in the range start - end as written.
596 *
597 * This changes extent type from 'pre-allocated' to 'regular'. If only
598 * part of extent is marked as written, the extent will be split into
599 * two or three.
600 */
601int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
602			      struct btrfs_inode *inode, u64 start, u64 end)
603{
604	struct btrfs_root *root = inode->root;
605	struct extent_buffer *leaf;
606	struct btrfs_path *path;
607	struct btrfs_file_extent_item *fi;
608	struct btrfs_ref ref = { 0 };
609	struct btrfs_key key;
610	struct btrfs_key new_key;
611	u64 bytenr;
612	u64 num_bytes;
613	u64 extent_end;
614	u64 orig_offset;
615	u64 other_start;
616	u64 other_end;
617	u64 split;
618	int del_nr = 0;
619	int del_slot = 0;
620	int recow;
621	int ret = 0;
622	u64 ino = btrfs_ino(inode);
623
624	path = btrfs_alloc_path();
625	if (!path)
626		return -ENOMEM;
627again:
628	recow = 0;
629	split = start;
630	key.objectid = ino;
631	key.type = BTRFS_EXTENT_DATA_KEY;
632	key.offset = split;
633
634	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
635	if (ret < 0)
636		goto out;
637	if (ret > 0 && path->slots[0] > 0)
638		path->slots[0]--;
639
640	leaf = path->nodes[0];
641	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
642	if (key.objectid != ino ||
643	    key.type != BTRFS_EXTENT_DATA_KEY) {
644		ret = -EINVAL;
645		btrfs_abort_transaction(trans, ret);
646		goto out;
647	}
648	fi = btrfs_item_ptr(leaf, path->slots[0],
649			    struct btrfs_file_extent_item);
650	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) {
651		ret = -EINVAL;
652		btrfs_abort_transaction(trans, ret);
653		goto out;
654	}
655	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
656	if (key.offset > start || extent_end < end) {
657		ret = -EINVAL;
658		btrfs_abort_transaction(trans, ret);
659		goto out;
660	}
661
662	bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
663	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
664	orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
665	memcpy(&new_key, &key, sizeof(new_key));
666
667	if (start == key.offset && end < extent_end) {
668		other_start = 0;
669		other_end = start;
670		if (extent_mergeable(leaf, path->slots[0] - 1,
671				     ino, bytenr, orig_offset,
672				     &other_start, &other_end)) {
673			new_key.offset = end;
674			btrfs_set_item_key_safe(trans, path, &new_key);
675			fi = btrfs_item_ptr(leaf, path->slots[0],
676					    struct btrfs_file_extent_item);
677			btrfs_set_file_extent_generation(leaf, fi,
678							 trans->transid);
679			btrfs_set_file_extent_num_bytes(leaf, fi,
680							extent_end - end);
681			btrfs_set_file_extent_offset(leaf, fi,
682						     end - orig_offset);
683			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
684					    struct btrfs_file_extent_item);
685			btrfs_set_file_extent_generation(leaf, fi,
686							 trans->transid);
687			btrfs_set_file_extent_num_bytes(leaf, fi,
688							end - other_start);
689			btrfs_mark_buffer_dirty(trans, leaf);
690			goto out;
691		}
692	}
693
694	if (start > key.offset && end == extent_end) {
695		other_start = end;
696		other_end = 0;
697		if (extent_mergeable(leaf, path->slots[0] + 1,
698				     ino, bytenr, orig_offset,
699				     &other_start, &other_end)) {
700			fi = btrfs_item_ptr(leaf, path->slots[0],
701					    struct btrfs_file_extent_item);
702			btrfs_set_file_extent_num_bytes(leaf, fi,
703							start - key.offset);
704			btrfs_set_file_extent_generation(leaf, fi,
705							 trans->transid);
706			path->slots[0]++;
707			new_key.offset = start;
708			btrfs_set_item_key_safe(trans, path, &new_key);
709
710			fi = btrfs_item_ptr(leaf, path->slots[0],
711					    struct btrfs_file_extent_item);
712			btrfs_set_file_extent_generation(leaf, fi,
713							 trans->transid);
714			btrfs_set_file_extent_num_bytes(leaf, fi,
715							other_end - start);
716			btrfs_set_file_extent_offset(leaf, fi,
717						     start - orig_offset);
718			btrfs_mark_buffer_dirty(trans, leaf);
719			goto out;
720		}
721	}
722
723	while (start > key.offset || end < extent_end) {
724		if (key.offset == start)
725			split = end;
726
727		new_key.offset = split;
728		ret = btrfs_duplicate_item(trans, root, path, &new_key);
729		if (ret == -EAGAIN) {
730			btrfs_release_path(path);
731			goto again;
732		}
733		if (ret < 0) {
734			btrfs_abort_transaction(trans, ret);
735			goto out;
736		}
737
738		leaf = path->nodes[0];
739		fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
740				    struct btrfs_file_extent_item);
741		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
742		btrfs_set_file_extent_num_bytes(leaf, fi,
743						split - key.offset);
744
745		fi = btrfs_item_ptr(leaf, path->slots[0],
746				    struct btrfs_file_extent_item);
747
748		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
749		btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
750		btrfs_set_file_extent_num_bytes(leaf, fi,
751						extent_end - split);
752		btrfs_mark_buffer_dirty(trans, leaf);
753
754		ref.action = BTRFS_ADD_DELAYED_REF;
755		ref.bytenr = bytenr;
756		ref.num_bytes = num_bytes;
757		ref.parent = 0;
758		ref.owning_root = btrfs_root_id(root);
759		ref.ref_root = btrfs_root_id(root);
760		btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);
761		ret = btrfs_inc_extent_ref(trans, &ref);
762		if (ret) {
763			btrfs_abort_transaction(trans, ret);
764			goto out;
765		}
766
767		if (split == start) {
768			key.offset = start;
769		} else {
770			if (start != key.offset) {
771				ret = -EINVAL;
772				btrfs_abort_transaction(trans, ret);
773				goto out;
774			}
775			path->slots[0]--;
776			extent_end = end;
777		}
778		recow = 1;
779	}
780
781	other_start = end;
782	other_end = 0;
783
784	ref.action = BTRFS_DROP_DELAYED_REF;
785	ref.bytenr = bytenr;
786	ref.num_bytes = num_bytes;
787	ref.parent = 0;
788	ref.owning_root = btrfs_root_id(root);
789	ref.ref_root = btrfs_root_id(root);
790	btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);
791	if (extent_mergeable(leaf, path->slots[0] + 1,
792			     ino, bytenr, orig_offset,
793			     &other_start, &other_end)) {
794		if (recow) {
795			btrfs_release_path(path);
796			goto again;
797		}
798		extent_end = other_end;
799		del_slot = path->slots[0] + 1;
800		del_nr++;
801		ret = btrfs_free_extent(trans, &ref);
802		if (ret) {
803			btrfs_abort_transaction(trans, ret);
804			goto out;
805		}
806	}
807	other_start = 0;
808	other_end = start;
809	if (extent_mergeable(leaf, path->slots[0] - 1,
810			     ino, bytenr, orig_offset,
811			     &other_start, &other_end)) {
812		if (recow) {
813			btrfs_release_path(path);
814			goto again;
815		}
816		key.offset = other_start;
817		del_slot = path->slots[0];
818		del_nr++;
819		ret = btrfs_free_extent(trans, &ref);
820		if (ret) {
821			btrfs_abort_transaction(trans, ret);
822			goto out;
823		}
824	}
825	if (del_nr == 0) {
826		fi = btrfs_item_ptr(leaf, path->slots[0],
827			   struct btrfs_file_extent_item);
828		btrfs_set_file_extent_type(leaf, fi,
829					   BTRFS_FILE_EXTENT_REG);
830		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
831		btrfs_mark_buffer_dirty(trans, leaf);
832	} else {
833		fi = btrfs_item_ptr(leaf, del_slot - 1,
834			   struct btrfs_file_extent_item);
835		btrfs_set_file_extent_type(leaf, fi,
836					   BTRFS_FILE_EXTENT_REG);
837		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
838		btrfs_set_file_extent_num_bytes(leaf, fi,
839						extent_end - key.offset);
840		btrfs_mark_buffer_dirty(trans, leaf);
841
842		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
843		if (ret < 0) {
844			btrfs_abort_transaction(trans, ret);
845			goto out;
846		}
847	}
848out:
849	btrfs_free_path(path);
850	return ret;
851}
852
853/*
854 * on error we return an unlocked page and the error value
855 * on success we return a locked page and 0
856 */
857static int prepare_uptodate_page(struct inode *inode,
858				 struct page *page, u64 pos,
859				 bool force_uptodate)
860{
861	struct folio *folio = page_folio(page);
862	int ret = 0;
863
864	if (((pos & (PAGE_SIZE - 1)) || force_uptodate) &&
865	    !PageUptodate(page)) {
866		ret = btrfs_read_folio(NULL, folio);
867		if (ret)
868			return ret;
869		lock_page(page);
870		if (!PageUptodate(page)) {
871			unlock_page(page);
872			return -EIO;
873		}
874
875		/*
876		 * Since btrfs_read_folio() will unlock the folio before it
877		 * returns, there is a window where btrfs_release_folio() can be
878		 * called to release the page.  Here we check both inode
879		 * mapping and PagePrivate() to make sure the page was not
880		 * released.
881		 *
882		 * The private flag check is essential for subpage as we need
883		 * to store extra bitmap using folio private.
884		 */
885		if (page->mapping != inode->i_mapping || !folio_test_private(folio)) {
886			unlock_page(page);
887			return -EAGAIN;
888		}
889	}
890	return 0;
891}
892
893static fgf_t get_prepare_fgp_flags(bool nowait)
894{
895	fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT;
896
897	if (nowait)
898		fgp_flags |= FGP_NOWAIT;
899
900	return fgp_flags;
901}
902
903static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
904{
905	gfp_t gfp;
906
907	gfp = btrfs_alloc_write_mask(inode->i_mapping);
908	if (nowait) {
909		gfp &= ~__GFP_DIRECT_RECLAIM;
910		gfp |= GFP_NOWAIT;
911	}
912
913	return gfp;
914}
915
916/*
917 * this just gets pages into the page cache and locks them down.
918 */
919static noinline int prepare_pages(struct inode *inode, struct page **pages,
920				  size_t num_pages, loff_t pos,
921				  size_t write_bytes, bool force_uptodate,
922				  bool nowait)
923{
924	int i;
925	unsigned long index = pos >> PAGE_SHIFT;
926	gfp_t mask = get_prepare_gfp_flags(inode, nowait);
927	fgf_t fgp_flags = get_prepare_fgp_flags(nowait);
928	int ret = 0;
929	int faili;
930
931	for (i = 0; i < num_pages; i++) {
932again:
933		pages[i] = pagecache_get_page(inode->i_mapping, index + i,
934					      fgp_flags, mask | __GFP_WRITE);
935		if (!pages[i]) {
936			faili = i - 1;
937			if (nowait)
938				ret = -EAGAIN;
939			else
940				ret = -ENOMEM;
941			goto fail;
942		}
943
944		ret = set_page_extent_mapped(pages[i]);
945		if (ret < 0) {
946			faili = i;
947			goto fail;
948		}
949
950		if (i == 0)
951			ret = prepare_uptodate_page(inode, pages[i], pos,
952						    force_uptodate);
953		if (!ret && i == num_pages - 1)
954			ret = prepare_uptodate_page(inode, pages[i],
955						    pos + write_bytes, false);
956		if (ret) {
957			put_page(pages[i]);
958			if (!nowait && ret == -EAGAIN) {
959				ret = 0;
960				goto again;
961			}
962			faili = i - 1;
963			goto fail;
964		}
965		wait_on_page_writeback(pages[i]);
966	}
967
968	return 0;
969fail:
970	while (faili >= 0) {
971		unlock_page(pages[faili]);
972		put_page(pages[faili]);
973		faili--;
974	}
975	return ret;
976
977}
978
979/*
980 * This function locks the extent and properly waits for data=ordered extents
981 * to finish before allowing the pages to be modified if need.
982 *
983 * The return value:
984 * 1 - the extent is locked
985 * 0 - the extent is not locked, and everything is OK
986 * -EAGAIN - need re-prepare the pages
987 * the other < 0 number - Something wrong happens
988 */
989static noinline int
990lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
991				size_t num_pages, loff_t pos,
992				size_t write_bytes,
993				u64 *lockstart, u64 *lockend, bool nowait,
994				struct extent_state **cached_state)
995{
996	struct btrfs_fs_info *fs_info = inode->root->fs_info;
997	u64 start_pos;
998	u64 last_pos;
999	int i;
1000	int ret = 0;
1001
1002	start_pos = round_down(pos, fs_info->sectorsize);
1003	last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
1004
1005	if (start_pos < inode->vfs_inode.i_size) {
1006		struct btrfs_ordered_extent *ordered;
1007
1008		if (nowait) {
1009			if (!try_lock_extent(&inode->io_tree, start_pos, last_pos,
1010					     cached_state)) {
1011				for (i = 0; i < num_pages; i++) {
1012					unlock_page(pages[i]);
1013					put_page(pages[i]);
1014					pages[i] = NULL;
1015				}
1016
1017				return -EAGAIN;
1018			}
1019		} else {
1020			lock_extent(&inode->io_tree, start_pos, last_pos, cached_state);
1021		}
1022
1023		ordered = btrfs_lookup_ordered_range(inode, start_pos,
1024						     last_pos - start_pos + 1);
1025		if (ordered &&
1026		    ordered->file_offset + ordered->num_bytes > start_pos &&
1027		    ordered->file_offset <= last_pos) {
1028			unlock_extent(&inode->io_tree, start_pos, last_pos,
1029				      cached_state);
1030			for (i = 0; i < num_pages; i++) {
1031				unlock_page(pages[i]);
1032				put_page(pages[i]);
1033			}
1034			btrfs_start_ordered_extent(ordered);
1035			btrfs_put_ordered_extent(ordered);
1036			return -EAGAIN;
1037		}
1038		if (ordered)
1039			btrfs_put_ordered_extent(ordered);
1040
1041		*lockstart = start_pos;
1042		*lockend = last_pos;
1043		ret = 1;
1044	}
1045
1046	/*
1047	 * We should be called after prepare_pages() which should have locked
1048	 * all pages in the range.
1049	 */
1050	for (i = 0; i < num_pages; i++)
1051		WARN_ON(!PageLocked(pages[i]));
1052
1053	return ret;
1054}
1055
1056/*
1057 * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
1058 *
1059 * @pos:         File offset.
1060 * @write_bytes: The length to write, will be updated to the nocow writeable
1061 *               range.
1062 *
1063 * This function will flush ordered extents in the range to ensure proper
1064 * nocow checks.
1065 *
1066 * Return:
1067 * > 0          If we can nocow, and updates @write_bytes.
1068 *  0           If we can't do a nocow write.
1069 * -EAGAIN      If we can't do a nocow write because snapshoting of the inode's
1070 *              root is in progress.
1071 * < 0          If an error happened.
1072 *
1073 * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
1074 */
1075int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
1076			   size_t *write_bytes, bool nowait)
1077{
1078	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1079	struct btrfs_root *root = inode->root;
1080	struct extent_state *cached_state = NULL;
1081	u64 lockstart, lockend;
1082	u64 num_bytes;
1083	int ret;
1084
1085	if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
1086		return 0;
1087
1088	if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
1089		return -EAGAIN;
1090
1091	lockstart = round_down(pos, fs_info->sectorsize);
1092	lockend = round_up(pos + *write_bytes,
1093			   fs_info->sectorsize) - 1;
1094	num_bytes = lockend - lockstart + 1;
1095
1096	if (nowait) {
1097		if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend,
1098						  &cached_state)) {
1099			btrfs_drew_write_unlock(&root->snapshot_lock);
1100			return -EAGAIN;
1101		}
1102	} else {
1103		btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend,
1104						   &cached_state);
1105	}
1106	ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
1107			NULL, NULL, NULL, nowait, false);
1108	if (ret <= 0)
1109		btrfs_drew_write_unlock(&root->snapshot_lock);
1110	else
1111		*write_bytes = min_t(size_t, *write_bytes ,
1112				     num_bytes - pos + lockstart);
1113	unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
1114
1115	return ret;
1116}
1117
1118void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
1119{
1120	btrfs_drew_write_unlock(&inode->root->snapshot_lock);
1121}
1122
1123static void update_time_for_write(struct inode *inode)
1124{
1125	struct timespec64 now, ts;
1126
1127	if (IS_NOCMTIME(inode))
1128		return;
1129
1130	now = current_time(inode);
1131	ts = inode_get_mtime(inode);
1132	if (!timespec64_equal(&ts, &now))
1133		inode_set_mtime_to_ts(inode, now);
1134
1135	ts = inode_get_ctime(inode);
1136	if (!timespec64_equal(&ts, &now))
1137		inode_set_ctime_to_ts(inode, now);
1138
1139	if (IS_I_VERSION(inode))
1140		inode_inc_iversion(inode);
1141}
1142
1143static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
1144			     size_t count)
1145{
1146	struct file *file = iocb->ki_filp;
1147	struct inode *inode = file_inode(file);
1148	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1149	loff_t pos = iocb->ki_pos;
1150	int ret;
1151	loff_t oldsize;
1152	loff_t start_pos;
1153
1154	/*
1155	 * Quickly bail out on NOWAIT writes if we don't have the nodatacow or
1156	 * prealloc flags, as without those flags we always have to COW. We will
1157	 * later check if we can really COW into the target range (using
1158	 * can_nocow_extent() at btrfs_get_blocks_direct_write()).
1159	 */
1160	if ((iocb->ki_flags & IOCB_NOWAIT) &&
1161	    !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
1162		return -EAGAIN;
1163
1164	ret = file_remove_privs(file);
1165	if (ret)
1166		return ret;
1167
1168	/*
1169	 * We reserve space for updating the inode when we reserve space for the
1170	 * extent we are going to write, so we will enospc out there.  We don't
1171	 * need to start yet another transaction to update the inode as we will
1172	 * update the inode when we finish writing whatever data we write.
1173	 */
1174	update_time_for_write(inode);
1175
1176	start_pos = round_down(pos, fs_info->sectorsize);
1177	oldsize = i_size_read(inode);
1178	if (start_pos > oldsize) {
1179		/* Expand hole size to cover write data, preventing empty gap */
1180		loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
1181
1182		ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos);
1183		if (ret)
1184			return ret;
1185	}
1186
1187	return 0;
1188}
1189
1190static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
1191					       struct iov_iter *i)
1192{
1193	struct file *file = iocb->ki_filp;
1194	loff_t pos;
1195	struct inode *inode = file_inode(file);
1196	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1197	struct page **pages = NULL;
1198	struct extent_changeset *data_reserved = NULL;
1199	u64 release_bytes = 0;
1200	u64 lockstart;
1201	u64 lockend;
1202	size_t num_written = 0;
1203	int nrptrs;
1204	ssize_t ret;
1205	bool only_release_metadata = false;
1206	bool force_page_uptodate = false;
1207	loff_t old_isize = i_size_read(inode);
1208	unsigned int ilock_flags = 0;
1209	const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
1210	unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
1211
1212	if (nowait)
1213		ilock_flags |= BTRFS_ILOCK_TRY;
1214
1215	ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
1216	if (ret < 0)
1217		return ret;
1218
1219	ret = generic_write_checks(iocb, i);
1220	if (ret <= 0)
1221		goto out;
1222
1223	ret = btrfs_write_check(iocb, i, ret);
1224	if (ret < 0)
1225		goto out;
1226
1227	pos = iocb->ki_pos;
1228	nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
1229			PAGE_SIZE / (sizeof(struct page *)));
1230	nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
1231	nrptrs = max(nrptrs, 8);
1232	pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
1233	if (!pages) {
1234		ret = -ENOMEM;
1235		goto out;
1236	}
1237
1238	while (iov_iter_count(i) > 0) {
1239		struct extent_state *cached_state = NULL;
1240		size_t offset = offset_in_page(pos);
1241		size_t sector_offset;
1242		size_t write_bytes = min(iov_iter_count(i),
1243					 nrptrs * (size_t)PAGE_SIZE -
1244					 offset);
1245		size_t num_pages;
1246		size_t reserve_bytes;
1247		size_t dirty_pages;
1248		size_t copied;
1249		size_t dirty_sectors;
1250		size_t num_sectors;
1251		int extents_locked;
1252
1253		/*
1254		 * Fault pages before locking them in prepare_pages
1255		 * to avoid recursive lock
1256		 */
1257		if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
1258			ret = -EFAULT;
1259			break;
1260		}
1261
1262		only_release_metadata = false;
1263		sector_offset = pos & (fs_info->sectorsize - 1);
1264
1265		extent_changeset_release(data_reserved);
1266		ret = btrfs_check_data_free_space(BTRFS_I(inode),
1267						  &data_reserved, pos,
1268						  write_bytes, nowait);
1269		if (ret < 0) {
1270			int can_nocow;
1271
1272			if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) {
1273				ret = -EAGAIN;
1274				break;
1275			}
1276
1277			/*
1278			 * If we don't have to COW at the offset, reserve
1279			 * metadata only. write_bytes may get smaller than
1280			 * requested here.
1281			 */
1282			can_nocow = btrfs_check_nocow_lock(BTRFS_I(inode), pos,
1283							   &write_bytes, nowait);
1284			if (can_nocow < 0)
1285				ret = can_nocow;
1286			if (can_nocow > 0)
1287				ret = 0;
1288			if (ret)
1289				break;
1290			only_release_metadata = true;
1291		}
1292
1293		num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE);
1294		WARN_ON(num_pages > nrptrs);
1295		reserve_bytes = round_up(write_bytes + sector_offset,
1296					 fs_info->sectorsize);
1297		WARN_ON(reserve_bytes == 0);
1298		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
1299						      reserve_bytes,
1300						      reserve_bytes, nowait);
1301		if (ret) {
1302			if (!only_release_metadata)
1303				btrfs_free_reserved_data_space(BTRFS_I(inode),
1304						data_reserved, pos,
1305						write_bytes);
1306			else
1307				btrfs_check_nocow_unlock(BTRFS_I(inode));
1308
1309			if (nowait && ret == -ENOSPC)
1310				ret = -EAGAIN;
1311			break;
1312		}
1313
1314		release_bytes = reserve_bytes;
1315again:
1316		ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags);
1317		if (ret) {
1318			btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
1319			break;
1320		}
1321
1322		/*
1323		 * This is going to setup the pages array with the number of
1324		 * pages we want, so we don't really need to worry about the
1325		 * contents of pages from loop to loop
1326		 */
1327		ret = prepare_pages(inode, pages, num_pages,
1328				    pos, write_bytes, force_page_uptodate, false);
1329		if (ret) {
1330			btrfs_delalloc_release_extents(BTRFS_I(inode),
1331						       reserve_bytes);
1332			break;
1333		}
1334
1335		extents_locked = lock_and_cleanup_extent_if_need(
1336				BTRFS_I(inode), pages,
1337				num_pages, pos, write_bytes, &lockstart,
1338				&lockend, nowait, &cached_state);
1339		if (extents_locked < 0) {
1340			if (!nowait && extents_locked == -EAGAIN)
1341				goto again;
1342
1343			btrfs_delalloc_release_extents(BTRFS_I(inode),
1344						       reserve_bytes);
1345			ret = extents_locked;
1346			break;
1347		}
1348
1349		copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
1350
1351		num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
1352		dirty_sectors = round_up(copied + sector_offset,
1353					fs_info->sectorsize);
1354		dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
1355
1356		/*
1357		 * if we have trouble faulting in the pages, fall
1358		 * back to one page at a time
1359		 */
1360		if (copied < write_bytes)
1361			nrptrs = 1;
1362
1363		if (copied == 0) {
1364			force_page_uptodate = true;
1365			dirty_sectors = 0;
1366			dirty_pages = 0;
1367		} else {
1368			force_page_uptodate = false;
1369			dirty_pages = DIV_ROUND_UP(copied + offset,
1370						   PAGE_SIZE);
1371		}
1372
1373		if (num_sectors > dirty_sectors) {
1374			/* release everything except the sectors we dirtied */
1375			release_bytes -= dirty_sectors << fs_info->sectorsize_bits;
1376			if (only_release_metadata) {
1377				btrfs_delalloc_release_metadata(BTRFS_I(inode),
1378							release_bytes, true);
1379			} else {
1380				u64 __pos;
1381
1382				__pos = round_down(pos,
1383						   fs_info->sectorsize) +
1384					(dirty_pages << PAGE_SHIFT);
1385				btrfs_delalloc_release_space(BTRFS_I(inode),
1386						data_reserved, __pos,
1387						release_bytes, true);
1388			}
1389		}
1390
1391		release_bytes = round_up(copied + sector_offset,
1392					fs_info->sectorsize);
1393
1394		ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
1395					dirty_pages, pos, copied,
1396					&cached_state, only_release_metadata);
1397
1398		/*
1399		 * If we have not locked the extent range, because the range's
1400		 * start offset is >= i_size, we might still have a non-NULL
1401		 * cached extent state, acquired while marking the extent range
1402		 * as delalloc through btrfs_dirty_pages(). Therefore free any
1403		 * possible cached extent state to avoid a memory leak.
1404		 */
1405		if (extents_locked)
1406			unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
1407				      lockend, &cached_state);
1408		else
1409			free_extent_state(cached_state);
1410
1411		btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
1412		if (ret) {
1413			btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
1414			break;
1415		}
1416
1417		release_bytes = 0;
1418		if (only_release_metadata)
1419			btrfs_check_nocow_unlock(BTRFS_I(inode));
1420
1421		btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
1422
1423		cond_resched();
1424
1425		pos += copied;
1426		num_written += copied;
1427	}
1428
1429	kfree(pages);
1430
1431	if (release_bytes) {
1432		if (only_release_metadata) {
1433			btrfs_check_nocow_unlock(BTRFS_I(inode));
1434			btrfs_delalloc_release_metadata(BTRFS_I(inode),
1435					release_bytes, true);
1436		} else {
1437			btrfs_delalloc_release_space(BTRFS_I(inode),
1438					data_reserved,
1439					round_down(pos, fs_info->sectorsize),
1440					release_bytes, true);
1441		}
1442	}
1443
1444	extent_changeset_free(data_reserved);
1445	if (num_written > 0) {
1446		pagecache_isize_extended(inode, old_isize, iocb->ki_pos);
1447		iocb->ki_pos += num_written;
1448	}
1449out:
1450	btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1451	return num_written ? num_written : ret;
1452}
1453
1454static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
1455			       const struct iov_iter *iter, loff_t offset)
1456{
1457	const u32 blocksize_mask = fs_info->sectorsize - 1;
1458
1459	if (offset & blocksize_mask)
1460		return -EINVAL;
1461
1462	if (iov_iter_alignment(iter) & blocksize_mask)
1463		return -EINVAL;
1464
1465	return 0;
1466}
1467
1468static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
1469{
1470	struct file *file = iocb->ki_filp;
1471	struct inode *inode = file_inode(file);
1472	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1473	loff_t pos;
1474	ssize_t written = 0;
1475	ssize_t written_buffered;
1476	size_t prev_left = 0;
1477	loff_t endbyte;
1478	ssize_t ret;
1479	unsigned int ilock_flags = 0;
1480	struct iomap_dio *dio;
1481
1482	if (iocb->ki_flags & IOCB_NOWAIT)
1483		ilock_flags |= BTRFS_ILOCK_TRY;
1484
1485	/*
1486	 * If the write DIO is within EOF, use a shared lock and also only if
1487	 * security bits will likely not be dropped by file_remove_privs() called
1488	 * from btrfs_write_check(). Either will need to be rechecked after the
1489	 * lock was acquired.
1490	 */
1491	if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
1492		ilock_flags |= BTRFS_ILOCK_SHARED;
1493
1494relock:
1495	ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
1496	if (ret < 0)
1497		return ret;
1498
1499	/* Shared lock cannot be used with security bits set. */
1500	if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
1501		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1502		ilock_flags &= ~BTRFS_ILOCK_SHARED;
1503		goto relock;
1504	}
1505
1506	ret = generic_write_checks(iocb, from);
1507	if (ret <= 0) {
1508		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1509		return ret;
1510	}
1511
1512	ret = btrfs_write_check(iocb, from, ret);
1513	if (ret < 0) {
1514		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1515		goto out;
1516	}
1517
1518	pos = iocb->ki_pos;
1519	/*
1520	 * Re-check since file size may have changed just before taking the
1521	 * lock or pos may have changed because of O_APPEND in generic_write_check()
1522	 */
1523	if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
1524	    pos + iov_iter_count(from) > i_size_read(inode)) {
1525		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1526		ilock_flags &= ~BTRFS_ILOCK_SHARED;
1527		goto relock;
1528	}
1529
1530	if (check_direct_IO(fs_info, from, pos)) {
1531		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1532		goto buffered;
1533	}
1534
1535	/*
1536	 * The iov_iter can be mapped to the same file range we are writing to.
1537	 * If that's the case, then we will deadlock in the iomap code, because
1538	 * it first calls our callback btrfs_dio_iomap_begin(), which will create
1539	 * an ordered extent, and after that it will fault in the pages that the
1540	 * iov_iter refers to. During the fault in we end up in the readahead
1541	 * pages code (starting at btrfs_readahead()), which will lock the range,
1542	 * find that ordered extent and then wait for it to complete (at
1543	 * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
1544	 * obviously the ordered extent can never complete as we didn't submit
1545	 * yet the respective bio(s). This always happens when the buffer is
1546	 * memory mapped to the same file range, since the iomap DIO code always
1547	 * invalidates pages in the target file range (after starting and waiting
1548	 * for any writeback).
1549	 *
1550	 * So here we disable page faults in the iov_iter and then retry if we
1551	 * got -EFAULT, faulting in the pages before the retry.
1552	 */
1553	from->nofault = true;
1554	dio = btrfs_dio_write(iocb, from, written);
1555	from->nofault = false;
1556
1557	/*
1558	 * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync
1559	 * iocb, and that needs to lock the inode. So unlock it before calling
1560	 * iomap_dio_complete() to avoid a deadlock.
1561	 */
1562	btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1563
1564	if (IS_ERR_OR_NULL(dio))
1565		ret = PTR_ERR_OR_ZERO(dio);
1566	else
1567		ret = iomap_dio_complete(dio);
1568
1569	/* No increment (+=) because iomap returns a cumulative value. */
1570	if (ret > 0)
1571		written = ret;
1572
1573	if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {
1574		const size_t left = iov_iter_count(from);
1575		/*
1576		 * We have more data left to write. Try to fault in as many as
1577		 * possible of the remainder pages and retry. We do this without
1578		 * releasing and locking again the inode, to prevent races with
1579		 * truncate.
1580		 *
1581		 * Also, in case the iov refers to pages in the file range of the
1582		 * file we want to write to (due to a mmap), we could enter an
1583		 * infinite loop if we retry after faulting the pages in, since
1584		 * iomap will invalidate any pages in the range early on, before
1585		 * it tries to fault in the pages of the iov. So we keep track of
1586		 * how much was left of iov in the previous EFAULT and fallback
1587		 * to buffered IO in case we haven't made any progress.
1588		 */
1589		if (left == prev_left) {
1590			ret = -ENOTBLK;
1591		} else {
1592			fault_in_iov_iter_readable(from, left);
1593			prev_left = left;
1594			goto relock;
1595		}
1596	}
1597
1598	/*
1599	 * If 'ret' is -ENOTBLK or we have not written all data, then it means
1600	 * we must fallback to buffered IO.
1601	 */
1602	if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from))
1603		goto out;
1604
1605buffered:
1606	/*
1607	 * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
1608	 * it must retry the operation in a context where blocking is acceptable,
1609	 * because even if we end up not blocking during the buffered IO attempt
1610	 * below, we will block when flushing and waiting for the IO.
1611	 */
1612	if (iocb->ki_flags & IOCB_NOWAIT) {
1613		ret = -EAGAIN;
1614		goto out;
1615	}
1616
1617	pos = iocb->ki_pos;
1618	written_buffered = btrfs_buffered_write(iocb, from);
1619	if (written_buffered < 0) {
1620		ret = written_buffered;
1621		goto out;
1622	}
1623	/*
1624	 * Ensure all data is persisted. We want the next direct IO read to be
1625	 * able to read what was just written.
1626	 */
1627	endbyte = pos + written_buffered - 1;
1628	ret = btrfs_fdatawrite_range(inode, pos, endbyte);
1629	if (ret)
1630		goto out;
1631	ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
1632	if (ret)
1633		goto out;
1634	written += written_buffered;
1635	iocb->ki_pos = pos + written_buffered;
1636	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
1637				 endbyte >> PAGE_SHIFT);
1638out:
1639	return ret < 0 ? ret : written;
1640}
1641
1642static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
1643			const struct btrfs_ioctl_encoded_io_args *encoded)
1644{
1645	struct file *file = iocb->ki_filp;
1646	struct inode *inode = file_inode(file);
1647	loff_t count;
1648	ssize_t ret;
1649
1650	btrfs_inode_lock(BTRFS_I(inode), 0);
1651	count = encoded->len;
1652	ret = generic_write_checks_count(iocb, &count);
1653	if (ret == 0 && count != encoded->len) {
1654		/*
1655		 * The write got truncated by generic_write_checks_count(). We
1656		 * can't do a partial encoded write.
1657		 */
1658		ret = -EFBIG;
1659	}
1660	if (ret || encoded->len == 0)
1661		goto out;
1662
1663	ret = btrfs_write_check(iocb, from, encoded->len);
1664	if (ret < 0)
1665		goto out;
1666
1667	ret = btrfs_do_encoded_write(iocb, from, encoded);
1668out:
1669	btrfs_inode_unlock(BTRFS_I(inode), 0);
1670	return ret;
1671}
1672
1673ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
1674			    const struct btrfs_ioctl_encoded_io_args *encoded)
1675{
1676	struct file *file = iocb->ki_filp;
1677	struct btrfs_inode *inode = BTRFS_I(file_inode(file));
1678	ssize_t num_written, num_sync;
1679
1680	/*
1681	 * If the fs flips readonly due to some impossible error, although we
1682	 * have opened a file as writable, we have to stop this write operation
1683	 * to ensure consistency.
1684	 */
1685	if (BTRFS_FS_ERROR(inode->root->fs_info))
1686		return -EROFS;
1687
1688	if (encoded && (iocb->ki_flags & IOCB_NOWAIT))
1689		return -EOPNOTSUPP;
1690
1691	if (encoded) {
1692		num_written = btrfs_encoded_write(iocb, from, encoded);
1693		num_sync = encoded->len;
1694	} else if (iocb->ki_flags & IOCB_DIRECT) {
1695		num_written = btrfs_direct_write(iocb, from);
1696		num_sync = num_written;
1697	} else {
1698		num_written = btrfs_buffered_write(iocb, from);
1699		num_sync = num_written;
1700	}
1701
1702	btrfs_set_inode_last_sub_trans(inode);
1703
1704	if (num_sync > 0) {
1705		num_sync = generic_write_sync(iocb, num_sync);
1706		if (num_sync < 0)
1707			num_written = num_sync;
1708	}
1709
1710	return num_written;
1711}
1712
1713static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1714{
1715	return btrfs_do_write_iter(iocb, from, NULL);
1716}
1717
1718int btrfs_release_file(struct inode *inode, struct file *filp)
1719{
1720	struct btrfs_file_private *private = filp->private_data;
1721
1722	if (private) {
1723		kfree(private->filldir_buf);
1724		free_extent_state(private->llseek_cached_state);
1725		kfree(private);
1726		filp->private_data = NULL;
1727	}
1728
1729	/*
1730	 * Set by setattr when we are about to truncate a file from a non-zero
1731	 * size to a zero size.  This tries to flush down new bytes that may
1732	 * have been written if the application were using truncate to replace
1733	 * a file in place.
1734	 */
1735	if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
1736			       &BTRFS_I(inode)->runtime_flags))
1737			filemap_flush(inode->i_mapping);
1738	return 0;
1739}
1740
1741static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
1742{
1743	int ret;
1744	struct blk_plug plug;
1745
1746	/*
1747	 * This is only called in fsync, which would do synchronous writes, so
1748	 * a plug can merge adjacent IOs as much as possible.  Esp. in case of
1749	 * multiple disks using raid profile, a large IO can be split to
1750	 * several segments of stripe length (currently 64K).
1751	 */
1752	blk_start_plug(&plug);
1753	ret = btrfs_fdatawrite_range(inode, start, end);
1754	blk_finish_plug(&plug);
1755
1756	return ret;
1757}
1758
1759static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
1760{
1761	struct btrfs_inode *inode = BTRFS_I(ctx->inode);
1762	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1763
1764	if (btrfs_inode_in_log(inode, btrfs_get_fs_generation(fs_info)) &&
1765	    list_empty(&ctx->ordered_extents))
1766		return true;
1767
1768	/*
1769	 * If we are doing a fast fsync we can not bail out if the inode's
1770	 * last_trans is <= then the last committed transaction, because we only
1771	 * update the last_trans of the inode during ordered extent completion,
1772	 * and for a fast fsync we don't wait for that, we only wait for the
1773	 * writeback to complete.
1774	 */
1775	if (inode->last_trans <= btrfs_get_last_trans_committed(fs_info) &&
1776	    (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
1777	     list_empty(&ctx->ordered_extents)))
1778		return true;
1779
1780	return false;
1781}
1782
1783/*
1784 * fsync call for both files and directories.  This logs the inode into
1785 * the tree log instead of forcing full commits whenever possible.
1786 *
1787 * It needs to call filemap_fdatawait so that all ordered extent updates are
1788 * in the metadata btree are up to date for copying to the log.
1789 *
1790 * It drops the inode mutex before doing the tree log commit.  This is an
1791 * important optimization for directories because holding the mutex prevents
1792 * new operations on the dir while we write to disk.
1793 */
1794int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1795{
1796	struct dentry *dentry = file_dentry(file);
1797	struct inode *inode = d_inode(dentry);
1798	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1799	struct btrfs_root *root = BTRFS_I(inode)->root;
1800	struct btrfs_trans_handle *trans;
1801	struct btrfs_log_ctx ctx;
1802	int ret = 0, err;
1803	u64 len;
1804	bool full_sync;
1805
1806	trace_btrfs_sync_file(file, datasync);
1807
1808	btrfs_init_log_ctx(&ctx, inode);
1809
1810	/*
1811	 * Always set the range to a full range, otherwise we can get into
1812	 * several problems, from missing file extent items to represent holes
1813	 * when not using the NO_HOLES feature, to log tree corruption due to
1814	 * races between hole detection during logging and completion of ordered
1815	 * extents outside the range, to missing checksums due to ordered extents
1816	 * for which we flushed only a subset of their pages.
1817	 */
1818	start = 0;
1819	end = LLONG_MAX;
1820	len = (u64)LLONG_MAX + 1;
1821
1822	/*
1823	 * We write the dirty pages in the range and wait until they complete
1824	 * out of the ->i_mutex. If so, we can flush the dirty pages by
1825	 * multi-task, and make the performance up.  See
1826	 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
1827	 */
1828	ret = start_ordered_ops(inode, start, end);
1829	if (ret)
1830		goto out;
1831
1832	btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
1833
1834	atomic_inc(&root->log_batch);
1835
1836	/*
1837	 * Before we acquired the inode's lock and the mmap lock, someone may
1838	 * have dirtied more pages in the target range. We need to make sure
1839	 * that writeback for any such pages does not start while we are logging
1840	 * the inode, because if it does, any of the following might happen when
1841	 * we are not doing a full inode sync:
1842	 *
1843	 * 1) We log an extent after its writeback finishes but before its
1844	 *    checksums are added to the csum tree, leading to -EIO errors
1845	 *    when attempting to read the extent after a log replay.
1846	 *
1847	 * 2) We can end up logging an extent before its writeback finishes.
1848	 *    Therefore after the log replay we will have a file extent item
1849	 *    pointing to an unwritten extent (and no data checksums as well).
1850	 *
1851	 * So trigger writeback for any eventual new dirty pages and then we
1852	 * wait for all ordered extents to complete below.
1853	 */
1854	ret = start_ordered_ops(inode, start, end);
1855	if (ret) {
1856		btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
1857		goto out;
1858	}
1859
1860	/*
1861	 * Always check for the full sync flag while holding the inode's lock,
1862	 * to avoid races with other tasks. The flag must be either set all the
1863	 * time during logging or always off all the time while logging.
1864	 * We check the flag here after starting delalloc above, because when
1865	 * running delalloc the full sync flag may be set if we need to drop
1866	 * extra extent map ranges due to temporary memory allocation failures.
1867	 */
1868	full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1869			     &BTRFS_I(inode)->runtime_flags);
1870
1871	/*
1872	 * We have to do this here to avoid the priority inversion of waiting on
1873	 * IO of a lower priority task while holding a transaction open.
1874	 *
1875	 * For a full fsync we wait for the ordered extents to complete while
1876	 * for a fast fsync we wait just for writeback to complete, and then
1877	 * attach the ordered extents to the transaction so that a transaction
1878	 * commit waits for their completion, to avoid data loss if we fsync,
1879	 * the current transaction commits before the ordered extents complete
1880	 * and a power failure happens right after that.
1881	 *
1882	 * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
1883	 * logical address recorded in the ordered extent may change. We need
1884	 * to wait for the IO to stabilize the logical address.
1885	 */
1886	if (full_sync || btrfs_is_zoned(fs_info)) {
1887		ret = btrfs_wait_ordered_range(inode, start, len);
1888	} else {
1889		/*
1890		 * Get our ordered extents as soon as possible to avoid doing
1891		 * checksum lookups in the csum tree, and use instead the
1892		 * checksums attached to the ordered extents.
1893		 */
1894		btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
1895						      &ctx.ordered_extents);
1896		ret = filemap_fdatawait_range(inode->i_mapping, start, end);
1897	}
1898
1899	if (ret)
1900		goto out_release_extents;
1901
1902	atomic_inc(&root->log_batch);
1903
1904	if (skip_inode_logging(&ctx)) {
1905		/*
1906		 * We've had everything committed since the last time we were
1907		 * modified so clear this flag in case it was set for whatever
1908		 * reason, it's no longer relevant.
1909		 */
1910		clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1911			  &BTRFS_I(inode)->runtime_flags);
1912		/*
1913		 * An ordered extent might have started before and completed
1914		 * already with io errors, in which case the inode was not
1915		 * updated and we end up here. So check the inode's mapping
1916		 * for any errors that might have happened since we last
1917		 * checked called fsync.
1918		 */
1919		ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
1920		goto out_release_extents;
1921	}
1922
1923	btrfs_init_log_ctx_scratch_eb(&ctx);
1924
1925	/*
1926	 * We use start here because we will need to wait on the IO to complete
1927	 * in btrfs_sync_log, which could require joining a transaction (for
1928	 * example checking cross references in the nocow path).  If we use join
1929	 * here we could get into a situation where we're waiting on IO to
1930	 * happen that is blocked on a transaction trying to commit.  With start
1931	 * we inc the extwriter counter, so we wait for all extwriters to exit
1932	 * before we start blocking joiners.  This comment is to keep somebody
1933	 * from thinking they are super smart and changing this to
1934	 * btrfs_join_transaction *cough*Josef*cough*.
1935	 */
1936	trans = btrfs_start_transaction(root, 0);
1937	if (IS_ERR(trans)) {
1938		ret = PTR_ERR(trans);
1939		goto out_release_extents;
1940	}
1941	trans->in_fsync = true;
1942
1943	ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
1944	/*
1945	 * Scratch eb no longer needed, release before syncing log or commit
1946	 * transaction, to avoid holding unnecessary memory during such long
1947	 * operations.
1948	 */
1949	if (ctx.scratch_eb) {
1950		free_extent_buffer(ctx.scratch_eb);
1951		ctx.scratch_eb = NULL;
1952	}
1953	btrfs_release_log_ctx_extents(&ctx);
1954	if (ret < 0) {
1955		/* Fallthrough and commit/free transaction. */
1956		ret = BTRFS_LOG_FORCE_COMMIT;
1957	}
1958
1959	/* we've logged all the items and now have a consistent
1960	 * version of the file in the log.  It is possible that
1961	 * someone will come in and modify the file, but that's
1962	 * fine because the log is consistent on disk, and we
1963	 * have references to all of the file's extents
1964	 *
1965	 * It is possible that someone will come in and log the
1966	 * file again, but that will end up using the synchronization
1967	 * inside btrfs_sync_log to keep things safe.
1968	 */
1969	btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
1970
1971	if (ret == BTRFS_NO_LOG_SYNC) {
1972		ret = btrfs_end_transaction(trans);
1973		goto out;
1974	}
1975
1976	/* We successfully logged the inode, attempt to sync the log. */
1977	if (!ret) {
1978		ret = btrfs_sync_log(trans, root, &ctx);
1979		if (!ret) {
1980			ret = btrfs_end_transaction(trans);
1981			goto out;
1982		}
1983	}
1984
1985	/*
1986	 * At this point we need to commit the transaction because we had
1987	 * btrfs_need_log_full_commit() or some other error.
1988	 *
1989	 * If we didn't do a full sync we have to stop the trans handle, wait on
1990	 * the ordered extents, start it again and commit the transaction.  If
1991	 * we attempt to wait on the ordered extents here we could deadlock with
1992	 * something like fallocate() that is holding the extent lock trying to
1993	 * start a transaction while some other thread is trying to commit the
1994	 * transaction while we (fsync) are currently holding the transaction
1995	 * open.
1996	 */
1997	if (!full_sync) {
1998		ret = btrfs_end_transaction(trans);
1999		if (ret)
2000			goto out;
2001		ret = btrfs_wait_ordered_range(inode, start, len);
2002		if (ret)
2003			goto out;
2004
2005		/*
2006		 * This is safe to use here because we're only interested in
2007		 * making sure the transaction that had the ordered extents is
2008		 * committed.  We aren't waiting on anything past this point,
2009		 * we're purely getting the transaction and committing it.
2010		 */
2011		trans = btrfs_attach_transaction_barrier(root);
2012		if (IS_ERR(trans)) {
2013			ret = PTR_ERR(trans);
2014
2015			/*
2016			 * We committed the transaction and there's no currently
2017			 * running transaction, this means everything we care
2018			 * about made it to disk and we are done.
2019			 */
2020			if (ret == -ENOENT)
2021				ret = 0;
2022			goto out;
2023		}
2024	}
2025
2026	ret = btrfs_commit_transaction(trans);
2027out:
2028	free_extent_buffer(ctx.scratch_eb);
2029	ASSERT(list_empty(&ctx.list));
2030	ASSERT(list_empty(&ctx.conflict_inodes));
2031	err = file_check_and_advance_wb_err(file);
2032	if (!ret)
2033		ret = err;
2034	return ret > 0 ? -EIO : ret;
2035
2036out_release_extents:
2037	btrfs_release_log_ctx_extents(&ctx);
2038	btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2039	goto out;
2040}
2041
2042/*
2043 * btrfs_page_mkwrite() is not allowed to change the file size as it gets
2044 * called from a page fault handler when a page is first dirtied. Hence we must
2045 * be careful to check for EOF conditions here. We set the page up correctly
2046 * for a written page which means we get ENOSPC checking when writing into
2047 * holes and correct delalloc and unwritten extent mapping on filesystems that
2048 * support these features.
2049 *
2050 * We are not allowed to take the i_mutex here so we have to play games to
2051 * protect against truncate races as the page could now be beyond EOF.  Because
2052 * truncate_setsize() writes the inode size before removing pages, once we have
2053 * the page lock we can determine safely if the page is beyond EOF. If it is not
2054 * beyond EOF, then the page is guaranteed safe against truncation until we
2055 * unlock the page.
2056 */
2057static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
2058{
2059	struct page *page = vmf->page;
2060	struct folio *folio = page_folio(page);
2061	struct inode *inode = file_inode(vmf->vma->vm_file);
2062	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
2063	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2064	struct btrfs_ordered_extent *ordered;
2065	struct extent_state *cached_state = NULL;
2066	struct extent_changeset *data_reserved = NULL;
2067	unsigned long zero_start;
2068	loff_t size;
2069	vm_fault_t ret;
2070	int ret2;
2071	int reserved = 0;
2072	u64 reserved_space;
2073	u64 page_start;
2074	u64 page_end;
2075	u64 end;
2076
2077	ASSERT(folio_order(folio) == 0);
2078
2079	reserved_space = PAGE_SIZE;
2080
2081	sb_start_pagefault(inode->i_sb);
2082	page_start = page_offset(page);
2083	page_end = page_start + PAGE_SIZE - 1;
2084	end = page_end;
2085
2086	/*
2087	 * Reserving delalloc space after obtaining the page lock can lead to
2088	 * deadlock. For example, if a dirty page is locked by this function
2089	 * and the call to btrfs_delalloc_reserve_space() ends up triggering
2090	 * dirty page write out, then the btrfs_writepages() function could
2091	 * end up waiting indefinitely to get a lock on the page currently
2092	 * being processed by btrfs_page_mkwrite() function.
2093	 */
2094	ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
2095					    page_start, reserved_space);
2096	if (!ret2) {
2097		ret2 = file_update_time(vmf->vma->vm_file);
2098		reserved = 1;
2099	}
2100	if (ret2) {
2101		ret = vmf_error(ret2);
2102		if (reserved)
2103			goto out;
2104		goto out_noreserve;
2105	}
2106
2107	/* Make the VM retry the fault. */
2108	ret = VM_FAULT_NOPAGE;
2109again:
2110	down_read(&BTRFS_I(inode)->i_mmap_lock);
2111	lock_page(page);
2112	size = i_size_read(inode);
2113
2114	if ((page->mapping != inode->i_mapping) ||
2115	    (page_start >= size)) {
2116		/* Page got truncated out from underneath us. */
2117		goto out_unlock;
2118	}
2119	wait_on_page_writeback(page);
2120
2121	lock_extent(io_tree, page_start, page_end, &cached_state);
2122	ret2 = set_page_extent_mapped(page);
2123	if (ret2 < 0) {
2124		ret = vmf_error(ret2);
2125		unlock_extent(io_tree, page_start, page_end, &cached_state);
2126		goto out_unlock;
2127	}
2128
2129	/*
2130	 * We can't set the delalloc bits if there are pending ordered
2131	 * extents.  Drop our locks and wait for them to finish.
2132	 */
2133	ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE);
2134	if (ordered) {
2135		unlock_extent(io_tree, page_start, page_end, &cached_state);
2136		unlock_page(page);
2137		up_read(&BTRFS_I(inode)->i_mmap_lock);
2138		btrfs_start_ordered_extent(ordered);
2139		btrfs_put_ordered_extent(ordered);
2140		goto again;
2141	}
2142
2143	if (page->index == ((size - 1) >> PAGE_SHIFT)) {
2144		reserved_space = round_up(size - page_start, fs_info->sectorsize);
2145		if (reserved_space < PAGE_SIZE) {
2146			end = page_start + reserved_space - 1;
2147			btrfs_delalloc_release_space(BTRFS_I(inode),
2148					data_reserved, page_start,
2149					PAGE_SIZE - reserved_space, true);
2150		}
2151	}
2152
2153	/*
2154	 * page_mkwrite gets called when the page is firstly dirtied after it's
2155	 * faulted in, but write(2) could also dirty a page and set delalloc
2156	 * bits, thus in this case for space account reason, we still need to
2157	 * clear any delalloc bits within this page range since we have to
2158	 * reserve data&meta space before lock_page() (see above comments).
2159	 */
2160	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
2161			  EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
2162			  EXTENT_DEFRAG, &cached_state);
2163
2164	ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
2165					&cached_state);
2166	if (ret2) {
2167		unlock_extent(io_tree, page_start, page_end, &cached_state);
2168		ret = VM_FAULT_SIGBUS;
2169		goto out_unlock;
2170	}
2171
2172	/* Page is wholly or partially inside EOF. */
2173	if (page_start + PAGE_SIZE > size)
2174		zero_start = offset_in_page(size);
2175	else
2176		zero_start = PAGE_SIZE;
2177
2178	if (zero_start != PAGE_SIZE)
2179		memzero_page(page, zero_start, PAGE_SIZE - zero_start);
2180
2181	btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
2182	btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
2183	btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
2184
2185	btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
2186
2187	unlock_extent(io_tree, page_start, page_end, &cached_state);
2188	up_read(&BTRFS_I(inode)->i_mmap_lock);
2189
2190	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
2191	sb_end_pagefault(inode->i_sb);
2192	extent_changeset_free(data_reserved);
2193	return VM_FAULT_LOCKED;
2194
2195out_unlock:
2196	unlock_page(page);
2197	up_read(&BTRFS_I(inode)->i_mmap_lock);
2198out:
2199	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
2200	btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
2201				     reserved_space, (ret != 0));
2202out_noreserve:
2203	sb_end_pagefault(inode->i_sb);
2204	extent_changeset_free(data_reserved);
2205	return ret;
2206}
2207
2208static const struct vm_operations_struct btrfs_file_vm_ops = {
2209	.fault		= filemap_fault,
2210	.map_pages	= filemap_map_pages,
2211	.page_mkwrite	= btrfs_page_mkwrite,
2212};
2213
2214static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
2215{
2216	struct address_space *mapping = filp->f_mapping;
2217
2218	if (!mapping->a_ops->read_folio)
2219		return -ENOEXEC;
2220
2221	file_accessed(filp);
2222	vma->vm_ops = &btrfs_file_vm_ops;
2223
2224	return 0;
2225}
2226
2227static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
2228			  int slot, u64 start, u64 end)
2229{
2230	struct btrfs_file_extent_item *fi;
2231	struct btrfs_key key;
2232
2233	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
2234		return 0;
2235
2236	btrfs_item_key_to_cpu(leaf, &key, slot);
2237	if (key.objectid != btrfs_ino(inode) ||
2238	    key.type != BTRFS_EXTENT_DATA_KEY)
2239		return 0;
2240
2241	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2242
2243	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2244		return 0;
2245
2246	if (btrfs_file_extent_disk_bytenr(leaf, fi))
2247		return 0;
2248
2249	if (key.offset == end)
2250		return 1;
2251	if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
2252		return 1;
2253	return 0;
2254}
2255
2256static int fill_holes(struct btrfs_trans_handle *trans,
2257		struct btrfs_inode *inode,
2258		struct btrfs_path *path, u64 offset, u64 end)
2259{
2260	struct btrfs_fs_info *fs_info = trans->fs_info;
2261	struct btrfs_root *root = inode->root;
2262	struct extent_buffer *leaf;
2263	struct btrfs_file_extent_item *fi;
2264	struct extent_map *hole_em;
2265	struct btrfs_key key;
2266	int ret;
2267
2268	if (btrfs_fs_incompat(fs_info, NO_HOLES))
2269		goto out;
2270
2271	key.objectid = btrfs_ino(inode);
2272	key.type = BTRFS_EXTENT_DATA_KEY;
2273	key.offset = offset;
2274
2275	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2276	if (ret <= 0) {
2277		/*
2278		 * We should have dropped this offset, so if we find it then
2279		 * something has gone horribly wrong.
2280		 */
2281		if (ret == 0)
2282			ret = -EINVAL;
2283		return ret;
2284	}
2285
2286	leaf = path->nodes[0];
2287	if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) {
2288		u64 num_bytes;
2289
2290		path->slots[0]--;
2291		fi = btrfs_item_ptr(leaf, path->slots[0],
2292				    struct btrfs_file_extent_item);
2293		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
2294			end - offset;
2295		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2296		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2297		btrfs_set_file_extent_offset(leaf, fi, 0);
2298		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2299		btrfs_mark_buffer_dirty(trans, leaf);
2300		goto out;
2301	}
2302
2303	if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
2304		u64 num_bytes;
2305
2306		key.offset = offset;
2307		btrfs_set_item_key_safe(trans, path, &key);
2308		fi = btrfs_item_ptr(leaf, path->slots[0],
2309				    struct btrfs_file_extent_item);
2310		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
2311			offset;
2312		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2313		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2314		btrfs_set_file_extent_offset(leaf, fi, 0);
2315		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2316		btrfs_mark_buffer_dirty(trans, leaf);
2317		goto out;
2318	}
2319	btrfs_release_path(path);
2320
2321	ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset,
2322				       end - offset);
2323	if (ret)
2324		return ret;
2325
2326out:
2327	btrfs_release_path(path);
2328
2329	hole_em = alloc_extent_map();
2330	if (!hole_em) {
2331		btrfs_drop_extent_map_range(inode, offset, end - 1, false);
2332		btrfs_set_inode_full_sync(inode);
2333	} else {
2334		hole_em->start = offset;
2335		hole_em->len = end - offset;
2336		hole_em->ram_bytes = hole_em->len;
2337		hole_em->orig_start = offset;
2338
2339		hole_em->block_start = EXTENT_MAP_HOLE;
2340		hole_em->block_len = 0;
2341		hole_em->orig_block_len = 0;
2342		hole_em->generation = trans->transid;
2343
2344		ret = btrfs_replace_extent_map_range(inode, hole_em, true);
2345		free_extent_map(hole_em);
2346		if (ret)
2347			btrfs_set_inode_full_sync(inode);
2348	}
2349
2350	return 0;
2351}
2352
2353/*
2354 * Find a hole extent on given inode and change start/len to the end of hole
2355 * extent.(hole/vacuum extent whose em->start <= start &&
2356 *	   em->start + em->len > start)
2357 * When a hole extent is found, return 1 and modify start/len.
2358 */
2359static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
2360{
2361	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2362	struct extent_map *em;
2363	int ret = 0;
2364
2365	em = btrfs_get_extent(inode, NULL,
2366			      round_down(*start, fs_info->sectorsize),
2367			      round_up(*len, fs_info->sectorsize));
2368	if (IS_ERR(em))
2369		return PTR_ERR(em);
2370
2371	/* Hole or vacuum extent(only exists in no-hole mode) */
2372	if (em->block_start == EXTENT_MAP_HOLE) {
2373		ret = 1;
2374		*len = em->start + em->len > *start + *len ?
2375		       0 : *start + *len - em->start - em->len;
2376		*start = em->start + em->len;
2377	}
2378	free_extent_map(em);
2379	return ret;
2380}
2381
2382static void btrfs_punch_hole_lock_range(struct inode *inode,
2383					const u64 lockstart,
2384					const u64 lockend,
2385					struct extent_state **cached_state)
2386{
2387	/*
2388	 * For subpage case, if the range is not at page boundary, we could
2389	 * have pages at the leading/tailing part of the range.
2390	 * This could lead to dead loop since filemap_range_has_page()
2391	 * will always return true.
2392	 * So here we need to do extra page alignment for
2393	 * filemap_range_has_page().
2394	 */
2395	const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
2396	const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
2397
2398	while (1) {
2399		truncate_pagecache_range(inode, lockstart, lockend);
2400
2401		lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2402			    cached_state);
2403		/*
2404		 * We can't have ordered extents in the range, nor dirty/writeback
2405		 * pages, because we have locked the inode's VFS lock in exclusive
2406		 * mode, we have locked the inode's i_mmap_lock in exclusive mode,
2407		 * we have flushed all delalloc in the range and we have waited
2408		 * for any ordered extents in the range to complete.
2409		 * We can race with anyone reading pages from this range, so after
2410		 * locking the range check if we have pages in the range, and if
2411		 * we do, unlock the range and retry.
2412		 */
2413		if (!filemap_range_has_page(inode->i_mapping, page_lockstart,
2414					    page_lockend))
2415			break;
2416
2417		unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2418			      cached_state);
2419	}
2420
2421	btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend);
2422}
2423
2424static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
2425				     struct btrfs_inode *inode,
2426				     struct btrfs_path *path,
2427				     struct btrfs_replace_extent_info *extent_info,
2428				     const u64 replace_len,
2429				     const u64 bytes_to_drop)
2430{
2431	struct btrfs_fs_info *fs_info = trans->fs_info;
2432	struct btrfs_root *root = inode->root;
2433	struct btrfs_file_extent_item *extent;
2434	struct extent_buffer *leaf;
2435	struct btrfs_key key;
2436	int slot;
2437	int ret;
2438
2439	if (replace_len == 0)
2440		return 0;
2441
2442	if (extent_info->disk_offset == 0 &&
2443	    btrfs_fs_incompat(fs_info, NO_HOLES)) {
2444		btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2445		return 0;
2446	}
2447
2448	key.objectid = btrfs_ino(inode);
2449	key.type = BTRFS_EXTENT_DATA_KEY;
2450	key.offset = extent_info->file_offset;
2451	ret = btrfs_insert_empty_item(trans, root, path, &key,
2452				      sizeof(struct btrfs_file_extent_item));
2453	if (ret)
2454		return ret;
2455	leaf = path->nodes[0];
2456	slot = path->slots[0];
2457	write_extent_buffer(leaf, extent_info->extent_buf,
2458			    btrfs_item_ptr_offset(leaf, slot),
2459			    sizeof(struct btrfs_file_extent_item));
2460	extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2461	ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
2462	btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
2463	btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
2464	if (extent_info->is_new_extent)
2465		btrfs_set_file_extent_generation(leaf, extent, trans->transid);
2466	btrfs_mark_buffer_dirty(trans, leaf);
2467	btrfs_release_path(path);
2468
2469	ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
2470						replace_len);
2471	if (ret)
2472		return ret;
2473
2474	/* If it's a hole, nothing more needs to be done. */
2475	if (extent_info->disk_offset == 0) {
2476		btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2477		return 0;
2478	}
2479
2480	btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop);
2481
2482	if (extent_info->is_new_extent && extent_info->insertions == 0) {
2483		key.objectid = extent_info->disk_offset;
2484		key.type = BTRFS_EXTENT_ITEM_KEY;
2485		key.offset = extent_info->disk_len;
2486		ret = btrfs_alloc_reserved_file_extent(trans, root,
2487						       btrfs_ino(inode),
2488						       extent_info->file_offset,
2489						       extent_info->qgroup_reserved,
2490						       &key);
2491	} else {
2492		struct btrfs_ref ref = {
2493			.action = BTRFS_ADD_DELAYED_REF,
2494			.bytenr = extent_info->disk_offset,
2495			.num_bytes = extent_info->disk_len,
2496			.owning_root = btrfs_root_id(root),
2497			.ref_root = btrfs_root_id(root),
2498		};
2499		u64 ref_offset;
2500
2501		ref_offset = extent_info->file_offset - extent_info->data_offset;
2502		btrfs_init_data_ref(&ref, btrfs_ino(inode), ref_offset, 0, false);
2503		ret = btrfs_inc_extent_ref(trans, &ref);
2504	}
2505
2506	extent_info->insertions++;
2507
2508	return ret;
2509}
2510
2511/*
2512 * The respective range must have been previously locked, as well as the inode.
2513 * The end offset is inclusive (last byte of the range).
2514 * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
2515 * the file range with an extent.
2516 * When not punching a hole, we don't want to end up in a state where we dropped
2517 * extents without inserting a new one, so we must abort the transaction to avoid
2518 * a corruption.
2519 */
2520int btrfs_replace_file_extents(struct btrfs_inode *inode,
2521			       struct btrfs_path *path, const u64 start,
2522			       const u64 end,
2523			       struct btrfs_replace_extent_info *extent_info,
2524			       struct btrfs_trans_handle **trans_out)
2525{
2526	struct btrfs_drop_extents_args drop_args = { 0 };
2527	struct btrfs_root *root = inode->root;
2528	struct btrfs_fs_info *fs_info = root->fs_info;
2529	u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
2530	u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
2531	struct btrfs_trans_handle *trans = NULL;
2532	struct btrfs_block_rsv *rsv;
2533	unsigned int rsv_count;
2534	u64 cur_offset;
2535	u64 len = end - start;
2536	int ret = 0;
2537
2538	if (end <= start)
2539		return -EINVAL;
2540
2541	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
2542	if (!rsv) {
2543		ret = -ENOMEM;
2544		goto out;
2545	}
2546	rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
2547	rsv->failfast = true;
2548
2549	/*
2550	 * 1 - update the inode
2551	 * 1 - removing the extents in the range
2552	 * 1 - adding the hole extent if no_holes isn't set or if we are
2553	 *     replacing the range with a new extent
2554	 */
2555	if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
2556		rsv_count = 3;
2557	else
2558		rsv_count = 2;
2559
2560	trans = btrfs_start_transaction(root, rsv_count);
2561	if (IS_ERR(trans)) {
2562		ret = PTR_ERR(trans);
2563		trans = NULL;
2564		goto out_free;
2565	}
2566
2567	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
2568				      min_size, false);
2569	if (WARN_ON(ret))
2570		goto out_trans;
2571	trans->block_rsv = rsv;
2572
2573	cur_offset = start;
2574	drop_args.path = path;
2575	drop_args.end = end + 1;
2576	drop_args.drop_cache = true;
2577	while (cur_offset < end) {
2578		drop_args.start = cur_offset;
2579		ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2580		/* If we are punching a hole decrement the inode's byte count */
2581		if (!extent_info)
2582			btrfs_update_inode_bytes(inode, 0,
2583						 drop_args.bytes_found);
2584		if (ret != -ENOSPC) {
2585			/*
2586			 * The only time we don't want to abort is if we are
2587			 * attempting to clone a partial inline extent, in which
2588			 * case we'll get EOPNOTSUPP.  However if we aren't
2589			 * clone we need to abort no matter what, because if we
2590			 * got EOPNOTSUPP via prealloc then we messed up and
2591			 * need to abort.
2592			 */
2593			if (ret &&
2594			    (ret != -EOPNOTSUPP ||
2595			     (extent_info && extent_info->is_new_extent)))
2596				btrfs_abort_transaction(trans, ret);
2597			break;
2598		}
2599
2600		trans->block_rsv = &fs_info->trans_block_rsv;
2601
2602		if (!extent_info && cur_offset < drop_args.drop_end &&
2603		    cur_offset < ino_size) {
2604			ret = fill_holes(trans, inode, path, cur_offset,
2605					 drop_args.drop_end);
2606			if (ret) {
2607				/*
2608				 * If we failed then we didn't insert our hole
2609				 * entries for the area we dropped, so now the
2610				 * fs is corrupted, so we must abort the
2611				 * transaction.
2612				 */
2613				btrfs_abort_transaction(trans, ret);
2614				break;
2615			}
2616		} else if (!extent_info && cur_offset < drop_args.drop_end) {
2617			/*
2618			 * We are past the i_size here, but since we didn't
2619			 * insert holes we need to clear the mapped area so we
2620			 * know to not set disk_i_size in this area until a new
2621			 * file extent is inserted here.
2622			 */
2623			ret = btrfs_inode_clear_file_extent_range(inode,
2624					cur_offset,
2625					drop_args.drop_end - cur_offset);
2626			if (ret) {
2627				/*
2628				 * We couldn't clear our area, so we could
2629				 * presumably adjust up and corrupt the fs, so
2630				 * we need to abort.
2631				 */
2632				btrfs_abort_transaction(trans, ret);
2633				break;
2634			}
2635		}
2636
2637		if (extent_info &&
2638		    drop_args.drop_end > extent_info->file_offset) {
2639			u64 replace_len = drop_args.drop_end -
2640					  extent_info->file_offset;
2641
2642			ret = btrfs_insert_replace_extent(trans, inode,	path,
2643					extent_info, replace_len,
2644					drop_args.bytes_found);
2645			if (ret) {
2646				btrfs_abort_transaction(trans, ret);
2647				break;
2648			}
2649			extent_info->data_len -= replace_len;
2650			extent_info->data_offset += replace_len;
2651			extent_info->file_offset += replace_len;
2652		}
2653
2654		/*
2655		 * We are releasing our handle on the transaction, balance the
2656		 * dirty pages of the btree inode and flush delayed items, and
2657		 * then get a new transaction handle, which may now point to a
2658		 * new transaction in case someone else may have committed the
2659		 * transaction we used to replace/drop file extent items. So
2660		 * bump the inode's iversion and update mtime and ctime except
2661		 * if we are called from a dedupe context. This is because a
2662		 * power failure/crash may happen after the transaction is
2663		 * committed and before we finish replacing/dropping all the
2664		 * file extent items we need.
2665		 */
2666		inode_inc_iversion(&inode->vfs_inode);
2667
2668		if (!extent_info || extent_info->update_times)
2669			inode_set_mtime_to_ts(&inode->vfs_inode,
2670					      inode_set_ctime_current(&inode->vfs_inode));
2671
2672		ret = btrfs_update_inode(trans, inode);
2673		if (ret)
2674			break;
2675
2676		btrfs_end_transaction(trans);
2677		btrfs_btree_balance_dirty(fs_info);
2678
2679		trans = btrfs_start_transaction(root, rsv_count);
2680		if (IS_ERR(trans)) {
2681			ret = PTR_ERR(trans);
2682			trans = NULL;
2683			break;
2684		}
2685
2686		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
2687					      rsv, min_size, false);
2688		if (WARN_ON(ret))
2689			break;
2690		trans->block_rsv = rsv;
2691
2692		cur_offset = drop_args.drop_end;
2693		len = end - cur_offset;
2694		if (!extent_info && len) {
2695			ret = find_first_non_hole(inode, &cur_offset, &len);
2696			if (unlikely(ret < 0))
2697				break;
2698			if (ret && !len) {
2699				ret = 0;
2700				break;
2701			}
2702		}
2703	}
2704
2705	/*
2706	 * If we were cloning, force the next fsync to be a full one since we
2707	 * we replaced (or just dropped in the case of cloning holes when
2708	 * NO_HOLES is enabled) file extent items and did not setup new extent
2709	 * maps for the replacement extents (or holes).
2710	 */
2711	if (extent_info && !extent_info->is_new_extent)
2712		btrfs_set_inode_full_sync(inode);
2713
2714	if (ret)
2715		goto out_trans;
2716
2717	trans->block_rsv = &fs_info->trans_block_rsv;
2718	/*
2719	 * If we are using the NO_HOLES feature we might have had already an
2720	 * hole that overlaps a part of the region [lockstart, lockend] and
2721	 * ends at (or beyond) lockend. Since we have no file extent items to
2722	 * represent holes, drop_end can be less than lockend and so we must
2723	 * make sure we have an extent map representing the existing hole (the
2724	 * call to __btrfs_drop_extents() might have dropped the existing extent
2725	 * map representing the existing hole), otherwise the fast fsync path
2726	 * will not record the existence of the hole region
2727	 * [existing_hole_start, lockend].
2728	 */
2729	if (drop_args.drop_end <= end)
2730		drop_args.drop_end = end + 1;
2731	/*
2732	 * Don't insert file hole extent item if it's for a range beyond eof
2733	 * (because it's useless) or if it represents a 0 bytes range (when
2734	 * cur_offset == drop_end).
2735	 */
2736	if (!extent_info && cur_offset < ino_size &&
2737	    cur_offset < drop_args.drop_end) {
2738		ret = fill_holes(trans, inode, path, cur_offset,
2739				 drop_args.drop_end);
2740		if (ret) {
2741			/* Same comment as above. */
2742			btrfs_abort_transaction(trans, ret);
2743			goto out_trans;
2744		}
2745	} else if (!extent_info && cur_offset < drop_args.drop_end) {
2746		/* See the comment in the loop above for the reasoning here. */
2747		ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
2748					drop_args.drop_end - cur_offset);
2749		if (ret) {
2750			btrfs_abort_transaction(trans, ret);
2751			goto out_trans;
2752		}
2753
2754	}
2755	if (extent_info) {
2756		ret = btrfs_insert_replace_extent(trans, inode, path,
2757				extent_info, extent_info->data_len,
2758				drop_args.bytes_found);
2759		if (ret) {
2760			btrfs_abort_transaction(trans, ret);
2761			goto out_trans;
2762		}
2763	}
2764
2765out_trans:
2766	if (!trans)
2767		goto out_free;
2768
2769	trans->block_rsv = &fs_info->trans_block_rsv;
2770	if (ret)
2771		btrfs_end_transaction(trans);
2772	else
2773		*trans_out = trans;
2774out_free:
2775	btrfs_free_block_rsv(fs_info, rsv);
2776out:
2777	return ret;
2778}
2779
2780static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
2781{
2782	struct inode *inode = file_inode(file);
2783	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
2784	struct btrfs_root *root = BTRFS_I(inode)->root;
2785	struct extent_state *cached_state = NULL;
2786	struct btrfs_path *path;
2787	struct btrfs_trans_handle *trans = NULL;
2788	u64 lockstart;
2789	u64 lockend;
2790	u64 tail_start;
2791	u64 tail_len;
2792	u64 orig_start = offset;
2793	int ret = 0;
2794	bool same_block;
2795	u64 ino_size;
2796	bool truncated_block = false;
2797	bool updated_inode = false;
2798
2799	btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2800
2801	ret = btrfs_wait_ordered_range(inode, offset, len);
2802	if (ret)
2803		goto out_only_mutex;
2804
2805	ino_size = round_up(inode->i_size, fs_info->sectorsize);
2806	ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2807	if (ret < 0)
2808		goto out_only_mutex;
2809	if (ret && !len) {
2810		/* Already in a large hole */
2811		ret = 0;
2812		goto out_only_mutex;
2813	}
2814
2815	ret = file_modified(file);
2816	if (ret)
2817		goto out_only_mutex;
2818
2819	lockstart = round_up(offset, fs_info->sectorsize);
2820	lockend = round_down(offset + len, fs_info->sectorsize) - 1;
2821	same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
2822		== (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
2823	/*
2824	 * We needn't truncate any block which is beyond the end of the file
2825	 * because we are sure there is no data there.
2826	 */
2827	/*
2828	 * Only do this if we are in the same block and we aren't doing the
2829	 * entire block.
2830	 */
2831	if (same_block && len < fs_info->sectorsize) {
2832		if (offset < ino_size) {
2833			truncated_block = true;
2834			ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
2835						   0);
2836		} else {
2837			ret = 0;
2838		}
2839		goto out_only_mutex;
2840	}
2841
2842	/* zero back part of the first block */
2843	if (offset < ino_size) {
2844		truncated_block = true;
2845		ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
2846		if (ret) {
2847			btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2848			return ret;
2849		}
2850	}
2851
2852	/* Check the aligned pages after the first unaligned page,
2853	 * if offset != orig_start, which means the first unaligned page
2854	 * including several following pages are already in holes,
2855	 * the extra check can be skipped */
2856	if (offset == orig_start) {
2857		/* after truncate page, check hole again */
2858		len = offset + len - lockstart;
2859		offset = lockstart;
2860		ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2861		if (ret < 0)
2862			goto out_only_mutex;
2863		if (ret && !len) {
2864			ret = 0;
2865			goto out_only_mutex;
2866		}
2867		lockstart = offset;
2868	}
2869
2870	/* Check the tail unaligned part is in a hole */
2871	tail_start = lockend + 1;
2872	tail_len = offset + len - tail_start;
2873	if (tail_len) {
2874		ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len);
2875		if (unlikely(ret < 0))
2876			goto out_only_mutex;
2877		if (!ret) {
2878			/* zero the front end of the last page */
2879			if (tail_start + tail_len < ino_size) {
2880				truncated_block = true;
2881				ret = btrfs_truncate_block(BTRFS_I(inode),
2882							tail_start + tail_len,
2883							0, 1);
2884				if (ret)
2885					goto out_only_mutex;
2886			}
2887		}
2888	}
2889
2890	if (lockend < lockstart) {
2891		ret = 0;
2892		goto out_only_mutex;
2893	}
2894
2895	btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state);
2896
2897	path = btrfs_alloc_path();
2898	if (!path) {
2899		ret = -ENOMEM;
2900		goto out;
2901	}
2902
2903	ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart,
2904					 lockend, NULL, &trans);
2905	btrfs_free_path(path);
2906	if (ret)
2907		goto out;
2908
2909	ASSERT(trans != NULL);
2910	inode_inc_iversion(inode);
2911	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
2912	ret = btrfs_update_inode(trans, BTRFS_I(inode));
2913	updated_inode = true;
2914	btrfs_end_transaction(trans);
2915	btrfs_btree_balance_dirty(fs_info);
2916out:
2917	unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2918		      &cached_state);
2919out_only_mutex:
2920	if (!updated_inode && truncated_block && !ret) {
2921		/*
2922		 * If we only end up zeroing part of a page, we still need to
2923		 * update the inode item, so that all the time fields are
2924		 * updated as well as the necessary btrfs inode in memory fields
2925		 * for detecting, at fsync time, if the inode isn't yet in the
2926		 * log tree or it's there but not up to date.
2927		 */
2928		struct timespec64 now = inode_set_ctime_current(inode);
2929
2930		inode_inc_iversion(inode);
2931		inode_set_mtime_to_ts(inode, now);
2932		trans = btrfs_start_transaction(root, 1);
2933		if (IS_ERR(trans)) {
2934			ret = PTR_ERR(trans);
2935		} else {
2936			int ret2;
2937
2938			ret = btrfs_update_inode(trans, BTRFS_I(inode));
2939			ret2 = btrfs_end_transaction(trans);
2940			if (!ret)
2941				ret = ret2;
2942		}
2943	}
2944	btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2945	return ret;
2946}
2947
2948/* Helper structure to record which range is already reserved */
2949struct falloc_range {
2950	struct list_head list;
2951	u64 start;
2952	u64 len;
2953};
2954
2955/*
2956 * Helper function to add falloc range
2957 *
2958 * Caller should have locked the larger range of extent containing
2959 * [start, len)
2960 */
2961static int add_falloc_range(struct list_head *head, u64 start, u64 len)
2962{
2963	struct falloc_range *range = NULL;
2964
2965	if (!list_empty(head)) {
2966		/*
2967		 * As fallocate iterates by bytenr order, we only need to check
2968		 * the last range.
2969		 */
2970		range = list_last_entry(head, struct falloc_range, list);
2971		if (range->start + range->len == start) {
2972			range->len += len;
2973			return 0;
2974		}
2975	}
2976
2977	range = kmalloc(sizeof(*range), GFP_KERNEL);
2978	if (!range)
2979		return -ENOMEM;
2980	range->start = start;
2981	range->len = len;
2982	list_add_tail(&range->list, head);
2983	return 0;
2984}
2985
2986static int btrfs_fallocate_update_isize(struct inode *inode,
2987					const u64 end,
2988					const int mode)
2989{
2990	struct btrfs_trans_handle *trans;
2991	struct btrfs_root *root = BTRFS_I(inode)->root;
2992	int ret;
2993	int ret2;
2994
2995	if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
2996		return 0;
2997
2998	trans = btrfs_start_transaction(root, 1);
2999	if (IS_ERR(trans))
3000		return PTR_ERR(trans);
3001
3002	inode_set_ctime_current(inode);
3003	i_size_write(inode, end);
3004	btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
3005	ret = btrfs_update_inode(trans, BTRFS_I(inode));
3006	ret2 = btrfs_end_transaction(trans);
3007
3008	return ret ? ret : ret2;
3009}
3010
3011enum {
3012	RANGE_BOUNDARY_WRITTEN_EXTENT,
3013	RANGE_BOUNDARY_PREALLOC_EXTENT,
3014	RANGE_BOUNDARY_HOLE,
3015};
3016
3017static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
3018						 u64 offset)
3019{
3020	const u64 sectorsize = inode->root->fs_info->sectorsize;
3021	struct extent_map *em;
3022	int ret;
3023
3024	offset = round_down(offset, sectorsize);
3025	em = btrfs_get_extent(inode, NULL, offset, sectorsize);
3026	if (IS_ERR(em))
3027		return PTR_ERR(em);
3028
3029	if (em->block_start == EXTENT_MAP_HOLE)
3030		ret = RANGE_BOUNDARY_HOLE;
3031	else if (em->flags & EXTENT_FLAG_PREALLOC)
3032		ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
3033	else
3034		ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
3035
3036	free_extent_map(em);
3037	return ret;
3038}
3039
3040static int btrfs_zero_range(struct inode *inode,
3041			    loff_t offset,
3042			    loff_t len,
3043			    const int mode)
3044{
3045	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
3046	struct extent_map *em;
3047	struct extent_changeset *data_reserved = NULL;
3048	int ret;
3049	u64 alloc_hint = 0;
3050	const u64 sectorsize = fs_info->sectorsize;
3051	u64 alloc_start = round_down(offset, sectorsize);
3052	u64 alloc_end = round_up(offset + len, sectorsize);
3053	u64 bytes_to_reserve = 0;
3054	bool space_reserved = false;
3055
3056	em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start,
3057			      alloc_end - alloc_start);
3058	if (IS_ERR(em)) {
3059		ret = PTR_ERR(em);
3060		goto out;
3061	}
3062
3063	/*
3064	 * Avoid hole punching and extent allocation for some cases. More cases
3065	 * could be considered, but these are unlikely common and we keep things
3066	 * as simple as possible for now. Also, intentionally, if the target
3067	 * range contains one or more prealloc extents together with regular
3068	 * extents and holes, we drop all the existing extents and allocate a
3069	 * new prealloc extent, so that we get a larger contiguous disk extent.
3070	 */
3071	if (em->start <= alloc_start && (em->flags & EXTENT_FLAG_PREALLOC)) {
3072		const u64 em_end = em->start + em->len;
3073
3074		if (em_end >= offset + len) {
3075			/*
3076			 * The whole range is already a prealloc extent,
3077			 * do nothing except updating the inode's i_size if
3078			 * needed.
3079			 */
3080			free_extent_map(em);
3081			ret = btrfs_fallocate_update_isize(inode, offset + len,
3082							   mode);
3083			goto out;
3084		}
3085		/*
3086		 * Part of the range is already a prealloc extent, so operate
3087		 * only on the remaining part of the range.
3088		 */
3089		alloc_start = em_end;
3090		ASSERT(IS_ALIGNED(alloc_start, sectorsize));
3091		len = offset + len - alloc_start;
3092		offset = alloc_start;
3093		alloc_hint = em->block_start + em->len;
3094	}
3095	free_extent_map(em);
3096
3097	if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
3098	    BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
3099		em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, sectorsize);
3100		if (IS_ERR(em)) {
3101			ret = PTR_ERR(em);
3102			goto out;
3103		}
3104
3105		if (em->flags & EXTENT_FLAG_PREALLOC) {
3106			free_extent_map(em);
3107			ret = btrfs_fallocate_update_isize(inode, offset + len,
3108							   mode);
3109			goto out;
3110		}
3111		if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
3112			free_extent_map(em);
3113			ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
3114						   0);
3115			if (!ret)
3116				ret = btrfs_fallocate_update_isize(inode,
3117								   offset + len,
3118								   mode);
3119			return ret;
3120		}
3121		free_extent_map(em);
3122		alloc_start = round_down(offset, sectorsize);
3123		alloc_end = alloc_start + sectorsize;
3124		goto reserve_space;
3125	}
3126
3127	alloc_start = round_up(offset, sectorsize);
3128	alloc_end = round_down(offset + len, sectorsize);
3129
3130	/*
3131	 * For unaligned ranges, check the pages at the boundaries, they might
3132	 * map to an extent, in which case we need to partially zero them, or
3133	 * they might map to a hole, in which case we need our allocation range
3134	 * to cover them.
3135	 */
3136	if (!IS_ALIGNED(offset, sectorsize)) {
3137		ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
3138							    offset);
3139		if (ret < 0)
3140			goto out;
3141		if (ret == RANGE_BOUNDARY_HOLE) {
3142			alloc_start = round_down(offset, sectorsize);
3143			ret = 0;
3144		} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
3145			ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
3146			if (ret)
3147				goto out;
3148		} else {
3149			ret = 0;
3150		}
3151	}
3152
3153	if (!IS_ALIGNED(offset + len, sectorsize)) {
3154		ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
3155							    offset + len);
3156		if (ret < 0)
3157			goto out;
3158		if (ret == RANGE_BOUNDARY_HOLE) {
3159			alloc_end = round_up(offset + len, sectorsize);
3160			ret = 0;
3161		} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
3162			ret = btrfs_truncate_block(BTRFS_I(inode), offset + len,
3163						   0, 1);
3164			if (ret)
3165				goto out;
3166		} else {
3167			ret = 0;
3168		}
3169	}
3170
3171reserve_space:
3172	if (alloc_start < alloc_end) {
3173		struct extent_state *cached_state = NULL;
3174		const u64 lockstart = alloc_start;
3175		const u64 lockend = alloc_end - 1;
3176
3177		bytes_to_reserve = alloc_end - alloc_start;
3178		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3179						      bytes_to_reserve);
3180		if (ret < 0)
3181			goto out;
3182		space_reserved = true;
3183		btrfs_punch_hole_lock_range(inode, lockstart, lockend,
3184					    &cached_state);
3185		ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
3186						alloc_start, bytes_to_reserve);
3187		if (ret) {
3188			unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
3189				      lockend, &cached_state);
3190			goto out;
3191		}
3192		ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
3193						alloc_end - alloc_start,
3194						fs_info->sectorsize,
3195						offset + len, &alloc_hint);
3196		unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
3197			      &cached_state);
3198		/* btrfs_prealloc_file_range releases reserved space on error */
3199		if (ret) {
3200			space_reserved = false;
3201			goto out;
3202		}
3203	}
3204	ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
3205 out:
3206	if (ret && space_reserved)
3207		btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
3208					       alloc_start, bytes_to_reserve);
3209	extent_changeset_free(data_reserved);
3210
3211	return ret;
3212}
3213
3214static long btrfs_fallocate(struct file *file, int mode,
3215			    loff_t offset, loff_t len)
3216{
3217	struct inode *inode = file_inode(file);
3218	struct extent_state *cached_state = NULL;
3219	struct extent_changeset *data_reserved = NULL;
3220	struct falloc_range *range;
3221	struct falloc_range *tmp;
3222	LIST_HEAD(reserve_list);
3223	u64 cur_offset;
3224	u64 last_byte;
3225	u64 alloc_start;
3226	u64 alloc_end;
3227	u64 alloc_hint = 0;
3228	u64 locked_end;
3229	u64 actual_end = 0;
3230	u64 data_space_needed = 0;
3231	u64 data_space_reserved = 0;
3232	u64 qgroup_reserved = 0;
3233	struct extent_map *em;
3234	int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize;
3235	int ret;
3236
3237	/* Do not allow fallocate in ZONED mode */
3238	if (btrfs_is_zoned(inode_to_fs_info(inode)))
3239		return -EOPNOTSUPP;
3240
3241	alloc_start = round_down(offset, blocksize);
3242	alloc_end = round_up(offset + len, blocksize);
3243	cur_offset = alloc_start;
3244
3245	/* Make sure we aren't being give some crap mode */
3246	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
3247		     FALLOC_FL_ZERO_RANGE))
3248		return -EOPNOTSUPP;
3249
3250	if (mode & FALLOC_FL_PUNCH_HOLE)
3251		return btrfs_punch_hole(file, offset, len);
3252
3253	btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3254
3255	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
3256		ret = inode_newsize_ok(inode, offset + len);
3257		if (ret)
3258			goto out;
3259	}
3260
3261	ret = file_modified(file);
3262	if (ret)
3263		goto out;
3264
3265	/*
3266	 * TODO: Move these two operations after we have checked
3267	 * accurate reserved space, or fallocate can still fail but
3268	 * with page truncated or size expanded.
3269	 *
3270	 * But that's a minor problem and won't do much harm BTW.
3271	 */
3272	if (alloc_start > inode->i_size) {
3273		ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode),
3274					alloc_start);
3275		if (ret)
3276			goto out;
3277	} else if (offset + len > inode->i_size) {
3278		/*
3279		 * If we are fallocating from the end of the file onward we
3280		 * need to zero out the end of the block if i_size lands in the
3281		 * middle of a block.
3282		 */
3283		ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
3284		if (ret)
3285			goto out;
3286	}
3287
3288	/*
3289	 * We have locked the inode at the VFS level (in exclusive mode) and we
3290	 * have locked the i_mmap_lock lock (in exclusive mode). Now before
3291	 * locking the file range, flush all dealloc in the range and wait for
3292	 * all ordered extents in the range to complete. After this we can lock
3293	 * the file range and, due to the previous locking we did, we know there
3294	 * can't be more delalloc or ordered extents in the range.
3295	 */
3296	ret = btrfs_wait_ordered_range(inode, alloc_start,
3297				       alloc_end - alloc_start);
3298	if (ret)
3299		goto out;
3300
3301	if (mode & FALLOC_FL_ZERO_RANGE) {
3302		ret = btrfs_zero_range(inode, offset, len, mode);
3303		btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3304		return ret;
3305	}
3306
3307	locked_end = alloc_end - 1;
3308	lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3309		    &cached_state);
3310
3311	btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end);
3312
3313	/* First, check if we exceed the qgroup limit */
3314	while (cur_offset < alloc_end) {
3315		em = btrfs_get_extent(BTRFS_I(inode), NULL, cur_offset,
3316				      alloc_end - cur_offset);
3317		if (IS_ERR(em)) {
3318			ret = PTR_ERR(em);
3319			break;
3320		}
3321		last_byte = min(extent_map_end(em), alloc_end);
3322		actual_end = min_t(u64, extent_map_end(em), offset + len);
3323		last_byte = ALIGN(last_byte, blocksize);
3324		if (em->block_start == EXTENT_MAP_HOLE ||
3325		    (cur_offset >= inode->i_size &&
3326		     !(em->flags & EXTENT_FLAG_PREALLOC))) {
3327			const u64 range_len = last_byte - cur_offset;
3328
3329			ret = add_falloc_range(&reserve_list, cur_offset, range_len);
3330			if (ret < 0) {
3331				free_extent_map(em);
3332				break;
3333			}
3334			ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
3335					&data_reserved, cur_offset, range_len);
3336			if (ret < 0) {
3337				free_extent_map(em);
3338				break;
3339			}
3340			qgroup_reserved += range_len;
3341			data_space_needed += range_len;
3342		}
3343		free_extent_map(em);
3344		cur_offset = last_byte;
3345	}
3346
3347	if (!ret && data_space_needed > 0) {
3348		/*
3349		 * We are safe to reserve space here as we can't have delalloc
3350		 * in the range, see above.
3351		 */
3352		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3353						      data_space_needed);
3354		if (!ret)
3355			data_space_reserved = data_space_needed;
3356	}
3357
3358	/*
3359	 * If ret is still 0, means we're OK to fallocate.
3360	 * Or just cleanup the list and exit.
3361	 */
3362	list_for_each_entry_safe(range, tmp, &reserve_list, list) {
3363		if (!ret) {
3364			ret = btrfs_prealloc_file_range(inode, mode,
3365					range->start,
3366					range->len, blocksize,
3367					offset + len, &alloc_hint);
3368			/*
3369			 * btrfs_prealloc_file_range() releases space even
3370			 * if it returns an error.
3371			 */
3372			data_space_reserved -= range->len;
3373			qgroup_reserved -= range->len;
3374		} else if (data_space_reserved > 0) {
3375			btrfs_free_reserved_data_space(BTRFS_I(inode),
3376					       data_reserved, range->start,
3377					       range->len);
3378			data_space_reserved -= range->len;
3379			qgroup_reserved -= range->len;
3380		} else if (qgroup_reserved > 0) {
3381			btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved,
3382					       range->start, range->len, NULL);
3383			qgroup_reserved -= range->len;
3384		}
3385		list_del(&range->list);
3386		kfree(range);
3387	}
3388	if (ret < 0)
3389		goto out_unlock;
3390
3391	/*
3392	 * We didn't need to allocate any more space, but we still extended the
3393	 * size of the file so we need to update i_size and the inode item.
3394	 */
3395	ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
3396out_unlock:
3397	unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3398		      &cached_state);
3399out:
3400	btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3401	extent_changeset_free(data_reserved);
3402	return ret;
3403}
3404
3405/*
3406 * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range
3407 * that has unflushed and/or flushing delalloc. There might be other adjacent
3408 * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps
3409 * looping while it gets adjacent subranges, and merging them together.
3410 */
3411static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end,
3412				   struct extent_state **cached_state,
3413				   bool *search_io_tree,
3414				   u64 *delalloc_start_ret, u64 *delalloc_end_ret)
3415{
3416	u64 len = end + 1 - start;
3417	u64 delalloc_len = 0;
3418	struct btrfs_ordered_extent *oe;
3419	u64 oe_start;
3420	u64 oe_end;
3421
3422	/*
3423	 * Search the io tree first for EXTENT_DELALLOC. If we find any, it
3424	 * means we have delalloc (dirty pages) for which writeback has not
3425	 * started yet.
3426	 */
3427	if (*search_io_tree) {
3428		spin_lock(&inode->lock);
3429		if (inode->delalloc_bytes > 0) {
3430			spin_unlock(&inode->lock);
3431			*delalloc_start_ret = start;
3432			delalloc_len = count_range_bits(&inode->io_tree,
3433							delalloc_start_ret, end,
3434							len, EXTENT_DELALLOC, 1,
3435							cached_state);
3436		} else {
3437			spin_unlock(&inode->lock);
3438		}
3439	}
3440
3441	if (delalloc_len > 0) {
3442		/*
3443		 * If delalloc was found then *delalloc_start_ret has a sector size
3444		 * aligned value (rounded down).
3445		 */
3446		*delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1;
3447
3448		if (*delalloc_start_ret == start) {
3449			/* Delalloc for the whole range, nothing more to do. */
3450			if (*delalloc_end_ret == end)
3451				return true;
3452			/* Else trim our search range for ordered extents. */
3453			start = *delalloc_end_ret + 1;
3454			len = end + 1 - start;
3455		}
3456	} else {
3457		/* No delalloc, future calls don't need to search again. */
3458		*search_io_tree = false;
3459	}
3460
3461	/*
3462	 * Now also check if there's any ordered extent in the range.
3463	 * We do this because:
3464	 *
3465	 * 1) When delalloc is flushed, the file range is locked, we clear the
3466	 *    EXTENT_DELALLOC bit from the io tree and create an extent map and
3467	 *    an ordered extent for the write. So we might just have been called
3468	 *    after delalloc is flushed and before the ordered extent completes
3469	 *    and inserts the new file extent item in the subvolume's btree;
3470	 *
3471	 * 2) We may have an ordered extent created by flushing delalloc for a
3472	 *    subrange that starts before the subrange we found marked with
3473	 *    EXTENT_DELALLOC in the io tree.
3474	 *
3475	 * We could also use the extent map tree to find such delalloc that is
3476	 * being flushed, but using the ordered extents tree is more efficient
3477	 * because it's usually much smaller as ordered extents are removed from
3478	 * the tree once they complete. With the extent maps, we mau have them
3479	 * in the extent map tree for a very long time, and they were either
3480	 * created by previous writes or loaded by read operations.
3481	 */
3482	oe = btrfs_lookup_first_ordered_range(inode, start, len);
3483	if (!oe)
3484		return (delalloc_len > 0);
3485
3486	/* The ordered extent may span beyond our search range. */
3487	oe_start = max(oe->file_offset, start);
3488	oe_end = min(oe->file_offset + oe->num_bytes - 1, end);
3489
3490	btrfs_put_ordered_extent(oe);
3491
3492	/* Don't have unflushed delalloc, return the ordered extent range. */
3493	if (delalloc_len == 0) {
3494		*delalloc_start_ret = oe_start;
3495		*delalloc_end_ret = oe_end;
3496		return true;
3497	}
3498
3499	/*
3500	 * We have both unflushed delalloc (io_tree) and an ordered extent.
3501	 * If the ranges are adjacent returned a combined range, otherwise
3502	 * return the leftmost range.
3503	 */
3504	if (oe_start < *delalloc_start_ret) {
3505		if (oe_end < *delalloc_start_ret)
3506			*delalloc_end_ret = oe_end;
3507		*delalloc_start_ret = oe_start;
3508	} else if (*delalloc_end_ret + 1 == oe_start) {
3509		*delalloc_end_ret = oe_end;
3510	}
3511
3512	return true;
3513}
3514
3515/*
3516 * Check if there's delalloc in a given range.
3517 *
3518 * @inode:               The inode.
3519 * @start:               The start offset of the range. It does not need to be
3520 *                       sector size aligned.
3521 * @end:                 The end offset (inclusive value) of the search range.
3522 *                       It does not need to be sector size aligned.
3523 * @cached_state:        Extent state record used for speeding up delalloc
3524 *                       searches in the inode's io_tree. Can be NULL.
3525 * @delalloc_start_ret:  Output argument, set to the start offset of the
3526 *                       subrange found with delalloc (may not be sector size
3527 *                       aligned).
3528 * @delalloc_end_ret:    Output argument, set to he end offset (inclusive value)
3529 *                       of the subrange found with delalloc.
3530 *
3531 * Returns true if a subrange with delalloc is found within the given range, and
3532 * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and
3533 * end offsets of the subrange.
3534 */
3535bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
3536				  struct extent_state **cached_state,
3537				  u64 *delalloc_start_ret, u64 *delalloc_end_ret)
3538{
3539	u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize);
3540	u64 prev_delalloc_end = 0;
3541	bool search_io_tree = true;
3542	bool ret = false;
3543
3544	while (cur_offset <= end) {
3545		u64 delalloc_start;
3546		u64 delalloc_end;
3547		bool delalloc;
3548
3549		delalloc = find_delalloc_subrange(inode, cur_offset, end,
3550						  cached_state, &search_io_tree,
3551						  &delalloc_start,
3552						  &delalloc_end);
3553		if (!delalloc)
3554			break;
3555
3556		if (prev_delalloc_end == 0) {
3557			/* First subrange found. */
3558			*delalloc_start_ret = max(delalloc_start, start);
3559			*delalloc_end_ret = delalloc_end;
3560			ret = true;
3561		} else if (delalloc_start == prev_delalloc_end + 1) {
3562			/* Subrange adjacent to the previous one, merge them. */
3563			*delalloc_end_ret = delalloc_end;
3564		} else {
3565			/* Subrange not adjacent to the previous one, exit. */
3566			break;
3567		}
3568
3569		prev_delalloc_end = delalloc_end;
3570		cur_offset = delalloc_end + 1;
3571		cond_resched();
3572	}
3573
3574	return ret;
3575}
3576
3577/*
3578 * Check if there's a hole or delalloc range in a range representing a hole (or
3579 * prealloc extent) found in the inode's subvolume btree.
3580 *
3581 * @inode:      The inode.
3582 * @whence:     Seek mode (SEEK_DATA or SEEK_HOLE).
3583 * @start:      Start offset of the hole region. It does not need to be sector
3584 *              size aligned.
3585 * @end:        End offset (inclusive value) of the hole region. It does not
3586 *              need to be sector size aligned.
3587 * @start_ret:  Return parameter, used to set the start of the subrange in the
3588 *              hole that matches the search criteria (seek mode), if such
3589 *              subrange is found (return value of the function is true).
3590 *              The value returned here may not be sector size aligned.
3591 *
3592 * Returns true if a subrange matching the given seek mode is found, and if one
3593 * is found, it updates @start_ret with the start of the subrange.
3594 */
3595static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
3596					struct extent_state **cached_state,
3597					u64 start, u64 end, u64 *start_ret)
3598{
3599	u64 delalloc_start;
3600	u64 delalloc_end;
3601	bool delalloc;
3602
3603	delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state,
3604						&delalloc_start, &delalloc_end);
3605	if (delalloc && whence == SEEK_DATA) {
3606		*start_ret = delalloc_start;
3607		return true;
3608	}
3609
3610	if (delalloc && whence == SEEK_HOLE) {
3611		/*
3612		 * We found delalloc but it starts after out start offset. So we
3613		 * have a hole between our start offset and the delalloc start.
3614		 */
3615		if (start < delalloc_start) {
3616			*start_ret = start;
3617			return true;
3618		}
3619		/*
3620		 * Delalloc range starts at our start offset.
3621		 * If the delalloc range's length is smaller than our range,
3622		 * then it means we have a hole that starts where the delalloc
3623		 * subrange ends.
3624		 */
3625		if (delalloc_end < end) {
3626			*start_ret = delalloc_end + 1;
3627			return true;
3628		}
3629
3630		/* There's delalloc for the whole range. */
3631		return false;
3632	}
3633
3634	if (!delalloc && whence == SEEK_HOLE) {
3635		*start_ret = start;
3636		return true;
3637	}
3638
3639	/*
3640	 * No delalloc in the range and we are seeking for data. The caller has
3641	 * to iterate to the next extent item in the subvolume btree.
3642	 */
3643	return false;
3644}
3645
3646static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
3647{
3648	struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host);
3649	struct btrfs_file_private *private = file->private_data;
3650	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3651	struct extent_state *cached_state = NULL;
3652	struct extent_state **delalloc_cached_state;
3653	const loff_t i_size = i_size_read(&inode->vfs_inode);
3654	const u64 ino = btrfs_ino(inode);
3655	struct btrfs_root *root = inode->root;
3656	struct btrfs_path *path;
3657	struct btrfs_key key;
3658	u64 last_extent_end;
3659	u64 lockstart;
3660	u64 lockend;
3661	u64 start;
3662	int ret;
3663	bool found = false;
3664
3665	if (i_size == 0 || offset >= i_size)
3666		return -ENXIO;
3667
3668	/*
3669	 * Quick path. If the inode has no prealloc extents and its number of
3670	 * bytes used matches its i_size, then it can not have holes.
3671	 */
3672	if (whence == SEEK_HOLE &&
3673	    !(inode->flags & BTRFS_INODE_PREALLOC) &&
3674	    inode_get_bytes(&inode->vfs_inode) == i_size)
3675		return i_size;
3676
3677	if (!private) {
3678		private = kzalloc(sizeof(*private), GFP_KERNEL);
3679		/*
3680		 * No worries if memory allocation failed.
3681		 * The private structure is used only for speeding up multiple
3682		 * lseek SEEK_HOLE/DATA calls to a file when there's delalloc,
3683		 * so everything will still be correct.
3684		 */
3685		file->private_data = private;
3686	}
3687
3688	if (private)
3689		delalloc_cached_state = &private->llseek_cached_state;
3690	else
3691		delalloc_cached_state = NULL;
3692
3693	/*
3694	 * offset can be negative, in this case we start finding DATA/HOLE from
3695	 * the very start of the file.
3696	 */
3697	start = max_t(loff_t, 0, offset);
3698
3699	lockstart = round_down(start, fs_info->sectorsize);
3700	lockend = round_up(i_size, fs_info->sectorsize);
3701	if (lockend <= lockstart)
3702		lockend = lockstart + fs_info->sectorsize;
3703	lockend--;
3704
3705	path = btrfs_alloc_path();
3706	if (!path)
3707		return -ENOMEM;
3708	path->reada = READA_FORWARD;
3709
3710	key.objectid = ino;
3711	key.type = BTRFS_EXTENT_DATA_KEY;
3712	key.offset = start;
3713
3714	last_extent_end = lockstart;
3715
3716	lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
3717
3718	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3719	if (ret < 0) {
3720		goto out;
3721	} else if (ret > 0 && path->slots[0] > 0) {
3722		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
3723		if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
3724			path->slots[0]--;
3725	}
3726
3727	while (start < i_size) {
3728		struct extent_buffer *leaf = path->nodes[0];
3729		struct btrfs_file_extent_item *extent;
3730		u64 extent_end;
3731		u8 type;
3732
3733		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3734			ret = btrfs_next_leaf(root, path);
3735			if (ret < 0)
3736				goto out;
3737			else if (ret > 0)
3738				break;
3739
3740			leaf = path->nodes[0];
3741		}
3742
3743		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3744		if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
3745			break;
3746
3747		extent_end = btrfs_file_extent_end(path);
3748
3749		/*
3750		 * In the first iteration we may have a slot that points to an
3751		 * extent that ends before our start offset, so skip it.
3752		 */
3753		if (extent_end <= start) {
3754			path->slots[0]++;
3755			continue;
3756		}
3757
3758		/* We have an implicit hole, NO_HOLES feature is likely set. */
3759		if (last_extent_end < key.offset) {
3760			u64 search_start = last_extent_end;
3761			u64 found_start;
3762
3763			/*
3764			 * First iteration, @start matches @offset and it's
3765			 * within the hole.
3766			 */
3767			if (start == offset)
3768				search_start = offset;
3769
3770			found = find_desired_extent_in_hole(inode, whence,
3771							    delalloc_cached_state,
3772							    search_start,
3773							    key.offset - 1,
3774							    &found_start);
3775			if (found) {
3776				start = found_start;
3777				break;
3778			}
3779			/*
3780			 * Didn't find data or a hole (due to delalloc) in the
3781			 * implicit hole range, so need to analyze the extent.
3782			 */
3783		}
3784
3785		extent = btrfs_item_ptr(leaf, path->slots[0],
3786					struct btrfs_file_extent_item);
3787		type = btrfs_file_extent_type(leaf, extent);
3788
3789		/*
3790		 * Can't access the extent's disk_bytenr field if this is an
3791		 * inline extent, since at that offset, it's where the extent
3792		 * data starts.
3793		 */
3794		if (type == BTRFS_FILE_EXTENT_PREALLOC ||
3795		    (type == BTRFS_FILE_EXTENT_REG &&
3796		     btrfs_file_extent_disk_bytenr(leaf, extent) == 0)) {
3797			/*
3798			 * Explicit hole or prealloc extent, search for delalloc.
3799			 * A prealloc extent is treated like a hole.
3800			 */
3801			u64 search_start = key.offset;
3802			u64 found_start;
3803
3804			/*
3805			 * First iteration, @start matches @offset and it's
3806			 * within the hole.
3807			 */
3808			if (start == offset)
3809				search_start = offset;
3810
3811			found = find_desired_extent_in_hole(inode, whence,
3812							    delalloc_cached_state,
3813							    search_start,
3814							    extent_end - 1,
3815							    &found_start);
3816			if (found) {
3817				start = found_start;
3818				break;
3819			}
3820			/*
3821			 * Didn't find data or a hole (due to delalloc) in the
3822			 * implicit hole range, so need to analyze the next
3823			 * extent item.
3824			 */
3825		} else {
3826			/*
3827			 * Found a regular or inline extent.
3828			 * If we are seeking for data, adjust the start offset
3829			 * and stop, we're done.
3830			 */
3831			if (whence == SEEK_DATA) {
3832				start = max_t(u64, key.offset, offset);
3833				found = true;
3834				break;
3835			}
3836			/*
3837			 * Else, we are seeking for a hole, check the next file
3838			 * extent item.
3839			 */
3840		}
3841
3842		start = extent_end;
3843		last_extent_end = extent_end;
3844		path->slots[0]++;
3845		if (fatal_signal_pending(current)) {
3846			ret = -EINTR;
3847			goto out;
3848		}
3849		cond_resched();
3850	}
3851
3852	/* We have an implicit hole from the last extent found up to i_size. */
3853	if (!found && start < i_size) {
3854		found = find_desired_extent_in_hole(inode, whence,
3855						    delalloc_cached_state, start,
3856						    i_size - 1, &start);
3857		if (!found)
3858			start = i_size;
3859	}
3860
3861out:
3862	unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
3863	btrfs_free_path(path);
3864
3865	if (ret < 0)
3866		return ret;
3867
3868	if (whence == SEEK_DATA && start >= i_size)
3869		return -ENXIO;
3870
3871	return min_t(loff_t, start, i_size);
3872}
3873
3874static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
3875{
3876	struct inode *inode = file->f_mapping->host;
3877
3878	switch (whence) {
3879	default:
3880		return generic_file_llseek(file, offset, whence);
3881	case SEEK_DATA:
3882	case SEEK_HOLE:
3883		btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3884		offset = find_desired_extent(file, offset, whence);
3885		btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3886		break;
3887	}
3888
3889	if (offset < 0)
3890		return offset;
3891
3892	return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
3893}
3894
3895static int btrfs_file_open(struct inode *inode, struct file *filp)
3896{
3897	int ret;
3898
3899	filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
3900
3901	ret = fsverity_file_open(inode, filp);
3902	if (ret)
3903		return ret;
3904	return generic_file_open(inode, filp);
3905}
3906
3907static int check_direct_read(struct btrfs_fs_info *fs_info,
3908			     const struct iov_iter *iter, loff_t offset)
3909{
3910	int ret;
3911	int i, seg;
3912
3913	ret = check_direct_IO(fs_info, iter, offset);
3914	if (ret < 0)
3915		return ret;
3916
3917	if (!iter_is_iovec(iter))
3918		return 0;
3919
3920	for (seg = 0; seg < iter->nr_segs; seg++) {
3921		for (i = seg + 1; i < iter->nr_segs; i++) {
3922			const struct iovec *iov1 = iter_iov(iter) + seg;
3923			const struct iovec *iov2 = iter_iov(iter) + i;
3924
3925			if (iov1->iov_base == iov2->iov_base)
3926				return -EINVAL;
3927		}
3928	}
3929	return 0;
3930}
3931
3932static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
3933{
3934	struct inode *inode = file_inode(iocb->ki_filp);
3935	size_t prev_left = 0;
3936	ssize_t read = 0;
3937	ssize_t ret;
3938
3939	if (fsverity_active(inode))
3940		return 0;
3941
3942	if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos))
3943		return 0;
3944
3945	btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3946again:
3947	/*
3948	 * This is similar to what we do for direct IO writes, see the comment
3949	 * at btrfs_direct_write(), but we also disable page faults in addition
3950	 * to disabling them only at the iov_iter level. This is because when
3951	 * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
3952	 * which can still trigger page fault ins despite having set ->nofault
3953	 * to true of our 'to' iov_iter.
3954	 *
3955	 * The difference to direct IO writes is that we deadlock when trying
3956	 * to lock the extent range in the inode's tree during he page reads
3957	 * triggered by the fault in (while for writes it is due to waiting for
3958	 * our own ordered extent). This is because for direct IO reads,
3959	 * btrfs_dio_iomap_begin() returns with the extent range locked, which
3960	 * is only unlocked in the endio callback (end_bio_extent_readpage()).
3961	 */
3962	pagefault_disable();
3963	to->nofault = true;
3964	ret = btrfs_dio_read(iocb, to, read);
3965	to->nofault = false;
3966	pagefault_enable();
3967
3968	/* No increment (+=) because iomap returns a cumulative value. */
3969	if (ret > 0)
3970		read = ret;
3971
3972	if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
3973		const size_t left = iov_iter_count(to);
3974
3975		if (left == prev_left) {
3976			/*
3977			 * We didn't make any progress since the last attempt,
3978			 * fallback to a buffered read for the remainder of the
3979			 * range. This is just to avoid any possibility of looping
3980			 * for too long.
3981			 */
3982			ret = read;
3983		} else {
3984			/*
3985			 * We made some progress since the last retry or this is
3986			 * the first time we are retrying. Fault in as many pages
3987			 * as possible and retry.
3988			 */
3989			fault_in_iov_iter_writeable(to, left);
3990			prev_left = left;
3991			goto again;
3992		}
3993	}
3994	btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3995	return ret < 0 ? ret : read;
3996}
3997
3998static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
3999{
4000	ssize_t ret = 0;
4001
4002	if (iocb->ki_flags & IOCB_DIRECT) {
4003		ret = btrfs_direct_read(iocb, to);
4004		if (ret < 0 || !iov_iter_count(to) ||
4005		    iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
4006			return ret;
4007	}
4008
4009	return filemap_read(iocb, to, ret);
4010}
4011
4012const struct file_operations btrfs_file_operations = {
4013	.llseek		= btrfs_file_llseek,
4014	.read_iter      = btrfs_file_read_iter,
4015	.splice_read	= filemap_splice_read,
4016	.write_iter	= btrfs_file_write_iter,
4017	.splice_write	= iter_file_splice_write,
4018	.mmap		= btrfs_file_mmap,
4019	.open		= btrfs_file_open,
4020	.release	= btrfs_release_file,
4021	.get_unmapped_area = thp_get_unmapped_area,
4022	.fsync		= btrfs_sync_file,
4023	.fallocate	= btrfs_fallocate,
4024	.unlocked_ioctl	= btrfs_ioctl,
4025#ifdef CONFIG_COMPAT
4026	.compat_ioctl	= btrfs_compat_ioctl,
4027#endif
4028	.remap_file_range = btrfs_remap_file_range,
4029	.fop_flags	= FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,
4030};
4031
4032int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
4033{
4034	int ret;
4035
4036	/*
4037	 * So with compression we will find and lock a dirty page and clear the
4038	 * first one as dirty, setup an async extent, and immediately return
4039	 * with the entire range locked but with nobody actually marked with
4040	 * writeback.  So we can't just filemap_write_and_wait_range() and
4041	 * expect it to work since it will just kick off a thread to do the
4042	 * actual work.  So we need to call filemap_fdatawrite_range _again_
4043	 * since it will wait on the page lock, which won't be unlocked until
4044	 * after the pages have been marked as writeback and so we're good to go
4045	 * from there.  We have to do this otherwise we'll miss the ordered
4046	 * extents and that results in badness.  Please Josef, do not think you
4047	 * know better and pull this out at some point in the future, it is
4048	 * right and you are wrong.
4049	 */
4050	ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
4051	if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
4052			     &BTRFS_I(inode)->runtime_flags))
4053		ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
4054
4055	return ret;
4056}
4057