fuse_io.c revision 330897
1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 2007-2009 Google Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are
9 * met:
10 *
11 * * Redistributions of source code must retain the above copyright
12 *   notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above
14 *   copyright notice, this list of conditions and the following disclaimer
15 *   in the documentation and/or other materials provided with the
16 *   distribution.
17 * * Neither the name of Google Inc. nor the names of its
18 *   contributors may be used to endorse or promote products derived from
19 *   this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Copyright (C) 2005 Csaba Henk.
34 * All rights reserved.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 *    notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 *    notice, this list of conditions and the following disclaimer in the
43 *    documentation and/or other materials provided with the distribution.
44 *
45 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48 * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 * SUCH DAMAGE.
56 */
57
58#include <sys/cdefs.h>
59__FBSDID("$FreeBSD: stable/11/sys/fs/fuse/fuse_io.c 330897 2018-03-14 03:19:51Z eadler $");
60
61#include <sys/types.h>
62#include <sys/module.h>
63#include <sys/systm.h>
64#include <sys/errno.h>
65#include <sys/param.h>
66#include <sys/kernel.h>
67#include <sys/conf.h>
68#include <sys/uio.h>
69#include <sys/malloc.h>
70#include <sys/queue.h>
71#include <sys/lock.h>
72#include <sys/sx.h>
73#include <sys/mutex.h>
74#include <sys/rwlock.h>
75#include <sys/proc.h>
76#include <sys/mount.h>
77#include <sys/vnode.h>
78#include <sys/stat.h>
79#include <sys/unistd.h>
80#include <sys/filedesc.h>
81#include <sys/file.h>
82#include <sys/fcntl.h>
83#include <sys/bio.h>
84#include <sys/buf.h>
85#include <sys/sysctl.h>
86
87#include <vm/vm.h>
88#include <vm/vm_extern.h>
89#include <vm/pmap.h>
90#include <vm/vm_map.h>
91#include <vm/vm_page.h>
92#include <vm/vm_object.h>
93
94#include "fuse.h"
95#include "fuse_file.h"
96#include "fuse_node.h"
97#include "fuse_internal.h"
98#include "fuse_ipc.h"
99#include "fuse_io.h"
100
101#define FUSE_DEBUG_MODULE IO
102#include "fuse_debug.h"
103
104
105static int
106fuse_read_directbackend(struct vnode *vp, struct uio *uio,
107    struct ucred *cred, struct fuse_filehandle *fufh);
108static int
109fuse_read_biobackend(struct vnode *vp, struct uio *uio,
110    struct ucred *cred, struct fuse_filehandle *fufh);
111static int
112fuse_write_directbackend(struct vnode *vp, struct uio *uio,
113    struct ucred *cred, struct fuse_filehandle *fufh, int ioflag);
114static int
115fuse_write_biobackend(struct vnode *vp, struct uio *uio,
116    struct ucred *cred, struct fuse_filehandle *fufh, int ioflag);
117
118int
119fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag,
120    struct ucred *cred)
121{
122	struct fuse_filehandle *fufh;
123	int err, directio;
124
125	MPASS(vp->v_type == VREG || vp->v_type == VDIR);
126
127	err = fuse_filehandle_getrw(vp,
128	    (uio->uio_rw == UIO_READ) ? FUFH_RDONLY : FUFH_WRONLY, &fufh);
129	if (err) {
130		printf("FUSE: io dispatch: filehandles are closed\n");
131		return err;
132	}
133	/*
134         * Ideally, when the daemon asks for direct io at open time, the
135         * standard file flag should be set according to this, so that would
136         * just change the default mode, which later on could be changed via
137         * fcntl(2).
138         * But this doesn't work, the O_DIRECT flag gets cleared at some point
139         * (don't know where). So to make any use of the Fuse direct_io option,
140         * we hardwire it into the file's private data (similarly to Linux,
141         * btw.).
142         */
143	directio = (ioflag & IO_DIRECT) || !fsess_opt_datacache(vnode_mount(vp));
144
145	switch (uio->uio_rw) {
146	case UIO_READ:
147		if (directio) {
148			FS_DEBUG("direct read of vnode %ju via file handle %ju\n",
149			    (uintmax_t)VTOILLU(vp), (uintmax_t)fufh->fh_id);
150			err = fuse_read_directbackend(vp, uio, cred, fufh);
151		} else {
152			FS_DEBUG("buffered read of vnode %ju\n",
153			      (uintmax_t)VTOILLU(vp));
154			err = fuse_read_biobackend(vp, uio, cred, fufh);
155		}
156		break;
157	case UIO_WRITE:
158		if (directio) {
159			FS_DEBUG("direct write of vnode %ju via file handle %ju\n",
160			    (uintmax_t)VTOILLU(vp), (uintmax_t)fufh->fh_id);
161			err = fuse_write_directbackend(vp, uio, cred, fufh, ioflag);
162		} else {
163			FS_DEBUG("buffered write of vnode %ju\n",
164			      (uintmax_t)VTOILLU(vp));
165			err = fuse_write_biobackend(vp, uio, cred, fufh, ioflag);
166		}
167		break;
168	default:
169		panic("uninterpreted mode passed to fuse_io_dispatch");
170	}
171
172	return (err);
173}
174
175static int
176fuse_read_biobackend(struct vnode *vp, struct uio *uio,
177    struct ucred *cred, struct fuse_filehandle *fufh)
178{
179	struct buf *bp;
180	daddr_t lbn;
181	int bcount;
182	int err = 0, n = 0, on = 0;
183	off_t filesize;
184
185	const int biosize = fuse_iosize(vp);
186
187	FS_DEBUG("resid=%zx offset=%jx fsize=%jx\n",
188	    uio->uio_resid, uio->uio_offset, VTOFUD(vp)->filesize);
189
190	if (uio->uio_resid == 0)
191		return (0);
192	if (uio->uio_offset < 0)
193		return (EINVAL);
194
195	bcount = MIN(MAXBSIZE, biosize);
196	filesize = VTOFUD(vp)->filesize;
197
198	do {
199		if (fuse_isdeadfs(vp)) {
200			err = ENXIO;
201			break;
202		}
203		lbn = uio->uio_offset / biosize;
204		on = uio->uio_offset & (biosize - 1);
205
206		FS_DEBUG2G("biosize %d, lbn %d, on %d\n", biosize, (int)lbn, on);
207
208		/*
209	         * Obtain the buffer cache block.  Figure out the buffer size
210	         * when we are at EOF.  If we are modifying the size of the
211	         * buffer based on an EOF condition we need to hold
212	         * nfs_rslock() through obtaining the buffer to prevent
213	         * a potential writer-appender from messing with n_size.
214	         * Otherwise we may accidentally truncate the buffer and
215	         * lose dirty data.
216	         *
217	         * Note that bcount is *not* DEV_BSIZE aligned.
218	         */
219		if ((off_t)lbn * biosize >= filesize) {
220			bcount = 0;
221		} else if ((off_t)(lbn + 1) * biosize > filesize) {
222			bcount = filesize - (off_t)lbn *biosize;
223		}
224		bp = getblk(vp, lbn, bcount, PCATCH, 0, 0);
225
226		if (!bp)
227			return (EINTR);
228
229		/*
230	         * If B_CACHE is not set, we must issue the read.  If this
231	         * fails, we return an error.
232	         */
233
234		if ((bp->b_flags & B_CACHE) == 0) {
235			bp->b_iocmd = BIO_READ;
236			vfs_busy_pages(bp, 0);
237			err = fuse_io_strategy(vp, bp);
238			if (err) {
239				brelse(bp);
240				return (err);
241			}
242		}
243		/*
244	         * on is the offset into the current bp.  Figure out how many
245	         * bytes we can copy out of the bp.  Note that bcount is
246	         * NOT DEV_BSIZE aligned.
247	         *
248	         * Then figure out how many bytes we can copy into the uio.
249	         */
250
251		n = 0;
252		if (on < bcount)
253			n = MIN((unsigned)(bcount - on), uio->uio_resid);
254		if (n > 0) {
255			FS_DEBUG2G("feeding buffeater with %d bytes of buffer %p,"
256				" saying %d was asked for\n",
257				n, bp->b_data + on, n + (int)bp->b_resid);
258			err = uiomove(bp->b_data + on, n, uio);
259		}
260		brelse(bp);
261		FS_DEBUG2G("end of turn, err %d, uio->uio_resid %zd, n %d\n",
262		    err, uio->uio_resid, n);
263	} while (err == 0 && uio->uio_resid > 0 && n > 0);
264
265	return (err);
266}
267
268static int
269fuse_read_directbackend(struct vnode *vp, struct uio *uio,
270    struct ucred *cred, struct fuse_filehandle *fufh)
271{
272	struct fuse_dispatcher fdi;
273	struct fuse_read_in *fri;
274	int err = 0;
275
276	if (uio->uio_resid == 0)
277		return (0);
278
279	fdisp_init(&fdi, 0);
280
281	/*
282         * XXX In "normal" case we use an intermediate kernel buffer for
283         * transmitting data from daemon's context to ours. Eventually, we should
284         * get rid of this. Anyway, if the target uio lives in sysspace (we are
285         * called from pageops), and the input data doesn't need kernel-side
286         * processing (we are not called from readdir) we can already invoke
287         * an optimized, "peer-to-peer" I/O routine.
288         */
289	while (uio->uio_resid > 0) {
290		fdi.iosize = sizeof(*fri);
291		fdisp_make_vp(&fdi, FUSE_READ, vp, uio->uio_td, cred);
292		fri = fdi.indata;
293		fri->fh = fufh->fh_id;
294		fri->offset = uio->uio_offset;
295		fri->size = MIN(uio->uio_resid,
296		    fuse_get_mpdata(vp->v_mount)->max_read);
297
298		FS_DEBUG2G("fri->fh %ju, fri->offset %ju, fri->size %ju\n",
299			(uintmax_t)fri->fh, (uintmax_t)fri->offset,
300			(uintmax_t)fri->size);
301
302		if ((err = fdisp_wait_answ(&fdi)))
303			goto out;
304
305		FS_DEBUG2G("complete: got iosize=%d, requested fri.size=%zd; "
306			"resid=%zd offset=%ju\n",
307			fri->size, fdi.iosize, uio->uio_resid,
308			(uintmax_t)uio->uio_offset);
309
310		if ((err = uiomove(fdi.answ, MIN(fri->size, fdi.iosize), uio)))
311			break;
312		if (fdi.iosize < fri->size)
313			break;
314	}
315
316out:
317	fdisp_destroy(&fdi);
318	return (err);
319}
320
321static int
322fuse_write_directbackend(struct vnode *vp, struct uio *uio,
323    struct ucred *cred, struct fuse_filehandle *fufh, int ioflag)
324{
325	struct fuse_vnode_data *fvdat = VTOFUD(vp);
326	struct fuse_write_in *fwi;
327	struct fuse_dispatcher fdi;
328	size_t chunksize;
329	int diff;
330	int err = 0;
331
332	if (uio->uio_resid == 0)
333		return (0);
334	if (ioflag & IO_APPEND)
335		uio_setoffset(uio, fvdat->filesize);
336
337	fdisp_init(&fdi, 0);
338
339	while (uio->uio_resid > 0) {
340		chunksize = MIN(uio->uio_resid,
341		    fuse_get_mpdata(vp->v_mount)->max_write);
342
343		fdi.iosize = sizeof(*fwi) + chunksize;
344		fdisp_make_vp(&fdi, FUSE_WRITE, vp, uio->uio_td, cred);
345
346		fwi = fdi.indata;
347		fwi->fh = fufh->fh_id;
348		fwi->offset = uio->uio_offset;
349		fwi->size = chunksize;
350
351		if ((err = uiomove((char *)fdi.indata + sizeof(*fwi),
352		    chunksize, uio)))
353			break;
354
355		if ((err = fdisp_wait_answ(&fdi)))
356			break;
357
358		diff = chunksize - ((struct fuse_write_out *)fdi.answ)->size;
359		if (diff < 0) {
360			err = EINVAL;
361			break;
362		}
363		uio->uio_resid += diff;
364		uio->uio_offset -= diff;
365		if (uio->uio_offset > fvdat->filesize)
366			fuse_vnode_setsize(vp, cred, uio->uio_offset);
367	}
368
369	fdisp_destroy(&fdi);
370
371	return (err);
372}
373
374static int
375fuse_write_biobackend(struct vnode *vp, struct uio *uio,
376    struct ucred *cred, struct fuse_filehandle *fufh, int ioflag)
377{
378	struct fuse_vnode_data *fvdat = VTOFUD(vp);
379	struct buf *bp;
380	daddr_t lbn;
381	int bcount;
382	int n, on, err = 0;
383
384	const int biosize = fuse_iosize(vp);
385
386	KASSERT(uio->uio_rw == UIO_WRITE, ("ncl_write mode"));
387	FS_DEBUG("resid=%zx offset=%jx fsize=%jx\n",
388	    uio->uio_resid, uio->uio_offset, fvdat->filesize);
389	if (vp->v_type != VREG)
390		return (EIO);
391	if (uio->uio_offset < 0)
392		return (EINVAL);
393	if (uio->uio_resid == 0)
394		return (0);
395	if (ioflag & IO_APPEND)
396		uio_setoffset(uio, fvdat->filesize);
397
398	/*
399         * Find all of this file's B_NEEDCOMMIT buffers.  If our writes
400         * would exceed the local maximum per-file write commit size when
401         * combined with those, we must decide whether to flush,
402         * go synchronous, or return err.  We don't bother checking
403         * IO_UNIT -- we just make all writes atomic anyway, as there's
404         * no point optimizing for something that really won't ever happen.
405         */
406	do {
407		if (fuse_isdeadfs(vp)) {
408			err = ENXIO;
409			break;
410		}
411		lbn = uio->uio_offset / biosize;
412		on = uio->uio_offset & (biosize - 1);
413		n = MIN((unsigned)(biosize - on), uio->uio_resid);
414
415		FS_DEBUG2G("lbn %ju, on %d, n %d, uio offset %ju, uio resid %zd\n",
416			(uintmax_t)lbn, on, n,
417			(uintmax_t)uio->uio_offset, uio->uio_resid);
418
419again:
420		/*
421	         * Handle direct append and file extension cases, calculate
422	         * unaligned buffer size.
423	         */
424		if (uio->uio_offset == fvdat->filesize && n) {
425			/*
426	                 * Get the buffer (in its pre-append state to maintain
427	                 * B_CACHE if it was previously set).  Resize the
428	                 * nfsnode after we have locked the buffer to prevent
429	                 * readers from reading garbage.
430	                 */
431			bcount = on;
432			FS_DEBUG("getting block from OS, bcount %d\n", bcount);
433			bp = getblk(vp, lbn, bcount, PCATCH, 0, 0);
434
435			if (bp != NULL) {
436				long save;
437
438				err = fuse_vnode_setsize(vp, cred,
439							 uio->uio_offset + n);
440				if (err) {
441					brelse(bp);
442					break;
443				}
444				save = bp->b_flags & B_CACHE;
445				bcount += n;
446				allocbuf(bp, bcount);
447				bp->b_flags |= save;
448			}
449		} else {
450			/*
451	                 * Obtain the locked cache block first, and then
452	                 * adjust the file's size as appropriate.
453	                 */
454			bcount = on + n;
455			if ((off_t)lbn * biosize + bcount < fvdat->filesize) {
456				if ((off_t)(lbn + 1) * biosize < fvdat->filesize)
457					bcount = biosize;
458				else
459					bcount = fvdat->filesize -
460					  (off_t)lbn *biosize;
461			}
462			FS_DEBUG("getting block from OS, bcount %d\n", bcount);
463			bp = getblk(vp, lbn, bcount, PCATCH, 0, 0);
464			if (bp && uio->uio_offset + n > fvdat->filesize) {
465				err = fuse_vnode_setsize(vp, cred,
466							 uio->uio_offset + n);
467				if (err) {
468					brelse(bp);
469					break;
470				}
471			}
472		}
473
474		if (!bp) {
475			err = EINTR;
476			break;
477		}
478		/*
479	         * Issue a READ if B_CACHE is not set.  In special-append
480	         * mode, B_CACHE is based on the buffer prior to the write
481	         * op and is typically set, avoiding the read.  If a read
482	         * is required in special append mode, the server will
483	         * probably send us a short-read since we extended the file
484	         * on our end, resulting in b_resid == 0 and, thusly,
485	         * B_CACHE getting set.
486	         *
487	         * We can also avoid issuing the read if the write covers
488	         * the entire buffer.  We have to make sure the buffer state
489	         * is reasonable in this case since we will not be initiating
490	         * I/O.  See the comments in kern/vfs_bio.c's getblk() for
491	         * more information.
492	         *
493	         * B_CACHE may also be set due to the buffer being cached
494	         * normally.
495	         */
496
497		if (on == 0 && n == bcount) {
498			bp->b_flags |= B_CACHE;
499			bp->b_flags &= ~B_INVAL;
500			bp->b_ioflags &= ~BIO_ERROR;
501		}
502		if ((bp->b_flags & B_CACHE) == 0) {
503			bp->b_iocmd = BIO_READ;
504			vfs_busy_pages(bp, 0);
505			fuse_io_strategy(vp, bp);
506			if ((err = bp->b_error)) {
507				brelse(bp);
508				break;
509			}
510		}
511		if (bp->b_wcred == NOCRED)
512			bp->b_wcred = crhold(cred);
513
514		/*
515	         * If dirtyend exceeds file size, chop it down.  This should
516	         * not normally occur but there is an append race where it
517	         * might occur XXX, so we log it.
518	         *
519	         * If the chopping creates a reverse-indexed or degenerate
520	         * situation with dirtyoff/end, we 0 both of them.
521	         */
522
523		if (bp->b_dirtyend > bcount) {
524			FS_DEBUG("FUSE append race @%lx:%d\n",
525			    (long)bp->b_blkno * biosize,
526			    bp->b_dirtyend - bcount);
527			bp->b_dirtyend = bcount;
528		}
529		if (bp->b_dirtyoff >= bp->b_dirtyend)
530			bp->b_dirtyoff = bp->b_dirtyend = 0;
531
532		/*
533	         * If the new write will leave a contiguous dirty
534	         * area, just update the b_dirtyoff and b_dirtyend,
535	         * otherwise force a write rpc of the old dirty area.
536	         *
537	         * While it is possible to merge discontiguous writes due to
538	         * our having a B_CACHE buffer ( and thus valid read data
539	         * for the hole), we don't because it could lead to
540	         * significant cache coherency problems with multiple clients,
541	         * especially if locking is implemented later on.
542	         *
543	         * as an optimization we could theoretically maintain
544	         * a linked list of discontinuous areas, but we would still
545	         * have to commit them separately so there isn't much
546	         * advantage to it except perhaps a bit of asynchronization.
547	         */
548
549		if (bp->b_dirtyend > 0 &&
550		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
551			/*
552	                 * Yes, we mean it. Write out everything to "storage"
553	                 * immediately, without hesitation. (Apart from other
554	                 * reasons: the only way to know if a write is valid
555	                 * if its actually written out.)
556	                 */
557			bwrite(bp);
558			if (bp->b_error == EINTR) {
559				err = EINTR;
560				break;
561			}
562			goto again;
563		}
564		err = uiomove((char *)bp->b_data + on, n, uio);
565
566		/*
567	         * Since this block is being modified, it must be written
568	         * again and not just committed.  Since write clustering does
569	         * not work for the stage 1 data write, only the stage 2
570	         * commit rpc, we have to clear B_CLUSTEROK as well.
571	         */
572		bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
573
574		if (err) {
575			bp->b_ioflags |= BIO_ERROR;
576			bp->b_error = err;
577			brelse(bp);
578			break;
579		}
580		/*
581	         * Only update dirtyoff/dirtyend if not a degenerate
582	         * condition.
583	         */
584		if (n) {
585			if (bp->b_dirtyend > 0) {
586				bp->b_dirtyoff = MIN(on, bp->b_dirtyoff);
587				bp->b_dirtyend = MAX((on + n), bp->b_dirtyend);
588			} else {
589				bp->b_dirtyoff = on;
590				bp->b_dirtyend = on + n;
591			}
592			vfs_bio_set_valid(bp, on, n);
593		}
594		err = bwrite(bp);
595		if (err)
596			break;
597	} while (uio->uio_resid > 0 && n > 0);
598
599	if (fuse_sync_resize && (fvdat->flag & FN_SIZECHANGE) != 0)
600		fuse_vnode_savesize(vp, cred);
601
602	return (err);
603}
604
605int
606fuse_io_strategy(struct vnode *vp, struct buf *bp)
607{
608	struct fuse_filehandle *fufh;
609	struct fuse_vnode_data *fvdat = VTOFUD(vp);
610	struct ucred *cred;
611	struct uio *uiop;
612	struct uio uio;
613	struct iovec io;
614	int error = 0;
615
616	const int biosize = fuse_iosize(vp);
617
618	MPASS(vp->v_type == VREG || vp->v_type == VDIR);
619	MPASS(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE);
620	FS_DEBUG("inode=%ju offset=%jd resid=%ld\n",
621	    (uintmax_t)VTOI(vp), (intmax_t)(((off_t)bp->b_blkno) * biosize),
622	    bp->b_bcount);
623
624	error = fuse_filehandle_getrw(vp,
625	    (bp->b_iocmd == BIO_READ) ? FUFH_RDONLY : FUFH_WRONLY, &fufh);
626	if (error) {
627		printf("FUSE: strategy: filehandles are closed\n");
628		bp->b_ioflags |= BIO_ERROR;
629		bp->b_error = error;
630		return (error);
631	}
632	cred = bp->b_iocmd == BIO_READ ? bp->b_rcred : bp->b_wcred;
633
634	uiop = &uio;
635	uiop->uio_iov = &io;
636	uiop->uio_iovcnt = 1;
637	uiop->uio_segflg = UIO_SYSSPACE;
638	uiop->uio_td = curthread;
639
640	/*
641         * clear BIO_ERROR and B_INVAL state prior to initiating the I/O.  We
642         * do this here so we do not have to do it in all the code that
643         * calls us.
644         */
645	bp->b_flags &= ~B_INVAL;
646	bp->b_ioflags &= ~BIO_ERROR;
647
648	KASSERT(!(bp->b_flags & B_DONE),
649	    ("fuse_io_strategy: bp %p already marked done", bp));
650	if (bp->b_iocmd == BIO_READ) {
651		io.iov_len = uiop->uio_resid = bp->b_bcount;
652		io.iov_base = bp->b_data;
653		uiop->uio_rw = UIO_READ;
654
655		uiop->uio_offset = ((off_t)bp->b_blkno) * biosize;
656		error = fuse_read_directbackend(vp, uiop, cred, fufh);
657
658		if ((!error && uiop->uio_resid) ||
659		    (fsess_opt_brokenio(vnode_mount(vp)) && error == EIO &&
660		    uiop->uio_offset < fvdat->filesize && fvdat->filesize > 0 &&
661		    uiop->uio_offset >= fvdat->cached_attrs.va_size)) {
662			/*
663	                 * If we had a short read with no error, we must have
664	                 * hit a file hole.  We should zero-fill the remainder.
665	                 * This can also occur if the server hits the file EOF.
666	                 *
667	                 * Holes used to be able to occur due to pending
668	                 * writes, but that is not possible any longer.
669	                 */
670			int nread = bp->b_bcount - uiop->uio_resid;
671			int left = uiop->uio_resid;
672
673			if (error != 0) {
674				printf("FUSE: Fix broken io: offset %ju, "
675				       " resid %zd, file size %ju/%ju\n",
676				       (uintmax_t)uiop->uio_offset,
677				    uiop->uio_resid, fvdat->filesize,
678				    fvdat->cached_attrs.va_size);
679				error = 0;
680			}
681			if (left > 0)
682				bzero((char *)bp->b_data + nread, left);
683			uiop->uio_resid = 0;
684		}
685		if (error) {
686			bp->b_ioflags |= BIO_ERROR;
687			bp->b_error = error;
688		}
689	} else {
690		/*
691	         * If we only need to commit, try to commit
692	         */
693		if (bp->b_flags & B_NEEDCOMMIT) {
694			FS_DEBUG("write: B_NEEDCOMMIT flags set\n");
695		}
696		/*
697	         * Setup for actual write
698	         */
699		if ((off_t)bp->b_blkno * biosize + bp->b_dirtyend >
700		    fvdat->filesize)
701			bp->b_dirtyend = fvdat->filesize -
702				(off_t)bp->b_blkno * biosize;
703
704		if (bp->b_dirtyend > bp->b_dirtyoff) {
705			io.iov_len = uiop->uio_resid = bp->b_dirtyend
706			    - bp->b_dirtyoff;
707			uiop->uio_offset = (off_t)bp->b_blkno * biosize
708			    + bp->b_dirtyoff;
709			io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
710			uiop->uio_rw = UIO_WRITE;
711
712			error = fuse_write_directbackend(vp, uiop, cred, fufh, 0);
713
714			if (error == EINTR || error == ETIMEDOUT
715			    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
716
717				bp->b_flags &= ~(B_INVAL | B_NOCACHE);
718				if ((bp->b_flags & B_PAGING) == 0) {
719					bdirty(bp);
720					bp->b_flags &= ~B_DONE;
721				}
722				if ((error == EINTR || error == ETIMEDOUT) &&
723				    (bp->b_flags & B_ASYNC) == 0)
724					bp->b_flags |= B_EINTR;
725			} else {
726				if (error) {
727					bp->b_ioflags |= BIO_ERROR;
728					bp->b_flags |= B_INVAL;
729					bp->b_error = error;
730				}
731				bp->b_dirtyoff = bp->b_dirtyend = 0;
732			}
733		} else {
734			bp->b_resid = 0;
735			bufdone(bp);
736			return (0);
737		}
738	}
739	bp->b_resid = uiop->uio_resid;
740	bufdone(bp);
741	return (error);
742}
743
744int
745fuse_io_flushbuf(struct vnode *vp, int waitfor, struct thread *td)
746{
747	struct vop_fsync_args a = {
748		.a_vp = vp,
749		.a_waitfor = waitfor,
750		.a_td = td,
751	};
752
753	return (vop_stdfsync(&a));
754}
755
756/*
757 * Flush and invalidate all dirty buffers. If another process is already
758 * doing the flush, just wait for completion.
759 */
760int
761fuse_io_invalbuf(struct vnode *vp, struct thread *td)
762{
763	struct fuse_vnode_data *fvdat = VTOFUD(vp);
764	int error = 0;
765
766	if (vp->v_iflag & VI_DOOMED)
767		return 0;
768
769	ASSERT_VOP_ELOCKED(vp, "fuse_io_invalbuf");
770
771	while (fvdat->flag & FN_FLUSHINPROG) {
772		struct proc *p = td->td_proc;
773
774		if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF)
775			return EIO;
776		fvdat->flag |= FN_FLUSHWANT;
777		tsleep(&fvdat->flag, PRIBIO + 2, "fusevinv", 2 * hz);
778		error = 0;
779		if (p != NULL) {
780			PROC_LOCK(p);
781			if (SIGNOTEMPTY(p->p_siglist) ||
782			    SIGNOTEMPTY(td->td_siglist))
783				error = EINTR;
784			PROC_UNLOCK(p);
785		}
786		if (error == EINTR)
787			return EINTR;
788	}
789	fvdat->flag |= FN_FLUSHINPROG;
790
791	if (vp->v_bufobj.bo_object != NULL) {
792		VM_OBJECT_WLOCK(vp->v_bufobj.bo_object);
793		vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC);
794		VM_OBJECT_WUNLOCK(vp->v_bufobj.bo_object);
795	}
796	error = vinvalbuf(vp, V_SAVE, PCATCH, 0);
797	while (error) {
798		if (error == ERESTART || error == EINTR) {
799			fvdat->flag &= ~FN_FLUSHINPROG;
800			if (fvdat->flag & FN_FLUSHWANT) {
801				fvdat->flag &= ~FN_FLUSHWANT;
802				wakeup(&fvdat->flag);
803			}
804			return EINTR;
805		}
806		error = vinvalbuf(vp, V_SAVE, PCATCH, 0);
807	}
808	fvdat->flag &= ~FN_FLUSHINPROG;
809	if (fvdat->flag & FN_FLUSHWANT) {
810		fvdat->flag &= ~FN_FLUSHWANT;
811		wakeup(&fvdat->flag);
812	}
813	return (error);
814}
815