ffs_balloc.c revision 331017
1/*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 2002 Networks Associates Technology, Inc. 5 * All rights reserved. 6 * 7 * This software was developed for the FreeBSD Project by Marshall 8 * Kirk McKusick and Network Associates Laboratories, the Security 9 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 10 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 11 * research program 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1982, 1986, 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * 37 * Redistribution and use in source and binary forms, with or without 38 * modification, are permitted provided that the following conditions 39 * are met: 40 * 1. Redistributions of source code must retain the above copyright 41 * notice, this list of conditions and the following disclaimer. 42 * 2. Redistributions in binary form must reproduce the above copyright 43 * notice, this list of conditions and the following disclaimer in the 44 * documentation and/or other materials provided with the distribution. 45 * 4. Neither the name of the University nor the names of its contributors 46 * may be used to endorse or promote products derived from this software 47 * without specific prior written permission. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 59 * SUCH DAMAGE. 60 * 61 * @(#)ffs_balloc.c 8.8 (Berkeley) 6/16/95 62 */ 63 64#include <sys/cdefs.h> 65__FBSDID("$FreeBSD: stable/11/sys/ufs/ffs/ffs_balloc.c 331017 2018-03-15 19:08:33Z kevans $"); 66 67#include <sys/param.h> 68#include <sys/systm.h> 69#include <sys/bio.h> 70#include <sys/buf.h> 71#include <sys/lock.h> 72#include <sys/mount.h> 73#include <sys/vnode.h> 74#include <sys/vmmeter.h> 75 76#include <ufs/ufs/quota.h> 77#include <ufs/ufs/inode.h> 78#include <ufs/ufs/ufs_extern.h> 79#include <ufs/ufs/extattr.h> 80#include <ufs/ufs/ufsmount.h> 81 82#include <ufs/ffs/fs.h> 83#include <ufs/ffs/ffs_extern.h> 84 85/* 86 * Balloc defines the structure of filesystem storage 87 * by allocating the physical blocks on a device given 88 * the inode and the logical block number in a file. 89 * This is the allocation strategy for UFS1. Below is 90 * the allocation strategy for UFS2. 91 */ 92int 93ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, 94 struct ucred *cred, int flags, struct buf **bpp) 95{ 96 struct inode *ip; 97 struct ufs1_dinode *dp; 98 ufs_lbn_t lbn, lastlbn; 99 struct fs *fs; 100 ufs1_daddr_t nb; 101 struct buf *bp, *nbp; 102 struct ufsmount *ump; 103 struct indir indirs[NIADDR + 2]; 104 int deallocated, osize, nsize, num, i, error; 105 ufs2_daddr_t newb; 106 ufs1_daddr_t *bap, pref; 107 ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1]; 108 ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1]; 109 int unwindidx = -1; 110 int saved_inbdflush; 111 static struct timeval lastfail; 112 static int curfail; 113 int gbflags, reclaimed; 114 115 ip = VTOI(vp); 116 dp = ip->i_din1; 117 fs = ITOFS(ip); 118 ump = ITOUMP(ip); 119 lbn = lblkno(fs, startoffset); 120 size = blkoff(fs, startoffset) + size; 121 reclaimed = 0; 122 if (size > fs->fs_bsize) 123 panic("ffs_balloc_ufs1: blk too big"); 124 *bpp = NULL; 125 if (flags & IO_EXT) 126 return (EOPNOTSUPP); 127 if (lbn < 0) 128 return (EFBIG); 129 gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; 130 131 if (DOINGSOFTDEP(vp)) 132 softdep_prealloc(vp, MNT_WAIT); 133 /* 134 * If the next write will extend the file into a new block, 135 * and the file is currently composed of a fragment 136 * this fragment has to be extended to be a full block. 137 */ 138 lastlbn = lblkno(fs, ip->i_size); 139 if (lastlbn < NDADDR && lastlbn < lbn) { 140 nb = lastlbn; 141 osize = blksize(fs, ip, nb); 142 if (osize < fs->fs_bsize && osize > 0) { 143 UFS_LOCK(ump); 144 error = ffs_realloccg(ip, nb, dp->di_db[nb], 145 ffs_blkpref_ufs1(ip, lastlbn, (int)nb, 146 &dp->di_db[0]), osize, (int)fs->fs_bsize, flags, 147 cred, &bp); 148 if (error) 149 return (error); 150 if (DOINGSOFTDEP(vp)) 151 softdep_setup_allocdirect(ip, nb, 152 dbtofsb(fs, bp->b_blkno), dp->di_db[nb], 153 fs->fs_bsize, osize, bp); 154 ip->i_size = smalllblktosize(fs, nb + 1); 155 dp->di_size = ip->i_size; 156 dp->di_db[nb] = dbtofsb(fs, bp->b_blkno); 157 ip->i_flag |= IN_CHANGE | IN_UPDATE; 158 if (flags & IO_SYNC) 159 bwrite(bp); 160 else 161 bawrite(bp); 162 } 163 } 164 /* 165 * The first NDADDR blocks are direct blocks 166 */ 167 if (lbn < NDADDR) { 168 if (flags & BA_METAONLY) 169 panic("ffs_balloc_ufs1: BA_METAONLY for direct block"); 170 nb = dp->di_db[lbn]; 171 if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) { 172 error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp); 173 if (error) { 174 brelse(bp); 175 return (error); 176 } 177 bp->b_blkno = fsbtodb(fs, nb); 178 *bpp = bp; 179 return (0); 180 } 181 if (nb != 0) { 182 /* 183 * Consider need to reallocate a fragment. 184 */ 185 osize = fragroundup(fs, blkoff(fs, ip->i_size)); 186 nsize = fragroundup(fs, size); 187 if (nsize <= osize) { 188 error = bread(vp, lbn, osize, NOCRED, &bp); 189 if (error) { 190 brelse(bp); 191 return (error); 192 } 193 bp->b_blkno = fsbtodb(fs, nb); 194 } else { 195 UFS_LOCK(ump); 196 error = ffs_realloccg(ip, lbn, dp->di_db[lbn], 197 ffs_blkpref_ufs1(ip, lbn, (int)lbn, 198 &dp->di_db[0]), osize, nsize, flags, 199 cred, &bp); 200 if (error) 201 return (error); 202 if (DOINGSOFTDEP(vp)) 203 softdep_setup_allocdirect(ip, lbn, 204 dbtofsb(fs, bp->b_blkno), nb, 205 nsize, osize, bp); 206 } 207 } else { 208 if (ip->i_size < smalllblktosize(fs, lbn + 1)) 209 nsize = fragroundup(fs, size); 210 else 211 nsize = fs->fs_bsize; 212 UFS_LOCK(ump); 213 error = ffs_alloc(ip, lbn, 214 ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]), 215 nsize, flags, cred, &newb); 216 if (error) 217 return (error); 218 bp = getblk(vp, lbn, nsize, 0, 0, gbflags); 219 bp->b_blkno = fsbtodb(fs, newb); 220 if (flags & BA_CLRBUF) 221 vfs_bio_clrbuf(bp); 222 if (DOINGSOFTDEP(vp)) 223 softdep_setup_allocdirect(ip, lbn, newb, 0, 224 nsize, 0, bp); 225 } 226 dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno); 227 ip->i_flag |= IN_CHANGE | IN_UPDATE; 228 *bpp = bp; 229 return (0); 230 } 231 /* 232 * Determine the number of levels of indirection. 233 */ 234 pref = 0; 235 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) 236 return(error); 237#ifdef INVARIANTS 238 if (num < 1) 239 panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block"); 240#endif 241 saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH); 242 /* 243 * Fetch the first indirect block allocating if necessary. 244 */ 245 --num; 246 nb = dp->di_ib[indirs[0].in_off]; 247 allocib = NULL; 248 allocblk = allociblk; 249 lbns_remfree = lbns; 250 if (nb == 0) { 251 UFS_LOCK(ump); 252 pref = ffs_blkpref_ufs1(ip, lbn, -indirs[0].in_off - 1, 253 (ufs1_daddr_t *)0); 254 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 255 flags, cred, &newb)) != 0) { 256 curthread_pflags_restore(saved_inbdflush); 257 return (error); 258 } 259 pref = newb + fs->fs_frag; 260 nb = newb; 261 MPASS(allocblk < allociblk + nitems(allociblk)); 262 MPASS(lbns_remfree < lbns + nitems(lbns)); 263 *allocblk++ = nb; 264 *lbns_remfree++ = indirs[1].in_lbn; 265 bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, gbflags); 266 bp->b_blkno = fsbtodb(fs, nb); 267 vfs_bio_clrbuf(bp); 268 if (DOINGSOFTDEP(vp)) { 269 softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off, 270 newb, 0, fs->fs_bsize, 0, bp); 271 bdwrite(bp); 272 } else { 273 /* 274 * Write synchronously so that indirect blocks 275 * never point at garbage. 276 */ 277 if (DOINGASYNC(vp)) 278 bdwrite(bp); 279 else if ((error = bwrite(bp)) != 0) 280 goto fail; 281 } 282 allocib = &dp->di_ib[indirs[0].in_off]; 283 *allocib = nb; 284 ip->i_flag |= IN_CHANGE | IN_UPDATE; 285 } 286 /* 287 * Fetch through the indirect blocks, allocating as necessary. 288 */ 289retry: 290 for (i = 1;;) { 291 error = bread(vp, 292 indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp); 293 if (error) { 294 brelse(bp); 295 goto fail; 296 } 297 bap = (ufs1_daddr_t *)bp->b_data; 298 nb = bap[indirs[i].in_off]; 299 if (i == num) 300 break; 301 i += 1; 302 if (nb != 0) { 303 bqrelse(bp); 304 continue; 305 } 306 UFS_LOCK(ump); 307 /* 308 * If parent indirect has just been allocated, try to cluster 309 * immediately following it. 310 */ 311 if (pref == 0) 312 pref = ffs_blkpref_ufs1(ip, lbn, i - num - 1, 313 (ufs1_daddr_t *)0); 314 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 315 flags | IO_BUFLOCKED, cred, &newb)) != 0) { 316 brelse(bp); 317 if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { 318 UFS_LOCK(ump); 319 softdep_request_cleanup(fs, vp, cred, 320 FLUSH_BLOCKS_WAIT); 321 UFS_UNLOCK(ump); 322 goto retry; 323 } 324 if (ppsratecheck(&lastfail, &curfail, 1)) { 325 ffs_fserr(fs, ip->i_number, "filesystem full"); 326 uprintf("\n%s: write failed, filesystem " 327 "is full\n", fs->fs_fsmnt); 328 } 329 goto fail; 330 } 331 pref = newb + fs->fs_frag; 332 nb = newb; 333 MPASS(allocblk < allociblk + nitems(allociblk)); 334 MPASS(lbns_remfree < lbns + nitems(lbns)); 335 *allocblk++ = nb; 336 *lbns_remfree++ = indirs[i].in_lbn; 337 nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0); 338 nbp->b_blkno = fsbtodb(fs, nb); 339 vfs_bio_clrbuf(nbp); 340 if (DOINGSOFTDEP(vp)) { 341 softdep_setup_allocindir_meta(nbp, ip, bp, 342 indirs[i - 1].in_off, nb); 343 bdwrite(nbp); 344 } else { 345 /* 346 * Write synchronously so that indirect blocks 347 * never point at garbage. 348 */ 349 if ((error = bwrite(nbp)) != 0) { 350 brelse(bp); 351 goto fail; 352 } 353 } 354 bap[indirs[i - 1].in_off] = nb; 355 if (allocib == NULL && unwindidx < 0) 356 unwindidx = i - 1; 357 /* 358 * If required, write synchronously, otherwise use 359 * delayed write. 360 */ 361 if (flags & IO_SYNC) { 362 bwrite(bp); 363 } else { 364 if (bp->b_bufsize == fs->fs_bsize) 365 bp->b_flags |= B_CLUSTEROK; 366 bdwrite(bp); 367 } 368 } 369 /* 370 * If asked only for the indirect block, then return it. 371 */ 372 if (flags & BA_METAONLY) { 373 curthread_pflags_restore(saved_inbdflush); 374 *bpp = bp; 375 return (0); 376 } 377 /* 378 * Get the data block, allocating if necessary. 379 */ 380 if (nb == 0) { 381 UFS_LOCK(ump); 382 /* 383 * If allocating metadata at the front of the cylinder 384 * group and parent indirect block has just been allocated, 385 * then cluster next to it if it is the first indirect in 386 * the file. Otherwise it has been allocated in the metadata 387 * area, so we want to find our own place out in the data area. 388 */ 389 if (pref == 0 || (lbn > NDADDR && fs->fs_metaspace != 0)) 390 pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off, 391 &bap[0]); 392 error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 393 flags | IO_BUFLOCKED, cred, &newb); 394 if (error) { 395 brelse(bp); 396 if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { 397 UFS_LOCK(ump); 398 softdep_request_cleanup(fs, vp, cred, 399 FLUSH_BLOCKS_WAIT); 400 UFS_UNLOCK(ump); 401 goto retry; 402 } 403 if (ppsratecheck(&lastfail, &curfail, 1)) { 404 ffs_fserr(fs, ip->i_number, "filesystem full"); 405 uprintf("\n%s: write failed, filesystem " 406 "is full\n", fs->fs_fsmnt); 407 } 408 goto fail; 409 } 410 nb = newb; 411 MPASS(allocblk < allociblk + nitems(allociblk)); 412 MPASS(lbns_remfree < lbns + nitems(lbns)); 413 *allocblk++ = nb; 414 *lbns_remfree++ = lbn; 415 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); 416 nbp->b_blkno = fsbtodb(fs, nb); 417 if (flags & BA_CLRBUF) 418 vfs_bio_clrbuf(nbp); 419 if (DOINGSOFTDEP(vp)) 420 softdep_setup_allocindir_page(ip, lbn, bp, 421 indirs[i].in_off, nb, 0, nbp); 422 bap[indirs[i].in_off] = nb; 423 /* 424 * If required, write synchronously, otherwise use 425 * delayed write. 426 */ 427 if (flags & IO_SYNC) { 428 bwrite(bp); 429 } else { 430 if (bp->b_bufsize == fs->fs_bsize) 431 bp->b_flags |= B_CLUSTEROK; 432 bdwrite(bp); 433 } 434 curthread_pflags_restore(saved_inbdflush); 435 *bpp = nbp; 436 return (0); 437 } 438 brelse(bp); 439 if (flags & BA_CLRBUF) { 440 int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT; 441 if (seqcount != 0 && 442 (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 && 443 !(vm_page_count_severe() || buf_dirty_count_severe())) { 444 error = cluster_read(vp, ip->i_size, lbn, 445 (int)fs->fs_bsize, NOCRED, 446 MAXBSIZE, seqcount, gbflags, &nbp); 447 } else { 448 error = bread_gb(vp, lbn, (int)fs->fs_bsize, NOCRED, 449 gbflags, &nbp); 450 } 451 if (error) { 452 brelse(nbp); 453 goto fail; 454 } 455 } else { 456 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); 457 nbp->b_blkno = fsbtodb(fs, nb); 458 } 459 curthread_pflags_restore(saved_inbdflush); 460 *bpp = nbp; 461 return (0); 462fail: 463 curthread_pflags_restore(saved_inbdflush); 464 /* 465 * If we have failed to allocate any blocks, simply return the error. 466 * This is the usual case and avoids the need to fsync the file. 467 */ 468 if (allocblk == allociblk && allocib == NULL && unwindidx == -1) 469 return (error); 470 /* 471 * If we have failed part way through block allocation, we 472 * have to deallocate any indirect blocks that we have allocated. 473 * We have to fsync the file before we start to get rid of all 474 * of its dependencies so that we do not leave them dangling. 475 * We have to sync it at the end so that the soft updates code 476 * does not find any untracked changes. Although this is really 477 * slow, running out of disk space is not expected to be a common 478 * occurrence. The error return from fsync is ignored as we already 479 * have an error to return to the user. 480 * 481 * XXX Still have to journal the free below 482 */ 483 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 484 for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns; 485 blkp < allocblk; blkp++, lbns_remfree++) { 486 /* 487 * We shall not leave the freed blocks on the vnode 488 * buffer object lists. 489 */ 490 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, 491 GB_NOCREAT | GB_UNMAPPED); 492 if (bp != NULL) { 493 KASSERT(bp->b_blkno == fsbtodb(fs, *blkp), 494 ("mismatch1 l %jd %jd b %ju %ju", 495 (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree, 496 (uintmax_t)bp->b_blkno, 497 (uintmax_t)fsbtodb(fs, *blkp))); 498 bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE; 499 bp->b_flags &= ~(B_ASYNC | B_CACHE); 500 brelse(bp); 501 } 502 deallocated += fs->fs_bsize; 503 } 504 if (allocib != NULL) { 505 *allocib = 0; 506 } else if (unwindidx >= 0) { 507 int r; 508 509 r = bread(vp, indirs[unwindidx].in_lbn, 510 (int)fs->fs_bsize, NOCRED, &bp); 511 if (r) { 512 panic("Could not unwind indirect block, error %d", r); 513 brelse(bp); 514 } else { 515 bap = (ufs1_daddr_t *)bp->b_data; 516 bap[indirs[unwindidx].in_off] = 0; 517 if (flags & IO_SYNC) { 518 bwrite(bp); 519 } else { 520 if (bp->b_bufsize == fs->fs_bsize) 521 bp->b_flags |= B_CLUSTEROK; 522 bdwrite(bp); 523 } 524 } 525 } 526 if (deallocated) { 527#ifdef QUOTA 528 /* 529 * Restore user's disk quota because allocation failed. 530 */ 531 (void) chkdq(ip, -btodb(deallocated), cred, FORCE); 532#endif 533 dp->di_blocks -= btodb(deallocated); 534 ip->i_flag |= IN_CHANGE | IN_UPDATE; 535 } 536 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 537 /* 538 * After the buffers are invalidated and on-disk pointers are 539 * cleared, free the blocks. 540 */ 541 for (blkp = allociblk; blkp < allocblk; blkp++) { 542#ifdef INVARIANTS 543 if (blkp == allociblk) 544 lbns_remfree = lbns; 545 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, 546 GB_NOCREAT | GB_UNMAPPED); 547 if (bp != NULL) { 548 panic("zombie1 %jd %ju %ju", 549 (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno, 550 (uintmax_t)fsbtodb(fs, *blkp)); 551 } 552 lbns_remfree++; 553#endif 554 ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize, 555 ip->i_number, vp->v_type, NULL); 556 } 557 return (error); 558} 559 560/* 561 * Balloc defines the structure of file system storage 562 * by allocating the physical blocks on a device given 563 * the inode and the logical block number in a file. 564 * This is the allocation strategy for UFS2. Above is 565 * the allocation strategy for UFS1. 566 */ 567int 568ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, 569 struct ucred *cred, int flags, struct buf **bpp) 570{ 571 struct inode *ip; 572 struct ufs2_dinode *dp; 573 ufs_lbn_t lbn, lastlbn; 574 struct fs *fs; 575 struct buf *bp, *nbp; 576 struct ufsmount *ump; 577 struct indir indirs[NIADDR + 2]; 578 ufs2_daddr_t nb, newb, *bap, pref; 579 ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1]; 580 ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1]; 581 int deallocated, osize, nsize, num, i, error; 582 int unwindidx = -1; 583 int saved_inbdflush; 584 static struct timeval lastfail; 585 static int curfail; 586 int gbflags, reclaimed; 587 588 ip = VTOI(vp); 589 dp = ip->i_din2; 590 fs = ITOFS(ip); 591 ump = ITOUMP(ip); 592 lbn = lblkno(fs, startoffset); 593 size = blkoff(fs, startoffset) + size; 594 reclaimed = 0; 595 if (size > fs->fs_bsize) 596 panic("ffs_balloc_ufs2: blk too big"); 597 *bpp = NULL; 598 if (lbn < 0) 599 return (EFBIG); 600 gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; 601 602 if (DOINGSOFTDEP(vp)) 603 softdep_prealloc(vp, MNT_WAIT); 604 605 /* 606 * Check for allocating external data. 607 */ 608 if (flags & IO_EXT) { 609 if (lbn >= NXADDR) 610 return (EFBIG); 611 /* 612 * If the next write will extend the data into a new block, 613 * and the data is currently composed of a fragment 614 * this fragment has to be extended to be a full block. 615 */ 616 lastlbn = lblkno(fs, dp->di_extsize); 617 if (lastlbn < lbn) { 618 nb = lastlbn; 619 osize = sblksize(fs, dp->di_extsize, nb); 620 if (osize < fs->fs_bsize && osize > 0) { 621 UFS_LOCK(ump); 622 error = ffs_realloccg(ip, -1 - nb, 623 dp->di_extb[nb], 624 ffs_blkpref_ufs2(ip, lastlbn, (int)nb, 625 &dp->di_extb[0]), osize, 626 (int)fs->fs_bsize, flags, cred, &bp); 627 if (error) 628 return (error); 629 if (DOINGSOFTDEP(vp)) 630 softdep_setup_allocext(ip, nb, 631 dbtofsb(fs, bp->b_blkno), 632 dp->di_extb[nb], 633 fs->fs_bsize, osize, bp); 634 dp->di_extsize = smalllblktosize(fs, nb + 1); 635 dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno); 636 bp->b_xflags |= BX_ALTDATA; 637 ip->i_flag |= IN_CHANGE; 638 if (flags & IO_SYNC) 639 bwrite(bp); 640 else 641 bawrite(bp); 642 } 643 } 644 /* 645 * All blocks are direct blocks 646 */ 647 if (flags & BA_METAONLY) 648 panic("ffs_balloc_ufs2: BA_METAONLY for ext block"); 649 nb = dp->di_extb[lbn]; 650 if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) { 651 error = bread_gb(vp, -1 - lbn, fs->fs_bsize, NOCRED, 652 gbflags, &bp); 653 if (error) { 654 brelse(bp); 655 return (error); 656 } 657 bp->b_blkno = fsbtodb(fs, nb); 658 bp->b_xflags |= BX_ALTDATA; 659 *bpp = bp; 660 return (0); 661 } 662 if (nb != 0) { 663 /* 664 * Consider need to reallocate a fragment. 665 */ 666 osize = fragroundup(fs, blkoff(fs, dp->di_extsize)); 667 nsize = fragroundup(fs, size); 668 if (nsize <= osize) { 669 error = bread_gb(vp, -1 - lbn, osize, NOCRED, 670 gbflags, &bp); 671 if (error) { 672 brelse(bp); 673 return (error); 674 } 675 bp->b_blkno = fsbtodb(fs, nb); 676 bp->b_xflags |= BX_ALTDATA; 677 } else { 678 UFS_LOCK(ump); 679 error = ffs_realloccg(ip, -1 - lbn, 680 dp->di_extb[lbn], 681 ffs_blkpref_ufs2(ip, lbn, (int)lbn, 682 &dp->di_extb[0]), osize, nsize, flags, 683 cred, &bp); 684 if (error) 685 return (error); 686 bp->b_xflags |= BX_ALTDATA; 687 if (DOINGSOFTDEP(vp)) 688 softdep_setup_allocext(ip, lbn, 689 dbtofsb(fs, bp->b_blkno), nb, 690 nsize, osize, bp); 691 } 692 } else { 693 if (dp->di_extsize < smalllblktosize(fs, lbn + 1)) 694 nsize = fragroundup(fs, size); 695 else 696 nsize = fs->fs_bsize; 697 UFS_LOCK(ump); 698 error = ffs_alloc(ip, lbn, 699 ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]), 700 nsize, flags, cred, &newb); 701 if (error) 702 return (error); 703 bp = getblk(vp, -1 - lbn, nsize, 0, 0, gbflags); 704 bp->b_blkno = fsbtodb(fs, newb); 705 bp->b_xflags |= BX_ALTDATA; 706 if (flags & BA_CLRBUF) 707 vfs_bio_clrbuf(bp); 708 if (DOINGSOFTDEP(vp)) 709 softdep_setup_allocext(ip, lbn, newb, 0, 710 nsize, 0, bp); 711 } 712 dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno); 713 ip->i_flag |= IN_CHANGE; 714 *bpp = bp; 715 return (0); 716 } 717 /* 718 * If the next write will extend the file into a new block, 719 * and the file is currently composed of a fragment 720 * this fragment has to be extended to be a full block. 721 */ 722 lastlbn = lblkno(fs, ip->i_size); 723 if (lastlbn < NDADDR && lastlbn < lbn) { 724 nb = lastlbn; 725 osize = blksize(fs, ip, nb); 726 if (osize < fs->fs_bsize && osize > 0) { 727 UFS_LOCK(ump); 728 error = ffs_realloccg(ip, nb, dp->di_db[nb], 729 ffs_blkpref_ufs2(ip, lastlbn, (int)nb, 730 &dp->di_db[0]), osize, (int)fs->fs_bsize, 731 flags, cred, &bp); 732 if (error) 733 return (error); 734 if (DOINGSOFTDEP(vp)) 735 softdep_setup_allocdirect(ip, nb, 736 dbtofsb(fs, bp->b_blkno), 737 dp->di_db[nb], 738 fs->fs_bsize, osize, bp); 739 ip->i_size = smalllblktosize(fs, nb + 1); 740 dp->di_size = ip->i_size; 741 dp->di_db[nb] = dbtofsb(fs, bp->b_blkno); 742 ip->i_flag |= IN_CHANGE | IN_UPDATE; 743 if (flags & IO_SYNC) 744 bwrite(bp); 745 else 746 bawrite(bp); 747 } 748 } 749 /* 750 * The first NDADDR blocks are direct blocks 751 */ 752 if (lbn < NDADDR) { 753 if (flags & BA_METAONLY) 754 panic("ffs_balloc_ufs2: BA_METAONLY for direct block"); 755 nb = dp->di_db[lbn]; 756 if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) { 757 error = bread_gb(vp, lbn, fs->fs_bsize, NOCRED, 758 gbflags, &bp); 759 if (error) { 760 brelse(bp); 761 return (error); 762 } 763 bp->b_blkno = fsbtodb(fs, nb); 764 *bpp = bp; 765 return (0); 766 } 767 if (nb != 0) { 768 /* 769 * Consider need to reallocate a fragment. 770 */ 771 osize = fragroundup(fs, blkoff(fs, ip->i_size)); 772 nsize = fragroundup(fs, size); 773 if (nsize <= osize) { 774 error = bread_gb(vp, lbn, osize, NOCRED, 775 gbflags, &bp); 776 if (error) { 777 brelse(bp); 778 return (error); 779 } 780 bp->b_blkno = fsbtodb(fs, nb); 781 } else { 782 UFS_LOCK(ump); 783 error = ffs_realloccg(ip, lbn, dp->di_db[lbn], 784 ffs_blkpref_ufs2(ip, lbn, (int)lbn, 785 &dp->di_db[0]), osize, nsize, flags, 786 cred, &bp); 787 if (error) 788 return (error); 789 if (DOINGSOFTDEP(vp)) 790 softdep_setup_allocdirect(ip, lbn, 791 dbtofsb(fs, bp->b_blkno), nb, 792 nsize, osize, bp); 793 } 794 } else { 795 if (ip->i_size < smalllblktosize(fs, lbn + 1)) 796 nsize = fragroundup(fs, size); 797 else 798 nsize = fs->fs_bsize; 799 UFS_LOCK(ump); 800 error = ffs_alloc(ip, lbn, 801 ffs_blkpref_ufs2(ip, lbn, (int)lbn, 802 &dp->di_db[0]), nsize, flags, cred, &newb); 803 if (error) 804 return (error); 805 bp = getblk(vp, lbn, nsize, 0, 0, gbflags); 806 bp->b_blkno = fsbtodb(fs, newb); 807 if (flags & BA_CLRBUF) 808 vfs_bio_clrbuf(bp); 809 if (DOINGSOFTDEP(vp)) 810 softdep_setup_allocdirect(ip, lbn, newb, 0, 811 nsize, 0, bp); 812 } 813 dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno); 814 ip->i_flag |= IN_CHANGE | IN_UPDATE; 815 *bpp = bp; 816 return (0); 817 } 818 /* 819 * Determine the number of levels of indirection. 820 */ 821 pref = 0; 822 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) 823 return(error); 824#ifdef INVARIANTS 825 if (num < 1) 826 panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block"); 827#endif 828 saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH); 829 /* 830 * Fetch the first indirect block allocating if necessary. 831 */ 832 --num; 833 nb = dp->di_ib[indirs[0].in_off]; 834 allocib = NULL; 835 allocblk = allociblk; 836 lbns_remfree = lbns; 837 if (nb == 0) { 838 UFS_LOCK(ump); 839 pref = ffs_blkpref_ufs2(ip, lbn, -indirs[0].in_off - 1, 840 (ufs2_daddr_t *)0); 841 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 842 flags, cred, &newb)) != 0) { 843 curthread_pflags_restore(saved_inbdflush); 844 return (error); 845 } 846 pref = newb + fs->fs_frag; 847 nb = newb; 848 MPASS(allocblk < allociblk + nitems(allociblk)); 849 MPASS(lbns_remfree < lbns + nitems(lbns)); 850 *allocblk++ = nb; 851 *lbns_remfree++ = indirs[1].in_lbn; 852 bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 853 GB_UNMAPPED); 854 bp->b_blkno = fsbtodb(fs, nb); 855 vfs_bio_clrbuf(bp); 856 if (DOINGSOFTDEP(vp)) { 857 softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off, 858 newb, 0, fs->fs_bsize, 0, bp); 859 bdwrite(bp); 860 } else { 861 /* 862 * Write synchronously so that indirect blocks 863 * never point at garbage. 864 */ 865 if (DOINGASYNC(vp)) 866 bdwrite(bp); 867 else if ((error = bwrite(bp)) != 0) 868 goto fail; 869 } 870 allocib = &dp->di_ib[indirs[0].in_off]; 871 *allocib = nb; 872 ip->i_flag |= IN_CHANGE | IN_UPDATE; 873 } 874 /* 875 * Fetch through the indirect blocks, allocating as necessary. 876 */ 877retry: 878 for (i = 1;;) { 879 error = bread(vp, 880 indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp); 881 if (error) { 882 brelse(bp); 883 goto fail; 884 } 885 bap = (ufs2_daddr_t *)bp->b_data; 886 nb = bap[indirs[i].in_off]; 887 if (i == num) 888 break; 889 i += 1; 890 if (nb != 0) { 891 bqrelse(bp); 892 continue; 893 } 894 UFS_LOCK(ump); 895 /* 896 * If parent indirect has just been allocated, try to cluster 897 * immediately following it. 898 */ 899 if (pref == 0) 900 pref = ffs_blkpref_ufs2(ip, lbn, i - num - 1, 901 (ufs2_daddr_t *)0); 902 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 903 flags | IO_BUFLOCKED, cred, &newb)) != 0) { 904 brelse(bp); 905 if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { 906 UFS_LOCK(ump); 907 softdep_request_cleanup(fs, vp, cred, 908 FLUSH_BLOCKS_WAIT); 909 UFS_UNLOCK(ump); 910 goto retry; 911 } 912 if (ppsratecheck(&lastfail, &curfail, 1)) { 913 ffs_fserr(fs, ip->i_number, "filesystem full"); 914 uprintf("\n%s: write failed, filesystem " 915 "is full\n", fs->fs_fsmnt); 916 } 917 goto fail; 918 } 919 pref = newb + fs->fs_frag; 920 nb = newb; 921 MPASS(allocblk < allociblk + nitems(allociblk)); 922 MPASS(lbns_remfree < lbns + nitems(lbns)); 923 *allocblk++ = nb; 924 *lbns_remfree++ = indirs[i].in_lbn; 925 nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 926 GB_UNMAPPED); 927 nbp->b_blkno = fsbtodb(fs, nb); 928 vfs_bio_clrbuf(nbp); 929 if (DOINGSOFTDEP(vp)) { 930 softdep_setup_allocindir_meta(nbp, ip, bp, 931 indirs[i - 1].in_off, nb); 932 bdwrite(nbp); 933 } else { 934 /* 935 * Write synchronously so that indirect blocks 936 * never point at garbage. 937 */ 938 if ((error = bwrite(nbp)) != 0) { 939 brelse(bp); 940 goto fail; 941 } 942 } 943 bap[indirs[i - 1].in_off] = nb; 944 if (allocib == NULL && unwindidx < 0) 945 unwindidx = i - 1; 946 /* 947 * If required, write synchronously, otherwise use 948 * delayed write. 949 */ 950 if (flags & IO_SYNC) { 951 bwrite(bp); 952 } else { 953 if (bp->b_bufsize == fs->fs_bsize) 954 bp->b_flags |= B_CLUSTEROK; 955 bdwrite(bp); 956 } 957 } 958 /* 959 * If asked only for the indirect block, then return it. 960 */ 961 if (flags & BA_METAONLY) { 962 curthread_pflags_restore(saved_inbdflush); 963 *bpp = bp; 964 return (0); 965 } 966 /* 967 * Get the data block, allocating if necessary. 968 */ 969 if (nb == 0) { 970 UFS_LOCK(ump); 971 /* 972 * If allocating metadata at the front of the cylinder 973 * group and parent indirect block has just been allocated, 974 * then cluster next to it if it is the first indirect in 975 * the file. Otherwise it has been allocated in the metadata 976 * area, so we want to find our own place out in the data area. 977 */ 978 if (pref == 0 || (lbn > NDADDR && fs->fs_metaspace != 0)) 979 pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off, 980 &bap[0]); 981 error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 982 flags | IO_BUFLOCKED, cred, &newb); 983 if (error) { 984 brelse(bp); 985 if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { 986 UFS_LOCK(ump); 987 softdep_request_cleanup(fs, vp, cred, 988 FLUSH_BLOCKS_WAIT); 989 UFS_UNLOCK(ump); 990 goto retry; 991 } 992 if (ppsratecheck(&lastfail, &curfail, 1)) { 993 ffs_fserr(fs, ip->i_number, "filesystem full"); 994 uprintf("\n%s: write failed, filesystem " 995 "is full\n", fs->fs_fsmnt); 996 } 997 goto fail; 998 } 999 nb = newb; 1000 MPASS(allocblk < allociblk + nitems(allociblk)); 1001 MPASS(lbns_remfree < lbns + nitems(lbns)); 1002 *allocblk++ = nb; 1003 *lbns_remfree++ = lbn; 1004 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); 1005 nbp->b_blkno = fsbtodb(fs, nb); 1006 if (flags & BA_CLRBUF) 1007 vfs_bio_clrbuf(nbp); 1008 if (DOINGSOFTDEP(vp)) 1009 softdep_setup_allocindir_page(ip, lbn, bp, 1010 indirs[i].in_off, nb, 0, nbp); 1011 bap[indirs[i].in_off] = nb; 1012 /* 1013 * If required, write synchronously, otherwise use 1014 * delayed write. 1015 */ 1016 if (flags & IO_SYNC) { 1017 bwrite(bp); 1018 } else { 1019 if (bp->b_bufsize == fs->fs_bsize) 1020 bp->b_flags |= B_CLUSTEROK; 1021 bdwrite(bp); 1022 } 1023 curthread_pflags_restore(saved_inbdflush); 1024 *bpp = nbp; 1025 return (0); 1026 } 1027 brelse(bp); 1028 /* 1029 * If requested clear invalid portions of the buffer. If we 1030 * have to do a read-before-write (typical if BA_CLRBUF is set), 1031 * try to do some read-ahead in the sequential case to reduce 1032 * the number of I/O transactions. 1033 */ 1034 if (flags & BA_CLRBUF) { 1035 int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT; 1036 if (seqcount != 0 && 1037 (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 && 1038 !(vm_page_count_severe() || buf_dirty_count_severe())) { 1039 error = cluster_read(vp, ip->i_size, lbn, 1040 (int)fs->fs_bsize, NOCRED, 1041 MAXBSIZE, seqcount, gbflags, &nbp); 1042 } else { 1043 error = bread_gb(vp, lbn, (int)fs->fs_bsize, 1044 NOCRED, gbflags, &nbp); 1045 } 1046 if (error) { 1047 brelse(nbp); 1048 goto fail; 1049 } 1050 } else { 1051 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); 1052 nbp->b_blkno = fsbtodb(fs, nb); 1053 } 1054 curthread_pflags_restore(saved_inbdflush); 1055 *bpp = nbp; 1056 return (0); 1057fail: 1058 curthread_pflags_restore(saved_inbdflush); 1059 /* 1060 * If we have failed to allocate any blocks, simply return the error. 1061 * This is the usual case and avoids the need to fsync the file. 1062 */ 1063 if (allocblk == allociblk && allocib == NULL && unwindidx == -1) 1064 return (error); 1065 /* 1066 * If we have failed part way through block allocation, we 1067 * have to deallocate any indirect blocks that we have allocated. 1068 * We have to fsync the file before we start to get rid of all 1069 * of its dependencies so that we do not leave them dangling. 1070 * We have to sync it at the end so that the soft updates code 1071 * does not find any untracked changes. Although this is really 1072 * slow, running out of disk space is not expected to be a common 1073 * occurrence. The error return from fsync is ignored as we already 1074 * have an error to return to the user. 1075 * 1076 * XXX Still have to journal the free below 1077 */ 1078 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 1079 for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns; 1080 blkp < allocblk; blkp++, lbns_remfree++) { 1081 /* 1082 * We shall not leave the freed blocks on the vnode 1083 * buffer object lists. 1084 */ 1085 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, 1086 GB_NOCREAT | GB_UNMAPPED); 1087 if (bp != NULL) { 1088 KASSERT(bp->b_blkno == fsbtodb(fs, *blkp), 1089 ("mismatch2 l %jd %jd b %ju %ju", 1090 (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree, 1091 (uintmax_t)bp->b_blkno, 1092 (uintmax_t)fsbtodb(fs, *blkp))); 1093 bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE; 1094 bp->b_flags &= ~(B_ASYNC | B_CACHE); 1095 brelse(bp); 1096 } 1097 deallocated += fs->fs_bsize; 1098 } 1099 if (allocib != NULL) { 1100 *allocib = 0; 1101 } else if (unwindidx >= 0) { 1102 int r; 1103 1104 r = bread(vp, indirs[unwindidx].in_lbn, 1105 (int)fs->fs_bsize, NOCRED, &bp); 1106 if (r) { 1107 panic("Could not unwind indirect block, error %d", r); 1108 brelse(bp); 1109 } else { 1110 bap = (ufs2_daddr_t *)bp->b_data; 1111 bap[indirs[unwindidx].in_off] = 0; 1112 if (flags & IO_SYNC) { 1113 bwrite(bp); 1114 } else { 1115 if (bp->b_bufsize == fs->fs_bsize) 1116 bp->b_flags |= B_CLUSTEROK; 1117 bdwrite(bp); 1118 } 1119 } 1120 } 1121 if (deallocated) { 1122#ifdef QUOTA 1123 /* 1124 * Restore user's disk quota because allocation failed. 1125 */ 1126 (void) chkdq(ip, -btodb(deallocated), cred, FORCE); 1127#endif 1128 dp->di_blocks -= btodb(deallocated); 1129 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1130 } 1131 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 1132 /* 1133 * After the buffers are invalidated and on-disk pointers are 1134 * cleared, free the blocks. 1135 */ 1136 for (blkp = allociblk; blkp < allocblk; blkp++) { 1137#ifdef INVARIANTS 1138 if (blkp == allociblk) 1139 lbns_remfree = lbns; 1140 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, 1141 GB_NOCREAT | GB_UNMAPPED); 1142 if (bp != NULL) { 1143 panic("zombie2 %jd %ju %ju", 1144 (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno, 1145 (uintmax_t)fsbtodb(fs, *blkp)); 1146 } 1147 lbns_remfree++; 1148#endif 1149 ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize, 1150 ip->i_number, vp->v_type, NULL); 1151 } 1152 return (error); 1153} 1154