rrwlock.c revision 269419
120253Sjoerg/* 220302Sjoerg * CDDL HEADER START 320302Sjoerg * 420253Sjoerg * The contents of this file are subject to the terms of the 520253Sjoerg * Common Development and Distribution License (the "License"). 620253Sjoerg * You may not use this file except in compliance with the License. 720253Sjoerg * 820253Sjoerg * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 920302Sjoerg * or http://www.opensolaris.org/os/licensing. 1020253Sjoerg * See the License for the specific language governing permissions 1120253Sjoerg * and limitations under the License. 1220253Sjoerg * 1320253Sjoerg * When distributing Covered Code, include this CDDL HEADER in each 1420302Sjoerg * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 1520253Sjoerg * If applicable, add the following below this CDDL HEADER, with the 1620253Sjoerg * fields enclosed by brackets "[]" replaced with your own identifying 1720302Sjoerg * information: Portions Copyright [yyyy] [name of copyright owner] 1820253Sjoerg * 1920253Sjoerg * CDDL HEADER END 2020253Sjoerg */ 2120253Sjoerg/* 2220253Sjoerg * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 2320253Sjoerg * Use is subject to license terms. 2420253Sjoerg */ 2520253Sjoerg/* 2620253Sjoerg * Copyright (c) 2012 by Delphix. All rights reserved. 2730259Scharnier */ 2830259Scharnier 2950479Speter#include <sys/refcount.h> 3030259Scharnier#include <sys/rrwlock.h> 3130259Scharnier 3220253Sjoerg/* 3330259Scharnier * This file contains the implementation of a re-entrant read 3420253Sjoerg * reader/writer lock (aka "rrwlock"). 3530259Scharnier * 3620253Sjoerg * This is a normal reader/writer lock with the additional feature 3720253Sjoerg * of allowing threads who have already obtained a read lock to 3820253Sjoerg * re-enter another read lock (re-entrant read) - even if there are 3920253Sjoerg * waiting writers. 4020253Sjoerg * 4120253Sjoerg * Callers who have not obtained a read lock give waiting writers priority. 4220253Sjoerg * 4320253Sjoerg * The rrwlock_t lock does not allow re-entrant writers, nor does it 4420253Sjoerg * allow a re-entrant mix of reads and writes (that is, it does not 4520253Sjoerg * allow a caller who has already obtained a read lock to be able to 4620253Sjoerg * then grab a write lock without first dropping all read locks, and 4752502Sdavidn * vice versa). 4820253Sjoerg * 4920253Sjoerg * The rrwlock_t uses tsd (thread specific data) to keep a list of 5020253Sjoerg * nodes (rrw_node_t), where each node keeps track of which specific 5120253Sjoerg * lock (rrw_node_t::rn_rrl) the thread has grabbed. Since re-entering 5220747Sdavidn * should be rare, a thread that grabs multiple reads on the same rrwlock_t 5352502Sdavidn * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the 5420253Sjoerg * tsd list can represent a different rrwlock_t. This allows a thread 5520253Sjoerg * to enter multiple and unique rrwlock_ts for read locks at the same time. 5620253Sjoerg * 5720253Sjoerg * Since using tsd exposes some overhead, the rrwlock_t only needs to 5820253Sjoerg * keep tsd data when writers are waiting. If no writers are waiting, then 5920253Sjoerg * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd 6020253Sjoerg * is needed. Once a writer attempts to grab the lock, readers then 6120253Sjoerg * keep tsd data and bump the linked readers count (rr_linked_rcount). 6220253Sjoerg * 6352512Sdavidn * If there are waiting writers and there are anonymous readers, then a 6452512Sdavidn * reader doesn't know if it is a re-entrant lock. But since it may be one, 6552512Sdavidn * we allow the read to proceed (otherwise it could deadlock). Since once 6620267Sjoerg * waiting writers are active, readers no longer bump the anonymous count, 6720267Sjoerg * the anonymous readers will eventually flush themselves out. At this point, 6820267Sjoerg * readers will be able to tell if they are a re-entrant lock (have a 6920267Sjoerg * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then 7052512Sdavidn * we must let the proceed. If they are not, then the reader blocks for the 7120267Sjoerg * waiting writers. Hence, we do not starve writers. 7220267Sjoerg */ 7320267Sjoerg 7420267Sjoerg/* global key for TSD */ 7520267Sjoerguint_t rrw_tsd_key; 7620267Sjoerg 7720267Sjoergtypedef struct rrw_node { 7820253Sjoerg struct rrw_node *rn_next; 7920267Sjoerg rrwlock_t *rn_rrl; 8020253Sjoerg void *rn_tag; 8144229Sdavidn} rrw_node_t; 8244229Sdavidn 8320253Sjoergstatic rrw_node_t * 8444229Sdavidnrrn_find(rrwlock_t *rrl) 8520267Sjoerg{ 8620253Sjoerg rrw_node_t *rn; 8720253Sjoerg 8820253Sjoerg if (refcount_count(&rrl->rr_linked_rcount) == 0) 8930259Scharnier return (NULL); 9020253Sjoerg 9161957Sache for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { 9220253Sjoerg if (rn->rn_rrl == rrl) 9320253Sjoerg return (rn); 9420253Sjoerg } 9520253Sjoerg return (NULL); 9644229Sdavidn} 9720253Sjoerg 9820253Sjoerg/* 9920253Sjoerg * Add a node to the head of the singly linked list. 10044229Sdavidn */ 10120253Sjoergstatic void 10220253Sjoergrrn_add(rrwlock_t *rrl, void *tag) 10320253Sjoerg{ 10420747Sdavidn rrw_node_t *rn; 10520747Sdavidn 10620253Sjoerg rn = kmem_alloc(sizeof (*rn), KM_SLEEP); 10720253Sjoerg rn->rn_rrl = rrl; 10820747Sdavidn rn->rn_next = tsd_get(rrw_tsd_key); 10920267Sjoerg rn->rn_tag = tag; 11020253Sjoerg VERIFY(tsd_set(rrw_tsd_key, rn) == 0); 11130259Scharnier} 11220253Sjoerg 11320253Sjoerg/* 11420253Sjoerg * If a node is found for 'rrl', then remove the node from this 11520253Sjoerg * thread's list and return TRUE; otherwise return FALSE. 11620253Sjoerg */ 11720253Sjoergstatic boolean_t 11820253Sjoergrrn_find_and_remove(rrwlock_t *rrl, void *tag) 11920253Sjoerg{ 12020253Sjoerg rrw_node_t *rn; 12120253Sjoerg rrw_node_t *prev = NULL; 12252502Sdavidn 12352502Sdavidn if (refcount_count(&rrl->rr_linked_rcount) == 0) 12452502Sdavidn return (B_FALSE); 12552502Sdavidn 12656000Sdavidn for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { 12752502Sdavidn if (rn->rn_rrl == rrl && rn->rn_tag == tag) { 12852502Sdavidn if (prev) 12920253Sjoerg prev->rn_next = rn->rn_next; 13020267Sjoerg else 13120253Sjoerg VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0); 13220267Sjoerg kmem_free(rn, sizeof (*rn)); 13320253Sjoerg return (B_TRUE); 13420253Sjoerg } 13520253Sjoerg prev = rn; 13620253Sjoerg } 13720253Sjoerg return (B_FALSE); 13820679Sdavidn} 13920253Sjoerg 14020253Sjoergvoid 14130259Scharnierrrw_init(rrwlock_t *rrl, boolean_t track_all) 14220253Sjoerg{ 14330259Scharnier mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL); 14420253Sjoerg cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL); 14520747Sdavidn rrl->rr_writer = NULL; 14620747Sdavidn refcount_create(&rrl->rr_anon_rcount); 14720253Sjoerg refcount_create(&rrl->rr_linked_rcount); 14820679Sdavidn rrl->rr_writer_wanted = B_FALSE; 14920253Sjoerg rrl->rr_track_all = track_all; 15020253Sjoerg} 15120253Sjoerg 15220253Sjoergvoid 15320253Sjoergrrw_destroy(rrwlock_t *rrl) 15420253Sjoerg{ 15520253Sjoerg mutex_destroy(&rrl->rr_lock); 15620253Sjoerg cv_destroy(&rrl->rr_cv); 15720253Sjoerg ASSERT(rrl->rr_writer == NULL); 15820253Sjoerg refcount_destroy(&rrl->rr_anon_rcount); 15920253Sjoerg refcount_destroy(&rrl->rr_linked_rcount); 16020253Sjoerg} 161124382Siedowse 162124382Siedowsevoid 16320253Sjoergrrw_enter_read(rrwlock_t *rrl, void *tag) 16420253Sjoerg{ 16520253Sjoerg mutex_enter(&rrl->rr_lock); 16620253Sjoerg#if !defined(DEBUG) && defined(_KERNEL) 167124382Siedowse if (rrl->rr_writer == NULL && !rrl->rr_writer_wanted && 16820253Sjoerg !rrl->rr_track_all) { 16920253Sjoerg rrl->rr_anon_rcount.rc_count++; 17020253Sjoerg mutex_exit(&rrl->rr_lock); 17120253Sjoerg return; 17220253Sjoerg } 17320253Sjoerg DTRACE_PROBE(zfs__rrwfastpath__rdmiss); 17420253Sjoerg#endif 17520253Sjoerg ASSERT(rrl->rr_writer != curthread); 17620253Sjoerg ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0); 17720253Sjoerg 17820253Sjoerg while (rrl->rr_writer != NULL || (rrl->rr_writer_wanted && 17920253Sjoerg refcount_is_zero(&rrl->rr_anon_rcount) && 18020253Sjoerg rrn_find(rrl) == NULL)) 18120253Sjoerg cv_wait(&rrl->rr_cv, &rrl->rr_lock); 18220253Sjoerg 18320253Sjoerg if (rrl->rr_writer_wanted || rrl->rr_track_all) { 18420253Sjoerg /* may or may not be a re-entrant enter */ 18520253Sjoerg rrn_add(rrl, tag); 18620253Sjoerg (void) refcount_add(&rrl->rr_linked_rcount, tag); 18720253Sjoerg } else { 18820253Sjoerg (void) refcount_add(&rrl->rr_anon_rcount, tag); 18920253Sjoerg } 19020253Sjoerg ASSERT(rrl->rr_writer == NULL); 19120253Sjoerg mutex_exit(&rrl->rr_lock); 19220253Sjoerg} 19330259Scharnier 19420267Sjoergvoid 19520253Sjoergrrw_enter_write(rrwlock_t *rrl) 19620253Sjoerg{ 19720253Sjoerg mutex_enter(&rrl->rr_lock); 19820253Sjoerg ASSERT(rrl->rr_writer != curthread); 19920253Sjoerg 20030259Scharnier while (refcount_count(&rrl->rr_anon_rcount) > 0 || 201124382Siedowse refcount_count(&rrl->rr_linked_rcount) > 0 || 202124382Siedowse rrl->rr_writer != NULL) { 203124382Siedowse rrl->rr_writer_wanted = B_TRUE; 204124382Siedowse cv_wait(&rrl->rr_cv, &rrl->rr_lock); 205124382Siedowse } 206124382Siedowse rrl->rr_writer_wanted = B_FALSE; 20720253Sjoerg rrl->rr_writer = curthread; 20820253Sjoerg mutex_exit(&rrl->rr_lock); 20920267Sjoerg} 21020267Sjoerg 21120267Sjoergvoid 21220267Sjoergrrw_enter(rrwlock_t *rrl, krw_t rw, void *tag) 21320267Sjoerg{ 21420267Sjoerg if (rw == RW_READER) 21520747Sdavidn rrw_enter_read(rrl, tag); 21620747Sdavidn else 21720267Sjoerg rrw_enter_write(rrl); 21820747Sdavidn} 21920747Sdavidn 22020747Sdavidnvoid 22120747Sdavidnrrw_exit(rrwlock_t *rrl, void *tag) 22220747Sdavidn{ 22320747Sdavidn mutex_enter(&rrl->rr_lock); 22420747Sdavidn#if !defined(DEBUG) && defined(_KERNEL) 22520267Sjoerg if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) { 22620267Sjoerg rrl->rr_anon_rcount.rc_count--; 22720747Sdavidn if (rrl->rr_anon_rcount.rc_count == 0) 22820267Sjoerg cv_broadcast(&rrl->rr_cv); 22944229Sdavidn mutex_exit(&rrl->rr_lock); 23061957Sache return; 23130259Scharnier } 23220267Sjoerg DTRACE_PROBE(zfs__rrwfastpath__exitmiss); 23320267Sjoerg#endif 23420267Sjoerg ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) || 23520267Sjoerg !refcount_is_zero(&rrl->rr_linked_rcount) || 23620267Sjoerg rrl->rr_writer != NULL); 23720267Sjoerg 23820747Sdavidn if (rrl->rr_writer == NULL) { 23920267Sjoerg int64_t count; 24020267Sjoerg if (rrn_find_and_remove(rrl, tag)) { 24120747Sdavidn count = refcount_remove(&rrl->rr_linked_rcount, tag); 24220267Sjoerg } else { 24320267Sjoerg ASSERT(!rrl->rr_track_all); 24420267Sjoerg count = refcount_remove(&rrl->rr_anon_rcount, tag); 24520267Sjoerg } 24620267Sjoerg if (count == 0) 24720267Sjoerg cv_broadcast(&rrl->rr_cv); 24820267Sjoerg } else { 24952502Sdavidn ASSERT(rrl->rr_writer == curthread); 25052502Sdavidn ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) && 25152502Sdavidn refcount_is_zero(&rrl->rr_linked_rcount)); 25252502Sdavidn rrl->rr_writer = NULL; 25352502Sdavidn cv_broadcast(&rrl->rr_cv); 25420267Sjoerg } 25552502Sdavidn mutex_exit(&rrl->rr_lock); 25652502Sdavidn} 25752502Sdavidn 25852502Sdavidn/* 25956000Sdavidn * If the lock was created with track_all, rrw_held(RW_READER) will return 26052502Sdavidn * B_TRUE iff the current thread has the lock for reader. Otherwise it may 26120253Sjoerg * return B_TRUE if any thread has the lock for reader. 26220253Sjoerg */ 26344229Sdavidnboolean_t 26430259Scharnierrrw_held(rrwlock_t *rrl, krw_t rw) 26520253Sjoerg{ 26620253Sjoerg boolean_t held; 26720253Sjoerg 26820747Sdavidn mutex_enter(&rrl->rr_lock); 26920747Sdavidn if (rw == RW_WRITER) { 27020747Sdavidn held = (rrl->rr_writer == curthread); 27120267Sjoerg } else { 27220253Sjoerg held = (!refcount_is_zero(&rrl->rr_anon_rcount) || 27320253Sjoerg rrn_find(rrl) != NULL); 27420253Sjoerg } 27520253Sjoerg mutex_exit(&rrl->rr_lock); 27620253Sjoerg 27720253Sjoerg return (held); 27820253Sjoerg} 27920253Sjoerg 28020253Sjoergvoid 28120253Sjoergrrw_tsd_destroy(void *arg) 28220253Sjoerg{ 28320253Sjoerg rrw_node_t *rn = arg; 28420253Sjoerg if (rn != NULL) { 28520253Sjoerg panic("thread %p terminating with rrw lock %p held", 28620253Sjoerg (void *)curthread, (void *)rn->rn_rrl); 28720253Sjoerg } 28844229Sdavidn} 28930259Scharnier 29020253Sjoerg/* 29120253Sjoerg * A reader-mostly lock implementation, tuning above reader-writer locks 29220253Sjoerg * for hightly parallel read acquisitions, while pessimizing writes. 29320253Sjoerg * 29420253Sjoerg * The idea is to split single busy lock into array of locks, so that 29520253Sjoerg * each reader can lock only one of them for read, depending on result 29620253Sjoerg * of simple hash function. That proportionally reduces lock congestion. 29720253Sjoerg * Writer same time has to sequentially aquire write on all the locks. 29820253Sjoerg * That makes write aquisition proportionally slower, but in places where 29920253Sjoerg * it is used (filesystem unmount) performance is not critical. 30020253Sjoerg * 30120253Sjoerg * All the functions below are direct wrappers around functions above. 30220253Sjoerg */ 30320253Sjoergvoid 30420253Sjoergrrm_init(rrmlock_t *rrl, boolean_t track_all) 30520253Sjoerg{ 30620253Sjoerg int i; 30744229Sdavidn 30844229Sdavidn for (i = 0; i < RRM_NUM_LOCKS; i++) 30956000Sdavidn rrw_init(&rrl->locks[i], track_all); 31056000Sdavidn} 31120253Sjoerg 31244229Sdavidnvoid 31320253Sjoergrrm_destroy(rrmlock_t *rrl) 31420253Sjoerg{ 31520253Sjoerg int i; 31620253Sjoerg 31720253Sjoerg for (i = 0; i < RRM_NUM_LOCKS; i++) 31820253Sjoerg rrw_destroy(&rrl->locks[i]); 31920253Sjoerg} 32020253Sjoerg 32120253Sjoergvoid 32220253Sjoergrrm_enter(rrmlock_t *rrl, krw_t rw, void *tag) 32320253Sjoerg{ 32420253Sjoerg if (rw == RW_READER) 32520253Sjoerg rrm_enter_read(rrl, tag); 32620253Sjoerg else 32720253Sjoerg rrm_enter_write(rrl); 32820253Sjoerg} 32920253Sjoerg 33020253Sjoerg/* 33130259Scharnier * This maps the current thread to a specific lock. Note that the lock 33220253Sjoerg * must be released by the same thread that acquired it. We do this 33320253Sjoerg * mapping by taking the thread pointer mod a prime number. We examine 33420253Sjoerg * only the low 32 bits of the thread pointer, because 32-bit division 33520253Sjoerg * is faster than 64-bit division, and the high 32 bits have little 33620253Sjoerg * entropy anyway. 33720253Sjoerg */ 33820253Sjoerg#define RRM_TD_LOCK() (((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS) 33920253Sjoerg 34020253Sjoergvoid 34120253Sjoergrrm_enter_read(rrmlock_t *rrl, void *tag) 34220747Sdavidn{ 34320747Sdavidn rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag); 34420253Sjoerg} 34520747Sdavidn 34620253Sjoergvoid 34720747Sdavidnrrm_enter_write(rrmlock_t *rrl) 34820253Sjoerg{ 34920253Sjoerg int i; 35020253Sjoerg 35122398Sdavidn for (i = 0; i < RRM_NUM_LOCKS; i++) 35220747Sdavidn rrw_enter_write(&rrl->locks[i]); 35320253Sjoerg} 35420747Sdavidn 35520253Sjoergvoid 35620253Sjoergrrm_exit(rrmlock_t *rrl, void *tag) 35720253Sjoerg{ 35820267Sjoerg int i; 35920253Sjoerg 360 if (rrl->locks[0].rr_writer == curthread) { 361 for (i = 0; i < RRM_NUM_LOCKS; i++) 362 rrw_exit(&rrl->locks[i], tag); 363 } else { 364 rrw_exit(&rrl->locks[RRM_TD_LOCK()], tag); 365 } 366} 367 368boolean_t 369rrm_held(rrmlock_t *rrl, krw_t rw) 370{ 371 if (rw == RW_WRITER) { 372 return (rrw_held(&rrl->locks[0], rw)); 373 } else { 374 return (rrw_held(&rrl->locks[RRM_TD_LOCK()], rw)); 375 } 376} 377