1185029Spjd/* 2185029Spjd * CDDL HEADER START 3185029Spjd * 4185029Spjd * The contents of this file are subject to the terms of the 5185029Spjd * Common Development and Distribution License (the "License"). 6185029Spjd * You may not use this file except in compliance with the License. 7185029Spjd * 8185029Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9185029Spjd * or http://www.opensolaris.org/os/licensing. 10185029Spjd * See the License for the specific language governing permissions 11185029Spjd * and limitations under the License. 12185029Spjd * 13185029Spjd * When distributing Covered Code, include this CDDL HEADER in each 14185029Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15185029Spjd * If applicable, add the following below this CDDL HEADER, with the 16185029Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17185029Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18185029Spjd * 19185029Spjd * CDDL HEADER END 20185029Spjd */ 21185029Spjd/* 22211932Smm * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23185029Spjd * Use is subject to license terms. 24185029Spjd */ 25248571Smm/* 26248571Smm * Copyright (c) 2012 by Delphix. All rights reserved. 27248571Smm */ 28185029Spjd 29185029Spjd#include <sys/refcount.h> 30185029Spjd#include <sys/rrwlock.h> 31185029Spjd 32185029Spjd/* 33185029Spjd * This file contains the implementation of a re-entrant read 34185029Spjd * reader/writer lock (aka "rrwlock"). 35185029Spjd * 36185029Spjd * This is a normal reader/writer lock with the additional feature 37185029Spjd * of allowing threads who have already obtained a read lock to 38185029Spjd * re-enter another read lock (re-entrant read) - even if there are 39185029Spjd * waiting writers. 40185029Spjd * 41185029Spjd * Callers who have not obtained a read lock give waiting writers priority. 42185029Spjd * 43185029Spjd * The rrwlock_t lock does not allow re-entrant writers, nor does it 44185029Spjd * allow a re-entrant mix of reads and writes (that is, it does not 45185029Spjd * allow a caller who has already obtained a read lock to be able to 46185029Spjd * then grab a write lock without first dropping all read locks, and 47185029Spjd * vice versa). 48185029Spjd * 49185029Spjd * The rrwlock_t uses tsd (thread specific data) to keep a list of 50185029Spjd * nodes (rrw_node_t), where each node keeps track of which specific 51185029Spjd * lock (rrw_node_t::rn_rrl) the thread has grabbed. Since re-entering 52185029Spjd * should be rare, a thread that grabs multiple reads on the same rrwlock_t 53185029Spjd * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the 54185029Spjd * tsd list can represent a different rrwlock_t. This allows a thread 55185029Spjd * to enter multiple and unique rrwlock_ts for read locks at the same time. 56185029Spjd * 57185029Spjd * Since using tsd exposes some overhead, the rrwlock_t only needs to 58185029Spjd * keep tsd data when writers are waiting. If no writers are waiting, then 59185029Spjd * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd 60185029Spjd * is needed. Once a writer attempts to grab the lock, readers then 61185029Spjd * keep tsd data and bump the linked readers count (rr_linked_rcount). 62185029Spjd * 63185029Spjd * If there are waiting writers and there are anonymous readers, then a 64185029Spjd * reader doesn't know if it is a re-entrant lock. But since it may be one, 65185029Spjd * we allow the read to proceed (otherwise it could deadlock). Since once 66185029Spjd * waiting writers are active, readers no longer bump the anonymous count, 67185029Spjd * the anonymous readers will eventually flush themselves out. At this point, 68185029Spjd * readers will be able to tell if they are a re-entrant lock (have a 69185029Spjd * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then 70185029Spjd * we must let the proceed. If they are not, then the reader blocks for the 71185029Spjd * waiting writers. Hence, we do not starve writers. 72185029Spjd */ 73185029Spjd 74185029Spjd/* global key for TSD */ 75185029Spjduint_t rrw_tsd_key; 76185029Spjd 77185029Spjdtypedef struct rrw_node { 78248571Smm struct rrw_node *rn_next; 79248571Smm rrwlock_t *rn_rrl; 80248571Smm void *rn_tag; 81185029Spjd} rrw_node_t; 82185029Spjd 83185029Spjdstatic rrw_node_t * 84185029Spjdrrn_find(rrwlock_t *rrl) 85185029Spjd{ 86185029Spjd rrw_node_t *rn; 87185029Spjd 88185029Spjd if (refcount_count(&rrl->rr_linked_rcount) == 0) 89211948Spjd return (NULL); 90185029Spjd 91185029Spjd for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { 92185029Spjd if (rn->rn_rrl == rrl) 93185029Spjd return (rn); 94185029Spjd } 95185029Spjd return (NULL); 96185029Spjd} 97185029Spjd 98185029Spjd/* 99185029Spjd * Add a node to the head of the singly linked list. 100185029Spjd */ 101185029Spjdstatic void 102248571Smmrrn_add(rrwlock_t *rrl, void *tag) 103185029Spjd{ 104185029Spjd rrw_node_t *rn; 105185029Spjd 106185029Spjd rn = kmem_alloc(sizeof (*rn), KM_SLEEP); 107185029Spjd rn->rn_rrl = rrl; 108185029Spjd rn->rn_next = tsd_get(rrw_tsd_key); 109248571Smm rn->rn_tag = tag; 110185029Spjd VERIFY(tsd_set(rrw_tsd_key, rn) == 0); 111185029Spjd} 112185029Spjd 113185029Spjd/* 114185029Spjd * If a node is found for 'rrl', then remove the node from this 115185029Spjd * thread's list and return TRUE; otherwise return FALSE. 116185029Spjd */ 117185029Spjdstatic boolean_t 118248571Smmrrn_find_and_remove(rrwlock_t *rrl, void *tag) 119185029Spjd{ 120185029Spjd rrw_node_t *rn; 121185029Spjd rrw_node_t *prev = NULL; 122185029Spjd 123185029Spjd if (refcount_count(&rrl->rr_linked_rcount) == 0) 124185029Spjd return (B_FALSE); 125185029Spjd 126185029Spjd for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { 127248571Smm if (rn->rn_rrl == rrl && rn->rn_tag == tag) { 128185029Spjd if (prev) 129185029Spjd prev->rn_next = rn->rn_next; 130185029Spjd else 131185029Spjd VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0); 132185029Spjd kmem_free(rn, sizeof (*rn)); 133185029Spjd return (B_TRUE); 134185029Spjd } 135185029Spjd prev = rn; 136185029Spjd } 137185029Spjd return (B_FALSE); 138185029Spjd} 139185029Spjd 140185029Spjdvoid 141248571Smmrrw_init(rrwlock_t *rrl, boolean_t track_all) 142185029Spjd{ 143185029Spjd mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL); 144185029Spjd cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL); 145185029Spjd rrl->rr_writer = NULL; 146185029Spjd refcount_create(&rrl->rr_anon_rcount); 147185029Spjd refcount_create(&rrl->rr_linked_rcount); 148185029Spjd rrl->rr_writer_wanted = B_FALSE; 149248571Smm rrl->rr_track_all = track_all; 150185029Spjd} 151185029Spjd 152185029Spjdvoid 153185029Spjdrrw_destroy(rrwlock_t *rrl) 154185029Spjd{ 155185029Spjd mutex_destroy(&rrl->rr_lock); 156185029Spjd cv_destroy(&rrl->rr_cv); 157185029Spjd ASSERT(rrl->rr_writer == NULL); 158185029Spjd refcount_destroy(&rrl->rr_anon_rcount); 159185029Spjd refcount_destroy(&rrl->rr_linked_rcount); 160185029Spjd} 161185029Spjd 162248571Smmvoid 163185029Spjdrrw_enter_read(rrwlock_t *rrl, void *tag) 164185029Spjd{ 165185029Spjd mutex_enter(&rrl->rr_lock); 166211932Smm#if !defined(DEBUG) && defined(_KERNEL) 167248571Smm if (rrl->rr_writer == NULL && !rrl->rr_writer_wanted && 168248571Smm !rrl->rr_track_all) { 169211932Smm rrl->rr_anon_rcount.rc_count++; 170211932Smm mutex_exit(&rrl->rr_lock); 171211932Smm return; 172211932Smm } 173211932Smm DTRACE_PROBE(zfs__rrwfastpath__rdmiss); 174211932Smm#endif 175185029Spjd ASSERT(rrl->rr_writer != curthread); 176185029Spjd ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0); 177185029Spjd 178248571Smm while (rrl->rr_writer != NULL || (rrl->rr_writer_wanted && 179185029Spjd refcount_is_zero(&rrl->rr_anon_rcount) && 180185029Spjd rrn_find(rrl) == NULL)) 181185029Spjd cv_wait(&rrl->rr_cv, &rrl->rr_lock); 182185029Spjd 183248571Smm if (rrl->rr_writer_wanted || rrl->rr_track_all) { 184185029Spjd /* may or may not be a re-entrant enter */ 185248571Smm rrn_add(rrl, tag); 186185029Spjd (void) refcount_add(&rrl->rr_linked_rcount, tag); 187185029Spjd } else { 188185029Spjd (void) refcount_add(&rrl->rr_anon_rcount, tag); 189185029Spjd } 190185029Spjd ASSERT(rrl->rr_writer == NULL); 191185029Spjd mutex_exit(&rrl->rr_lock); 192185029Spjd} 193185029Spjd 194248571Smmvoid 195185029Spjdrrw_enter_write(rrwlock_t *rrl) 196185029Spjd{ 197185029Spjd mutex_enter(&rrl->rr_lock); 198185029Spjd ASSERT(rrl->rr_writer != curthread); 199185029Spjd 200185029Spjd while (refcount_count(&rrl->rr_anon_rcount) > 0 || 201185029Spjd refcount_count(&rrl->rr_linked_rcount) > 0 || 202185029Spjd rrl->rr_writer != NULL) { 203185029Spjd rrl->rr_writer_wanted = B_TRUE; 204185029Spjd cv_wait(&rrl->rr_cv, &rrl->rr_lock); 205185029Spjd } 206185029Spjd rrl->rr_writer_wanted = B_FALSE; 207185029Spjd rrl->rr_writer = curthread; 208185029Spjd mutex_exit(&rrl->rr_lock); 209185029Spjd} 210185029Spjd 211185029Spjdvoid 212185029Spjdrrw_enter(rrwlock_t *rrl, krw_t rw, void *tag) 213185029Spjd{ 214185029Spjd if (rw == RW_READER) 215185029Spjd rrw_enter_read(rrl, tag); 216185029Spjd else 217185029Spjd rrw_enter_write(rrl); 218185029Spjd} 219185029Spjd 220185029Spjdvoid 221185029Spjdrrw_exit(rrwlock_t *rrl, void *tag) 222185029Spjd{ 223185029Spjd mutex_enter(&rrl->rr_lock); 224211932Smm#if !defined(DEBUG) && defined(_KERNEL) 225211932Smm if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) { 226211932Smm rrl->rr_anon_rcount.rc_count--; 227211932Smm if (rrl->rr_anon_rcount.rc_count == 0) 228211932Smm cv_broadcast(&rrl->rr_cv); 229211932Smm mutex_exit(&rrl->rr_lock); 230211932Smm return; 231211932Smm } 232211932Smm DTRACE_PROBE(zfs__rrwfastpath__exitmiss); 233211932Smm#endif 234185029Spjd ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) || 235185029Spjd !refcount_is_zero(&rrl->rr_linked_rcount) || 236185029Spjd rrl->rr_writer != NULL); 237185029Spjd 238185029Spjd if (rrl->rr_writer == NULL) { 239211932Smm int64_t count; 240248571Smm if (rrn_find_and_remove(rrl, tag)) { 241211932Smm count = refcount_remove(&rrl->rr_linked_rcount, tag); 242248571Smm } else { 243248571Smm ASSERT(!rrl->rr_track_all); 244211932Smm count = refcount_remove(&rrl->rr_anon_rcount, tag); 245248571Smm } 246211932Smm if (count == 0) 247211932Smm cv_broadcast(&rrl->rr_cv); 248185029Spjd } else { 249185029Spjd ASSERT(rrl->rr_writer == curthread); 250185029Spjd ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) && 251185029Spjd refcount_is_zero(&rrl->rr_linked_rcount)); 252185029Spjd rrl->rr_writer = NULL; 253185029Spjd cv_broadcast(&rrl->rr_cv); 254185029Spjd } 255185029Spjd mutex_exit(&rrl->rr_lock); 256185029Spjd} 257185029Spjd 258248571Smm/* 259248571Smm * If the lock was created with track_all, rrw_held(RW_READER) will return 260248571Smm * B_TRUE iff the current thread has the lock for reader. Otherwise it may 261248571Smm * return B_TRUE if any thread has the lock for reader. 262248571Smm */ 263185029Spjdboolean_t 264185029Spjdrrw_held(rrwlock_t *rrl, krw_t rw) 265185029Spjd{ 266185029Spjd boolean_t held; 267185029Spjd 268185029Spjd mutex_enter(&rrl->rr_lock); 269185029Spjd if (rw == RW_WRITER) { 270185029Spjd held = (rrl->rr_writer == curthread); 271185029Spjd } else { 272185029Spjd held = (!refcount_is_zero(&rrl->rr_anon_rcount) || 273248571Smm rrn_find(rrl) != NULL); 274185029Spjd } 275185029Spjd mutex_exit(&rrl->rr_lock); 276185029Spjd 277185029Spjd return (held); 278185029Spjd} 279248571Smm 280248571Smmvoid 281248571Smmrrw_tsd_destroy(void *arg) 282248571Smm{ 283248571Smm rrw_node_t *rn = arg; 284248571Smm if (rn != NULL) { 285248571Smm panic("thread %p terminating with rrw lock %p held", 286248571Smm (void *)curthread, (void *)rn->rn_rrl); 287248571Smm } 288248571Smm} 289