1180025Sdfr/*-
2180025Sdfr * Copyright (c) 2008 Isilon Inc http://www.isilon.com/
3180025Sdfr * Authors: Doug Rabson <dfr@rabson.org>
4180025Sdfr * Developed with Red Inc: Alfred Perlstein <alfred@freebsd.org>
5180025Sdfr *
6180025Sdfr * Redistribution and use in source and binary forms, with or without
7180025Sdfr * modification, are permitted provided that the following conditions
8180025Sdfr * are met:
9180025Sdfr * 1. Redistributions of source code must retain the above copyright
10180025Sdfr *    notice, this list of conditions and the following disclaimer.
11180025Sdfr * 2. Redistributions in binary form must reproduce the above copyright
12180025Sdfr *    notice, this list of conditions and the following disclaimer in the
13180025Sdfr *    documentation and/or other materials provided with the distribution.
14180025Sdfr *
15180025Sdfr * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16180025Sdfr * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17180025Sdfr * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18180025Sdfr * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19180025Sdfr * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20180025Sdfr * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21180025Sdfr * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22180025Sdfr * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23180025Sdfr * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24180025Sdfr * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25180025Sdfr * SUCH DAMAGE.
26180025Sdfr */
27180025Sdfr
28180025Sdfr#include <sys/cdefs.h>
29180025Sdfr__FBSDID("$FreeBSD$");
30180025Sdfr
31180025Sdfr#include <sys/param.h>
32180025Sdfr#include <sys/fcntl.h>
33193066Sjamie#include <sys/jail.h>
34180025Sdfr#include <sys/kernel.h>
35180025Sdfr#include <sys/limits.h>
36180025Sdfr#include <sys/lock.h>
37180025Sdfr#include <sys/lockf.h>
38180025Sdfr#include <sys/malloc.h>
39192578Srwatson#include <sys/mbuf.h>
40180025Sdfr#include <sys/mount.h>
41180025Sdfr#include <sys/mutex.h>
42180025Sdfr#include <sys/proc.h>
43214048Srmacklem#include <sys/socket.h>
44180025Sdfr#include <sys/syslog.h>
45180025Sdfr#include <sys/systm.h>
46180025Sdfr#include <sys/unistd.h>
47180025Sdfr#include <sys/vnode.h>
48180025Sdfr
49180025Sdfr#include <nfs/nfsproto.h>
50180025Sdfr#include <nfsclient/nfs.h>
51180025Sdfr#include <nfsclient/nfsmount.h>
52180025Sdfr
53180025Sdfr#include <nlm/nlm_prot.h>
54180025Sdfr#include <nlm/nlm.h>
55180025Sdfr
56180025Sdfr/*
57180025Sdfr * We need to keep track of the svid values used for F_FLOCK locks.
58180025Sdfr */
59180025Sdfrstruct nlm_file_svid {
60180025Sdfr	int		ns_refs;	/* thread count + 1 if active */
61180025Sdfr	int		ns_svid;	/* on-the-wire SVID for this file */
62180025Sdfr	struct ucred	*ns_ucred;	/* creds to use for lock recovery */
63180025Sdfr	void		*ns_id;		/* local struct file pointer */
64180025Sdfr	bool_t		ns_active;	/* TRUE if we own a lock */
65180025Sdfr	LIST_ENTRY(nlm_file_svid) ns_link;
66180025Sdfr};
67180025SdfrLIST_HEAD(nlm_file_svid_list, nlm_file_svid);
68180025Sdfr
69180025Sdfr#define NLM_SVID_HASH_SIZE	256
70180025Sdfrstruct nlm_file_svid_list nlm_file_svids[NLM_SVID_HASH_SIZE];
71180025Sdfr
72180025Sdfrstruct mtx nlm_svid_lock;
73180025Sdfrstatic struct unrhdr *nlm_svid_allocator;
74180025Sdfrstatic volatile u_int nlm_xid = 1;
75180025Sdfr
76180025Sdfrstatic int nlm_setlock(struct nlm_host *host, struct rpc_callextra *ext,
77180025Sdfr    rpcvers_t vers, struct timeval *timo, int retries,
78180025Sdfr    struct vnode *vp, int op, struct flock *fl, int flags,
79180025Sdfr    int svid, size_t fhlen, void *fh, off_t size, bool_t reclaim);
80180025Sdfrstatic int nlm_clearlock(struct nlm_host *host,  struct rpc_callextra *ext,
81180025Sdfr    rpcvers_t vers, struct timeval *timo, int retries,
82180025Sdfr    struct vnode *vp, int op, struct flock *fl, int flags,
83180025Sdfr    int svid, size_t fhlen, void *fh, off_t size);
84180025Sdfrstatic int nlm_getlock(struct nlm_host *host, struct rpc_callextra *ext,
85180025Sdfr    rpcvers_t vers, struct timeval *timo, int retries,
86180025Sdfr    struct vnode *vp, int op, struct flock *fl, int flags,
87180025Sdfr    int svid, size_t fhlen, void *fh, off_t size);
88180025Sdfrstatic int nlm_map_status(nlm4_stats stat);
89180025Sdfrstatic struct nlm_file_svid *nlm_find_svid(void *id);
90180025Sdfrstatic void nlm_free_svid(struct nlm_file_svid *nf);
91180025Sdfrstatic int nlm_init_lock(struct flock *fl, int flags, int svid,
92180025Sdfr    rpcvers_t vers, size_t fhlen, void *fh, off_t size,
93180025Sdfr    struct nlm4_lock *lock, char oh_space[32]);
94180025Sdfr
95180025Sdfrstatic void
96180025Sdfrnlm_client_init(void *dummy)
97180025Sdfr{
98180025Sdfr	int i;
99180025Sdfr
100180025Sdfr	mtx_init(&nlm_svid_lock, "NLM svid lock", NULL, MTX_DEF);
101239328Skib	/* pid_max cannot be greater than PID_MAX */
102180025Sdfr	nlm_svid_allocator = new_unrhdr(PID_MAX + 2, INT_MAX, &nlm_svid_lock);
103180025Sdfr	for (i = 0; i < NLM_SVID_HASH_SIZE; i++)
104180025Sdfr		LIST_INIT(&nlm_file_svids[i]);
105180025Sdfr}
106180025SdfrSYSINIT(nlm_client_init, SI_SUB_LOCK, SI_ORDER_FIRST, nlm_client_init, NULL);
107180025Sdfr
108180025Sdfrstatic int
109180025Sdfrnlm_msg(struct thread *td, const char *server, const char *msg, int error)
110180025Sdfr{
111180025Sdfr	struct proc *p;
112180025Sdfr
113180025Sdfr	p = td ? td->td_proc : NULL;
114180025Sdfr	if (error) {
115180025Sdfr		tprintf(p, LOG_INFO, "nfs server %s: %s, error %d\n", server,
116180025Sdfr		    msg, error);
117180025Sdfr	} else {
118180025Sdfr		tprintf(p, LOG_INFO, "nfs server %s: %s\n", server, msg);
119180025Sdfr	}
120180025Sdfr	return (0);
121180025Sdfr}
122180025Sdfr
123180025Sdfrstruct nlm_feedback_arg {
124180025Sdfr	bool_t	nf_printed;
125180025Sdfr	struct nfsmount *nf_nmp;
126180025Sdfr};
127180025Sdfr
128180025Sdfrstatic void
129180025Sdfrnlm_down(struct nlm_feedback_arg *nf, struct thread *td,
130180025Sdfr    const char *msg, int error)
131180025Sdfr{
132180025Sdfr	struct nfsmount *nmp = nf->nf_nmp;
133180025Sdfr
134180025Sdfr	if (nmp == NULL)
135180025Sdfr		return;
136180025Sdfr	mtx_lock(&nmp->nm_mtx);
137180025Sdfr	if (!(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
138180025Sdfr		nmp->nm_state |= NFSSTA_LOCKTIMEO;
139180025Sdfr		mtx_unlock(&nmp->nm_mtx);
140180025Sdfr		vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
141180025Sdfr		    VQ_NOTRESPLOCK, 0);
142180025Sdfr	} else {
143180025Sdfr		mtx_unlock(&nmp->nm_mtx);
144180025Sdfr	}
145180025Sdfr
146180025Sdfr	nf->nf_printed = TRUE;
147180025Sdfr	nlm_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, error);
148180025Sdfr}
149180025Sdfr
150180025Sdfrstatic void
151180025Sdfrnlm_up(struct nlm_feedback_arg *nf, struct thread *td,
152180025Sdfr    const char *msg)
153180025Sdfr{
154180025Sdfr	struct nfsmount *nmp = nf->nf_nmp;
155180025Sdfr
156180025Sdfr	if (!nf->nf_printed)
157180025Sdfr		return;
158180025Sdfr
159180025Sdfr	nlm_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, 0);
160180025Sdfr
161180025Sdfr	mtx_lock(&nmp->nm_mtx);
162180025Sdfr	if (nmp->nm_state & NFSSTA_LOCKTIMEO) {
163180025Sdfr		nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
164180025Sdfr		mtx_unlock(&nmp->nm_mtx);
165180025Sdfr		vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
166180025Sdfr		    VQ_NOTRESPLOCK, 1);
167180025Sdfr	} else {
168180025Sdfr		mtx_unlock(&nmp->nm_mtx);
169180025Sdfr	}
170180025Sdfr}
171180025Sdfr
172180025Sdfrstatic void
173180025Sdfrnlm_feedback(int type, int proc, void *arg)
174180025Sdfr{
175180025Sdfr	struct thread *td = curthread;
176180025Sdfr	struct nlm_feedback_arg *nf = (struct nlm_feedback_arg *) arg;
177180025Sdfr
178180025Sdfr	switch (type) {
179180025Sdfr	case FEEDBACK_REXMIT2:
180180025Sdfr	case FEEDBACK_RECONNECT:
181180025Sdfr		nlm_down(nf, td, "lockd not responding", 0);
182180025Sdfr		break;
183180025Sdfr
184180025Sdfr	case FEEDBACK_OK:
185180025Sdfr		nlm_up(nf, td, "lockd is alive again");
186180025Sdfr		break;
187180025Sdfr	}
188180025Sdfr}
189180025Sdfr
190180025Sdfr/*
191180025Sdfr * nlm_advlock --
192180025Sdfr *      NFS advisory byte-level locks.
193180025Sdfr */
194180025Sdfrstatic int
195180025Sdfrnlm_advlock_internal(struct vnode *vp, void *id, int op, struct flock *fl,
196180025Sdfr    int flags, bool_t reclaim, bool_t unlock_vp)
197180025Sdfr{
198180025Sdfr	struct thread *td = curthread;
199180025Sdfr	struct nfsmount *nmp;
200180025Sdfr	off_t size;
201180025Sdfr	size_t fhlen;
202180025Sdfr	union nfsfh fh;
203180025Sdfr	struct sockaddr *sa;
204180025Sdfr	struct sockaddr_storage ss;
205180025Sdfr	char servername[MNAMELEN];
206180025Sdfr	struct timeval timo;
207180025Sdfr	int retries;
208180025Sdfr	rpcvers_t vers;
209180025Sdfr	struct nlm_host *host;
210180025Sdfr	struct rpc_callextra ext;
211180025Sdfr	struct nlm_feedback_arg nf;
212180025Sdfr	AUTH *auth;
213302192Skib	struct ucred *cred, *cred1;
214180025Sdfr	struct nlm_file_svid *ns;
215180025Sdfr	int svid;
216180025Sdfr	int error;
217214048Srmacklem	int is_v3;
218180025Sdfr
219180025Sdfr	ASSERT_VOP_LOCKED(vp, "nlm_advlock_1");
220180025Sdfr
221216931Srmacklem	nmp = VFSTONFS(vp->v_mount);
222180025Sdfr	/*
223180025Sdfr	 * Push any pending writes to the server and flush our cache
224180025Sdfr	 * so that if we are contending with another machine for a
225180025Sdfr	 * file, we get whatever they wrote and vice-versa.
226180025Sdfr	 */
227180025Sdfr	if (op == F_SETLK || op == F_UNLCK)
228216931Srmacklem		nmp->nm_vinvalbuf(vp, V_SAVE, td, 1);
229180025Sdfr
230214048Srmacklem	strcpy(servername, nmp->nm_hostname);
231216931Srmacklem	nmp->nm_getinfo(vp, fh.fh_bytes, &fhlen, &ss, &is_v3, &size, &timo);
232180025Sdfr	sa = (struct sockaddr *) &ss;
233214048Srmacklem	if (is_v3 != 0)
234180025Sdfr		vers = NLM_VERS4;
235180025Sdfr	else
236180025Sdfr		vers = NLM_VERS;
237180025Sdfr
238180025Sdfr	if (nmp->nm_flag & NFSMNT_SOFT)
239180025Sdfr		retries = nmp->nm_retry;
240180025Sdfr	else
241180025Sdfr		retries = INT_MAX;
242180025Sdfr
243180025Sdfr	/*
244180025Sdfr	 * We need to switch to mount-point creds so that we can send
245302192Skib	 * packets from a privileged port.  Reference mnt_cred and
246302192Skib	 * switch to them before unlocking the vnode, since mount
247302192Skib	 * point could be unmounted right after unlock.
248180025Sdfr	 */
249180025Sdfr	cred = td->td_ucred;
250180025Sdfr	td->td_ucred = vp->v_mount->mnt_cred;
251302192Skib	crhold(td->td_ucred);
252302192Skib	if (unlock_vp)
253302192Skib		VOP_UNLOCK(vp, 0);
254180025Sdfr
255180025Sdfr	host = nlm_find_host_by_name(servername, sa, vers);
256180025Sdfr	auth = authunix_create(cred);
257180025Sdfr	memset(&ext, 0, sizeof(ext));
258180025Sdfr
259180025Sdfr	nf.nf_printed = FALSE;
260180025Sdfr	nf.nf_nmp = nmp;
261180025Sdfr	ext.rc_auth = auth;
262180025Sdfr
263180025Sdfr	ext.rc_feedback = nlm_feedback;
264180025Sdfr	ext.rc_feedback_arg = &nf;
265184588Sdfr	ext.rc_timers = NULL;
266180025Sdfr
267180025Sdfr	ns = NULL;
268180025Sdfr	if (flags & F_FLOCK) {
269180025Sdfr		ns = nlm_find_svid(id);
270180025Sdfr		KASSERT(fl->l_start == 0 && fl->l_len == 0,
271180025Sdfr		    ("F_FLOCK lock requests must be whole-file locks"));
272180025Sdfr		if (!ns->ns_ucred) {
273180025Sdfr			/*
274180025Sdfr			 * Remember the creds used for locking in case
275180025Sdfr			 * we need to recover the lock later.
276180025Sdfr			 */
277180025Sdfr			ns->ns_ucred = crdup(cred);
278180025Sdfr		}
279180025Sdfr		svid = ns->ns_svid;
280180025Sdfr	} else if (flags & F_REMOTE) {
281180025Sdfr		/*
282180025Sdfr		 * If we are recovering after a server restart or
283180025Sdfr		 * trashing locks on a force unmount, use the same
284180025Sdfr		 * svid as last time.
285180025Sdfr		 */
286180025Sdfr		svid = fl->l_pid;
287180025Sdfr	} else {
288180025Sdfr		svid = ((struct proc *) id)->p_pid;
289180025Sdfr	}
290180025Sdfr
291180025Sdfr	switch(op) {
292180025Sdfr	case F_SETLK:
293180025Sdfr		if ((flags & (F_FLOCK|F_WAIT)) == (F_FLOCK|F_WAIT)
294180025Sdfr		    && fl->l_type == F_WRLCK) {
295180025Sdfr			/*
296180025Sdfr			 * The semantics for flock(2) require that any
297180025Sdfr			 * shared lock on the file must be released
298180025Sdfr			 * before an exclusive lock is granted. The
299180025Sdfr			 * local locking code interprets this by
300180025Sdfr			 * unlocking the file before sleeping on a
301180025Sdfr			 * blocked exclusive lock request. We
302180025Sdfr			 * approximate this by first attempting
303180025Sdfr			 * non-blocking and if that fails, we unlock
304180025Sdfr			 * the file and block.
305180025Sdfr			 */
306180025Sdfr			error = nlm_setlock(host, &ext, vers, &timo, retries,
307180025Sdfr			    vp, F_SETLK, fl, flags & ~F_WAIT,
308180025Sdfr			    svid, fhlen, &fh.fh_bytes, size, reclaim);
309180025Sdfr			if (error == EAGAIN) {
310180025Sdfr				fl->l_type = F_UNLCK;
311180025Sdfr				error = nlm_clearlock(host, &ext, vers, &timo,
312180025Sdfr				    retries, vp, F_UNLCK, fl, flags,
313180025Sdfr				    svid, fhlen, &fh.fh_bytes, size);
314180025Sdfr				fl->l_type = F_WRLCK;
315180025Sdfr				if (!error) {
316180025Sdfr					mtx_lock(&nlm_svid_lock);
317180025Sdfr					if (ns->ns_active) {
318180025Sdfr						ns->ns_refs--;
319180025Sdfr						ns->ns_active = FALSE;
320180025Sdfr					}
321180025Sdfr					mtx_unlock(&nlm_svid_lock);
322180025Sdfr					flags |= F_WAIT;
323180025Sdfr					error = nlm_setlock(host, &ext, vers,
324180025Sdfr					    &timo, retries, vp, F_SETLK, fl,
325180025Sdfr					    flags, svid, fhlen, &fh.fh_bytes,
326180025Sdfr					    size, reclaim);
327180025Sdfr				}
328180025Sdfr			}
329180025Sdfr		} else {
330180025Sdfr			error = nlm_setlock(host, &ext, vers, &timo, retries,
331180025Sdfr			    vp, op, fl, flags, svid, fhlen, &fh.fh_bytes,
332180025Sdfr			    size, reclaim);
333180025Sdfr		}
334180025Sdfr		if (!error && ns) {
335180025Sdfr			mtx_lock(&nlm_svid_lock);
336180025Sdfr			if (!ns->ns_active) {
337180025Sdfr				/*
338180025Sdfr				 * Add one to the reference count to
339180025Sdfr				 * hold onto the SVID for the lifetime
340180025Sdfr				 * of the lock. Note that since
341180025Sdfr				 * F_FLOCK only supports whole-file
342180025Sdfr				 * locks, there can only be one active
343180025Sdfr				 * lock for this SVID.
344180025Sdfr				 */
345180025Sdfr				ns->ns_refs++;
346180025Sdfr				ns->ns_active = TRUE;
347180025Sdfr			}
348180025Sdfr			mtx_unlock(&nlm_svid_lock);
349180025Sdfr		}
350180025Sdfr		break;
351180025Sdfr
352180025Sdfr	case F_UNLCK:
353180025Sdfr		error = nlm_clearlock(host, &ext, vers, &timo, retries,
354180025Sdfr		    vp, op, fl, flags, svid, fhlen, &fh.fh_bytes, size);
355180025Sdfr		if (!error && ns) {
356180025Sdfr			mtx_lock(&nlm_svid_lock);
357180025Sdfr			if (ns->ns_active) {
358180025Sdfr				ns->ns_refs--;
359180025Sdfr				ns->ns_active = FALSE;
360180025Sdfr			}
361180025Sdfr			mtx_unlock(&nlm_svid_lock);
362180025Sdfr		}
363180025Sdfr		break;
364180025Sdfr
365180025Sdfr	case F_GETLK:
366180025Sdfr		error = nlm_getlock(host, &ext, vers, &timo, retries,
367180025Sdfr		    vp, op, fl, flags, svid, fhlen, &fh.fh_bytes, size);
368180025Sdfr		break;
369180025Sdfr
370180025Sdfr	default:
371180025Sdfr		error = EINVAL;
372180025Sdfr		break;
373180025Sdfr	}
374180025Sdfr
375180025Sdfr	if (ns)
376180025Sdfr		nlm_free_svid(ns);
377180025Sdfr
378302192Skib	cred1 = td->td_ucred;
379180025Sdfr	td->td_ucred = cred;
380302192Skib	crfree(cred1);
381180025Sdfr	AUTH_DESTROY(auth);
382180025Sdfr
383180025Sdfr	nlm_host_release(host);
384180025Sdfr
385180025Sdfr	return (error);
386180025Sdfr}
387180025Sdfr
388180025Sdfrint
389180025Sdfrnlm_advlock(struct vop_advlock_args *ap)
390180025Sdfr{
391180025Sdfr
392180025Sdfr	return (nlm_advlock_internal(ap->a_vp, ap->a_id, ap->a_op, ap->a_fl,
393180025Sdfr		ap->a_flags, FALSE, TRUE));
394180025Sdfr}
395180025Sdfr
396180025Sdfr/*
397180025Sdfr * Set the creds of td to the creds of the given lock's owner. The new
398180025Sdfr * creds reference count will be incremented via crhold. The caller is
399180025Sdfr * responsible for calling crfree and restoring td's original creds.
400180025Sdfr */
401180025Sdfrstatic void
402180025Sdfrnlm_set_creds_for_lock(struct thread *td, struct flock *fl)
403180025Sdfr{
404180025Sdfr	int i;
405180025Sdfr	struct nlm_file_svid *ns;
406180025Sdfr	struct proc *p;
407180025Sdfr	struct ucred *cred;
408180025Sdfr
409180025Sdfr	cred = NULL;
410180025Sdfr	if (fl->l_pid > PID_MAX) {
411180025Sdfr		/*
412180025Sdfr		 * If this was originally a F_FLOCK-style lock, we
413180025Sdfr		 * recorded the creds used when it was originally
414180025Sdfr		 * locked in the nlm_file_svid structure.
415180025Sdfr		 */
416180025Sdfr		mtx_lock(&nlm_svid_lock);
417180025Sdfr		for (i = 0; i < NLM_SVID_HASH_SIZE; i++) {
418180025Sdfr			for (ns = LIST_FIRST(&nlm_file_svids[i]); ns;
419180025Sdfr			     ns = LIST_NEXT(ns, ns_link)) {
420180025Sdfr				if (ns->ns_svid == fl->l_pid) {
421180025Sdfr					cred = crhold(ns->ns_ucred);
422180025Sdfr					break;
423180025Sdfr				}
424180025Sdfr			}
425180025Sdfr		}
426180025Sdfr		mtx_unlock(&nlm_svid_lock);
427180025Sdfr	} else {
428180025Sdfr		/*
429180025Sdfr		 * This lock is owned by a process. Get a reference to
430180025Sdfr		 * the process creds.
431180025Sdfr		 */
432180025Sdfr		p = pfind(fl->l_pid);
433180025Sdfr		if (p) {
434180025Sdfr			cred = crhold(p->p_ucred);
435180025Sdfr			PROC_UNLOCK(p);
436180025Sdfr		}
437180025Sdfr	}
438180025Sdfr
439180025Sdfr	/*
440180025Sdfr	 * If we can't find a cred, fall back on the recovery
441180025Sdfr	 * thread's cred.
442180025Sdfr	 */
443180025Sdfr	if (!cred) {
444180025Sdfr		cred = crhold(td->td_ucred);
445180025Sdfr	}
446180025Sdfr
447180025Sdfr	td->td_ucred = cred;
448180025Sdfr}
449180025Sdfr
450180025Sdfrstatic int
451180025Sdfrnlm_reclaim_free_lock(struct vnode *vp, struct flock *fl, void *arg)
452180025Sdfr{
453180025Sdfr	struct flock newfl;
454180025Sdfr	struct thread *td = curthread;
455180025Sdfr	struct ucred *oldcred;
456180025Sdfr	int error;
457180025Sdfr
458180025Sdfr	newfl = *fl;
459180025Sdfr	newfl.l_type = F_UNLCK;
460180025Sdfr
461180025Sdfr	oldcred = td->td_ucred;
462180025Sdfr	nlm_set_creds_for_lock(td, &newfl);
463180025Sdfr
464180025Sdfr	error = nlm_advlock_internal(vp, NULL, F_UNLCK, &newfl, F_REMOTE,
465180025Sdfr	    FALSE, FALSE);
466180025Sdfr
467180025Sdfr	crfree(td->td_ucred);
468180025Sdfr	td->td_ucred = oldcred;
469180025Sdfr
470180025Sdfr	return (error);
471180025Sdfr}
472180025Sdfr
473180025Sdfrint
474180025Sdfrnlm_reclaim(struct vop_reclaim_args *ap)
475180025Sdfr{
476180025Sdfr
477180025Sdfr	nlm_cancel_wait(ap->a_vp);
478180025Sdfr	lf_iteratelocks_vnode(ap->a_vp, nlm_reclaim_free_lock, NULL);
479180025Sdfr	return (0);
480180025Sdfr}
481180025Sdfr
482180025Sdfrstruct nlm_recovery_context {
483180025Sdfr	struct nlm_host	*nr_host;	/* host we are recovering */
484180025Sdfr	int		nr_state;	/* remote NSM state for recovery */
485180025Sdfr};
486180025Sdfr
487180025Sdfrstatic int
488180025Sdfrnlm_client_recover_lock(struct vnode *vp, struct flock *fl, void *arg)
489180025Sdfr{
490180025Sdfr	struct nlm_recovery_context *nr = (struct nlm_recovery_context *) arg;
491180025Sdfr	struct thread *td = curthread;
492180025Sdfr	struct ucred *oldcred;
493180025Sdfr	int state, error;
494180025Sdfr
495180025Sdfr	/*
496180025Sdfr	 * If the remote NSM state changes during recovery, the host
497180025Sdfr	 * must have rebooted a second time. In that case, we must
498180025Sdfr	 * restart the recovery.
499180025Sdfr	 */
500180025Sdfr	state = nlm_host_get_state(nr->nr_host);
501180025Sdfr	if (nr->nr_state != state)
502180025Sdfr		return (ERESTART);
503180025Sdfr
504180025Sdfr	error = vn_lock(vp, LK_SHARED);
505180025Sdfr	if (error)
506180025Sdfr		return (error);
507180025Sdfr
508180025Sdfr	oldcred = td->td_ucred;
509180025Sdfr	nlm_set_creds_for_lock(td, fl);
510180025Sdfr
511180025Sdfr	error = nlm_advlock_internal(vp, NULL, F_SETLK, fl, F_REMOTE,
512180025Sdfr	    TRUE, TRUE);
513180025Sdfr
514180025Sdfr	crfree(td->td_ucred);
515180025Sdfr	td->td_ucred = oldcred;
516180025Sdfr
517180025Sdfr	return (error);
518180025Sdfr}
519180025Sdfr
520180025Sdfrvoid
521180025Sdfrnlm_client_recovery(struct nlm_host *host)
522180025Sdfr{
523180025Sdfr	struct nlm_recovery_context nr;
524180025Sdfr	int sysid, error;
525180025Sdfr
526180025Sdfr	sysid = NLM_SYSID_CLIENT | nlm_host_get_sysid(host);
527180025Sdfr	do {
528180025Sdfr		nr.nr_host = host;
529180025Sdfr		nr.nr_state = nlm_host_get_state(host);
530180025Sdfr		error = lf_iteratelocks_sysid(sysid,
531180025Sdfr		    nlm_client_recover_lock, &nr);
532180025Sdfr	} while (error == ERESTART);
533180025Sdfr}
534180025Sdfr
535180025Sdfrstatic void
536180025Sdfrnlm_convert_to_nlm_lock(struct nlm_lock *dst, struct nlm4_lock *src)
537180025Sdfr{
538180025Sdfr
539180025Sdfr	dst->caller_name = src->caller_name;
540180025Sdfr	dst->fh = src->fh;
541180025Sdfr	dst->oh = src->oh;
542180025Sdfr	dst->svid = src->svid;
543180025Sdfr	dst->l_offset = src->l_offset;
544180025Sdfr	dst->l_len = src->l_len;
545180025Sdfr}
546180025Sdfr
547180025Sdfrstatic void
548180025Sdfrnlm_convert_to_nlm4_holder(struct nlm4_holder *dst, struct nlm_holder *src)
549180025Sdfr{
550180025Sdfr
551180025Sdfr	dst->exclusive = src->exclusive;
552180025Sdfr	dst->svid = src->svid;
553180025Sdfr	dst->oh = src->oh;
554180025Sdfr	dst->l_offset = src->l_offset;
555180025Sdfr	dst->l_len = src->l_len;
556180025Sdfr}
557180025Sdfr
558180025Sdfrstatic void
559180025Sdfrnlm_convert_to_nlm4_res(struct nlm4_res *dst, struct nlm_res *src)
560180025Sdfr{
561180025Sdfr	dst->cookie = src->cookie;
562180025Sdfr	dst->stat.stat = (enum nlm4_stats) src->stat.stat;
563180025Sdfr}
564180025Sdfr
565180025Sdfrstatic enum clnt_stat
566180025Sdfrnlm_test_rpc(rpcvers_t vers, nlm4_testargs *args, nlm4_testres *res, CLIENT *client,
567180025Sdfr    struct rpc_callextra *ext, struct timeval timo)
568180025Sdfr{
569180025Sdfr	if (vers == NLM_VERS4) {
570180025Sdfr		return nlm4_test_4(args, res, client, ext, timo);
571180025Sdfr	} else {
572180025Sdfr		nlm_testargs args1;
573180025Sdfr		nlm_testres res1;
574180025Sdfr		enum clnt_stat stat;
575180025Sdfr
576180025Sdfr		args1.cookie = args->cookie;
577180025Sdfr		args1.exclusive = args->exclusive;
578180025Sdfr		nlm_convert_to_nlm_lock(&args1.alock, &args->alock);
579180025Sdfr		memset(&res1, 0, sizeof(res1));
580180025Sdfr
581180025Sdfr		stat = nlm_test_1(&args1, &res1, client, ext, timo);
582180025Sdfr
583180025Sdfr		if (stat == RPC_SUCCESS) {
584180025Sdfr			res->cookie = res1.cookie;
585180025Sdfr			res->stat.stat = (enum nlm4_stats) res1.stat.stat;
586180025Sdfr			if (res1.stat.stat == nlm_denied)
587180025Sdfr				nlm_convert_to_nlm4_holder(
588180025Sdfr					&res->stat.nlm4_testrply_u.holder,
589180025Sdfr					&res1.stat.nlm_testrply_u.holder);
590180025Sdfr		}
591180025Sdfr
592180025Sdfr		return (stat);
593180025Sdfr	}
594180025Sdfr}
595180025Sdfr
596180025Sdfrstatic enum clnt_stat
597180025Sdfrnlm_lock_rpc(rpcvers_t vers, nlm4_lockargs *args, nlm4_res *res, CLIENT *client,
598180025Sdfr    struct rpc_callextra *ext, struct timeval timo)
599180025Sdfr{
600180025Sdfr	if (vers == NLM_VERS4) {
601180025Sdfr		return nlm4_lock_4(args, res, client, ext, timo);
602180025Sdfr	} else {
603180025Sdfr		nlm_lockargs args1;
604180025Sdfr		nlm_res res1;
605180025Sdfr		enum clnt_stat stat;
606180025Sdfr
607180025Sdfr		args1.cookie = args->cookie;
608180025Sdfr		args1.block = args->block;
609180025Sdfr		args1.exclusive = args->exclusive;
610180025Sdfr		nlm_convert_to_nlm_lock(&args1.alock, &args->alock);
611180025Sdfr		args1.reclaim = args->reclaim;
612180025Sdfr		args1.state = args->state;
613180025Sdfr		memset(&res1, 0, sizeof(res1));
614180025Sdfr
615180025Sdfr		stat = nlm_lock_1(&args1, &res1, client, ext, timo);
616180025Sdfr
617180025Sdfr		if (stat == RPC_SUCCESS) {
618180025Sdfr			nlm_convert_to_nlm4_res(res, &res1);
619180025Sdfr		}
620180025Sdfr
621180025Sdfr		return (stat);
622180025Sdfr	}
623180025Sdfr}
624180025Sdfr
625180025Sdfrstatic enum clnt_stat
626180025Sdfrnlm_cancel_rpc(rpcvers_t vers, nlm4_cancargs *args, nlm4_res *res, CLIENT *client,
627180025Sdfr    struct rpc_callextra *ext, struct timeval timo)
628180025Sdfr{
629180025Sdfr	if (vers == NLM_VERS4) {
630180025Sdfr		return nlm4_cancel_4(args, res, client, ext, timo);
631180025Sdfr	} else {
632180025Sdfr		nlm_cancargs args1;
633180025Sdfr		nlm_res res1;
634180025Sdfr		enum clnt_stat stat;
635180025Sdfr
636180025Sdfr		args1.cookie = args->cookie;
637180025Sdfr		args1.block = args->block;
638180025Sdfr		args1.exclusive = args->exclusive;
639180025Sdfr		nlm_convert_to_nlm_lock(&args1.alock, &args->alock);
640180025Sdfr		memset(&res1, 0, sizeof(res1));
641180025Sdfr
642180025Sdfr		stat = nlm_cancel_1(&args1, &res1, client, ext, timo);
643180025Sdfr
644180025Sdfr		if (stat == RPC_SUCCESS) {
645180025Sdfr			nlm_convert_to_nlm4_res(res, &res1);
646180025Sdfr		}
647180025Sdfr
648180025Sdfr		return (stat);
649180025Sdfr	}
650180025Sdfr}
651180025Sdfr
652180025Sdfrstatic enum clnt_stat
653180025Sdfrnlm_unlock_rpc(rpcvers_t vers, nlm4_unlockargs *args, nlm4_res *res, CLIENT *client,
654180025Sdfr    struct rpc_callextra *ext, struct timeval timo)
655180025Sdfr{
656180025Sdfr	if (vers == NLM_VERS4) {
657180025Sdfr		return nlm4_unlock_4(args, res, client, ext, timo);
658180025Sdfr	} else {
659180025Sdfr		nlm_unlockargs args1;
660180025Sdfr		nlm_res res1;
661180025Sdfr		enum clnt_stat stat;
662180025Sdfr
663180025Sdfr		args1.cookie = args->cookie;
664180025Sdfr		nlm_convert_to_nlm_lock(&args1.alock, &args->alock);
665180025Sdfr		memset(&res1, 0, sizeof(res1));
666180025Sdfr
667180025Sdfr		stat = nlm_unlock_1(&args1, &res1, client, ext, timo);
668180025Sdfr
669180025Sdfr		if (stat == RPC_SUCCESS) {
670180025Sdfr			nlm_convert_to_nlm4_res(res, &res1);
671180025Sdfr		}
672180025Sdfr
673180025Sdfr		return (stat);
674180025Sdfr	}
675180025Sdfr}
676180025Sdfr
677180025Sdfr/*
678180025Sdfr * Called after a lock request (set or clear) succeeded. We record the
679180025Sdfr * details in the local lock manager. Note that since the remote
680180025Sdfr * server has granted the lock, we can be sure that it doesn't
681180025Sdfr * conflict with any other locks we have in the local lock manager.
682180025Sdfr *
683180025Sdfr * Since it is possible that host may also make NLM client requests to
684180025Sdfr * our NLM server, we use a different sysid value to record our own
685180025Sdfr * client locks.
686180025Sdfr *
687180025Sdfr * Note that since it is possible for us to receive replies from the
688180025Sdfr * server in a different order than the locks were granted (e.g. if
689180025Sdfr * many local threads are contending for the same lock), we must use a
690180025Sdfr * blocking operation when registering with the local lock manager.
691180025Sdfr * We expect that any actual wait will be rare and short hence we
692180025Sdfr * ignore signals for this.
693180025Sdfr */
694180025Sdfrstatic void
695180025Sdfrnlm_record_lock(struct vnode *vp, int op, struct flock *fl,
696180025Sdfr    int svid, int sysid, off_t size)
697180025Sdfr{
698180025Sdfr	struct vop_advlockasync_args a;
699180025Sdfr	struct flock newfl;
700180025Sdfr	int error;
701180025Sdfr
702180025Sdfr	a.a_vp = vp;
703180025Sdfr	a.a_id = NULL;
704180025Sdfr	a.a_op = op;
705180025Sdfr	a.a_fl = &newfl;
706180025Sdfr	a.a_flags = F_REMOTE|F_WAIT|F_NOINTR;
707180025Sdfr	a.a_task = NULL;
708180025Sdfr	a.a_cookiep = NULL;
709180025Sdfr	newfl.l_start = fl->l_start;
710180025Sdfr	newfl.l_len = fl->l_len;
711180025Sdfr	newfl.l_type = fl->l_type;
712180025Sdfr	newfl.l_whence = fl->l_whence;
713180025Sdfr	newfl.l_pid = svid;
714180025Sdfr	newfl.l_sysid = NLM_SYSID_CLIENT | sysid;
715180025Sdfr
716302209Skib	for (;;) {
717302209Skib		error = lf_advlockasync(&a, &vp->v_lockf, size);
718302209Skib		if (error == EDEADLK) {
719302209Skib			/*
720302209Skib			 * Locks are associated with the processes and
721302209Skib			 * not with threads.  Suppose we have two
722302209Skib			 * threads A1 A2 in one process, A1 locked
723302209Skib			 * file f1, A2 is locking file f2, and A1 is
724302209Skib			 * unlocking f1. Then remote server may
725302209Skib			 * already unlocked f1, while local still not
726302209Skib			 * yet scheduled A1 to make the call to local
727302209Skib			 * advlock manager. The process B owns lock on
728302209Skib			 * f2 and issued the lock on f1.  Remote would
729302209Skib			 * grant B the request on f1, but local would
730302209Skib			 * return EDEADLK.
731302209Skib			*/
732302209Skib			pause("nlmdlk", 1);
733302209Skib			/* XXXKIB allow suspend */
734302209Skib		} else if (error == EINTR) {
735302209Skib			/*
736302209Skib			 * lf_purgelocks() might wake up the lock
737302209Skib			 * waiter and removed our lock graph edges.
738302209Skib			 * There is no sense in re-trying recording
739302209Skib			 * the lock to the local manager after
740302209Skib			 * reclaim.
741302209Skib			 */
742302209Skib			error = 0;
743302209Skib			break;
744302209Skib		} else
745302209Skib			break;
746302209Skib	}
747193434Sed	KASSERT(error == 0 || error == ENOENT,
748193432Sdfr	    ("Failed to register NFS lock locally - error=%d", error));
749180025Sdfr}
750180025Sdfr
751180025Sdfrstatic int
752180025Sdfrnlm_setlock(struct nlm_host *host, struct rpc_callextra *ext,
753180025Sdfr    rpcvers_t vers, struct timeval *timo, int retries,
754180025Sdfr    struct vnode *vp, int op, struct flock *fl, int flags,
755180025Sdfr    int svid, size_t fhlen, void *fh, off_t size, bool_t reclaim)
756180025Sdfr{
757180025Sdfr	struct nlm4_lockargs args;
758180025Sdfr	char oh_space[32];
759180025Sdfr	struct nlm4_res res;
760180025Sdfr	u_int xid;
761180025Sdfr	CLIENT *client;
762180025Sdfr	enum clnt_stat stat;
763180025Sdfr	int retry, block, exclusive;
764180025Sdfr	void *wait_handle = NULL;
765180025Sdfr	int error;
766180025Sdfr
767180025Sdfr	memset(&args, 0, sizeof(args));
768180025Sdfr	memset(&res, 0, sizeof(res));
769180025Sdfr
770180025Sdfr	block = (flags & F_WAIT) ? TRUE : FALSE;
771180025Sdfr	exclusive = (fl->l_type == F_WRLCK);
772180025Sdfr
773180025Sdfr	error = nlm_init_lock(fl, flags, svid, vers, fhlen, fh, size,
774180025Sdfr	    &args.alock, oh_space);
775180025Sdfr	if (error)
776180025Sdfr		return (error);
777180025Sdfr	args.block = block;
778180025Sdfr	args.exclusive = exclusive;
779180025Sdfr	args.reclaim = reclaim;
780180025Sdfr	args.state = nlm_nsm_state;
781180025Sdfr
782180025Sdfr	retry = 5*hz;
783180025Sdfr	for (;;) {
784184588Sdfr		client = nlm_host_get_rpc(host, FALSE);
785180025Sdfr		if (!client)
786180025Sdfr			return (ENOLCK); /* XXX retry? */
787180025Sdfr
788180025Sdfr		if (block)
789180025Sdfr			wait_handle = nlm_register_wait_lock(&args.alock, vp);
790180025Sdfr
791180025Sdfr		xid = atomic_fetchadd_int(&nlm_xid, 1);
792180025Sdfr		args.cookie.n_len = sizeof(xid);
793180025Sdfr		args.cookie.n_bytes = (char*) &xid;
794180025Sdfr
795180025Sdfr		stat = nlm_lock_rpc(vers, &args, &res, client, ext, *timo);
796180025Sdfr
797180025Sdfr		CLNT_RELEASE(client);
798180025Sdfr
799180025Sdfr		if (stat != RPC_SUCCESS) {
800180025Sdfr			if (block)
801180025Sdfr				nlm_deregister_wait_lock(wait_handle);
802180025Sdfr			if (retries) {
803180025Sdfr				retries--;
804180025Sdfr				continue;
805180025Sdfr			}
806180025Sdfr			return (EINVAL);
807180025Sdfr		}
808180025Sdfr
809180025Sdfr		/*
810180025Sdfr		 * Free res.cookie.
811180025Sdfr		 */
812180025Sdfr		xdr_free((xdrproc_t) xdr_nlm4_res, &res);
813180025Sdfr
814180025Sdfr		if (block && res.stat.stat != nlm4_blocked)
815180025Sdfr			nlm_deregister_wait_lock(wait_handle);
816180025Sdfr
817180025Sdfr		if (res.stat.stat == nlm4_denied_grace_period) {
818180025Sdfr			/*
819180025Sdfr			 * The server has recently rebooted and is
820180025Sdfr			 * giving old clients a change to reclaim
821180025Sdfr			 * their locks. Wait for a few seconds and try
822180025Sdfr			 * again.
823180025Sdfr			 */
824180025Sdfr			error = tsleep(&args, PCATCH, "nlmgrace", retry);
825180025Sdfr			if (error && error != EWOULDBLOCK)
826180025Sdfr				return (error);
827180025Sdfr			retry = 2*retry;
828180025Sdfr			if (retry > 30*hz)
829180025Sdfr				retry = 30*hz;
830180025Sdfr			continue;
831180025Sdfr		}
832180025Sdfr
833180025Sdfr		if (block && res.stat.stat == nlm4_blocked) {
834180025Sdfr			/*
835180025Sdfr			 * The server should call us back with a
836180025Sdfr			 * granted message when the lock succeeds. In
837180025Sdfr			 * order to deal with broken servers, lost
838180025Sdfr			 * granted messages and server reboots, we
839180025Sdfr			 * will also re-try every few seconds.
840180025Sdfr			 */
841180025Sdfr			error = nlm_wait_lock(wait_handle, retry);
842180025Sdfr			if (error == EWOULDBLOCK) {
843180025Sdfr				retry = 2*retry;
844180025Sdfr				if (retry > 30*hz)
845180025Sdfr					retry = 30*hz;
846180025Sdfr				continue;
847180025Sdfr			}
848180025Sdfr			if (error) {
849180025Sdfr				/*
850180025Sdfr				 * We need to call the server to
851180025Sdfr				 * cancel our lock request.
852180025Sdfr				 */
853180025Sdfr				nlm4_cancargs cancel;
854180025Sdfr
855180025Sdfr				memset(&cancel, 0, sizeof(cancel));
856180025Sdfr
857180025Sdfr				xid = atomic_fetchadd_int(&nlm_xid, 1);
858180025Sdfr				cancel.cookie.n_len = sizeof(xid);
859180025Sdfr				cancel.cookie.n_bytes = (char*) &xid;
860180025Sdfr				cancel.block = block;
861180025Sdfr				cancel.exclusive = exclusive;
862180025Sdfr				cancel.alock = args.alock;
863180025Sdfr
864180025Sdfr				do {
865184588Sdfr					client = nlm_host_get_rpc(host, FALSE);
866180025Sdfr					if (!client)
867180025Sdfr						/* XXX retry? */
868180025Sdfr						return (ENOLCK);
869180025Sdfr
870180025Sdfr					stat = nlm_cancel_rpc(vers, &cancel,
871180025Sdfr					    &res, client, ext, *timo);
872180025Sdfr
873180025Sdfr					CLNT_RELEASE(client);
874180025Sdfr
875180025Sdfr					if (stat != RPC_SUCCESS) {
876180025Sdfr						/*
877180025Sdfr						 * We need to cope
878180025Sdfr						 * with temporary
879180025Sdfr						 * network partitions
880180025Sdfr						 * as well as server
881180025Sdfr						 * reboots. This means
882180025Sdfr						 * we have to keep
883180025Sdfr						 * trying to cancel
884180025Sdfr						 * until the server
885180025Sdfr						 * wakes up again.
886180025Sdfr						 */
887180025Sdfr						pause("nlmcancel", 10*hz);
888180025Sdfr					}
889180025Sdfr				} while (stat != RPC_SUCCESS);
890180025Sdfr
891180025Sdfr				/*
892180025Sdfr				 * Free res.cookie.
893180025Sdfr				 */
894180025Sdfr				xdr_free((xdrproc_t) xdr_nlm4_res, &res);
895180025Sdfr
896180025Sdfr				switch (res.stat.stat) {
897180025Sdfr				case nlm_denied:
898180025Sdfr					/*
899180025Sdfr					 * There was nothing
900180025Sdfr					 * to cancel. We are
901180025Sdfr					 * going to go ahead
902180025Sdfr					 * and assume we got
903180025Sdfr					 * the lock.
904180025Sdfr					 */
905180025Sdfr					error = 0;
906180025Sdfr					break;
907180025Sdfr
908180025Sdfr				case nlm4_denied_grace_period:
909180025Sdfr					/*
910180025Sdfr					 * The server has
911180025Sdfr					 * recently rebooted -
912180025Sdfr					 * treat this as a
913180025Sdfr					 * successful
914180025Sdfr					 * cancellation.
915180025Sdfr					 */
916180025Sdfr					break;
917180025Sdfr
918180025Sdfr				case nlm4_granted:
919180025Sdfr					/*
920180025Sdfr					 * We managed to
921180025Sdfr					 * cancel.
922180025Sdfr					 */
923180025Sdfr					break;
924180025Sdfr
925180025Sdfr				default:
926180025Sdfr					/*
927180025Sdfr					 * Broken server
928180025Sdfr					 * implementation -
929180025Sdfr					 * can't really do
930180025Sdfr					 * anything here.
931180025Sdfr					 */
932180025Sdfr					break;
933180025Sdfr				}
934180025Sdfr
935180025Sdfr			}
936180025Sdfr		} else {
937180025Sdfr			error = nlm_map_status(res.stat.stat);
938180025Sdfr		}
939180025Sdfr
940180025Sdfr		if (!error && !reclaim) {
941180025Sdfr			nlm_record_lock(vp, op, fl, args.alock.svid,
942180025Sdfr			    nlm_host_get_sysid(host), size);
943180025Sdfr			nlm_host_monitor(host, 0);
944180025Sdfr		}
945180025Sdfr
946180025Sdfr		return (error);
947180025Sdfr	}
948180025Sdfr}
949180025Sdfr
950180025Sdfrstatic int
951180025Sdfrnlm_clearlock(struct nlm_host *host, struct rpc_callextra *ext,
952180025Sdfr    rpcvers_t vers, struct timeval *timo, int retries,
953180025Sdfr    struct vnode *vp, int op, struct flock *fl, int flags,
954180025Sdfr    int svid, size_t fhlen, void *fh, off_t size)
955180025Sdfr{
956180025Sdfr	struct nlm4_unlockargs args;
957180025Sdfr	char oh_space[32];
958180025Sdfr	struct nlm4_res res;
959180025Sdfr	u_int xid;
960180025Sdfr	CLIENT *client;
961180025Sdfr	enum clnt_stat stat;
962180025Sdfr	int error;
963180025Sdfr
964180025Sdfr	memset(&args, 0, sizeof(args));
965180025Sdfr	memset(&res, 0, sizeof(res));
966180025Sdfr
967180025Sdfr	error = nlm_init_lock(fl, flags, svid, vers, fhlen, fh, size,
968180025Sdfr	    &args.alock, oh_space);
969180025Sdfr	if (error)
970180025Sdfr		return (error);
971180025Sdfr
972180025Sdfr	for (;;) {
973184588Sdfr		client = nlm_host_get_rpc(host, FALSE);
974180025Sdfr		if (!client)
975180025Sdfr			return (ENOLCK); /* XXX retry? */
976180025Sdfr
977180025Sdfr		xid = atomic_fetchadd_int(&nlm_xid, 1);
978180025Sdfr		args.cookie.n_len = sizeof(xid);
979180025Sdfr		args.cookie.n_bytes = (char*) &xid;
980180025Sdfr
981180025Sdfr		stat = nlm_unlock_rpc(vers, &args, &res, client, ext, *timo);
982180025Sdfr
983180025Sdfr		CLNT_RELEASE(client);
984180025Sdfr
985180025Sdfr		if (stat != RPC_SUCCESS) {
986180025Sdfr			if (retries) {
987180025Sdfr				retries--;
988180025Sdfr				continue;
989180025Sdfr			}
990180025Sdfr			return (EINVAL);
991180025Sdfr		}
992180025Sdfr
993180025Sdfr		/*
994180025Sdfr		 * Free res.cookie.
995180025Sdfr		 */
996180025Sdfr		xdr_free((xdrproc_t) xdr_nlm4_res, &res);
997180025Sdfr
998180025Sdfr		if (res.stat.stat == nlm4_denied_grace_period) {
999180025Sdfr			/*
1000180025Sdfr			 * The server has recently rebooted and is
1001180025Sdfr			 * giving old clients a change to reclaim
1002180025Sdfr			 * their locks. Wait for a few seconds and try
1003180025Sdfr			 * again.
1004180025Sdfr			 */
1005180025Sdfr			error = tsleep(&args, PCATCH, "nlmgrace", 5*hz);
1006180025Sdfr			if (error && error != EWOULDBLOCK)
1007180025Sdfr				return (error);
1008180025Sdfr			continue;
1009180025Sdfr		}
1010180025Sdfr
1011180025Sdfr		/*
1012180025Sdfr		 * If we are being called via nlm_reclaim (which will
1013180025Sdfr		 * use the F_REMOTE flag), don't record the lock
1014180025Sdfr		 * operation in the local lock manager since the vnode
1015180025Sdfr		 * is going away.
1016180025Sdfr		 */
1017180025Sdfr		if (!(flags & F_REMOTE))
1018180025Sdfr			nlm_record_lock(vp, op, fl, args.alock.svid,
1019180025Sdfr			    nlm_host_get_sysid(host), size);
1020180025Sdfr
1021180025Sdfr		return (0);
1022180025Sdfr	}
1023180025Sdfr}
1024180025Sdfr
1025180025Sdfrstatic int
1026180025Sdfrnlm_getlock(struct nlm_host *host, struct rpc_callextra *ext,
1027180025Sdfr    rpcvers_t vers, struct timeval *timo, int retries,
1028180025Sdfr    struct vnode *vp, int op, struct flock *fl, int flags,
1029180025Sdfr    int svid, size_t fhlen, void *fh, off_t size)
1030180025Sdfr{
1031180025Sdfr	struct nlm4_testargs args;
1032180025Sdfr	char oh_space[32];
1033180025Sdfr	struct nlm4_testres res;
1034180025Sdfr	u_int xid;
1035180025Sdfr	CLIENT *client;
1036180025Sdfr	enum clnt_stat stat;
1037180025Sdfr	int exclusive;
1038180025Sdfr	int error;
1039180025Sdfr
1040180025Sdfr	KASSERT(!(flags & F_FLOCK), ("unexpected F_FLOCK for F_GETLK"));
1041180025Sdfr
1042180025Sdfr	memset(&args, 0, sizeof(args));
1043180025Sdfr	memset(&res, 0, sizeof(res));
1044180025Sdfr
1045180025Sdfr	exclusive = (fl->l_type == F_WRLCK);
1046180025Sdfr
1047180025Sdfr	error = nlm_init_lock(fl, flags, svid, vers, fhlen, fh, size,
1048180025Sdfr	    &args.alock, oh_space);
1049180025Sdfr	if (error)
1050180025Sdfr		return (error);
1051180025Sdfr	args.exclusive = exclusive;
1052180025Sdfr
1053180025Sdfr	for (;;) {
1054184588Sdfr		client = nlm_host_get_rpc(host, FALSE);
1055180025Sdfr		if (!client)
1056180025Sdfr			return (ENOLCK); /* XXX retry? */
1057180025Sdfr
1058180025Sdfr		xid = atomic_fetchadd_int(&nlm_xid, 1);
1059180025Sdfr		args.cookie.n_len = sizeof(xid);
1060180025Sdfr		args.cookie.n_bytes = (char*) &xid;
1061180025Sdfr
1062180025Sdfr		stat = nlm_test_rpc(vers, &args, &res, client, ext, *timo);
1063180025Sdfr
1064180025Sdfr		CLNT_RELEASE(client);
1065180025Sdfr
1066180025Sdfr		if (stat != RPC_SUCCESS) {
1067180025Sdfr			if (retries) {
1068180025Sdfr				retries--;
1069180025Sdfr				continue;
1070180025Sdfr			}
1071180025Sdfr			return (EINVAL);
1072180025Sdfr		}
1073180025Sdfr
1074180025Sdfr		if (res.stat.stat == nlm4_denied_grace_period) {
1075180025Sdfr			/*
1076180025Sdfr			 * The server has recently rebooted and is
1077180025Sdfr			 * giving old clients a change to reclaim
1078180025Sdfr			 * their locks. Wait for a few seconds and try
1079180025Sdfr			 * again.
1080180025Sdfr			 */
1081180025Sdfr			xdr_free((xdrproc_t) xdr_nlm4_testres, &res);
1082180025Sdfr			error = tsleep(&args, PCATCH, "nlmgrace", 5*hz);
1083180025Sdfr			if (error && error != EWOULDBLOCK)
1084180025Sdfr				return (error);
1085180025Sdfr			continue;
1086180025Sdfr		}
1087180025Sdfr
1088180025Sdfr		if (res.stat.stat == nlm4_denied) {
1089180025Sdfr			struct nlm4_holder *h =
1090180025Sdfr				&res.stat.nlm4_testrply_u.holder;
1091180025Sdfr			fl->l_start = h->l_offset;
1092180025Sdfr			fl->l_len = h->l_len;
1093180025Sdfr			fl->l_pid = h->svid;
1094180025Sdfr			if (h->exclusive)
1095180025Sdfr				fl->l_type = F_WRLCK;
1096180025Sdfr			else
1097180025Sdfr				fl->l_type = F_RDLCK;
1098180025Sdfr			fl->l_whence = SEEK_SET;
1099180025Sdfr			fl->l_sysid = 0;
1100180025Sdfr		} else {
1101180025Sdfr			fl->l_type = F_UNLCK;
1102180025Sdfr		}
1103180025Sdfr
1104180025Sdfr		xdr_free((xdrproc_t) xdr_nlm4_testres, &res);
1105180025Sdfr
1106180025Sdfr		return (0);
1107180025Sdfr	}
1108180025Sdfr}
1109180025Sdfr
1110180025Sdfrstatic int
1111180025Sdfrnlm_map_status(nlm4_stats stat)
1112180025Sdfr{
1113180025Sdfr	switch (stat) {
1114180025Sdfr	case nlm4_granted:
1115180025Sdfr		return (0);
1116180025Sdfr
1117180025Sdfr	case nlm4_denied:
1118180025Sdfr		return (EAGAIN);
1119180025Sdfr
1120180025Sdfr	case nlm4_denied_nolocks:
1121180025Sdfr		return (ENOLCK);
1122180025Sdfr
1123180025Sdfr	case nlm4_deadlck:
1124180025Sdfr		return (EDEADLK);
1125180025Sdfr
1126180025Sdfr	case nlm4_rofs:
1127180025Sdfr		return (EROFS);
1128180025Sdfr
1129180025Sdfr	case nlm4_stale_fh:
1130180025Sdfr		return (ESTALE);
1131180025Sdfr
1132180025Sdfr	case nlm4_fbig:
1133180025Sdfr		return (EFBIG);
1134180025Sdfr
1135180025Sdfr	case nlm4_failed:
1136180025Sdfr		return (EACCES);
1137180025Sdfr
1138180025Sdfr	default:
1139180025Sdfr		return (EINVAL);
1140180025Sdfr	}
1141180025Sdfr}
1142180025Sdfr
1143180025Sdfrstatic struct nlm_file_svid *
1144180025Sdfrnlm_find_svid(void *id)
1145180025Sdfr{
1146180025Sdfr	struct nlm_file_svid *ns, *newns;
1147180025Sdfr	int h;
1148180025Sdfr
1149180025Sdfr	h = (((uintptr_t) id) >> 7) % NLM_SVID_HASH_SIZE;
1150180025Sdfr
1151180025Sdfr	mtx_lock(&nlm_svid_lock);
1152180025Sdfr	LIST_FOREACH(ns, &nlm_file_svids[h], ns_link) {
1153180025Sdfr		if (ns->ns_id == id) {
1154180025Sdfr			ns->ns_refs++;
1155180025Sdfr			break;
1156180025Sdfr		}
1157180025Sdfr	}
1158180025Sdfr	mtx_unlock(&nlm_svid_lock);
1159180025Sdfr	if (!ns) {
1160180025Sdfr		int svid = alloc_unr(nlm_svid_allocator);
1161180025Sdfr		newns = malloc(sizeof(struct nlm_file_svid), M_NLM,
1162180025Sdfr		    M_WAITOK);
1163180025Sdfr		newns->ns_refs = 1;
1164180025Sdfr		newns->ns_id = id;
1165180025Sdfr		newns->ns_svid = svid;
1166180025Sdfr		newns->ns_ucred = NULL;
1167180025Sdfr		newns->ns_active = FALSE;
1168180025Sdfr
1169180025Sdfr		/*
1170180025Sdfr		 * We need to check for a race with some other
1171180025Sdfr		 * thread allocating a svid for this file.
1172180025Sdfr		 */
1173180025Sdfr		mtx_lock(&nlm_svid_lock);
1174180025Sdfr		LIST_FOREACH(ns, &nlm_file_svids[h], ns_link) {
1175180025Sdfr			if (ns->ns_id == id) {
1176180025Sdfr				ns->ns_refs++;
1177180025Sdfr				break;
1178180025Sdfr			}
1179180025Sdfr		}
1180180025Sdfr		if (ns) {
1181180025Sdfr			mtx_unlock(&nlm_svid_lock);
1182180025Sdfr			free_unr(nlm_svid_allocator, newns->ns_svid);
1183180025Sdfr			free(newns, M_NLM);
1184180025Sdfr		} else {
1185180025Sdfr			LIST_INSERT_HEAD(&nlm_file_svids[h], newns,
1186180025Sdfr			    ns_link);
1187180025Sdfr			ns = newns;
1188180025Sdfr			mtx_unlock(&nlm_svid_lock);
1189180025Sdfr		}
1190180025Sdfr	}
1191180025Sdfr
1192180025Sdfr	return (ns);
1193180025Sdfr}
1194180025Sdfr
1195180025Sdfrstatic void
1196180025Sdfrnlm_free_svid(struct nlm_file_svid *ns)
1197180025Sdfr{
1198180025Sdfr
1199180025Sdfr	mtx_lock(&nlm_svid_lock);
1200180025Sdfr	ns->ns_refs--;
1201180025Sdfr	if (!ns->ns_refs) {
1202180025Sdfr		KASSERT(!ns->ns_active, ("Freeing active SVID"));
1203180025Sdfr		LIST_REMOVE(ns, ns_link);
1204180025Sdfr		mtx_unlock(&nlm_svid_lock);
1205180025Sdfr		free_unr(nlm_svid_allocator, ns->ns_svid);
1206180025Sdfr		if (ns->ns_ucred)
1207180025Sdfr			crfree(ns->ns_ucred);
1208180025Sdfr		free(ns, M_NLM);
1209180025Sdfr	} else {
1210180025Sdfr		mtx_unlock(&nlm_svid_lock);
1211180025Sdfr	}
1212180025Sdfr}
1213180025Sdfr
1214180025Sdfrstatic int
1215180025Sdfrnlm_init_lock(struct flock *fl, int flags, int svid,
1216180025Sdfr    rpcvers_t vers, size_t fhlen, void *fh, off_t size,
1217180025Sdfr    struct nlm4_lock *lock, char oh_space[32])
1218180025Sdfr{
1219180025Sdfr	size_t oh_len;
1220180025Sdfr	off_t start, len;
1221180025Sdfr
1222180025Sdfr	if (fl->l_whence == SEEK_END) {
1223180025Sdfr		if (size > OFF_MAX
1224180025Sdfr		    || (fl->l_start > 0 && size > OFF_MAX - fl->l_start))
1225180025Sdfr			return (EOVERFLOW);
1226180025Sdfr		start = size + fl->l_start;
1227180025Sdfr	} else if (fl->l_whence == SEEK_SET || fl->l_whence == SEEK_CUR) {
1228180025Sdfr		start = fl->l_start;
1229180025Sdfr	} else {
1230180025Sdfr		return (EINVAL);
1231180025Sdfr	}
1232180025Sdfr	if (start < 0)
1233180025Sdfr		return (EINVAL);
1234180025Sdfr	if (fl->l_len < 0) {
1235180025Sdfr		len = -fl->l_len;
1236180025Sdfr		start -= len;
1237180025Sdfr		if (start < 0)
1238180025Sdfr			return (EINVAL);
1239180025Sdfr	} else {
1240180025Sdfr		len = fl->l_len;
1241180025Sdfr	}
1242180025Sdfr
1243180025Sdfr	if (vers == NLM_VERS) {
1244180025Sdfr		/*
1245180025Sdfr		 * Enforce range limits on V1 locks
1246180025Sdfr		 */
1247180025Sdfr		if (start > 0xffffffffLL || len > 0xffffffffLL)
1248180025Sdfr			return (EOVERFLOW);
1249180025Sdfr	}
1250180025Sdfr
1251193066Sjamie	snprintf(oh_space, 32, "%d@", svid);
1252180025Sdfr	oh_len = strlen(oh_space);
1253193066Sjamie	getcredhostname(NULL, oh_space + oh_len, 32 - oh_len);
1254193066Sjamie	oh_len = strlen(oh_space);
1255180025Sdfr
1256180025Sdfr	memset(lock, 0, sizeof(*lock));
1257194118Sjamie	lock->caller_name = prison0.pr_hostname;
1258180025Sdfr	lock->fh.n_len = fhlen;
1259180025Sdfr	lock->fh.n_bytes = fh;
1260180025Sdfr	lock->oh.n_len = oh_len;
1261180025Sdfr	lock->oh.n_bytes = oh_space;
1262180025Sdfr	lock->svid = svid;
1263180025Sdfr	lock->l_offset = start;
1264180025Sdfr	lock->l_len = len;
1265180025Sdfr
1266180025Sdfr	return (0);
1267180025Sdfr}
1268