1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1989, 1991, 1993, 1995
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Rick Macklem at The University of Guelph.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 */
35
36#include <sys/cdefs.h>
37/*
38 * Socket operations for use by nfs
39 */
40
41#include "opt_kgssapi.h"
42#include "opt_nfs.h"
43
44#include <sys/param.h>
45#include <sys/systm.h>
46#include <sys/kernel.h>
47#include <sys/limits.h>
48#include <sys/lock.h>
49#include <sys/malloc.h>
50#include <sys/mbuf.h>
51#include <sys/mount.h>
52#include <sys/mutex.h>
53#include <sys/proc.h>
54#include <sys/signalvar.h>
55#include <sys/syscallsubr.h>
56#include <sys/sysctl.h>
57#include <sys/syslog.h>
58#include <sys/vnode.h>
59
60#include <rpc/rpc.h>
61#include <rpc/krpc.h>
62
63#include <kgssapi/krb5/kcrypto.h>
64
65#include <fs/nfs/nfsport.h>
66
67#ifdef KDTRACE_HOOKS
68#include <sys/dtrace_bsd.h>
69
70dtrace_nfsclient_nfs23_start_probe_func_t
71		dtrace_nfscl_nfs234_start_probe;
72
73dtrace_nfsclient_nfs23_done_probe_func_t
74		dtrace_nfscl_nfs234_done_probe;
75
76/*
77 * Registered probes by RPC type.
78 */
79uint32_t	nfscl_nfs2_start_probes[NFSV41_NPROCS + 1];
80uint32_t	nfscl_nfs2_done_probes[NFSV41_NPROCS + 1];
81
82uint32_t	nfscl_nfs3_start_probes[NFSV41_NPROCS + 1];
83uint32_t	nfscl_nfs3_done_probes[NFSV41_NPROCS + 1];
84
85uint32_t	nfscl_nfs4_start_probes[NFSV41_NPROCS + 1];
86uint32_t	nfscl_nfs4_done_probes[NFSV41_NPROCS + 1];
87#endif
88
89NFSSTATESPINLOCK;
90NFSREQSPINLOCK;
91NFSDLOCKMUTEX;
92NFSCLSTATEMUTEX;
93extern struct nfsstatsv1 nfsstatsv1;
94extern struct nfsreqhead nfsd_reqq;
95extern int nfscl_ticks;
96extern void (*ncl_call_invalcaches)(struct vnode *);
97extern int nfs_numnfscbd;
98extern int nfscl_debuglevel;
99extern int nfsrv_lease;
100
101SVCPOOL		*nfscbd_pool;
102int		nfs_bufpackets = 4;
103static int	nfsrv_gsscallbackson = 0;
104static int	nfs_reconnects;
105static int	nfs3_jukebox_delay = 10;
106static int	nfs_skip_wcc_data_onerr = 1;
107static int	nfs_dsretries = 2;
108static struct timespec	nfs_trylater_max = {
109	.tv_sec		= NFS_TRYLATERDEL,
110	.tv_nsec	= 0,
111};
112
113SYSCTL_DECL(_vfs_nfs);
114
115SYSCTL_INT(_vfs_nfs, OID_AUTO, bufpackets, CTLFLAG_RW, &nfs_bufpackets, 0,
116    "Buffer reservation size 2 < x < 64");
117SYSCTL_INT(_vfs_nfs, OID_AUTO, reconnects, CTLFLAG_RD, &nfs_reconnects, 0,
118    "Number of times the nfs client has had to reconnect");
119SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs3_jukebox_delay, CTLFLAG_RW, &nfs3_jukebox_delay, 0,
120    "Number of seconds to delay a retry after receiving EJUKEBOX");
121SYSCTL_INT(_vfs_nfs, OID_AUTO, skip_wcc_data_onerr, CTLFLAG_RW, &nfs_skip_wcc_data_onerr, 0,
122    "Disable weak cache consistency checking when server returns an error");
123SYSCTL_INT(_vfs_nfs, OID_AUTO, dsretries, CTLFLAG_RW, &nfs_dsretries, 0,
124    "Number of retries for a DS RPC before failure");
125
126static void	nfs_down(struct nfsmount *, struct thread *, const char *,
127    int, int);
128static void	nfs_up(struct nfsmount *, struct thread *, const char *,
129    int, int);
130static int	nfs_msg(struct thread *, const char *, const char *, int);
131
132struct nfs_cached_auth {
133	int		ca_refs; /* refcount, including 1 from the cache */
134	uid_t		ca_uid;	 /* uid that corresponds to this auth */
135	AUTH		*ca_auth; /* RPC auth handle */
136};
137
138static int nfsv2_procid[NFS_V3NPROCS] = {
139	NFSV2PROC_NULL,
140	NFSV2PROC_GETATTR,
141	NFSV2PROC_SETATTR,
142	NFSV2PROC_LOOKUP,
143	NFSV2PROC_NOOP,
144	NFSV2PROC_READLINK,
145	NFSV2PROC_READ,
146	NFSV2PROC_WRITE,
147	NFSV2PROC_CREATE,
148	NFSV2PROC_MKDIR,
149	NFSV2PROC_SYMLINK,
150	NFSV2PROC_CREATE,
151	NFSV2PROC_REMOVE,
152	NFSV2PROC_RMDIR,
153	NFSV2PROC_RENAME,
154	NFSV2PROC_LINK,
155	NFSV2PROC_READDIR,
156	NFSV2PROC_NOOP,
157	NFSV2PROC_STATFS,
158	NFSV2PROC_NOOP,
159	NFSV2PROC_NOOP,
160	NFSV2PROC_NOOP,
161};
162
163/*
164 * This static array indicates that a NFSv4 RPC should use
165 * RPCSEC_GSS, if the mount indicates that via sec=krb5[ip].
166 * System RPCs that do not use file handles will be false
167 * in this array so that they will use AUTH_SYS when the
168 * "syskrb5" mount option is specified, along with
169 * "sec=krb5[ip]".
170 */
171static bool nfscl_use_gss[NFSV42_NPROCS] = {
172	true,
173	true,
174	true,
175	true,
176	true,
177	true,
178	true,
179	true,
180	true,
181	true,
182	true,
183	true,
184	true,
185	true,
186	true,
187	true,
188	true,
189	true,
190	true,
191	true,
192	true,
193	true,
194	true,
195	false,		/* SetClientID */
196	false,		/* SetClientIDConfirm */
197	true,
198	true,
199	true,
200	true,
201	true,
202	true,
203	true,
204	false,		/* Renew */
205	true,
206	false,		/* ReleaseLockOwn */
207	true,
208	true,
209	true,
210	true,
211	true,
212	true,
213	false,		/* ExchangeID */
214	false,		/* CreateSession */
215	false,		/* DestroySession */
216	false,		/* DestroyClientID */
217	false,		/* FreeStateID */
218	true,
219	true,
220	true,
221	true,
222	false,		/* ReclaimComplete */
223	true,
224	true,
225	true,
226	true,
227	true,
228	true,
229	true,
230	true,
231	true,
232	true,
233	true,
234	true,
235	true,
236	true,
237	false,		/* BindConnectionToSession */
238	true,
239	true,
240	true,
241	true,
242};
243
244/*
245 * Initialize sockets and congestion for a new NFS connection.
246 * We do not free the sockaddr if error.
247 * Which arguments are set to NULL indicate what kind of call it is.
248 * cred == NULL --> a call to connect to a pNFS DS
249 * nmp == NULL --> indicates an upcall to userland or a NFSv4.0 callback
250 */
251int
252newnfs_connect(struct nfsmount *nmp, struct nfssockreq *nrp,
253    struct ucred *cred, NFSPROC_T *p, int callback_retry_mult, bool dotls,
254    struct __rpc_client **clipp)
255{
256	int rcvreserve, sndreserve;
257	int pktscale, pktscalesav;
258	struct sockaddr *saddr;
259	struct ucred *origcred;
260	CLIENT *client;
261	struct netconfig *nconf;
262	struct socket *so;
263	int one = 1, retries, error = 0;
264	struct thread *td = curthread;
265	SVCXPRT *xprt;
266	struct timeval timo;
267	uint64_t tval;
268
269	/*
270	 * We need to establish the socket using the credentials of
271	 * the mountpoint.  Some parts of this process (such as
272	 * sobind() and soconnect()) will use the curent thread's
273	 * credential instead of the socket credential.  To work
274	 * around this, temporarily change the current thread's
275	 * credential to that of the mountpoint.
276	 *
277	 * XXX: It would be better to explicitly pass the correct
278	 * credential to sobind() and soconnect().
279	 */
280	origcred = td->td_ucred;
281
282	/*
283	 * Use the credential in nr_cred, if not NULL.
284	 */
285	if (nrp->nr_cred != NULL)
286		td->td_ucred = nrp->nr_cred;
287	else
288		td->td_ucred = cred;
289	saddr = nrp->nr_nam;
290
291	if (saddr->sa_family == AF_INET)
292		if (nrp->nr_sotype == SOCK_DGRAM)
293			nconf = getnetconfigent("udp");
294		else
295			nconf = getnetconfigent("tcp");
296	else
297		if (nrp->nr_sotype == SOCK_DGRAM)
298			nconf = getnetconfigent("udp6");
299		else
300			nconf = getnetconfigent("tcp6");
301
302	pktscale = nfs_bufpackets;
303	if (pktscale < 2)
304		pktscale = 2;
305	if (pktscale > 64)
306		pktscale = 64;
307	pktscalesav = pktscale;
308	/*
309	 * soreserve() can fail if sb_max is too small, so shrink pktscale
310	 * and try again if there is an error.
311	 * Print a log message suggesting increasing sb_max.
312	 * Creating a socket and doing this is necessary since, if the
313	 * reservation sizes are too large and will make soreserve() fail,
314	 * the connection will work until a large send is attempted and
315	 * then it will loop in the krpc code.
316	 */
317	so = NULL;
318	saddr = NFSSOCKADDR(nrp->nr_nam, struct sockaddr *);
319	error = socreate(saddr->sa_family, &so, nrp->nr_sotype,
320	    nrp->nr_soproto, td->td_ucred, td);
321	if (error != 0)
322		goto out;
323	do {
324	    if (error != 0 && pktscale > 2) {
325		if (nmp != NULL && nrp->nr_sotype == SOCK_STREAM &&
326		    pktscale == pktscalesav) {
327		    /*
328		     * Suggest vfs.nfs.bufpackets * maximum RPC message,
329		     * adjusted for the sb_max->sb_max_adj conversion of
330		     * MCLBYTES / (MSIZE + MCLBYTES) as the minimum setting
331		     * for kern.ipc.maxsockbuf.
332		     */
333		    tval = (NFS_MAXBSIZE + NFS_MAXXDR) * nfs_bufpackets;
334		    tval *= MSIZE + MCLBYTES;
335		    tval += MCLBYTES - 1; /* Round up divide by MCLBYTES. */
336		    tval /= MCLBYTES;
337		    printf("Consider increasing kern.ipc.maxsockbuf to a "
338			"minimum of %ju to support %ubyte NFS I/O\n",
339			(uintmax_t)tval, NFS_MAXBSIZE);
340		}
341		pktscale--;
342	    }
343	    if (nrp->nr_sotype == SOCK_DGRAM) {
344		if (nmp != NULL) {
345			sndreserve = (NFS_MAXDGRAMDATA + NFS_MAXPKTHDR) *
346			    pktscale;
347			rcvreserve = (NFS_MAXDGRAMDATA + NFS_MAXPKTHDR) *
348			    pktscale;
349		} else {
350			sndreserve = rcvreserve = 1024 * pktscale;
351		}
352	    } else {
353		if (nrp->nr_sotype != SOCK_STREAM)
354			panic("nfscon sotype");
355		if (nmp != NULL) {
356			sndreserve = (NFS_MAXBSIZE + NFS_MAXXDR) *
357			    pktscale;
358			rcvreserve = (NFS_MAXBSIZE + NFS_MAXXDR) *
359			    pktscale;
360		} else {
361			sndreserve = rcvreserve = 1024 * pktscale;
362		}
363	    }
364	    error = soreserve(so, sndreserve, rcvreserve);
365	    if (error != 0 && nmp != NULL && nrp->nr_sotype == SOCK_STREAM &&
366		pktscale <= 2)
367		printf("Must increase kern.ipc.maxsockbuf or reduce"
368		    " rsize, wsize\n");
369	} while (error != 0 && pktscale > 2);
370	soclose(so);
371	if (error != 0)
372		goto out;
373
374	client = clnt_reconnect_create(nconf, saddr, nrp->nr_prog,
375	    nrp->nr_vers, sndreserve, rcvreserve);
376	CLNT_CONTROL(client, CLSET_WAITCHAN, "nfsreq");
377	if (nmp != NULL) {
378		if ((nmp->nm_flag & NFSMNT_INT))
379			CLNT_CONTROL(client, CLSET_INTERRUPTIBLE, &one);
380		if ((nmp->nm_flag & NFSMNT_RESVPORT))
381			CLNT_CONTROL(client, CLSET_PRIVPORT, &one);
382		if (NFSHASTLS(nmp)) {
383			CLNT_CONTROL(client, CLSET_TLS, &one);
384			if (nmp->nm_tlscertname != NULL)
385				CLNT_CONTROL(client, CLSET_TLSCERTNAME,
386				    nmp->nm_tlscertname);
387		}
388		if (NFSHASSOFT(nmp)) {
389			if (nmp->nm_sotype == SOCK_DGRAM)
390				/*
391				 * For UDP, the large timeout for a reconnect
392				 * will be set to "nm_retry * nm_timeo / 2", so
393				 * we only want to do 2 reconnect timeout
394				 * retries.
395				 */
396				retries = 2;
397			else
398				retries = nmp->nm_retry;
399		} else
400			retries = INT_MAX;
401		if (NFSHASNFSV4N(nmp)) {
402			if (cred != NULL) {
403				if (NFSHASSOFT(nmp)) {
404					/*
405					 * This should be a DS mount.
406					 * Use CLSET_TIMEOUT to set the timeout
407					 * for connections to DSs instead of
408					 * specifying a timeout on each RPC.
409					 * This is done so that SO_SNDTIMEO
410					 * is set on the TCP socket as well
411					 * as specifying a time limit when
412					 * waiting for an RPC reply.  Useful
413					 * if the send queue for the TCP
414					 * connection has become constipated,
415					 * due to a failed DS.
416					 * The choice of lease_duration / 4 is
417					 * fairly arbitrary, but seems to work
418					 * ok, with a lower bound of 10sec.
419					 */
420					timo.tv_sec = nfsrv_lease / 4;
421					if (timo.tv_sec < 10)
422						timo.tv_sec = 10;
423					timo.tv_usec = 0;
424					CLNT_CONTROL(client, CLSET_TIMEOUT,
425					    &timo);
426				}
427				/*
428				 * Make sure the nfscbd_pool doesn't get
429				 * destroyed while doing this.
430				 */
431				NFSD_LOCK();
432				if (nfs_numnfscbd > 0) {
433					nfs_numnfscbd++;
434					NFSD_UNLOCK();
435					xprt = svc_vc_create_backchannel(
436					    nfscbd_pool);
437					CLNT_CONTROL(client, CLSET_BACKCHANNEL,
438					    xprt);
439					NFSD_LOCK();
440					nfs_numnfscbd--;
441					if (nfs_numnfscbd == 0)
442						wakeup(&nfs_numnfscbd);
443				}
444				NFSD_UNLOCK();
445			} else {
446				/*
447				 * cred == NULL for a DS connect.
448				 * For connects to a DS, set a retry limit
449				 * so that failed DSs will be detected.
450				 * This is ok for NFSv4.1, since a DS does
451				 * not maintain open/lock state and is the
452				 * only case where using a "soft" mount is
453				 * recommended for NFSv4.
454				 * For mounts from the MDS to DS, this is done
455				 * via mount options, but that is not the case
456				 * here.  The retry limit here can be adjusted
457				 * via the sysctl vfs.nfs.dsretries.
458				 * See the comment above w.r.t. timeout.
459				 */
460				timo.tv_sec = nfsrv_lease / 4;
461				if (timo.tv_sec < 10)
462					timo.tv_sec = 10;
463				timo.tv_usec = 0;
464				CLNT_CONTROL(client, CLSET_TIMEOUT, &timo);
465				retries = nfs_dsretries;
466			}
467		}
468	} else {
469		/*
470		 * Three cases:
471		 * - Null RPC callback to client
472		 * - Non-Null RPC callback to client, wait a little longer
473		 * - upcalls to nfsuserd and gssd (clp == NULL)
474		 */
475		if (callback_retry_mult == 0) {
476			retries = NFSV4_UPCALLRETRY;
477			CLNT_CONTROL(client, CLSET_PRIVPORT, &one);
478		} else {
479			retries = NFSV4_CALLBACKRETRY * callback_retry_mult;
480		}
481		if (dotls)
482			CLNT_CONTROL(client, CLSET_TLS, &one);
483	}
484	CLNT_CONTROL(client, CLSET_RETRIES, &retries);
485
486	if (nmp != NULL) {
487		/*
488		 * For UDP, there are 2 timeouts:
489		 * - CLSET_RETRY_TIMEOUT sets the initial timeout for the timer
490		 *   that does a retransmit of an RPC request using the same
491		 *   socket and xid. This is what you normally want to do,
492		 *   since NFS servers depend on "same xid" for their
493		 *   Duplicate Request Cache.
494		 * - timeout specified in CLNT_CALL_MBUF(), which specifies when
495		 *   retransmits on the same socket should fail and a fresh
496		 *   socket created. Each of these timeouts counts as one
497		 *   CLSET_RETRIES as set above.
498		 * Set the initial retransmit timeout for UDP. This timeout
499		 * doesn't exist for TCP and the following call just fails,
500		 * which is ok.
501		 */
502		timo.tv_sec = nmp->nm_timeo / NFS_HZ;
503		timo.tv_usec = (nmp->nm_timeo % NFS_HZ) * 1000000 / NFS_HZ;
504		CLNT_CONTROL(client, CLSET_RETRY_TIMEOUT, &timo);
505	}
506
507	/*
508	 * *clipp is &nrp->nr_client or &nm_aconn[nmp->nm_nextaconn].
509	 * The latter case is for additional connections specified by the
510	 * "nconnect" mount option.  nr_mtx etc is used for these additional
511	 * connections, as well as nr_client in the nfssockreq
512	 * structure for the mount.
513	 */
514	mtx_lock(&nrp->nr_mtx);
515	if (*clipp != NULL) {
516		mtx_unlock(&nrp->nr_mtx);
517		/*
518		 * Someone else already connected.
519		 */
520		CLNT_RELEASE(client);
521	} else {
522		*clipp = client;
523		/*
524		 * Protocols that do not require connections may be optionally
525		 * left unconnected for servers that reply from a port other
526		 * than NFS_PORT.
527		 */
528		if (nmp == NULL || (nmp->nm_flag & NFSMNT_NOCONN) == 0) {
529			mtx_unlock(&nrp->nr_mtx);
530			CLNT_CONTROL(client, CLSET_CONNECT, &one);
531		} else
532			mtx_unlock(&nrp->nr_mtx);
533	}
534
535out:
536	/* Restore current thread's credentials. */
537	td->td_ucred = origcred;
538
539	NFSEXITCODE(error);
540	return (error);
541}
542
543/*
544 * NFS disconnect. Clean up and unlink.
545 */
546void
547newnfs_disconnect(struct nfsmount *nmp, struct nfssockreq *nrp)
548{
549	CLIENT *client, *aconn[NFS_MAXNCONN - 1];
550	int i;
551
552	mtx_lock(&nrp->nr_mtx);
553	if (nrp->nr_client != NULL) {
554		client = nrp->nr_client;
555		nrp->nr_client = NULL;
556		if (nmp != NULL && nmp->nm_aconnect > 0) {
557			for (i = 0; i < nmp->nm_aconnect; i++) {
558				aconn[i] = nmp->nm_aconn[i];
559				nmp->nm_aconn[i] = NULL;
560			}
561		}
562		mtx_unlock(&nrp->nr_mtx);
563		rpc_gss_secpurge_call(client);
564		CLNT_CLOSE(client);
565		CLNT_RELEASE(client);
566		if (nmp != NULL && nmp->nm_aconnect > 0) {
567			for (i = 0; i < nmp->nm_aconnect; i++) {
568				if (aconn[i] != NULL) {
569					rpc_gss_secpurge_call(aconn[i]);
570					CLNT_CLOSE(aconn[i]);
571					CLNT_RELEASE(aconn[i]);
572				}
573			}
574		}
575	} else {
576		mtx_unlock(&nrp->nr_mtx);
577	}
578}
579
580static AUTH *
581nfs_getauth(struct nfssockreq *nrp, int secflavour, char *clnt_principal,
582    char *srv_principal, gss_OID mech_oid, struct ucred *cred)
583{
584	rpc_gss_service_t svc;
585	AUTH *auth;
586
587	switch (secflavour) {
588	case RPCSEC_GSS_KRB5:
589	case RPCSEC_GSS_KRB5I:
590	case RPCSEC_GSS_KRB5P:
591		if (!mech_oid) {
592			if (!rpc_gss_mech_to_oid_call("kerberosv5", &mech_oid))
593				return (NULL);
594		}
595		if (secflavour == RPCSEC_GSS_KRB5)
596			svc = rpc_gss_svc_none;
597		else if (secflavour == RPCSEC_GSS_KRB5I)
598			svc = rpc_gss_svc_integrity;
599		else
600			svc = rpc_gss_svc_privacy;
601
602		if (clnt_principal == NULL) {
603			NFSCL_DEBUG(1, "nfs_getauth: clnt princ=NULL, "
604			    "srv princ=%s\n", srv_principal);
605			auth = rpc_gss_secfind_call(nrp->nr_client, cred,
606			    srv_principal, mech_oid, svc);
607		} else {
608			NFSCL_DEBUG(1, "nfs_getauth: clnt princ=%s "
609			    "srv princ=%s\n", clnt_principal, srv_principal);
610			auth = rpc_gss_seccreate_call(nrp->nr_client, cred,
611			    clnt_principal, srv_principal, "kerberosv5",
612			    svc, NULL, NULL, NULL);
613			return (auth);
614		}
615		if (auth != NULL)
616			return (auth);
617		/* fallthrough */
618	case AUTH_SYS:
619	default:
620		return (authunix_create(cred));
621	}
622}
623
624/*
625 * Callback from the RPC code to generate up/down notifications.
626 */
627
628struct nfs_feedback_arg {
629	struct nfsmount *nf_mount;
630	int		nf_lastmsg;	/* last tprintf */
631	int		nf_tprintfmsg;
632	struct thread	*nf_td;
633};
634
635static void
636nfs_feedback(int type, int proc, void *arg)
637{
638	struct nfs_feedback_arg *nf = (struct nfs_feedback_arg *) arg;
639	struct nfsmount *nmp = nf->nf_mount;
640	time_t now;
641
642	switch (type) {
643	case FEEDBACK_REXMIT2:
644	case FEEDBACK_RECONNECT:
645		now = NFSD_MONOSEC;
646		if (nf->nf_lastmsg + nmp->nm_tprintf_delay < now) {
647			nfs_down(nmp, nf->nf_td,
648			    "not responding", 0, NFSSTA_TIMEO);
649			nf->nf_tprintfmsg = TRUE;
650			nf->nf_lastmsg = now;
651		}
652		break;
653
654	case FEEDBACK_OK:
655		nfs_up(nf->nf_mount, nf->nf_td,
656		    "is alive again", NFSSTA_TIMEO, nf->nf_tprintfmsg);
657		break;
658	}
659}
660
661/*
662 * newnfs_request - goes something like this
663 *	- does the rpc by calling the krpc layer
664 *	- break down rpc header and return with nfs reply
665 * nb: always frees up nd_mreq mbuf list
666 */
667int
668newnfs_request(struct nfsrv_descript *nd, struct nfsmount *nmp,
669    struct nfsclient *clp, struct nfssockreq *nrp, vnode_t vp,
670    struct thread *td, struct ucred *cred, u_int32_t prog, u_int32_t vers,
671    u_char *retsum, int toplevel, u_int64_t *xidp, struct nfsclsession *dssep)
672{
673	uint32_t retseq, retval, slotseq, *tl;
674	int i = 0, j = 0, opcnt, set_sigset = 0, slot;
675	int error = 0, usegssname = 0, secflavour = AUTH_SYS;
676	int freeslot, maxslot, reterr, slotpos, timeo;
677	u_int16_t procnum;
678	u_int nextconn;
679	struct nfs_feedback_arg nf;
680	struct timeval timo;
681	AUTH *auth;
682	struct rpc_callextra ext;
683	enum clnt_stat stat;
684	struct nfsreq *rep = NULL;
685	char *srv_principal = NULL, *clnt_principal = NULL;
686	sigset_t oldset;
687	struct ucred *authcred;
688	struct nfsclsession *sep;
689	uint8_t sessionid[NFSX_V4SESSIONID];
690	bool nextconn_set;
691	struct timespec trylater_delay, ts, waituntil;
692
693	/* Initially 1msec. */
694	trylater_delay.tv_sec = 0;
695	trylater_delay.tv_nsec = 1000000;
696	sep = dssep;
697	if (xidp != NULL)
698		*xidp = 0;
699	/* Reject requests while attempting a forced unmount. */
700	if (nmp != NULL && NFSCL_FORCEDISM(nmp->nm_mountp)) {
701		m_freem(nd->nd_mreq);
702		return (ESTALE);
703	}
704
705	/*
706	 * Set authcred, which is used to acquire RPC credentials to
707	 * the cred argument, by default. The crhold() should not be
708	 * necessary, but will ensure that some future code change
709	 * doesn't result in the credential being free'd prematurely.
710	 */
711	authcred = crhold(cred);
712
713	/* For client side interruptible mounts, mask off the signals. */
714	if (nmp != NULL && td != NULL && NFSHASINT(nmp)) {
715		newnfs_set_sigmask(td, &oldset);
716		set_sigset = 1;
717	}
718
719	/*
720	 * If not already connected call newnfs_connect now.
721	 */
722	if (nrp->nr_client == NULL)
723		newnfs_connect(nmp, nrp, cred, td, 0, false, &nrp->nr_client);
724
725	/*
726	 * If the "nconnect" mount option was specified and this RPC is
727	 * one that can have a large RPC message and is being done through
728	 * the NFS/MDS server, use an additional connection. (When the RPC is
729	 * being done through the server/MDS, nrp == &nmp->nm_sockreq.)
730	 * The "nconnect" mount option normally has minimal effect when the
731	 * "pnfs" mount option is specified, since only Readdir RPCs are
732	 * normally done through the NFS/MDS server.
733	 */
734	nextconn_set = false;
735	if (nmp != NULL && nmp->nm_aconnect > 0 && nrp == &nmp->nm_sockreq &&
736	    (nd->nd_procnum == NFSPROC_READ ||
737	     nd->nd_procnum == NFSPROC_READDIR ||
738	     nd->nd_procnum == NFSPROC_READDIRPLUS ||
739	     nd->nd_procnum == NFSPROC_WRITE)) {
740		nextconn = atomic_fetchadd_int(&nmp->nm_nextaconn, 1);
741		nextconn %= nmp->nm_aconnect;
742		nextconn_set = true;
743		if (nmp->nm_aconn[nextconn] == NULL)
744			newnfs_connect(nmp, nrp, cred, td, 0, false,
745			    &nmp->nm_aconn[nextconn]);
746	}
747
748	/*
749	 * For a client side mount, nmp is != NULL and clp == NULL. For
750	 * server calls (callbacks or upcalls), nmp == NULL.
751	 */
752	if (clp != NULL) {
753		NFSLOCKSTATE();
754		if ((clp->lc_flags & LCL_GSS) && nfsrv_gsscallbackson) {
755			secflavour = RPCSEC_GSS_KRB5;
756			if (nd->nd_procnum != NFSPROC_NULL) {
757				if (clp->lc_flags & LCL_GSSINTEGRITY)
758					secflavour = RPCSEC_GSS_KRB5I;
759				else if (clp->lc_flags & LCL_GSSPRIVACY)
760					secflavour = RPCSEC_GSS_KRB5P;
761			}
762		}
763		NFSUNLOCKSTATE();
764	} else if (nmp != NULL && NFSHASKERB(nmp) &&
765	     nd->nd_procnum != NFSPROC_NULL && (!NFSHASSYSKRB5(nmp) ||
766	     nfscl_use_gss[nd->nd_procnum])) {
767		if (NFSHASALLGSSNAME(nmp) && nmp->nm_krbnamelen > 0)
768			nd->nd_flag |= ND_USEGSSNAME;
769		if ((nd->nd_flag & ND_USEGSSNAME) != 0) {
770			/*
771			 * If there is a client side host based credential,
772			 * use that, otherwise use the system uid, if set.
773			 * The system uid is in the nmp->nm_sockreq.nr_cred
774			 * credentials.
775			 */
776			if (nmp->nm_krbnamelen > 0) {
777				usegssname = 1;
778				clnt_principal = nmp->nm_krbname;
779			} else if (nmp->nm_uid != (uid_t)-1) {
780				KASSERT(nmp->nm_sockreq.nr_cred != NULL,
781				    ("newnfs_request: NULL nr_cred"));
782				crfree(authcred);
783				authcred = crhold(nmp->nm_sockreq.nr_cred);
784			}
785		} else if (nmp->nm_krbnamelen == 0 &&
786		    nmp->nm_uid != (uid_t)-1 && cred->cr_uid == (uid_t)0) {
787			/*
788			 * If there is no host based principal name and
789			 * the system uid is set and this is root, use the
790			 * system uid, since root won't have user
791			 * credentials in a credentials cache file.
792			 * The system uid is in the nmp->nm_sockreq.nr_cred
793			 * credentials.
794			 */
795			KASSERT(nmp->nm_sockreq.nr_cred != NULL,
796			    ("newnfs_request: NULL nr_cred"));
797			crfree(authcred);
798			authcred = crhold(nmp->nm_sockreq.nr_cred);
799		}
800		if (NFSHASINTEGRITY(nmp))
801			secflavour = RPCSEC_GSS_KRB5I;
802		else if (NFSHASPRIVACY(nmp))
803			secflavour = RPCSEC_GSS_KRB5P;
804		else
805			secflavour = RPCSEC_GSS_KRB5;
806		if (nrp->nr_srvprinc[0] == '\0')
807			srv_principal = NFSMNT_SRVKRBNAME(nmp);
808		else
809			srv_principal = nrp->nr_srvprinc;
810	} else if (nmp != NULL && (!NFSHASKERB(nmp) || NFSHASSYSKRB5(nmp)) &&
811	    nd->nd_procnum != NFSPROC_NULL &&
812	    (nd->nd_flag & ND_USEGSSNAME) != 0) {
813		/*
814		 * Use the uid that did the mount when the RPC is doing
815		 * NFSv4 system operations, as indicated by the
816		 * ND_USEGSSNAME flag, for the AUTH_SYS case.
817		 * The credentials in nm_sockreq.nr_cred were used for the
818		 * mount.
819		 */
820		KASSERT(nmp->nm_sockreq.nr_cred != NULL,
821		    ("newnfs_request: NULL nr_cred"));
822		crfree(authcred);
823		authcred = crhold(nmp->nm_sockreq.nr_cred);
824	}
825
826	if (nmp != NULL) {
827		bzero(&nf, sizeof(struct nfs_feedback_arg));
828		nf.nf_mount = nmp;
829		nf.nf_td = td;
830		nf.nf_lastmsg = NFSD_MONOSEC -
831		    ((nmp->nm_tprintf_delay)-(nmp->nm_tprintf_initial_delay));
832	}
833
834	if (nd->nd_procnum == NFSPROC_NULL)
835		auth = authnone_create();
836	else if (usegssname) {
837		/*
838		 * For this case, the authenticator is held in the
839		 * nfssockreq structure, so don't release the reference count
840		 * held on it. --> Don't AUTH_DESTROY() it in this function.
841		 */
842		if (nrp->nr_auth == NULL)
843			nrp->nr_auth = nfs_getauth(nrp, secflavour,
844			    clnt_principal, srv_principal, NULL, authcred);
845		else
846			rpc_gss_refresh_auth_call(nrp->nr_auth);
847		auth = nrp->nr_auth;
848	} else
849		auth = nfs_getauth(nrp, secflavour, NULL,
850		    srv_principal, NULL, authcred);
851	crfree(authcred);
852	if (auth == NULL) {
853		m_freem(nd->nd_mreq);
854		if (set_sigset)
855			newnfs_restore_sigmask(td, &oldset);
856		return (EACCES);
857	}
858	bzero(&ext, sizeof(ext));
859	ext.rc_auth = auth;
860	if (nmp != NULL) {
861		ext.rc_feedback = nfs_feedback;
862		ext.rc_feedback_arg = &nf;
863	}
864
865	procnum = nd->nd_procnum;
866	if ((nd->nd_flag & ND_NFSV4) &&
867	    nd->nd_procnum != NFSPROC_NULL &&
868	    nd->nd_procnum != NFSV4PROC_CBCOMPOUND)
869		procnum = NFSV4PROC_COMPOUND;
870
871	if (nmp != NULL) {
872		NFSINCRGLOBAL(nfsstatsv1.rpcrequests);
873
874		/* Map the procnum to the old NFSv2 one, as required. */
875		if ((nd->nd_flag & ND_NFSV2) != 0) {
876			if (nd->nd_procnum < NFS_V3NPROCS)
877				procnum = nfsv2_procid[nd->nd_procnum];
878			else
879				procnum = NFSV2PROC_NOOP;
880		}
881
882		/*
883		 * Now only used for the R_DONTRECOVER case, but until that is
884		 * supported within the krpc code, I need to keep a queue of
885		 * outstanding RPCs for nfsv4 client requests.
886		 */
887		if ((nd->nd_flag & ND_NFSV4) && procnum == NFSV4PROC_COMPOUND)
888			rep = malloc(sizeof(struct nfsreq),
889			    M_NFSDREQ, M_WAITOK);
890#ifdef KDTRACE_HOOKS
891		if (dtrace_nfscl_nfs234_start_probe != NULL) {
892			uint32_t probe_id;
893			int probe_procnum;
894
895			if (nd->nd_flag & ND_NFSV4) {
896				probe_id =
897				    nfscl_nfs4_start_probes[nd->nd_procnum];
898				probe_procnum = nd->nd_procnum;
899			} else if (nd->nd_flag & ND_NFSV3) {
900				probe_id = nfscl_nfs3_start_probes[procnum];
901				probe_procnum = procnum;
902			} else {
903				probe_id =
904				    nfscl_nfs2_start_probes[nd->nd_procnum];
905				probe_procnum = procnum;
906			}
907			if (probe_id != 0)
908				(dtrace_nfscl_nfs234_start_probe)
909				    (probe_id, vp, nd->nd_mreq, cred,
910				     probe_procnum);
911		}
912#endif
913	}
914	freeslot = -1;		/* Set to slot that needs to be free'd */
915tryagain:
916	slot = -1;		/* Slot that needs a sequence# increment. */
917	/*
918	 * This timeout specifies when a new socket should be created,
919	 * along with new xid values. For UDP, this should be done
920	 * infrequently, since retransmits of RPC requests should normally
921	 * use the same xid.
922	 */
923	if (nmp == NULL) {
924		if (clp == NULL) {
925			timo.tv_sec = NFSV4_UPCALLTIMEO;
926			timo.tv_usec = 0;
927		} else {
928			timo.tv_sec = NFSV4_CALLBACKTIMEO / 1000;
929			timo.tv_usec = NFSV4_CALLBACKTIMEO * 1000;
930		}
931	} else {
932		if (nrp->nr_sotype != SOCK_DGRAM) {
933			timo.tv_usec = 0;
934			if ((nmp->nm_flag & NFSMNT_NFSV4))
935				timo.tv_sec = INT_MAX;
936			else
937				timo.tv_sec = NFS_TCPTIMEO;
938		} else {
939			if (NFSHASSOFT(nmp)) {
940				/*
941				 * CLSET_RETRIES is set to 2, so this should be
942				 * half of the total timeout required.
943				 */
944				timeo = nmp->nm_retry * nmp->nm_timeo / 2;
945				if (timeo < 1)
946					timeo = 1;
947				timo.tv_sec = timeo / NFS_HZ;
948				timo.tv_usec = (timeo % NFS_HZ) * 1000000 /
949				    NFS_HZ;
950			} else {
951				/* For UDP hard mounts, use a large value. */
952				timo.tv_sec = NFS_MAXTIMEO / NFS_HZ;
953				timo.tv_usec = 0;
954			}
955		}
956
957		if (rep != NULL) {
958			rep->r_flags = 0;
959			rep->r_nmp = nmp;
960			/*
961			 * Chain request into list of outstanding requests.
962			 */
963			NFSLOCKREQ();
964			TAILQ_INSERT_TAIL(&nfsd_reqq, rep, r_chain);
965			NFSUNLOCKREQ();
966		}
967	}
968
969	nd->nd_mrep = NULL;
970	if (clp != NULL && sep != NULL)
971		stat = clnt_bck_call(nrp->nr_client, &ext, procnum,
972		    nd->nd_mreq, &nd->nd_mrep, timo, sep->nfsess_xprt);
973	else if (nextconn_set)
974		/*
975		 * When there are multiple TCP connections, send the
976		 * RPCs with large messages on the alternate TCP
977		 * connection(s) in a round robin fashion.
978		 * The small RPC messages are sent on the default
979		 * TCP connection because they do not require much
980		 * network bandwidth and separating them from the
981		 * large RPC messages avoids them getting "log jammed"
982		 * behind several large RPC messages.
983		 */
984		stat = CLNT_CALL_MBUF(nmp->nm_aconn[nextconn],
985		    &ext, procnum, nd->nd_mreq, &nd->nd_mrep, timo);
986	else
987		stat = CLNT_CALL_MBUF(nrp->nr_client, &ext, procnum,
988		    nd->nd_mreq, &nd->nd_mrep, timo);
989	NFSCL_DEBUG(2, "clnt call=%d\n", stat);
990
991	if (rep != NULL) {
992		/*
993		 * RPC done, unlink the request.
994		 */
995		NFSLOCKREQ();
996		TAILQ_REMOVE(&nfsd_reqq, rep, r_chain);
997		NFSUNLOCKREQ();
998	}
999
1000	/*
1001	 * If there was a successful reply and a tprintf msg.
1002	 * tprintf a response.
1003	 */
1004	if (stat == RPC_SUCCESS) {
1005		error = 0;
1006	} else if (stat == RPC_TIMEDOUT) {
1007		NFSINCRGLOBAL(nfsstatsv1.rpctimeouts);
1008		error = ETIMEDOUT;
1009	} else if (stat == RPC_VERSMISMATCH) {
1010		NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
1011		error = EOPNOTSUPP;
1012	} else if (stat == RPC_PROGVERSMISMATCH) {
1013		NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
1014		error = EPROTONOSUPPORT;
1015	} else if (stat == RPC_CANTSEND || stat == RPC_CANTRECV ||
1016	     stat == RPC_SYSTEMERROR || stat == RPC_INTR) {
1017		/* Check for a session slot that needs to be free'd. */
1018		if ((nd->nd_flag & (ND_NFSV41 | ND_HASSLOTID)) ==
1019		    (ND_NFSV41 | ND_HASSLOTID) && nmp != NULL &&
1020		    nd->nd_procnum != NFSPROC_NULL) {
1021			/*
1022			 * This should only occur when either the MDS or
1023			 * a client has an RPC against a DS fail.
1024			 * This happens because these cases use "soft"
1025			 * connections that can time out and fail.
1026			 * The slot used for this RPC is now in a
1027			 * non-deterministic state, but if the slot isn't
1028			 * free'd, threads can get stuck waiting for a slot.
1029			 */
1030			if (sep == NULL)
1031				sep = nfsmnt_mdssession(nmp);
1032			/*
1033			 * Bump the sequence# out of range, so that reuse of
1034			 * this slot will result in an NFSERR_SEQMISORDERED
1035			 * error and not a bogus cached RPC reply.
1036			 */
1037			mtx_lock(&sep->nfsess_mtx);
1038			sep->nfsess_slotseq[nd->nd_slotid] += 10;
1039			sep->nfsess_badslots |= (0x1ULL << nd->nd_slotid);
1040			mtx_unlock(&sep->nfsess_mtx);
1041			/* And free the slot. */
1042			nfsv4_freeslot(sep, nd->nd_slotid, false);
1043		}
1044		if (stat == RPC_INTR)
1045			error = EINTR;
1046		else {
1047			NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
1048			error = ENXIO;
1049		}
1050	} else if (stat == RPC_AUTHERROR) {
1051		/* Check for a session slot that needs to be free'd. */
1052		if ((nd->nd_flag & (ND_NFSV41 | ND_HASSLOTID)) ==
1053		    (ND_NFSV41 | ND_HASSLOTID) && nmp != NULL &&
1054		    nd->nd_procnum != NFSPROC_NULL) {
1055			/*
1056			 * This can occur when a Kerberos/RPCSEC_GSS session
1057			 * expires, due to TGT expiration.
1058			 * Free the slot, resetting the slot's sequence#.
1059			 */
1060			if (sep == NULL)
1061				sep = nfsmnt_mdssession(nmp);
1062			nfsv4_freeslot(sep, nd->nd_slotid, true);
1063		}
1064		NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
1065		error = EACCES;
1066	} else {
1067		NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
1068		error = EACCES;
1069	}
1070	if (error) {
1071		m_freem(nd->nd_mreq);
1072		if (usegssname == 0)
1073			AUTH_DESTROY(auth);
1074		if (rep != NULL)
1075			free(rep, M_NFSDREQ);
1076		if (set_sigset)
1077			newnfs_restore_sigmask(td, &oldset);
1078		return (error);
1079	}
1080
1081	KASSERT(nd->nd_mrep != NULL, ("mrep shouldn't be NULL if no error\n"));
1082
1083	/*
1084	 * Search for any mbufs that are not a multiple of 4 bytes long
1085	 * or with m_data not longword aligned.
1086	 * These could cause pointer alignment problems, so copy them to
1087	 * well aligned mbufs.
1088	 */
1089	newnfs_realign(&nd->nd_mrep, M_WAITOK);
1090	nd->nd_md = nd->nd_mrep;
1091	nd->nd_dpos = mtod(nd->nd_md, caddr_t);
1092	nd->nd_repstat = 0;
1093	if (nd->nd_procnum != NFSPROC_NULL &&
1094	    nd->nd_procnum != NFSV4PROC_CBNULL) {
1095		/* If sep == NULL, set it to the default in nmp. */
1096		if (sep == NULL && nmp != NULL)
1097			sep = nfsmnt_mdssession(nmp);
1098		/*
1099		 * and now the actual NFS xdr.
1100		 */
1101		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
1102		nd->nd_repstat = fxdr_unsigned(u_int32_t, *tl);
1103		if (nd->nd_repstat >= 10000)
1104			NFSCL_DEBUG(1, "proc=%d reps=%d\n", (int)nd->nd_procnum,
1105			    (int)nd->nd_repstat);
1106
1107		/*
1108		 * Get rid of the tag, return count and SEQUENCE result for
1109		 * NFSv4.
1110		 */
1111		if ((nd->nd_flag & ND_NFSV4) != 0 && nd->nd_repstat !=
1112		    NFSERR_MINORVERMISMATCH) {
1113			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
1114			i = fxdr_unsigned(int, *tl);
1115			error = nfsm_advance(nd, NFSM_RNDUP(i), -1);
1116			if (error)
1117				goto nfsmout;
1118			NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
1119			opcnt = fxdr_unsigned(int, *tl++);
1120			i = fxdr_unsigned(int, *tl++);
1121			j = fxdr_unsigned(int, *tl);
1122			if (j >= 10000)
1123				NFSCL_DEBUG(1, "fop=%d fst=%d\n", i, j);
1124			/*
1125			 * If the first op is Sequence, free up the slot.
1126			 */
1127			if ((nmp != NULL && i == NFSV4OP_SEQUENCE && j != 0) ||
1128			   (clp != NULL && i == NFSV4OP_CBSEQUENCE && j != 0)) {
1129				NFSCL_DEBUG(1, "failed seq=%d\n", j);
1130				if (sep != NULL && i == NFSV4OP_SEQUENCE &&
1131				    j == NFSERR_SEQMISORDERED) {
1132					mtx_lock(&sep->nfsess_mtx);
1133					sep->nfsess_badslots |=
1134					    (0x1ULL << nd->nd_slotid);
1135					mtx_unlock(&sep->nfsess_mtx);
1136				}
1137			}
1138			if (((nmp != NULL && i == NFSV4OP_SEQUENCE && j == 0) ||
1139			    (clp != NULL && i == NFSV4OP_CBSEQUENCE &&
1140			    j == 0)) && sep != NULL) {
1141				if (i == NFSV4OP_SEQUENCE)
1142					NFSM_DISSECT(tl, uint32_t *,
1143					    NFSX_V4SESSIONID +
1144					    5 * NFSX_UNSIGNED);
1145				else
1146					NFSM_DISSECT(tl, uint32_t *,
1147					    NFSX_V4SESSIONID +
1148					    4 * NFSX_UNSIGNED);
1149				mtx_lock(&sep->nfsess_mtx);
1150				if (bcmp(tl, sep->nfsess_sessionid,
1151				    NFSX_V4SESSIONID) == 0) {
1152					tl += NFSX_V4SESSIONID / NFSX_UNSIGNED;
1153					retseq = fxdr_unsigned(uint32_t, *tl++);
1154					slot = fxdr_unsigned(int, *tl++);
1155					if ((nd->nd_flag & ND_HASSLOTID) != 0) {
1156						if (slot >= NFSV4_SLOTS ||
1157						    (i == NFSV4OP_CBSEQUENCE &&
1158						     slot >= NFSV4_CBSLOTS)) {
1159							printf("newnfs_request:"
1160							    " Bogus slot\n");
1161							slot = nd->nd_slotid;
1162						} else if (slot !=
1163						    nd->nd_slotid) {
1164						    printf("newnfs_request:"
1165							" Wrong session "
1166							"srvslot=%d "
1167							"slot=%d\n", slot,
1168							nd->nd_slotid);
1169						    if (i == NFSV4OP_SEQUENCE) {
1170							/*
1171							 * Mark both slots as
1172							 * bad, because we do
1173							 * not know if the
1174							 * server has advanced
1175							 * the sequence# for
1176							 * either of them.
1177							 */
1178							sep->nfsess_badslots |=
1179							    (0x1ULL << slot);
1180							sep->nfsess_badslots |=
1181							    (0x1ULL <<
1182							     nd->nd_slotid);
1183						    }
1184						    slot = nd->nd_slotid;
1185						}
1186						freeslot = slot;
1187					} else if (slot != 0) {
1188						printf("newnfs_request: Bad "
1189						    "session slot=%d\n", slot);
1190						slot = 0;
1191					}
1192					if (retseq != sep->nfsess_slotseq[slot])
1193						printf("retseq diff 0x%x\n",
1194						    retseq);
1195					retval = fxdr_unsigned(uint32_t, *++tl);
1196					if ((retval + 1) < sep->nfsess_foreslots
1197					    )
1198						sep->nfsess_foreslots = (retval
1199						    + 1);
1200					else if ((retval + 1) >
1201					    sep->nfsess_foreslots)
1202						sep->nfsess_foreslots = (retval
1203						    < 64) ? (retval + 1) : 64;
1204				}
1205				mtx_unlock(&sep->nfsess_mtx);
1206
1207				/* Grab the op and status for the next one. */
1208				if (opcnt > 1) {
1209					NFSM_DISSECT(tl, uint32_t *,
1210					    2 * NFSX_UNSIGNED);
1211					i = fxdr_unsigned(int, *tl++);
1212					j = fxdr_unsigned(int, *tl);
1213				}
1214			}
1215		}
1216		if (nd->nd_repstat != 0) {
1217			if (nd->nd_repstat == NFSERR_BADSESSION &&
1218			    nmp != NULL && dssep == NULL &&
1219			    (nd->nd_flag & ND_NFSV41) != 0) {
1220				/*
1221				 * If this is a client side MDS RPC, mark
1222				 * the MDS session defunct and initiate
1223				 * recovery, as required.
1224				 * The nfsess_defunct field is protected by
1225				 * the NFSLOCKMNT()/nm_mtx lock and not the
1226				 * nfsess_mtx lock to simplify its handling,
1227				 * for the MDS session. This lock is also
1228				 * sufficient for nfsess_sessionid, since it
1229				 * never changes in the structure.
1230				 */
1231				NFSCL_DEBUG(1, "Got badsession\n");
1232				NFSLOCKCLSTATE();
1233				NFSLOCKMNT(nmp);
1234				if (TAILQ_EMPTY(&nmp->nm_sess)) {
1235					NFSUNLOCKMNT(nmp);
1236					NFSUNLOCKCLSTATE();
1237					printf("If server has not rebooted, "
1238					    "check NFS clients for unique "
1239					    "/etc/hostid's\n");
1240					goto out;
1241				}
1242				sep = NFSMNT_MDSSESSION(nmp);
1243				if (bcmp(sep->nfsess_sessionid, nd->nd_sequence,
1244				    NFSX_V4SESSIONID) == 0) {
1245					printf("Initiate recovery. If server "
1246					    "has not rebooted, "
1247					    "check NFS clients for unique "
1248					    "/etc/hostid's\n");
1249					/* Initiate recovery. */
1250					sep->nfsess_defunct = 1;
1251					NFSCL_DEBUG(1, "Marked defunct\n");
1252					if (nmp->nm_clp != NULL) {
1253						nmp->nm_clp->nfsc_flags |=
1254						    NFSCLFLAGS_RECOVER;
1255						wakeup(nmp->nm_clp);
1256					}
1257				}
1258				NFSUNLOCKCLSTATE();
1259				/*
1260				 * Sleep for up to 1sec waiting for a new
1261				 * session.
1262				 */
1263				mtx_sleep(&nmp->nm_sess, &nmp->nm_mtx, PZERO,
1264				    "nfsbadsess", hz);
1265				/*
1266				 * Get the session again, in case a new one
1267				 * has been created during the sleep.
1268				 */
1269				sep = NFSMNT_MDSSESSION(nmp);
1270				NFSUNLOCKMNT(nmp);
1271				if ((nd->nd_flag & ND_LOOPBADSESS) != 0) {
1272					reterr = nfsv4_sequencelookup(nmp, sep,
1273					    &slotpos, &maxslot, &slotseq,
1274					    sessionid, true);
1275					if (reterr == 0) {
1276						/* Fill in new session info. */
1277						NFSCL_DEBUG(1,
1278						  "Filling in new sequence\n");
1279						tl = nd->nd_sequence;
1280						bcopy(sessionid, tl,
1281						    NFSX_V4SESSIONID);
1282						tl += NFSX_V4SESSIONID /
1283						    NFSX_UNSIGNED;
1284						*tl++ = txdr_unsigned(slotseq);
1285						*tl++ = txdr_unsigned(slotpos);
1286						*tl = txdr_unsigned(maxslot);
1287						nd->nd_slotid = slotpos;
1288						nd->nd_flag |= ND_HASSLOTID;
1289					}
1290					if (reterr == NFSERR_BADSESSION ||
1291					    reterr == 0) {
1292						NFSCL_DEBUG(1,
1293						    "Badsession looping\n");
1294						m_freem(nd->nd_mrep);
1295						nd->nd_mrep = NULL;
1296						goto tryagain;
1297					}
1298					nd->nd_repstat = reterr;
1299					NFSCL_DEBUG(1, "Got err=%d\n", reterr);
1300				}
1301			}
1302			/*
1303			 * When clp != NULL, it is a callback and all
1304			 * callback operations can be retried for NFSERR_DELAY.
1305			 */
1306			if (((nd->nd_repstat == NFSERR_DELAY ||
1307			      nd->nd_repstat == NFSERR_GRACE) &&
1308			     (nd->nd_flag & ND_NFSV4) && (clp != NULL ||
1309			     (nd->nd_procnum != NFSPROC_DELEGRETURN &&
1310			     nd->nd_procnum != NFSPROC_SETATTR &&
1311			     nd->nd_procnum != NFSPROC_READ &&
1312			     nd->nd_procnum != NFSPROC_READDS &&
1313			     nd->nd_procnum != NFSPROC_WRITE &&
1314			     nd->nd_procnum != NFSPROC_WRITEDS &&
1315			     nd->nd_procnum != NFSPROC_OPEN &&
1316			     nd->nd_procnum != NFSPROC_OPENLAYGET &&
1317			     nd->nd_procnum != NFSPROC_CREATE &&
1318			     nd->nd_procnum != NFSPROC_CREATELAYGET &&
1319			     nd->nd_procnum != NFSPROC_OPENCONFIRM &&
1320			     nd->nd_procnum != NFSPROC_OPENDOWNGRADE &&
1321			     nd->nd_procnum != NFSPROC_CLOSE &&
1322			     nd->nd_procnum != NFSPROC_LOCK &&
1323			     nd->nd_procnum != NFSPROC_LOCKU))) ||
1324			    (nd->nd_repstat == NFSERR_DELAY &&
1325			     (nd->nd_flag & ND_NFSV4) == 0) ||
1326			    nd->nd_repstat == NFSERR_RESOURCE ||
1327			    nd->nd_repstat == NFSERR_RETRYUNCACHEDREP) {
1328				/* Clip at NFS_TRYLATERDEL. */
1329				if (timespeccmp(&trylater_delay,
1330				    &nfs_trylater_max, >))
1331					trylater_delay = nfs_trylater_max;
1332				getnanouptime(&waituntil);
1333				timespecadd(&waituntil, &trylater_delay,
1334				    &waituntil);
1335				do {
1336					nfs_catnap(PZERO, 0, "nfstry");
1337					getnanouptime(&ts);
1338				} while (timespeccmp(&ts, &waituntil, <));
1339				timespecadd(&trylater_delay, &trylater_delay,
1340				    &trylater_delay);	/* Double each time. */
1341				if (slot != -1) {
1342					mtx_lock(&sep->nfsess_mtx);
1343					sep->nfsess_slotseq[slot]++;
1344					*nd->nd_slotseq = txdr_unsigned(
1345					    sep->nfsess_slotseq[slot]);
1346					mtx_unlock(&sep->nfsess_mtx);
1347				}
1348				m_freem(nd->nd_mrep);
1349				nd->nd_mrep = NULL;
1350				goto tryagain;
1351			}
1352
1353			/*
1354			 * If the File Handle was stale, invalidate the
1355			 * lookup cache, just in case.
1356			 * (vp != NULL implies a client side call)
1357			 */
1358			if (nd->nd_repstat == ESTALE && vp != NULL) {
1359				cache_purge(vp);
1360				if (ncl_call_invalcaches != NULL)
1361					(*ncl_call_invalcaches)(vp);
1362			}
1363		}
1364		if ((nd->nd_flag & ND_NFSV4) != 0) {
1365			/* Free the slot, as required. */
1366			if (freeslot != -1)
1367				nfsv4_freeslot(sep, freeslot, false);
1368			/*
1369			 * If this op is Putfh, throw its results away.
1370			 */
1371			if (j >= 10000)
1372				NFSCL_DEBUG(1, "nop=%d nst=%d\n", i, j);
1373			if (nmp != NULL && i == NFSV4OP_PUTFH && j == 0) {
1374				NFSM_DISSECT(tl,u_int32_t *,2 * NFSX_UNSIGNED);
1375				i = fxdr_unsigned(int, *tl++);
1376				j = fxdr_unsigned(int, *tl);
1377				if (j >= 10000)
1378					NFSCL_DEBUG(1, "n2op=%d n2st=%d\n", i,
1379					    j);
1380				/*
1381				 * All Compounds that do an Op that must
1382				 * be in sequence consist of NFSV4OP_PUTFH
1383				 * followed by one of these. As such, we
1384				 * can determine if the seqid# should be
1385				 * incremented, here.
1386				 */
1387				if ((i == NFSV4OP_OPEN ||
1388				     i == NFSV4OP_OPENCONFIRM ||
1389				     i == NFSV4OP_OPENDOWNGRADE ||
1390				     i == NFSV4OP_CLOSE ||
1391				     i == NFSV4OP_LOCK ||
1392				     i == NFSV4OP_LOCKU) &&
1393				    (j == 0 ||
1394				     (j != NFSERR_STALECLIENTID &&
1395				      j != NFSERR_STALESTATEID &&
1396				      j != NFSERR_BADSTATEID &&
1397				      j != NFSERR_BADSEQID &&
1398				      j != NFSERR_BADXDR &&
1399				      j != NFSERR_RESOURCE &&
1400				      j != NFSERR_NOFILEHANDLE)))
1401					nd->nd_flag |= ND_INCRSEQID;
1402			}
1403			/*
1404			 * If this op's status is non-zero, mark
1405			 * that there is no more data to process.
1406			 * The exception is Setattr, which always has xdr
1407			 * when it has failed.
1408			 */
1409			if (j != 0 && i != NFSV4OP_SETATTR)
1410				nd->nd_flag |= ND_NOMOREDATA;
1411
1412			/*
1413			 * If R_DONTRECOVER is set, replace the stale error
1414			 * reply, so that recovery isn't initiated.
1415			 */
1416			if ((nd->nd_repstat == NFSERR_STALECLIENTID ||
1417			     nd->nd_repstat == NFSERR_BADSESSION ||
1418			     nd->nd_repstat == NFSERR_STALESTATEID) &&
1419			    rep != NULL && (rep->r_flags & R_DONTRECOVER))
1420				nd->nd_repstat = NFSERR_STALEDONTRECOVER;
1421		}
1422	}
1423out:
1424
1425#ifdef KDTRACE_HOOKS
1426	if (nmp != NULL && dtrace_nfscl_nfs234_done_probe != NULL) {
1427		uint32_t probe_id;
1428		int probe_procnum;
1429
1430		if (nd->nd_flag & ND_NFSV4) {
1431			probe_id = nfscl_nfs4_done_probes[nd->nd_procnum];
1432			probe_procnum = nd->nd_procnum;
1433		} else if (nd->nd_flag & ND_NFSV3) {
1434			probe_id = nfscl_nfs3_done_probes[procnum];
1435			probe_procnum = procnum;
1436		} else {
1437			probe_id = nfscl_nfs2_done_probes[nd->nd_procnum];
1438			probe_procnum = procnum;
1439		}
1440		if (probe_id != 0)
1441			(dtrace_nfscl_nfs234_done_probe)(probe_id, vp,
1442			    nd->nd_mreq, cred, probe_procnum, 0);
1443	}
1444#endif
1445
1446	m_freem(nd->nd_mreq);
1447	if (usegssname == 0)
1448		AUTH_DESTROY(auth);
1449	if (rep != NULL)
1450		free(rep, M_NFSDREQ);
1451	if (set_sigset)
1452		newnfs_restore_sigmask(td, &oldset);
1453	return (0);
1454nfsmout:
1455	m_freem(nd->nd_mrep);
1456	m_freem(nd->nd_mreq);
1457	if (usegssname == 0)
1458		AUTH_DESTROY(auth);
1459	if (rep != NULL)
1460		free(rep, M_NFSDREQ);
1461	if (set_sigset)
1462		newnfs_restore_sigmask(td, &oldset);
1463	return (error);
1464}
1465
1466/*
1467 * Mark all of an nfs mount's outstanding requests with R_SOFTTERM and
1468 * wait for all requests to complete. This is used by forced unmounts
1469 * to terminate any outstanding RPCs.
1470 */
1471int
1472newnfs_nmcancelreqs(struct nfsmount *nmp)
1473{
1474	struct nfsclds *dsp;
1475	struct __rpc_client *cl;
1476	int i;
1477
1478	if (nmp->nm_sockreq.nr_client != NULL)
1479		CLNT_CLOSE(nmp->nm_sockreq.nr_client);
1480	for (i = 0; i < nmp->nm_aconnect; i++)
1481		if (nmp->nm_aconn[i] != NULL)
1482			CLNT_CLOSE(nmp->nm_aconn[i]);
1483lookformore:
1484	NFSLOCKMNT(nmp);
1485	TAILQ_FOREACH(dsp, &nmp->nm_sess, nfsclds_list) {
1486		NFSLOCKDS(dsp);
1487		if (dsp != TAILQ_FIRST(&nmp->nm_sess) &&
1488		    (dsp->nfsclds_flags & NFSCLDS_CLOSED) == 0 &&
1489		    dsp->nfsclds_sockp != NULL &&
1490		    dsp->nfsclds_sockp->nr_client != NULL) {
1491			dsp->nfsclds_flags |= NFSCLDS_CLOSED;
1492			cl = dsp->nfsclds_sockp->nr_client;
1493			NFSUNLOCKDS(dsp);
1494			NFSUNLOCKMNT(nmp);
1495			CLNT_CLOSE(cl);
1496			goto lookformore;
1497		}
1498		NFSUNLOCKDS(dsp);
1499	}
1500	NFSUNLOCKMNT(nmp);
1501	return (0);
1502}
1503
1504/*
1505 * Any signal that can interrupt an NFS operation in an intr mount
1506 * should be added to this set. SIGSTOP and SIGKILL cannot be masked.
1507 */
1508int newnfs_sig_set[] = {
1509	SIGINT,
1510	SIGTERM,
1511	SIGHUP,
1512	SIGKILL,
1513	SIGQUIT
1514};
1515
1516/*
1517 * Check to see if one of the signals in our subset is pending on
1518 * the process (in an intr mount).
1519 */
1520static int
1521nfs_sig_pending(sigset_t set)
1522{
1523	int i;
1524
1525	for (i = 0 ; i < nitems(newnfs_sig_set); i++)
1526		if (SIGISMEMBER(set, newnfs_sig_set[i]))
1527			return (1);
1528	return (0);
1529}
1530
1531/*
1532 * The set/restore sigmask functions are used to (temporarily) overwrite
1533 * the thread td_sigmask during an RPC call (for example). These are also
1534 * used in other places in the NFS client that might tsleep().
1535 */
1536void
1537newnfs_set_sigmask(struct thread *td, sigset_t *oldset)
1538{
1539	sigset_t newset;
1540	int i;
1541	struct proc *p;
1542
1543	SIGFILLSET(newset);
1544	if (td == NULL)
1545		td = curthread; /* XXX */
1546	p = td->td_proc;
1547	/* Remove the NFS set of signals from newset */
1548	PROC_LOCK(p);
1549	mtx_lock(&p->p_sigacts->ps_mtx);
1550	for (i = 0 ; i < nitems(newnfs_sig_set); i++) {
1551		/*
1552		 * But make sure we leave the ones already masked
1553		 * by the process, ie. remove the signal from the
1554		 * temporary signalmask only if it wasn't already
1555		 * in p_sigmask.
1556		 */
1557		if (!SIGISMEMBER(td->td_sigmask, newnfs_sig_set[i]) &&
1558		    !SIGISMEMBER(p->p_sigacts->ps_sigignore, newnfs_sig_set[i]))
1559			SIGDELSET(newset, newnfs_sig_set[i]);
1560	}
1561	mtx_unlock(&p->p_sigacts->ps_mtx);
1562	kern_sigprocmask(td, SIG_SETMASK, &newset, oldset,
1563	    SIGPROCMASK_PROC_LOCKED);
1564	PROC_UNLOCK(p);
1565}
1566
1567void
1568newnfs_restore_sigmask(struct thread *td, sigset_t *set)
1569{
1570	if (td == NULL)
1571		td = curthread; /* XXX */
1572	kern_sigprocmask(td, SIG_SETMASK, set, NULL, 0);
1573}
1574
1575/*
1576 * NFS wrapper to msleep(), that shoves a new p_sigmask and restores the
1577 * old one after msleep() returns.
1578 */
1579int
1580newnfs_msleep(struct thread *td, void *ident, struct mtx *mtx, int priority, char *wmesg, int timo)
1581{
1582	sigset_t oldset;
1583	int error;
1584
1585	if ((priority & PCATCH) == 0)
1586		return msleep(ident, mtx, priority, wmesg, timo);
1587	if (td == NULL)
1588		td = curthread; /* XXX */
1589	newnfs_set_sigmask(td, &oldset);
1590	error = msleep(ident, mtx, priority, wmesg, timo);
1591	newnfs_restore_sigmask(td, &oldset);
1592	return (error);
1593}
1594
1595/*
1596 * Test for a termination condition pending on the process.
1597 * This is used for NFSMNT_INT mounts.
1598 */
1599int
1600newnfs_sigintr(struct nfsmount *nmp, struct thread *td)
1601{
1602	struct proc *p;
1603	sigset_t tmpset;
1604
1605	/* Terminate all requests while attempting a forced unmount. */
1606	if (NFSCL_FORCEDISM(nmp->nm_mountp))
1607		return (EIO);
1608	if (!(nmp->nm_flag & NFSMNT_INT))
1609		return (0);
1610	if (td == NULL)
1611		return (0);
1612	p = td->td_proc;
1613	PROC_LOCK(p);
1614	tmpset = p->p_siglist;
1615	SIGSETOR(tmpset, td->td_siglist);
1616	SIGSETNAND(tmpset, td->td_sigmask);
1617	mtx_lock(&p->p_sigacts->ps_mtx);
1618	SIGSETNAND(tmpset, p->p_sigacts->ps_sigignore);
1619	mtx_unlock(&p->p_sigacts->ps_mtx);
1620	if ((SIGNOTEMPTY(p->p_siglist) || SIGNOTEMPTY(td->td_siglist))
1621	    && nfs_sig_pending(tmpset)) {
1622		PROC_UNLOCK(p);
1623		return (EINTR);
1624	}
1625	PROC_UNLOCK(p);
1626	return (0);
1627}
1628
1629static int
1630nfs_msg(struct thread *td, const char *server, const char *msg, int error)
1631{
1632	struct proc *p;
1633
1634	p = td ? td->td_proc : NULL;
1635	if (error) {
1636		tprintf(p, LOG_INFO, "nfs server %s: %s, error %d\n",
1637		    server, msg, error);
1638	} else {
1639		tprintf(p, LOG_INFO, "nfs server %s: %s\n", server, msg);
1640	}
1641	return (0);
1642}
1643
1644static void
1645nfs_down(struct nfsmount *nmp, struct thread *td, const char *msg,
1646    int error, int flags)
1647{
1648	if (nmp == NULL)
1649		return;
1650	mtx_lock(&nmp->nm_mtx);
1651	if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
1652		nmp->nm_state |= NFSSTA_TIMEO;
1653		mtx_unlock(&nmp->nm_mtx);
1654		vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
1655		    VQ_NOTRESP, 0);
1656	} else
1657		mtx_unlock(&nmp->nm_mtx);
1658	mtx_lock(&nmp->nm_mtx);
1659	if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
1660		nmp->nm_state |= NFSSTA_LOCKTIMEO;
1661		mtx_unlock(&nmp->nm_mtx);
1662		vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
1663		    VQ_NOTRESPLOCK, 0);
1664	} else
1665		mtx_unlock(&nmp->nm_mtx);
1666	nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, error);
1667}
1668
1669static void
1670nfs_up(struct nfsmount *nmp, struct thread *td, const char *msg,
1671    int flags, int tprintfmsg)
1672{
1673	if (nmp == NULL)
1674		return;
1675	if (tprintfmsg) {
1676		nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, 0);
1677	}
1678
1679	mtx_lock(&nmp->nm_mtx);
1680	if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
1681		nmp->nm_state &= ~NFSSTA_TIMEO;
1682		mtx_unlock(&nmp->nm_mtx);
1683		vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
1684		    VQ_NOTRESP, 1);
1685	} else
1686		mtx_unlock(&nmp->nm_mtx);
1687
1688	mtx_lock(&nmp->nm_mtx);
1689	if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
1690		nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
1691		mtx_unlock(&nmp->nm_mtx);
1692		vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
1693		    VQ_NOTRESPLOCK, 1);
1694	} else
1695		mtx_unlock(&nmp->nm_mtx);
1696}
1697