1/*-
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 */
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD: stable/11/sys/fs/nfsserver/nfs_nfsdcache.c 361236 2020-05-19 01:43:00Z freqlabs $");
36
37/*
38 * Here is the basic algorithm:
39 * First, some design criteria I used:
40 * - I think a false hit is more serious than a false miss
41 * - A false hit for an RPC that has Op(s) that order via seqid# must be
42 *   avoided at all cost
43 * - A valid hit will probably happen a long time after the original reply
44 *   and the TCP socket that the original request was received on will no
45 *   longer be active
46 *   (The long time delay implies to me that LRU is not appropriate.)
47 * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
48 *   in them as well as minimizing the risk of redoing retried non-idempotent
49 *   Ops.
50 * Because it is biased towards avoiding false hits, multiple entries with
51 * the same xid are to be expected, especially for the case of the entry
52 * in the cache being related to a seqid# sequenced Op.
53 *
54 * The basic algorithm I'm about to code up:
55 * - Null RPCs bypass the cache and are just done
56 * For TCP
57 * 	- key on <xid, NFS version> (as noted above, there can be several
58 * 				     entries with the same key)
59 * 	When a request arrives:
60 * 		For all that match key
61 * 		- if RPC# != OR request_size !=
62 * 			- not a match with this one
63 * 		- if NFSv4 and received on same TCP socket OR
64 *			received on a TCP connection created before the
65 *			entry was cached
66 * 			- not a match with this one
67 * 			(V2,3 clients might retry on same TCP socket)
68 * 		- calculate checksum on first N bytes of NFS XDR
69 * 		- if checksum !=
70 * 			- not a match for this one
71 * 		If any of the remaining ones that match has a
72 * 			seqid_refcnt > 0
73 * 			- not a match (go do RPC, using new cache entry)
74 * 		If one match left
75 * 			- a hit (reply from cache)
76 * 		else
77 * 			- miss (go do RPC, using new cache entry)
78 *
79 * 	During processing of NFSv4 request:
80 * 		- set a flag when a non-idempotent Op is processed
81 * 		- when an Op that uses a seqid# (Open,...) is processed
82 * 			- if same seqid# as referenced entry in cache
83 * 				- free new cache entry
84 * 				- reply from referenced cache entry
85 * 			  else if next seqid# in order
86 * 				- free referenced cache entry
87 * 				- increment seqid_refcnt on new cache entry
88 * 				- set pointer from Openowner/Lockowner to
89 * 					new cache entry (aka reference it)
90 * 			  else if first seqid# in sequence
91 * 				- increment seqid_refcnt on new cache entry
92 * 				- set pointer from Openowner/Lockowner to
93 * 					new cache entry (aka reference it)
94 *
95 * 	At end of RPC processing:
96 * 		- if seqid_refcnt > 0 OR flagged non-idempotent on new
97 * 			cache entry
98 * 			- save reply in cache entry
99 * 			- calculate checksum on first N bytes of NFS XDR
100 * 				request
101 * 			- note op and length of XDR request (in bytes)
102 * 			- timestamp it
103 * 		  else
104 * 			- free new cache entry
105 * 		- Send reply (noting info for socket activity check, below)
106 *
107 * 	For cache entries saved above:
108 * 		- if saved since seqid_refcnt was > 0
109 * 			- free when seqid_refcnt decrements to 0
110 * 			  (when next one in sequence is processed above, or
111 * 			   when Openowner/Lockowner is discarded)
112 * 		  else { non-idempotent Op(s) }
113 * 			- free when
114 * 				- some further activity observed on same
115 * 					socket
116 * 				  (I'm not yet sure how I'm going to do
117 * 				   this. Maybe look at the TCP connection
118 * 				   to see if the send_tcp_sequence# is well
119 * 				   past sent reply OR K additional RPCs
120 * 				   replied on same socket OR?)
121 * 			  OR
122 * 				- when very old (hours, days, weeks?)
123 *
124 * For UDP (v2, 3 only), pretty much the old way:
125 * - key on <xid, NFS version, RPC#, Client host ip#>
126 *   (at most one entry for each key)
127 *
128 * When a Request arrives:
129 * - if a match with entry via key
130 * 	- if RPC marked In_progress
131 * 		- discard request (don't send reply)
132 * 	  else
133 * 		- reply from cache
134 * 		- timestamp cache entry
135 *   else
136 * 	- add entry to cache, marked In_progress
137 * 	- do RPC
138 * 	- when RPC done
139 * 		- if RPC# non-idempotent
140 * 			- mark entry Done (not In_progress)
141 * 			- save reply
142 * 			- timestamp cache entry
143 * 		  else
144 * 			- free cache entry
145 * 		- send reply
146 *
147 * Later, entries with saved replies are free'd a short time (few minutes)
148 * after reply sent (timestamp).
149 * Reference: Chet Juszczak, "Improving the Performance and Correctness
150 *		of an NFS Server", in Proc. Winter 1989 USENIX Conference,
151 *		pages 53-63. San Diego, February 1989.
152 *	 for the UDP case.
153 * nfsrc_floodlevel is set to the allowable upper limit for saved replies
154 *	for TCP. For V3, a reply won't be saved when the flood level is
155 *	hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
156 *	that case. This level should be set high enough that this almost
157 *	never happens.
158 */
159#include <fs/nfs/nfsport.h>
160
161extern struct nfsstatsv1 nfsstatsv1;
162extern struct mtx nfsrc_udpmtx;
163extern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
164extern struct nfsrchash_bucket nfsrcahash_table[NFSRVCACHE_HASHSIZE];
165int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
166
167SYSCTL_DECL(_vfs_nfsd);
168
169static u_int	nfsrc_tcphighwater = 0;
170static int
171sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
172{
173	int error, newhighwater;
174
175	newhighwater = nfsrc_tcphighwater;
176	error = sysctl_handle_int(oidp, &newhighwater, 0, req);
177	if (error != 0 || req->newptr == NULL)
178		return (error);
179	if (newhighwater < 0)
180		return (EINVAL);
181	if (newhighwater >= nfsrc_floodlevel)
182		nfsrc_floodlevel = newhighwater + newhighwater / 5;
183	nfsrc_tcphighwater = newhighwater;
184	return (0);
185}
186SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater, CTLTYPE_UINT | CTLFLAG_RW, 0,
187    sizeof(nfsrc_tcphighwater), sysctl_tcphighwater, "IU",
188    "High water mark for TCP cache entries");
189
190static u_int	nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
191SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
192    &nfsrc_udphighwater, 0,
193    "High water mark for UDP cache entries");
194static u_int	nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
195SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
196    &nfsrc_tcptimeout, 0,
197    "Timeout for TCP entries in the DRC");
198static u_int nfsrc_tcpnonidempotent = 1;
199SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
200    &nfsrc_tcpnonidempotent, 0,
201    "Enable the DRC for NFS over TCP");
202
203static int nfsrc_udpcachesize = 0;
204static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
205static struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
206
207/*
208 * and the reverse mapping from generic to Version 2 procedure numbers
209 */
210static int newnfsv2_procid[NFS_V3NPROCS] = {
211	NFSV2PROC_NULL,
212	NFSV2PROC_GETATTR,
213	NFSV2PROC_SETATTR,
214	NFSV2PROC_LOOKUP,
215	NFSV2PROC_NOOP,
216	NFSV2PROC_READLINK,
217	NFSV2PROC_READ,
218	NFSV2PROC_WRITE,
219	NFSV2PROC_CREATE,
220	NFSV2PROC_MKDIR,
221	NFSV2PROC_SYMLINK,
222	NFSV2PROC_CREATE,
223	NFSV2PROC_REMOVE,
224	NFSV2PROC_RMDIR,
225	NFSV2PROC_RENAME,
226	NFSV2PROC_LINK,
227	NFSV2PROC_READDIR,
228	NFSV2PROC_NOOP,
229	NFSV2PROC_STATFS,
230	NFSV2PROC_NOOP,
231	NFSV2PROC_NOOP,
232	NFSV2PROC_NOOP,
233};
234
235#define	nfsrc_hash(xid)	(((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
236#define	NFSRCUDPHASH(xid) \
237	(&nfsrvudphashtbl[nfsrc_hash(xid)])
238#define	NFSRCHASH(xid) \
239	(&nfsrchash_table[nfsrc_hash(xid)].tbl)
240#define	NFSRCAHASH(xid) (&nfsrcahash_table[nfsrc_hash(xid)])
241#define	TRUE	1
242#define	FALSE	0
243#define	NFSRVCACHE_CHECKLEN	100
244
245/* True iff the rpc reply is an nfs status ONLY! */
246static int nfsv2_repstat[NFS_V3NPROCS] = {
247	FALSE,
248	FALSE,
249	FALSE,
250	FALSE,
251	FALSE,
252	FALSE,
253	FALSE,
254	FALSE,
255	FALSE,
256	FALSE,
257	TRUE,
258	TRUE,
259	TRUE,
260	TRUE,
261	FALSE,
262	TRUE,
263	FALSE,
264	FALSE,
265	FALSE,
266	FALSE,
267	FALSE,
268	FALSE,
269};
270
271/*
272 * Will NFS want to work over IPv6 someday?
273 */
274#define	NETFAMILY(rp) \
275		(((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
276
277/* local functions */
278static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
279static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
280static void nfsrc_lock(struct nfsrvcache *rp);
281static void nfsrc_unlock(struct nfsrvcache *rp);
282static void nfsrc_wanted(struct nfsrvcache *rp);
283static void nfsrc_freecache(struct nfsrvcache *rp);
284static int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum);
285static void nfsrc_marksametcpconn(u_int64_t);
286
287/*
288 * Return the correct mutex for this cache entry.
289 */
290static __inline struct mtx *
291nfsrc_cachemutex(struct nfsrvcache *rp)
292{
293
294	if ((rp->rc_flag & RC_UDP) != 0)
295		return (&nfsrc_udpmtx);
296	return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx);
297}
298
299/*
300 * Initialize the server request cache list
301 */
302void
303nfsrvd_initcache(void)
304{
305	int i;
306	static int inited = 0;
307
308	if (inited)
309		return;
310	inited = 1;
311	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
312		LIST_INIT(&nfsrvudphashtbl[i]);
313		LIST_INIT(&nfsrchash_table[i].tbl);
314		LIST_INIT(&nfsrcahash_table[i].tbl);
315	}
316	TAILQ_INIT(&nfsrvudplru);
317	nfsrc_tcpsavedreplies = 0;
318	nfsrc_udpcachesize = 0;
319	nfsstatsv1.srvcache_tcppeak = 0;
320	nfsstatsv1.srvcache_size = 0;
321}
322
323/*
324 * Get a cache entry for this request. Basically just malloc a new one
325 * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
326 */
327int
328nfsrvd_getcache(struct nfsrv_descript *nd)
329{
330	struct nfsrvcache *newrp;
331	int ret;
332
333	if (nd->nd_procnum == NFSPROC_NULL)
334		panic("nfsd cache null");
335	MALLOC(newrp, struct nfsrvcache *, sizeof (struct nfsrvcache),
336	    M_NFSRVCACHE, M_WAITOK);
337	NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
338	if (nd->nd_flag & ND_NFSV4)
339		newrp->rc_flag = RC_NFSV4;
340	else if (nd->nd_flag & ND_NFSV3)
341		newrp->rc_flag = RC_NFSV3;
342	else
343		newrp->rc_flag = RC_NFSV2;
344	newrp->rc_xid = nd->nd_retxid;
345	newrp->rc_proc = nd->nd_procnum;
346	newrp->rc_sockref = nd->nd_sockref;
347	newrp->rc_cachetime = nd->nd_tcpconntime;
348	if (nd->nd_flag & ND_SAMETCPCONN)
349		newrp->rc_flag |= RC_SAMETCPCONN;
350	if (nd->nd_nam2 != NULL) {
351		newrp->rc_flag |= RC_UDP;
352		ret = nfsrc_getudp(nd, newrp);
353	} else {
354		ret = nfsrc_gettcp(nd, newrp);
355	}
356	NFSEXITCODE2(0, nd);
357	return (ret);
358}
359
360/*
361 * For UDP (v2, v3):
362 * - key on <xid, NFS version, RPC#, Client host ip#>
363 *   (at most one entry for each key)
364 */
365static int
366nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
367{
368	struct nfsrvcache *rp;
369	struct sockaddr_in *saddr;
370	struct sockaddr_in6 *saddr6;
371	struct nfsrvhashhead *hp;
372	int ret = 0;
373	struct mtx *mutex;
374
375	mutex = nfsrc_cachemutex(newrp);
376	hp = NFSRCUDPHASH(newrp->rc_xid);
377loop:
378	mtx_lock(mutex);
379	LIST_FOREACH(rp, hp, rc_hash) {
380	    if (newrp->rc_xid == rp->rc_xid &&
381		newrp->rc_proc == rp->rc_proc &&
382		(newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
383		nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
384			if ((rp->rc_flag & RC_LOCKED) != 0) {
385				rp->rc_flag |= RC_WANTED;
386				(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
387				    "nfsrc", 10 * hz);
388				goto loop;
389			}
390			if (rp->rc_flag == 0)
391				panic("nfs udp cache0");
392			rp->rc_flag |= RC_LOCKED;
393			TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
394			TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
395			if (rp->rc_flag & RC_INPROG) {
396				nfsstatsv1.srvcache_inproghits++;
397				mtx_unlock(mutex);
398				ret = RC_DROPIT;
399			} else if (rp->rc_flag & RC_REPSTATUS) {
400				/*
401				 * V2 only.
402				 */
403				nfsstatsv1.srvcache_nonidemdonehits++;
404				mtx_unlock(mutex);
405				nfsrvd_rephead(nd);
406				*(nd->nd_errp) = rp->rc_status;
407				ret = RC_REPLY;
408				rp->rc_timestamp = NFSD_MONOSEC +
409					NFSRVCACHE_UDPTIMEOUT;
410			} else if (rp->rc_flag & RC_REPMBUF) {
411				nfsstatsv1.srvcache_nonidemdonehits++;
412				mtx_unlock(mutex);
413				nd->nd_mreq = m_copym(rp->rc_reply, 0,
414					M_COPYALL, M_WAITOK);
415				ret = RC_REPLY;
416				rp->rc_timestamp = NFSD_MONOSEC +
417					NFSRVCACHE_UDPTIMEOUT;
418			} else {
419				panic("nfs udp cache1");
420			}
421			nfsrc_unlock(rp);
422			free((caddr_t)newrp, M_NFSRVCACHE);
423			goto out;
424		}
425	}
426	nfsstatsv1.srvcache_misses++;
427	atomic_add_int(&nfsstatsv1.srvcache_size, 1);
428	nfsrc_udpcachesize++;
429
430	newrp->rc_flag |= RC_INPROG;
431	saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
432	if (saddr->sin_family == AF_INET)
433		newrp->rc_inet = saddr->sin_addr.s_addr;
434	else if (saddr->sin_family == AF_INET6) {
435		saddr6 = (struct sockaddr_in6 *)saddr;
436		NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
437		    sizeof (struct in6_addr));
438		newrp->rc_flag |= RC_INETIPV6;
439	}
440	LIST_INSERT_HEAD(hp, newrp, rc_hash);
441	TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
442	mtx_unlock(mutex);
443	nd->nd_rp = newrp;
444	ret = RC_DOIT;
445
446out:
447	NFSEXITCODE2(0, nd);
448	return (ret);
449}
450
451/*
452 * Update a request cache entry after the rpc has been done
453 */
454struct nfsrvcache *
455nfsrvd_updatecache(struct nfsrv_descript *nd)
456{
457	struct nfsrvcache *rp;
458	struct nfsrvcache *retrp = NULL;
459	mbuf_t m;
460	struct mtx *mutex;
461
462	rp = nd->nd_rp;
463	if (!rp)
464		panic("nfsrvd_updatecache null rp");
465	nd->nd_rp = NULL;
466	mutex = nfsrc_cachemutex(rp);
467	mtx_lock(mutex);
468	nfsrc_lock(rp);
469	if (!(rp->rc_flag & RC_INPROG))
470		panic("nfsrvd_updatecache not inprog");
471	rp->rc_flag &= ~RC_INPROG;
472	if (rp->rc_flag & RC_UDP) {
473		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
474		TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
475	}
476
477	/*
478	 * Reply from cache is a special case returned by nfsrv_checkseqid().
479	 */
480	if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
481		nfsstatsv1.srvcache_nonidemdonehits++;
482		mtx_unlock(mutex);
483		nd->nd_repstat = 0;
484		if (nd->nd_mreq)
485			mbuf_freem(nd->nd_mreq);
486		if (!(rp->rc_flag & RC_REPMBUF))
487			panic("reply from cache");
488		nd->nd_mreq = m_copym(rp->rc_reply, 0,
489		    M_COPYALL, M_WAITOK);
490		rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
491		nfsrc_unlock(rp);
492		goto out;
493	}
494
495	/*
496	 * If rc_refcnt > 0, save it
497	 * For UDP, save it if ND_SAVEREPLY is set
498	 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
499	 */
500	if (nd->nd_repstat != NFSERR_DONTREPLY &&
501	    (rp->rc_refcnt > 0 ||
502	     ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
503	     ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
504	      nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
505	      nfsrc_tcpnonidempotent))) {
506		if (rp->rc_refcnt > 0) {
507			if (!(rp->rc_flag & RC_NFSV4))
508				panic("update_cache refcnt");
509			rp->rc_flag |= RC_REFCNT;
510		}
511		if ((nd->nd_flag & ND_NFSV2) &&
512		    nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
513			rp->rc_status = nd->nd_repstat;
514			rp->rc_flag |= RC_REPSTATUS;
515			mtx_unlock(mutex);
516		} else {
517			if (!(rp->rc_flag & RC_UDP)) {
518			    atomic_add_int(&nfsrc_tcpsavedreplies, 1);
519			    if (nfsrc_tcpsavedreplies >
520				nfsstatsv1.srvcache_tcppeak)
521				nfsstatsv1.srvcache_tcppeak =
522				    nfsrc_tcpsavedreplies;
523			}
524			mtx_unlock(mutex);
525			m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
526			mtx_lock(mutex);
527			rp->rc_reply = m;
528			rp->rc_flag |= RC_REPMBUF;
529			mtx_unlock(mutex);
530		}
531		if (rp->rc_flag & RC_UDP) {
532			rp->rc_timestamp = NFSD_MONOSEC +
533			    NFSRVCACHE_UDPTIMEOUT;
534			nfsrc_unlock(rp);
535		} else {
536			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
537			if (rp->rc_refcnt > 0)
538				nfsrc_unlock(rp);
539			else
540				retrp = rp;
541		}
542	} else {
543		nfsrc_freecache(rp);
544		mtx_unlock(mutex);
545	}
546
547out:
548	NFSEXITCODE2(0, nd);
549	return (retrp);
550}
551
552/*
553 * Invalidate and, if possible, free an in prog cache entry.
554 * Must not sleep.
555 */
556void
557nfsrvd_delcache(struct nfsrvcache *rp)
558{
559	struct mtx *mutex;
560
561	mutex = nfsrc_cachemutex(rp);
562	if (!(rp->rc_flag & RC_INPROG))
563		panic("nfsrvd_delcache not in prog");
564	mtx_lock(mutex);
565	rp->rc_flag &= ~RC_INPROG;
566	if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
567		nfsrc_freecache(rp);
568	mtx_unlock(mutex);
569}
570
571/*
572 * Called after nfsrvd_updatecache() once the reply is sent, to update
573 * the entry's sequence number and unlock it. The argument is
574 * the pointer returned by nfsrvd_updatecache().
575 */
576void
577nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
578{
579	struct nfsrchash_bucket *hbp;
580
581	KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
582	if (have_seq) {
583		hbp = NFSRCAHASH(rp->rc_sockref);
584		mtx_lock(&hbp->mtx);
585		rp->rc_tcpseq = seq;
586		if (rp->rc_acked != RC_NO_ACK)
587			LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
588		rp->rc_acked = RC_NO_ACK;
589		mtx_unlock(&hbp->mtx);
590	}
591	nfsrc_unlock(rp);
592}
593
594/*
595 * Get a cache entry for TCP
596 * - key on <xid, nfs version>
597 *   (allow multiple entries for a given key)
598 */
599static int
600nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
601{
602	struct nfsrvcache *rp, *nextrp;
603	int i;
604	struct nfsrvcache *hitrp;
605	struct nfsrvhashhead *hp, nfsrc_templist;
606	int hit, ret = 0;
607	struct mtx *mutex;
608
609	mutex = nfsrc_cachemutex(newrp);
610	hp = NFSRCHASH(newrp->rc_xid);
611	newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
612tryagain:
613	mtx_lock(mutex);
614	hit = 1;
615	LIST_INIT(&nfsrc_templist);
616	/*
617	 * Get all the matches and put them on the temp list.
618	 */
619	rp = LIST_FIRST(hp);
620	while (rp != LIST_END(hp)) {
621		nextrp = LIST_NEXT(rp, rc_hash);
622		if (newrp->rc_xid == rp->rc_xid &&
623		    (!(rp->rc_flag & RC_INPROG) ||
624		     ((newrp->rc_flag & RC_SAMETCPCONN) &&
625		      newrp->rc_sockref == rp->rc_sockref)) &&
626		    (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
627		    newrp->rc_proc == rp->rc_proc &&
628		    ((newrp->rc_flag & RC_NFSV4) &&
629		     newrp->rc_sockref != rp->rc_sockref &&
630		     newrp->rc_cachetime >= rp->rc_cachetime)
631		    && newrp->rc_reqlen == rp->rc_reqlen &&
632		    newrp->rc_cksum == rp->rc_cksum) {
633			LIST_REMOVE(rp, rc_hash);
634			LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
635		}
636		rp = nextrp;
637	}
638
639	/*
640	 * Now, use nfsrc_templist to decide if there is a match.
641	 */
642	i = 0;
643	LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
644		i++;
645		if (rp->rc_refcnt > 0) {
646			hit = 0;
647			break;
648		}
649	}
650	/*
651	 * Can be a hit only if one entry left.
652	 * Note possible hit entry and put nfsrc_templist back on hash
653	 * list.
654	 */
655	if (i != 1)
656		hit = 0;
657	hitrp = rp = LIST_FIRST(&nfsrc_templist);
658	while (rp != LIST_END(&nfsrc_templist)) {
659		nextrp = LIST_NEXT(rp, rc_hash);
660		LIST_REMOVE(rp, rc_hash);
661		LIST_INSERT_HEAD(hp, rp, rc_hash);
662		rp = nextrp;
663	}
664	if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
665		panic("nfs gettcp cache templist");
666
667	if (hit) {
668		rp = hitrp;
669		if ((rp->rc_flag & RC_LOCKED) != 0) {
670			rp->rc_flag |= RC_WANTED;
671			(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
672			    "nfsrc", 10 * hz);
673			goto tryagain;
674		}
675		if (rp->rc_flag == 0)
676			panic("nfs tcp cache0");
677		rp->rc_flag |= RC_LOCKED;
678		if (rp->rc_flag & RC_INPROG) {
679			nfsstatsv1.srvcache_inproghits++;
680			mtx_unlock(mutex);
681			if (newrp->rc_sockref == rp->rc_sockref)
682				nfsrc_marksametcpconn(rp->rc_sockref);
683			ret = RC_DROPIT;
684		} else if (rp->rc_flag & RC_REPSTATUS) {
685			/*
686			 * V2 only.
687			 */
688			nfsstatsv1.srvcache_nonidemdonehits++;
689			mtx_unlock(mutex);
690			if (newrp->rc_sockref == rp->rc_sockref)
691				nfsrc_marksametcpconn(rp->rc_sockref);
692			ret = RC_REPLY;
693			nfsrvd_rephead(nd);
694			*(nd->nd_errp) = rp->rc_status;
695			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
696		} else if (rp->rc_flag & RC_REPMBUF) {
697			nfsstatsv1.srvcache_nonidemdonehits++;
698			mtx_unlock(mutex);
699			if (newrp->rc_sockref == rp->rc_sockref)
700				nfsrc_marksametcpconn(rp->rc_sockref);
701			ret = RC_REPLY;
702			nd->nd_mreq = m_copym(rp->rc_reply, 0,
703				M_COPYALL, M_WAITOK);
704			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
705		} else {
706			panic("nfs tcp cache1");
707		}
708		nfsrc_unlock(rp);
709		free((caddr_t)newrp, M_NFSRVCACHE);
710		goto out;
711	}
712	nfsstatsv1.srvcache_misses++;
713	atomic_add_int(&nfsstatsv1.srvcache_size, 1);
714
715	/*
716	 * For TCP, multiple entries for a key are allowed, so don't
717	 * chain it into the hash table until done.
718	 */
719	newrp->rc_cachetime = NFSD_MONOSEC;
720	newrp->rc_flag |= RC_INPROG;
721	LIST_INSERT_HEAD(hp, newrp, rc_hash);
722	mtx_unlock(mutex);
723	nd->nd_rp = newrp;
724	ret = RC_DOIT;
725
726out:
727	NFSEXITCODE2(0, nd);
728	return (ret);
729}
730
731/*
732 * Lock a cache entry.
733 */
734static void
735nfsrc_lock(struct nfsrvcache *rp)
736{
737	struct mtx *mutex;
738
739	mutex = nfsrc_cachemutex(rp);
740	mtx_assert(mutex, MA_OWNED);
741	while ((rp->rc_flag & RC_LOCKED) != 0) {
742		rp->rc_flag |= RC_WANTED;
743		(void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
744	}
745	rp->rc_flag |= RC_LOCKED;
746}
747
748/*
749 * Unlock a cache entry.
750 */
751static void
752nfsrc_unlock(struct nfsrvcache *rp)
753{
754	struct mtx *mutex;
755
756	mutex = nfsrc_cachemutex(rp);
757	mtx_lock(mutex);
758	rp->rc_flag &= ~RC_LOCKED;
759	nfsrc_wanted(rp);
760	mtx_unlock(mutex);
761}
762
763/*
764 * Wakeup anyone wanting entry.
765 */
766static void
767nfsrc_wanted(struct nfsrvcache *rp)
768{
769	if (rp->rc_flag & RC_WANTED) {
770		rp->rc_flag &= ~RC_WANTED;
771		wakeup((caddr_t)rp);
772	}
773}
774
775/*
776 * Free up the entry.
777 * Must not sleep.
778 */
779static void
780nfsrc_freecache(struct nfsrvcache *rp)
781{
782	struct nfsrchash_bucket *hbp;
783
784	LIST_REMOVE(rp, rc_hash);
785	if (rp->rc_flag & RC_UDP) {
786		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
787		nfsrc_udpcachesize--;
788	} else if (rp->rc_acked != RC_NO_SEQ) {
789		hbp = NFSRCAHASH(rp->rc_sockref);
790		mtx_lock(&hbp->mtx);
791		if (rp->rc_acked == RC_NO_ACK)
792			LIST_REMOVE(rp, rc_ahash);
793		mtx_unlock(&hbp->mtx);
794	}
795	nfsrc_wanted(rp);
796	if (rp->rc_flag & RC_REPMBUF) {
797		mbuf_freem(rp->rc_reply);
798		if (!(rp->rc_flag & RC_UDP))
799			atomic_add_int(&nfsrc_tcpsavedreplies, -1);
800	}
801	FREE((caddr_t)rp, M_NFSRVCACHE);
802	atomic_add_int(&nfsstatsv1.srvcache_size, -1);
803}
804
805/*
806 * Clean out the cache. Called when nfsserver module is unloaded.
807 */
808void
809nfsrvd_cleancache(void)
810{
811	struct nfsrvcache *rp, *nextrp;
812	int i;
813
814	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
815		mtx_lock(&nfsrchash_table[i].mtx);
816		LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp)
817			nfsrc_freecache(rp);
818		mtx_unlock(&nfsrchash_table[i].mtx);
819	}
820	mtx_lock(&nfsrc_udpmtx);
821	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
822		LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
823			nfsrc_freecache(rp);
824		}
825	}
826	nfsstatsv1.srvcache_size = 0;
827	mtx_unlock(&nfsrc_udpmtx);
828	nfsrc_tcpsavedreplies = 0;
829}
830
831#define HISTSIZE	16
832/*
833 * The basic rule is to get rid of entries that are expired.
834 */
835void
836nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
837{
838	struct nfsrchash_bucket *hbp;
839	struct nfsrvcache *rp, *nextrp;
840	int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
841	time_t thisstamp;
842	static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
843	static int onethread = 0, oneslot = 0;
844
845	if (sockref != 0) {
846		hbp = NFSRCAHASH(sockref);
847		mtx_lock(&hbp->mtx);
848		LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
849			if (sockref == rp->rc_sockref) {
850				if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
851					rp->rc_acked = RC_ACK;
852					LIST_REMOVE(rp, rc_ahash);
853				} else if (final) {
854					rp->rc_acked = RC_NACK;
855					LIST_REMOVE(rp, rc_ahash);
856				}
857			}
858		}
859		mtx_unlock(&hbp->mtx);
860	}
861
862	if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
863		return;
864	if (NFSD_MONOSEC != udp_lasttrim ||
865	    nfsrc_udpcachesize >= (nfsrc_udphighwater +
866	    nfsrc_udphighwater / 2)) {
867		mtx_lock(&nfsrc_udpmtx);
868		udp_lasttrim = NFSD_MONOSEC;
869		TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
870			if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
871			     && rp->rc_refcnt == 0
872			     && ((rp->rc_flag & RC_REFCNT) ||
873				 udp_lasttrim > rp->rc_timestamp ||
874				 nfsrc_udpcachesize > nfsrc_udphighwater))
875				nfsrc_freecache(rp);
876		}
877		mtx_unlock(&nfsrc_udpmtx);
878	}
879	if (NFSD_MONOSEC != tcp_lasttrim ||
880	    nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
881		force = nfsrc_tcphighwater / 4;
882		if (force > 0 &&
883		    nfsrc_tcpsavedreplies + force >= nfsrc_tcphighwater) {
884			for (i = 0; i < HISTSIZE; i++)
885				time_histo[i] = 0;
886			i = 0;
887			lastslot = NFSRVCACHE_HASHSIZE - 1;
888		} else {
889			force = 0;
890			if (NFSD_MONOSEC != tcp_lasttrim) {
891				i = 0;
892				lastslot = NFSRVCACHE_HASHSIZE - 1;
893			} else {
894				lastslot = i = oneslot;
895				if (++oneslot >= NFSRVCACHE_HASHSIZE)
896					oneslot = 0;
897			}
898		}
899		tto = nfsrc_tcptimeout;
900		tcp_lasttrim = NFSD_MONOSEC;
901		for (; i <= lastslot; i++) {
902			mtx_lock(&nfsrchash_table[i].mtx);
903			LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash,
904			    nextrp) {
905				if (!(rp->rc_flag &
906				     (RC_INPROG|RC_LOCKED|RC_WANTED))
907				     && rp->rc_refcnt == 0) {
908					if ((rp->rc_flag & RC_REFCNT) ||
909					    tcp_lasttrim > rp->rc_timestamp ||
910					    rp->rc_acked == RC_ACK) {
911						nfsrc_freecache(rp);
912						continue;
913					}
914
915					if (force == 0)
916						continue;
917					/*
918					 * The timestamps range from roughly the
919					 * present (tcp_lasttrim) to the present
920					 * + nfsrc_tcptimeout. Generate a simple
921					 * histogram of where the timeouts fall.
922					 */
923					j = rp->rc_timestamp - tcp_lasttrim;
924					if (j >= tto)
925						j = HISTSIZE - 1;
926					else if (j < 0)
927						j = 0;
928					else
929						j = j * HISTSIZE / tto;
930					time_histo[j]++;
931				}
932			}
933			mtx_unlock(&nfsrchash_table[i].mtx);
934		}
935		if (force) {
936			/*
937			 * Trim some more with a smaller timeout of as little
938			 * as 20% of nfsrc_tcptimeout to try and get below
939			 * 80% of the nfsrc_tcphighwater.
940			 */
941			k = 0;
942			for (i = 0; i < (HISTSIZE - 2); i++) {
943				k += time_histo[i];
944				if (k > force)
945					break;
946			}
947			k = tto * (i + 1) / HISTSIZE;
948			if (k < 1)
949				k = 1;
950			thisstamp = tcp_lasttrim + k;
951			for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
952				mtx_lock(&nfsrchash_table[i].mtx);
953				LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl,
954				    rc_hash, nextrp) {
955					if (!(rp->rc_flag &
956					     (RC_INPROG|RC_LOCKED|RC_WANTED))
957					     && rp->rc_refcnt == 0
958					     && ((rp->rc_flag & RC_REFCNT) ||
959						 thisstamp > rp->rc_timestamp ||
960						 rp->rc_acked == RC_ACK))
961						nfsrc_freecache(rp);
962				}
963				mtx_unlock(&nfsrchash_table[i].mtx);
964			}
965		}
966	}
967	atomic_store_rel_int(&onethread, 0);
968}
969
970/*
971 * Add a seqid# reference to the cache entry.
972 */
973void
974nfsrvd_refcache(struct nfsrvcache *rp)
975{
976	struct mtx *mutex;
977
978	if (rp == NULL)
979		/* For NFSv4.1, there is no cache entry. */
980		return;
981	mutex = nfsrc_cachemutex(rp);
982	mtx_lock(mutex);
983	if (rp->rc_refcnt < 0)
984		panic("nfs cache refcnt");
985	rp->rc_refcnt++;
986	mtx_unlock(mutex);
987}
988
989/*
990 * Dereference a seqid# cache entry.
991 */
992void
993nfsrvd_derefcache(struct nfsrvcache *rp)
994{
995	struct mtx *mutex;
996
997	mutex = nfsrc_cachemutex(rp);
998	mtx_lock(mutex);
999	if (rp->rc_refcnt <= 0)
1000		panic("nfs cache derefcnt");
1001	rp->rc_refcnt--;
1002	if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
1003		nfsrc_freecache(rp);
1004	mtx_unlock(mutex);
1005}
1006
1007/*
1008 * Calculate the length of the mbuf list and a checksum on the first up to
1009 * NFSRVCACHE_CHECKLEN bytes.
1010 */
1011static int
1012nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
1013{
1014	int len = 0, cklen;
1015	mbuf_t m;
1016
1017	m = m1;
1018	while (m) {
1019		len += mbuf_len(m);
1020		m = mbuf_next(m);
1021	}
1022	cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1023	*cksum = in_cksum(m1, cklen);
1024	return (len);
1025}
1026
1027/*
1028 * Mark a TCP connection that is seeing retries. Should never happen for
1029 * NFSv4.
1030 */
1031static void
1032nfsrc_marksametcpconn(u_int64_t sockref)
1033{
1034}
1035
1036