nfs_nfsdcache.c revision 269398
1/*-
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 */
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD: stable/10/sys/fs/nfsserver/nfs_nfsdcache.c 269398 2014-08-01 21:10:41Z rmacklem $");
36
37/*
38 * Here is the basic algorithm:
39 * First, some design criteria I used:
40 * - I think a false hit is more serious than a false miss
41 * - A false hit for an RPC that has Op(s) that order via seqid# must be
42 *   avoided at all cost
43 * - A valid hit will probably happen a long time after the original reply
44 *   and the TCP socket that the original request was received on will no
45 *   longer be active
46 *   (The long time delay implies to me that LRU is not appropriate.)
47 * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
48 *   in them as well as minimizing the risk of redoing retried non-idempotent
49 *   Ops.
50 * Because it is biased towards avoiding false hits, multiple entries with
51 * the same xid are to be expected, especially for the case of the entry
52 * in the cache being related to a seqid# sequenced Op.
53 *
54 * The basic algorithm I'm about to code up:
55 * - Null RPCs bypass the cache and are just done
56 * For TCP
57 * 	- key on <xid, NFS version> (as noted above, there can be several
58 * 				     entries with the same key)
59 * 	When a request arrives:
60 * 		For all that match key
61 * 		- if RPC# != OR request_size !=
62 * 			- not a match with this one
63 * 		- if NFSv4 and received on same TCP socket OR
64 *			received on a TCP connection created before the
65 *			entry was cached
66 * 			- not a match with this one
67 * 			(V2,3 clients might retry on same TCP socket)
68 * 		- calculate checksum on first N bytes of NFS XDR
69 * 		- if checksum !=
70 * 			- not a match for this one
71 * 		If any of the remaining ones that match has a
72 * 			seqid_refcnt > 0
73 * 			- not a match (go do RPC, using new cache entry)
74 * 		If one match left
75 * 			- a hit (reply from cache)
76 * 		else
77 * 			- miss (go do RPC, using new cache entry)
78 *
79 * 	During processing of NFSv4 request:
80 * 		- set a flag when a non-idempotent Op is processed
81 * 		- when an Op that uses a seqid# (Open,...) is processed
82 * 			- if same seqid# as referenced entry in cache
83 * 				- free new cache entry
84 * 				- reply from referenced cache entry
85 * 			  else if next seqid# in order
86 * 				- free referenced cache entry
87 * 				- increment seqid_refcnt on new cache entry
88 * 				- set pointer from Openowner/Lockowner to
89 * 					new cache entry (aka reference it)
90 * 			  else if first seqid# in sequence
91 * 				- increment seqid_refcnt on new cache entry
92 * 				- set pointer from Openowner/Lockowner to
93 * 					new cache entry (aka reference it)
94 *
95 * 	At end of RPC processing:
96 * 		- if seqid_refcnt > 0 OR flagged non-idempotent on new
97 * 			cache entry
98 * 			- save reply in cache entry
99 * 			- calculate checksum on first N bytes of NFS XDR
100 * 				request
101 * 			- note op and length of XDR request (in bytes)
102 * 			- timestamp it
103 * 		  else
104 * 			- free new cache entry
105 * 		- Send reply (noting info for socket activity check, below)
106 *
107 * 	For cache entries saved above:
108 * 		- if saved since seqid_refcnt was > 0
109 * 			- free when seqid_refcnt decrements to 0
110 * 			  (when next one in sequence is processed above, or
111 * 			   when Openowner/Lockowner is discarded)
112 * 		  else { non-idempotent Op(s) }
113 * 			- free when
114 * 				- some further activity observed on same
115 * 					socket
116 * 				  (I'm not yet sure how I'm going to do
117 * 				   this. Maybe look at the TCP connection
118 * 				   to see if the send_tcp_sequence# is well
119 * 				   past sent reply OR K additional RPCs
120 * 				   replied on same socket OR?)
121 * 			  OR
122 * 				- when very old (hours, days, weeks?)
123 *
124 * For UDP (v2, 3 only), pretty much the old way:
125 * - key on <xid, NFS version, RPC#, Client host ip#>
126 *   (at most one entry for each key)
127 *
128 * When a Request arrives:
129 * - if a match with entry via key
130 * 	- if RPC marked In_progress
131 * 		- discard request (don't send reply)
132 * 	  else
133 * 		- reply from cache
134 * 		- timestamp cache entry
135 *   else
136 * 	- add entry to cache, marked In_progress
137 * 	- do RPC
138 * 	- when RPC done
139 * 		- if RPC# non-idempotent
140 * 			- mark entry Done (not In_progress)
141 * 			- save reply
142 * 			- timestamp cache entry
143 * 		  else
144 * 			- free cache entry
145 * 		- send reply
146 *
147 * Later, entries with saved replies are free'd a short time (few minutes)
148 * after reply sent (timestamp).
149 * Reference: Chet Juszczak, "Improving the Performance and Correctness
150 *		of an NFS Server", in Proc. Winter 1989 USENIX Conference,
151 *		pages 53-63. San Diego, February 1989.
152 *	 for the UDP case.
153 * nfsrc_floodlevel is set to the allowable upper limit for saved replies
154 *	for TCP. For V3, a reply won't be saved when the flood level is
155 *	hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
156 *	that case. This level should be set high enough that this almost
157 *	never happens.
158 */
159#ifndef APPLEKEXT
160#include <fs/nfs/nfsport.h>
161
162extern struct nfsstats newnfsstats;
163extern struct mtx nfsrc_udpmtx;
164extern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
165extern struct nfsrchash_bucket nfsrcahash_table[NFSRVCACHE_HASHSIZE];
166int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
167#endif	/* !APPLEKEXT */
168
169SYSCTL_DECL(_vfs_nfsd);
170
171static u_int	nfsrc_tcphighwater = 0;
172static int
173sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
174{
175	int error, newhighwater;
176
177	newhighwater = nfsrc_tcphighwater;
178	error = sysctl_handle_int(oidp, &newhighwater, 0, req);
179	if (error != 0 || req->newptr == NULL)
180		return (error);
181	if (newhighwater < 0)
182		return (EINVAL);
183	if (newhighwater >= nfsrc_floodlevel)
184		nfsrc_floodlevel = newhighwater + newhighwater / 5;
185	nfsrc_tcphighwater = newhighwater;
186	return (0);
187}
188SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater, CTLTYPE_UINT | CTLFLAG_RW, 0,
189    sizeof(nfsrc_tcphighwater), sysctl_tcphighwater, "IU",
190    "High water mark for TCP cache entries");
191
192static u_int	nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
193SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
194    &nfsrc_udphighwater, 0,
195    "High water mark for UDP cache entries");
196static u_int	nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
197SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
198    &nfsrc_tcptimeout, 0,
199    "Timeout for TCP entries in the DRC");
200static u_int nfsrc_tcpnonidempotent = 1;
201SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
202    &nfsrc_tcpnonidempotent, 0,
203    "Enable the DRC for NFS over TCP");
204
205static int nfsrc_udpcachesize = 0;
206static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
207static struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
208
209/*
210 * and the reverse mapping from generic to Version 2 procedure numbers
211 */
212static int newnfsv2_procid[NFS_V3NPROCS] = {
213	NFSV2PROC_NULL,
214	NFSV2PROC_GETATTR,
215	NFSV2PROC_SETATTR,
216	NFSV2PROC_LOOKUP,
217	NFSV2PROC_NOOP,
218	NFSV2PROC_READLINK,
219	NFSV2PROC_READ,
220	NFSV2PROC_WRITE,
221	NFSV2PROC_CREATE,
222	NFSV2PROC_MKDIR,
223	NFSV2PROC_SYMLINK,
224	NFSV2PROC_CREATE,
225	NFSV2PROC_REMOVE,
226	NFSV2PROC_RMDIR,
227	NFSV2PROC_RENAME,
228	NFSV2PROC_LINK,
229	NFSV2PROC_READDIR,
230	NFSV2PROC_NOOP,
231	NFSV2PROC_STATFS,
232	NFSV2PROC_NOOP,
233	NFSV2PROC_NOOP,
234	NFSV2PROC_NOOP,
235};
236
237#define	nfsrc_hash(xid)	(((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
238#define	NFSRCUDPHASH(xid) \
239	(&nfsrvudphashtbl[nfsrc_hash(xid)])
240#define	NFSRCHASH(xid) \
241	(&nfsrchash_table[nfsrc_hash(xid)].tbl)
242#define	NFSRCAHASH(xid) (&nfsrcahash_table[nfsrc_hash(xid)])
243#define	TRUE	1
244#define	FALSE	0
245#define	NFSRVCACHE_CHECKLEN	100
246
247/* True iff the rpc reply is an nfs status ONLY! */
248static int nfsv2_repstat[NFS_V3NPROCS] = {
249	FALSE,
250	FALSE,
251	FALSE,
252	FALSE,
253	FALSE,
254	FALSE,
255	FALSE,
256	FALSE,
257	FALSE,
258	FALSE,
259	TRUE,
260	TRUE,
261	TRUE,
262	TRUE,
263	FALSE,
264	TRUE,
265	FALSE,
266	FALSE,
267	FALSE,
268	FALSE,
269	FALSE,
270	FALSE,
271};
272
273/*
274 * Will NFS want to work over IPv6 someday?
275 */
276#define	NETFAMILY(rp) \
277		(((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
278
279/* local functions */
280static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
281static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
282static void nfsrc_lock(struct nfsrvcache *rp);
283static void nfsrc_unlock(struct nfsrvcache *rp);
284static void nfsrc_wanted(struct nfsrvcache *rp);
285static void nfsrc_freecache(struct nfsrvcache *rp);
286static int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum);
287static void nfsrc_marksametcpconn(u_int64_t);
288
289/*
290 * Return the correct mutex for this cache entry.
291 */
292static __inline struct mtx *
293nfsrc_cachemutex(struct nfsrvcache *rp)
294{
295
296	if ((rp->rc_flag & RC_UDP) != 0)
297		return (&nfsrc_udpmtx);
298	return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx);
299}
300
301/*
302 * Initialize the server request cache list
303 */
304APPLESTATIC void
305nfsrvd_initcache(void)
306{
307	int i;
308	static int inited = 0;
309
310	if (inited)
311		return;
312	inited = 1;
313	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
314		LIST_INIT(&nfsrvudphashtbl[i]);
315		LIST_INIT(&nfsrchash_table[i].tbl);
316		LIST_INIT(&nfsrcahash_table[i].tbl);
317	}
318	TAILQ_INIT(&nfsrvudplru);
319	nfsrc_tcpsavedreplies = 0;
320	nfsrc_udpcachesize = 0;
321	newnfsstats.srvcache_tcppeak = 0;
322	newnfsstats.srvcache_size = 0;
323}
324
325/*
326 * Get a cache entry for this request. Basically just malloc a new one
327 * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
328 */
329APPLESTATIC int
330nfsrvd_getcache(struct nfsrv_descript *nd)
331{
332	struct nfsrvcache *newrp;
333	int ret;
334
335	if (nd->nd_procnum == NFSPROC_NULL)
336		panic("nfsd cache null");
337	MALLOC(newrp, struct nfsrvcache *, sizeof (struct nfsrvcache),
338	    M_NFSRVCACHE, M_WAITOK);
339	NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
340	if (nd->nd_flag & ND_NFSV4)
341		newrp->rc_flag = RC_NFSV4;
342	else if (nd->nd_flag & ND_NFSV3)
343		newrp->rc_flag = RC_NFSV3;
344	else
345		newrp->rc_flag = RC_NFSV2;
346	newrp->rc_xid = nd->nd_retxid;
347	newrp->rc_proc = nd->nd_procnum;
348	newrp->rc_sockref = nd->nd_sockref;
349	newrp->rc_cachetime = nd->nd_tcpconntime;
350	if (nd->nd_flag & ND_SAMETCPCONN)
351		newrp->rc_flag |= RC_SAMETCPCONN;
352	if (nd->nd_nam2 != NULL) {
353		newrp->rc_flag |= RC_UDP;
354		ret = nfsrc_getudp(nd, newrp);
355	} else {
356		ret = nfsrc_gettcp(nd, newrp);
357	}
358	NFSEXITCODE2(0, nd);
359	return (ret);
360}
361
362/*
363 * For UDP (v2, v3):
364 * - key on <xid, NFS version, RPC#, Client host ip#>
365 *   (at most one entry for each key)
366 */
367static int
368nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
369{
370	struct nfsrvcache *rp;
371	struct sockaddr_in *saddr;
372	struct sockaddr_in6 *saddr6;
373	struct nfsrvhashhead *hp;
374	int ret = 0;
375	struct mtx *mutex;
376
377	mutex = nfsrc_cachemutex(newrp);
378	hp = NFSRCUDPHASH(newrp->rc_xid);
379loop:
380	mtx_lock(mutex);
381	LIST_FOREACH(rp, hp, rc_hash) {
382	    if (newrp->rc_xid == rp->rc_xid &&
383		newrp->rc_proc == rp->rc_proc &&
384		(newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
385		nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
386			if ((rp->rc_flag & RC_LOCKED) != 0) {
387				rp->rc_flag |= RC_WANTED;
388				(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
389				    "nfsrc", 10 * hz);
390				goto loop;
391			}
392			if (rp->rc_flag == 0)
393				panic("nfs udp cache0");
394			rp->rc_flag |= RC_LOCKED;
395			TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
396			TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
397			if (rp->rc_flag & RC_INPROG) {
398				newnfsstats.srvcache_inproghits++;
399				mtx_unlock(mutex);
400				ret = RC_DROPIT;
401			} else if (rp->rc_flag & RC_REPSTATUS) {
402				/*
403				 * V2 only.
404				 */
405				newnfsstats.srvcache_nonidemdonehits++;
406				mtx_unlock(mutex);
407				nfsrvd_rephead(nd);
408				*(nd->nd_errp) = rp->rc_status;
409				ret = RC_REPLY;
410				rp->rc_timestamp = NFSD_MONOSEC +
411					NFSRVCACHE_UDPTIMEOUT;
412			} else if (rp->rc_flag & RC_REPMBUF) {
413				newnfsstats.srvcache_nonidemdonehits++;
414				mtx_unlock(mutex);
415				nd->nd_mreq = m_copym(rp->rc_reply, 0,
416					M_COPYALL, M_WAITOK);
417				ret = RC_REPLY;
418				rp->rc_timestamp = NFSD_MONOSEC +
419					NFSRVCACHE_UDPTIMEOUT;
420			} else {
421				panic("nfs udp cache1");
422			}
423			nfsrc_unlock(rp);
424			free((caddr_t)newrp, M_NFSRVCACHE);
425			goto out;
426		}
427	}
428	newnfsstats.srvcache_misses++;
429	atomic_add_int(&newnfsstats.srvcache_size, 1);
430	nfsrc_udpcachesize++;
431
432	newrp->rc_flag |= RC_INPROG;
433	saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
434	if (saddr->sin_family == AF_INET)
435		newrp->rc_inet = saddr->sin_addr.s_addr;
436	else if (saddr->sin_family == AF_INET6) {
437		saddr6 = (struct sockaddr_in6 *)saddr;
438		NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
439		    sizeof (struct in6_addr));
440		newrp->rc_flag |= RC_INETIPV6;
441	}
442	LIST_INSERT_HEAD(hp, newrp, rc_hash);
443	TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
444	mtx_unlock(mutex);
445	nd->nd_rp = newrp;
446	ret = RC_DOIT;
447
448out:
449	NFSEXITCODE2(0, nd);
450	return (ret);
451}
452
453/*
454 * Update a request cache entry after the rpc has been done
455 */
456APPLESTATIC struct nfsrvcache *
457nfsrvd_updatecache(struct nfsrv_descript *nd)
458{
459	struct nfsrvcache *rp;
460	struct nfsrvcache *retrp = NULL;
461	mbuf_t m;
462	struct mtx *mutex;
463
464	rp = nd->nd_rp;
465	if (!rp)
466		panic("nfsrvd_updatecache null rp");
467	nd->nd_rp = NULL;
468	mutex = nfsrc_cachemutex(rp);
469	mtx_lock(mutex);
470	nfsrc_lock(rp);
471	if (!(rp->rc_flag & RC_INPROG))
472		panic("nfsrvd_updatecache not inprog");
473	rp->rc_flag &= ~RC_INPROG;
474	if (rp->rc_flag & RC_UDP) {
475		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
476		TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
477	}
478
479	/*
480	 * Reply from cache is a special case returned by nfsrv_checkseqid().
481	 */
482	if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
483		newnfsstats.srvcache_nonidemdonehits++;
484		mtx_unlock(mutex);
485		nd->nd_repstat = 0;
486		if (nd->nd_mreq)
487			mbuf_freem(nd->nd_mreq);
488		if (!(rp->rc_flag & RC_REPMBUF))
489			panic("reply from cache");
490		nd->nd_mreq = m_copym(rp->rc_reply, 0,
491		    M_COPYALL, M_WAITOK);
492		rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
493		nfsrc_unlock(rp);
494		goto out;
495	}
496
497	/*
498	 * If rc_refcnt > 0, save it
499	 * For UDP, save it if ND_SAVEREPLY is set
500	 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
501	 */
502	if (nd->nd_repstat != NFSERR_DONTREPLY &&
503	    (rp->rc_refcnt > 0 ||
504	     ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
505	     ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
506	      nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
507	      nfsrc_tcpnonidempotent))) {
508		if (rp->rc_refcnt > 0) {
509			if (!(rp->rc_flag & RC_NFSV4))
510				panic("update_cache refcnt");
511			rp->rc_flag |= RC_REFCNT;
512		}
513		if ((nd->nd_flag & ND_NFSV2) &&
514		    nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
515			rp->rc_status = nd->nd_repstat;
516			rp->rc_flag |= RC_REPSTATUS;
517			mtx_unlock(mutex);
518		} else {
519			if (!(rp->rc_flag & RC_UDP)) {
520			    atomic_add_int(&nfsrc_tcpsavedreplies, 1);
521			    if (nfsrc_tcpsavedreplies >
522				newnfsstats.srvcache_tcppeak)
523				newnfsstats.srvcache_tcppeak =
524				    nfsrc_tcpsavedreplies;
525			}
526			mtx_unlock(mutex);
527			m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
528			mtx_lock(mutex);
529			rp->rc_reply = m;
530			rp->rc_flag |= RC_REPMBUF;
531			mtx_unlock(mutex);
532		}
533		if (rp->rc_flag & RC_UDP) {
534			rp->rc_timestamp = NFSD_MONOSEC +
535			    NFSRVCACHE_UDPTIMEOUT;
536			nfsrc_unlock(rp);
537		} else {
538			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
539			if (rp->rc_refcnt > 0)
540				nfsrc_unlock(rp);
541			else
542				retrp = rp;
543		}
544	} else {
545		nfsrc_freecache(rp);
546		mtx_unlock(mutex);
547	}
548
549out:
550	NFSEXITCODE2(0, nd);
551	return (retrp);
552}
553
554/*
555 * Invalidate and, if possible, free an in prog cache entry.
556 * Must not sleep.
557 */
558APPLESTATIC void
559nfsrvd_delcache(struct nfsrvcache *rp)
560{
561	struct mtx *mutex;
562
563	mutex = nfsrc_cachemutex(rp);
564	if (!(rp->rc_flag & RC_INPROG))
565		panic("nfsrvd_delcache not in prog");
566	mtx_lock(mutex);
567	rp->rc_flag &= ~RC_INPROG;
568	if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
569		nfsrc_freecache(rp);
570	mtx_unlock(mutex);
571}
572
573/*
574 * Called after nfsrvd_updatecache() once the reply is sent, to update
575 * the entry's sequence number and unlock it. The argument is
576 * the pointer returned by nfsrvd_updatecache().
577 */
578APPLESTATIC void
579nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
580{
581	struct nfsrchash_bucket *hbp;
582
583	KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
584	if (have_seq) {
585		hbp = NFSRCAHASH(rp->rc_sockref);
586		mtx_lock(&hbp->mtx);
587		rp->rc_tcpseq = seq;
588		if (rp->rc_acked != RC_NO_ACK)
589			LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
590		rp->rc_acked = RC_NO_ACK;
591		mtx_unlock(&hbp->mtx);
592	}
593	nfsrc_unlock(rp);
594}
595
596/*
597 * Get a cache entry for TCP
598 * - key on <xid, nfs version>
599 *   (allow multiple entries for a given key)
600 */
601static int
602nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
603{
604	struct nfsrvcache *rp, *nextrp;
605	int i;
606	struct nfsrvcache *hitrp;
607	struct nfsrvhashhead *hp, nfsrc_templist;
608	int hit, ret = 0;
609	struct mtx *mutex;
610
611	mutex = nfsrc_cachemutex(newrp);
612	hp = NFSRCHASH(newrp->rc_xid);
613	newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
614tryagain:
615	mtx_lock(mutex);
616	hit = 1;
617	LIST_INIT(&nfsrc_templist);
618	/*
619	 * Get all the matches and put them on the temp list.
620	 */
621	rp = LIST_FIRST(hp);
622	while (rp != LIST_END(hp)) {
623		nextrp = LIST_NEXT(rp, rc_hash);
624		if (newrp->rc_xid == rp->rc_xid &&
625		    (!(rp->rc_flag & RC_INPROG) ||
626		     ((newrp->rc_flag & RC_SAMETCPCONN) &&
627		      newrp->rc_sockref == rp->rc_sockref)) &&
628		    (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
629		    newrp->rc_proc == rp->rc_proc &&
630		    ((newrp->rc_flag & RC_NFSV4) &&
631		     newrp->rc_sockref != rp->rc_sockref &&
632		     newrp->rc_cachetime >= rp->rc_cachetime)
633		    && newrp->rc_reqlen == rp->rc_reqlen &&
634		    newrp->rc_cksum == rp->rc_cksum) {
635			LIST_REMOVE(rp, rc_hash);
636			LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
637		}
638		rp = nextrp;
639	}
640
641	/*
642	 * Now, use nfsrc_templist to decide if there is a match.
643	 */
644	i = 0;
645	LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
646		i++;
647		if (rp->rc_refcnt > 0) {
648			hit = 0;
649			break;
650		}
651	}
652	/*
653	 * Can be a hit only if one entry left.
654	 * Note possible hit entry and put nfsrc_templist back on hash
655	 * list.
656	 */
657	if (i != 1)
658		hit = 0;
659	hitrp = rp = LIST_FIRST(&nfsrc_templist);
660	while (rp != LIST_END(&nfsrc_templist)) {
661		nextrp = LIST_NEXT(rp, rc_hash);
662		LIST_REMOVE(rp, rc_hash);
663		LIST_INSERT_HEAD(hp, rp, rc_hash);
664		rp = nextrp;
665	}
666	if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
667		panic("nfs gettcp cache templist");
668
669	if (hit) {
670		rp = hitrp;
671		if ((rp->rc_flag & RC_LOCKED) != 0) {
672			rp->rc_flag |= RC_WANTED;
673			(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
674			    "nfsrc", 10 * hz);
675			goto tryagain;
676		}
677		if (rp->rc_flag == 0)
678			panic("nfs tcp cache0");
679		rp->rc_flag |= RC_LOCKED;
680		if (rp->rc_flag & RC_INPROG) {
681			newnfsstats.srvcache_inproghits++;
682			mtx_unlock(mutex);
683			if (newrp->rc_sockref == rp->rc_sockref)
684				nfsrc_marksametcpconn(rp->rc_sockref);
685			ret = RC_DROPIT;
686		} else if (rp->rc_flag & RC_REPSTATUS) {
687			/*
688			 * V2 only.
689			 */
690			newnfsstats.srvcache_nonidemdonehits++;
691			mtx_unlock(mutex);
692			if (newrp->rc_sockref == rp->rc_sockref)
693				nfsrc_marksametcpconn(rp->rc_sockref);
694			ret = RC_REPLY;
695			nfsrvd_rephead(nd);
696			*(nd->nd_errp) = rp->rc_status;
697			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
698		} else if (rp->rc_flag & RC_REPMBUF) {
699			newnfsstats.srvcache_nonidemdonehits++;
700			mtx_unlock(mutex);
701			if (newrp->rc_sockref == rp->rc_sockref)
702				nfsrc_marksametcpconn(rp->rc_sockref);
703			ret = RC_REPLY;
704			nd->nd_mreq = m_copym(rp->rc_reply, 0,
705				M_COPYALL, M_WAITOK);
706			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
707		} else {
708			panic("nfs tcp cache1");
709		}
710		nfsrc_unlock(rp);
711		free((caddr_t)newrp, M_NFSRVCACHE);
712		goto out;
713	}
714	newnfsstats.srvcache_misses++;
715	atomic_add_int(&newnfsstats.srvcache_size, 1);
716
717	/*
718	 * For TCP, multiple entries for a key are allowed, so don't
719	 * chain it into the hash table until done.
720	 */
721	newrp->rc_cachetime = NFSD_MONOSEC;
722	newrp->rc_flag |= RC_INPROG;
723	LIST_INSERT_HEAD(hp, newrp, rc_hash);
724	mtx_unlock(mutex);
725	nd->nd_rp = newrp;
726	ret = RC_DOIT;
727
728out:
729	NFSEXITCODE2(0, nd);
730	return (ret);
731}
732
733/*
734 * Lock a cache entry.
735 */
736static void
737nfsrc_lock(struct nfsrvcache *rp)
738{
739	struct mtx *mutex;
740
741	mutex = nfsrc_cachemutex(rp);
742	mtx_assert(mutex, MA_OWNED);
743	while ((rp->rc_flag & RC_LOCKED) != 0) {
744		rp->rc_flag |= RC_WANTED;
745		(void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
746	}
747	rp->rc_flag |= RC_LOCKED;
748}
749
750/*
751 * Unlock a cache entry.
752 */
753static void
754nfsrc_unlock(struct nfsrvcache *rp)
755{
756	struct mtx *mutex;
757
758	mutex = nfsrc_cachemutex(rp);
759	mtx_lock(mutex);
760	rp->rc_flag &= ~RC_LOCKED;
761	nfsrc_wanted(rp);
762	mtx_unlock(mutex);
763}
764
765/*
766 * Wakeup anyone wanting entry.
767 */
768static void
769nfsrc_wanted(struct nfsrvcache *rp)
770{
771	if (rp->rc_flag & RC_WANTED) {
772		rp->rc_flag &= ~RC_WANTED;
773		wakeup((caddr_t)rp);
774	}
775}
776
777/*
778 * Free up the entry.
779 * Must not sleep.
780 */
781static void
782nfsrc_freecache(struct nfsrvcache *rp)
783{
784	struct nfsrchash_bucket *hbp;
785
786	LIST_REMOVE(rp, rc_hash);
787	if (rp->rc_flag & RC_UDP) {
788		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
789		nfsrc_udpcachesize--;
790	} else if (rp->rc_acked != RC_NO_SEQ) {
791		hbp = NFSRCAHASH(rp->rc_sockref);
792		mtx_lock(&hbp->mtx);
793		if (rp->rc_acked == RC_NO_ACK)
794			LIST_REMOVE(rp, rc_ahash);
795		mtx_unlock(&hbp->mtx);
796	}
797	nfsrc_wanted(rp);
798	if (rp->rc_flag & RC_REPMBUF) {
799		mbuf_freem(rp->rc_reply);
800		if (!(rp->rc_flag & RC_UDP))
801			atomic_add_int(&nfsrc_tcpsavedreplies, -1);
802	}
803	FREE((caddr_t)rp, M_NFSRVCACHE);
804	atomic_add_int(&newnfsstats.srvcache_size, -1);
805}
806
807/*
808 * Clean out the cache. Called when nfsserver module is unloaded.
809 */
810APPLESTATIC void
811nfsrvd_cleancache(void)
812{
813	struct nfsrvcache *rp, *nextrp;
814	int i;
815
816	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
817		mtx_lock(&nfsrchash_table[i].mtx);
818		LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp)
819			nfsrc_freecache(rp);
820		mtx_unlock(&nfsrchash_table[i].mtx);
821	}
822	mtx_lock(&nfsrc_udpmtx);
823	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
824		LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
825			nfsrc_freecache(rp);
826		}
827	}
828	newnfsstats.srvcache_size = 0;
829	mtx_unlock(&nfsrc_udpmtx);
830	nfsrc_tcpsavedreplies = 0;
831}
832
833#define HISTSIZE	16
834/*
835 * The basic rule is to get rid of entries that are expired.
836 */
837void
838nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
839{
840	struct nfsrchash_bucket *hbp;
841	struct nfsrvcache *rp, *nextrp;
842	int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
843	time_t thisstamp;
844	static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
845	static int onethread = 0, oneslot = 0;
846
847	if (sockref != 0) {
848		hbp = NFSRCAHASH(sockref);
849		mtx_lock(&hbp->mtx);
850		LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
851			if (sockref == rp->rc_sockref) {
852				if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
853					rp->rc_acked = RC_ACK;
854					LIST_REMOVE(rp, rc_ahash);
855				} else if (final) {
856					rp->rc_acked = RC_NACK;
857					LIST_REMOVE(rp, rc_ahash);
858				}
859			}
860		}
861		mtx_unlock(&hbp->mtx);
862	}
863
864	if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
865		return;
866	if (NFSD_MONOSEC != udp_lasttrim ||
867	    nfsrc_udpcachesize >= (nfsrc_udphighwater +
868	    nfsrc_udphighwater / 2)) {
869		mtx_lock(&nfsrc_udpmtx);
870		udp_lasttrim = NFSD_MONOSEC;
871		TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
872			if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
873			     && rp->rc_refcnt == 0
874			     && ((rp->rc_flag & RC_REFCNT) ||
875				 udp_lasttrim > rp->rc_timestamp ||
876				 nfsrc_udpcachesize > nfsrc_udphighwater))
877				nfsrc_freecache(rp);
878		}
879		mtx_unlock(&nfsrc_udpmtx);
880	}
881	if (NFSD_MONOSEC != tcp_lasttrim ||
882	    nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
883		force = nfsrc_tcphighwater / 4;
884		if (force > 0 &&
885		    nfsrc_tcpsavedreplies + force >= nfsrc_tcphighwater) {
886			for (i = 0; i < HISTSIZE; i++)
887				time_histo[i] = 0;
888			i = 0;
889			lastslot = NFSRVCACHE_HASHSIZE - 1;
890		} else {
891			force = 0;
892			if (NFSD_MONOSEC != tcp_lasttrim) {
893				i = 0;
894				lastslot = NFSRVCACHE_HASHSIZE - 1;
895			} else {
896				lastslot = i = oneslot;
897				if (++oneslot >= NFSRVCACHE_HASHSIZE)
898					oneslot = 0;
899			}
900		}
901		tto = nfsrc_tcptimeout;
902		tcp_lasttrim = NFSD_MONOSEC;
903		for (; i <= lastslot; i++) {
904			mtx_lock(&nfsrchash_table[i].mtx);
905			LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash,
906			    nextrp) {
907				if (!(rp->rc_flag &
908				     (RC_INPROG|RC_LOCKED|RC_WANTED))
909				     && rp->rc_refcnt == 0) {
910					if ((rp->rc_flag & RC_REFCNT) ||
911					    tcp_lasttrim > rp->rc_timestamp ||
912					    rp->rc_acked == RC_ACK) {
913						nfsrc_freecache(rp);
914						continue;
915					}
916
917					if (force == 0)
918						continue;
919					/*
920					 * The timestamps range from roughly the
921					 * present (tcp_lasttrim) to the present
922					 * + nfsrc_tcptimeout. Generate a simple
923					 * histogram of where the timeouts fall.
924					 */
925					j = rp->rc_timestamp - tcp_lasttrim;
926					if (j >= tto)
927						j = HISTSIZE - 1;
928					else if (j < 0)
929						j = 0;
930					else
931						j = j * HISTSIZE / tto;
932					time_histo[j]++;
933				}
934			}
935			mtx_unlock(&nfsrchash_table[i].mtx);
936		}
937		if (force) {
938			/*
939			 * Trim some more with a smaller timeout of as little
940			 * as 20% of nfsrc_tcptimeout to try and get below
941			 * 80% of the nfsrc_tcphighwater.
942			 */
943			k = 0;
944			for (i = 0; i < (HISTSIZE - 2); i++) {
945				k += time_histo[i];
946				if (k > force)
947					break;
948			}
949			k = tto * (i + 1) / HISTSIZE;
950			if (k < 1)
951				k = 1;
952			thisstamp = tcp_lasttrim + k;
953			for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
954				mtx_lock(&nfsrchash_table[i].mtx);
955				LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl,
956				    rc_hash, nextrp) {
957					if (!(rp->rc_flag &
958					     (RC_INPROG|RC_LOCKED|RC_WANTED))
959					     && rp->rc_refcnt == 0
960					     && ((rp->rc_flag & RC_REFCNT) ||
961						 thisstamp > rp->rc_timestamp ||
962						 rp->rc_acked == RC_ACK))
963						nfsrc_freecache(rp);
964				}
965				mtx_unlock(&nfsrchash_table[i].mtx);
966			}
967		}
968	}
969	atomic_store_rel_int(&onethread, 0);
970}
971
972/*
973 * Add a seqid# reference to the cache entry.
974 */
975APPLESTATIC void
976nfsrvd_refcache(struct nfsrvcache *rp)
977{
978	struct mtx *mutex;
979
980	if (rp == NULL)
981		/* For NFSv4.1, there is no cache entry. */
982		return;
983	mutex = nfsrc_cachemutex(rp);
984	mtx_lock(mutex);
985	if (rp->rc_refcnt < 0)
986		panic("nfs cache refcnt");
987	rp->rc_refcnt++;
988	mtx_unlock(mutex);
989}
990
991/*
992 * Dereference a seqid# cache entry.
993 */
994APPLESTATIC void
995nfsrvd_derefcache(struct nfsrvcache *rp)
996{
997	struct mtx *mutex;
998
999	mutex = nfsrc_cachemutex(rp);
1000	mtx_lock(mutex);
1001	if (rp->rc_refcnt <= 0)
1002		panic("nfs cache derefcnt");
1003	rp->rc_refcnt--;
1004	if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
1005		nfsrc_freecache(rp);
1006	mtx_unlock(mutex);
1007}
1008
1009/*
1010 * Calculate the length of the mbuf list and a checksum on the first up to
1011 * NFSRVCACHE_CHECKLEN bytes.
1012 */
1013static int
1014nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
1015{
1016	int len = 0, cklen;
1017	mbuf_t m;
1018
1019	m = m1;
1020	while (m) {
1021		len += mbuf_len(m);
1022		m = mbuf_next(m);
1023	}
1024	cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1025	*cksum = in_cksum(m1, cklen);
1026	return (len);
1027}
1028
1029/*
1030 * Mark a TCP connection that is seeing retries. Should never happen for
1031 * NFSv4.
1032 */
1033static void
1034nfsrc_marksametcpconn(u_int64_t sockref)
1035{
1036}
1037
1038