nfs_nfsdcache.c revision 261051
1/*-
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 */
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD: stable/10/sys/fs/nfsserver/nfs_nfsdcache.c 261051 2014-01-22 23:49:37Z mav $");
36
37/*
38 * Here is the basic algorithm:
39 * First, some design criteria I used:
40 * - I think a false hit is more serious than a false miss
41 * - A false hit for an RPC that has Op(s) that order via seqid# must be
42 *   avoided at all cost
43 * - A valid hit will probably happen a long time after the original reply
44 *   and the TCP socket that the original request was received on will no
45 *   longer be active
46 *   (The long time delay implies to me that LRU is not appropriate.)
47 * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
48 *   in them as well as minimizing the risk of redoing retried non-idempotent
49 *   Ops.
50 * Because it is biased towards avoiding false hits, multiple entries with
51 * the same xid are to be expected, especially for the case of the entry
52 * in the cache being related to a seqid# sequenced Op.
53 *
54 * The basic algorithm I'm about to code up:
55 * - Null RPCs bypass the cache and are just done
56 * For TCP
57 * 	- key on <xid, NFS version> (as noted above, there can be several
58 * 				     entries with the same key)
59 * 	When a request arrives:
60 * 		For all that match key
61 * 		- if RPC# != OR request_size !=
62 * 			- not a match with this one
63 * 		- if NFSv4 and received on same TCP socket OR
64 *			received on a TCP connection created before the
65 *			entry was cached
66 * 			- not a match with this one
67 * 			(V2,3 clients might retry on same TCP socket)
68 * 		- calculate checksum on first N bytes of NFS XDR
69 * 		- if checksum !=
70 * 			- not a match for this one
71 * 		If any of the remaining ones that match has a
72 * 			seqid_refcnt > 0
73 * 			- not a match (go do RPC, using new cache entry)
74 * 		If one match left
75 * 			- a hit (reply from cache)
76 * 		else
77 * 			- miss (go do RPC, using new cache entry)
78 *
79 * 	During processing of NFSv4 request:
80 * 		- set a flag when a non-idempotent Op is processed
81 * 		- when an Op that uses a seqid# (Open,...) is processed
82 * 			- if same seqid# as referenced entry in cache
83 * 				- free new cache entry
84 * 				- reply from referenced cache entry
85 * 			  else if next seqid# in order
86 * 				- free referenced cache entry
87 * 				- increment seqid_refcnt on new cache entry
88 * 				- set pointer from Openowner/Lockowner to
89 * 					new cache entry (aka reference it)
90 * 			  else if first seqid# in sequence
91 * 				- increment seqid_refcnt on new cache entry
92 * 				- set pointer from Openowner/Lockowner to
93 * 					new cache entry (aka reference it)
94 *
95 * 	At end of RPC processing:
96 * 		- if seqid_refcnt > 0 OR flagged non-idempotent on new
97 * 			cache entry
98 * 			- save reply in cache entry
99 * 			- calculate checksum on first N bytes of NFS XDR
100 * 				request
101 * 			- note op and length of XDR request (in bytes)
102 * 			- timestamp it
103 * 		  else
104 * 			- free new cache entry
105 * 		- Send reply (noting info for socket activity check, below)
106 *
107 * 	For cache entries saved above:
108 * 		- if saved since seqid_refcnt was > 0
109 * 			- free when seqid_refcnt decrements to 0
110 * 			  (when next one in sequence is processed above, or
111 * 			   when Openowner/Lockowner is discarded)
112 * 		  else { non-idempotent Op(s) }
113 * 			- free when
114 * 				- some further activity observed on same
115 * 					socket
116 * 				  (I'm not yet sure how I'm going to do
117 * 				   this. Maybe look at the TCP connection
118 * 				   to see if the send_tcp_sequence# is well
119 * 				   past sent reply OR K additional RPCs
120 * 				   replied on same socket OR?)
121 * 			  OR
122 * 				- when very old (hours, days, weeks?)
123 *
124 * For UDP (v2, 3 only), pretty much the old way:
125 * - key on <xid, NFS version, RPC#, Client host ip#>
126 *   (at most one entry for each key)
127 *
128 * When a Request arrives:
129 * - if a match with entry via key
130 * 	- if RPC marked In_progress
131 * 		- discard request (don't send reply)
132 * 	  else
133 * 		- reply from cache
134 * 		- timestamp cache entry
135 *   else
136 * 	- add entry to cache, marked In_progress
137 * 	- do RPC
138 * 	- when RPC done
139 * 		- if RPC# non-idempotent
140 * 			- mark entry Done (not In_progress)
141 * 			- save reply
142 * 			- timestamp cache entry
143 * 		  else
144 * 			- free cache entry
145 * 		- send reply
146 *
147 * Later, entries with saved replies are free'd a short time (few minutes)
148 * after reply sent (timestamp).
149 * Reference: Chet Juszczak, "Improving the Performance and Correctness
150 *		of an NFS Server", in Proc. Winter 1989 USENIX Conference,
151 *		pages 53-63. San Diego, February 1989.
152 *	 for the UDP case.
153 * nfsrc_floodlevel is set to the allowable upper limit for saved replies
154 *	for TCP. For V3, a reply won't be saved when the flood level is
155 *	hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
156 *	that case. This level should be set high enough that this almost
157 *	never happens.
158 */
159#ifndef APPLEKEXT
160#include <fs/nfs/nfsport.h>
161
162extern struct nfsstats newnfsstats;
163extern struct mtx nfsrc_udpmtx;
164extern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
165int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
166#endif	/* !APPLEKEXT */
167
168SYSCTL_DECL(_vfs_nfsd);
169
170static u_int	nfsrc_tcphighwater = 0;
171static int
172sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
173{
174	int error, newhighwater;
175
176	newhighwater = nfsrc_tcphighwater;
177	error = sysctl_handle_int(oidp, &newhighwater, 0, req);
178	if (error != 0 || req->newptr == NULL)
179		return (error);
180	if (newhighwater < 0)
181		return (EINVAL);
182	if (newhighwater >= nfsrc_floodlevel)
183		nfsrc_floodlevel = newhighwater + newhighwater / 5;
184	nfsrc_tcphighwater = newhighwater;
185	return (0);
186}
187SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater, CTLTYPE_UINT | CTLFLAG_RW, 0,
188    sizeof(nfsrc_tcphighwater), sysctl_tcphighwater, "IU",
189    "High water mark for TCP cache entries");
190
191static u_int	nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
192SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
193    &nfsrc_udphighwater, 0,
194    "High water mark for UDP cache entries");
195static u_int	nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
196SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
197    &nfsrc_tcptimeout, 0,
198    "Timeout for TCP entries in the DRC");
199static u_int nfsrc_tcpnonidempotent = 1;
200SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
201    &nfsrc_tcpnonidempotent, 0,
202    "Enable the DRC for NFS over TCP");
203
204static int nfsrc_udpcachesize = 0;
205static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
206static struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
207
208/*
209 * and the reverse mapping from generic to Version 2 procedure numbers
210 */
211static int newnfsv2_procid[NFS_V3NPROCS] = {
212	NFSV2PROC_NULL,
213	NFSV2PROC_GETATTR,
214	NFSV2PROC_SETATTR,
215	NFSV2PROC_LOOKUP,
216	NFSV2PROC_NOOP,
217	NFSV2PROC_READLINK,
218	NFSV2PROC_READ,
219	NFSV2PROC_WRITE,
220	NFSV2PROC_CREATE,
221	NFSV2PROC_MKDIR,
222	NFSV2PROC_SYMLINK,
223	NFSV2PROC_CREATE,
224	NFSV2PROC_REMOVE,
225	NFSV2PROC_RMDIR,
226	NFSV2PROC_RENAME,
227	NFSV2PROC_LINK,
228	NFSV2PROC_READDIR,
229	NFSV2PROC_NOOP,
230	NFSV2PROC_STATFS,
231	NFSV2PROC_NOOP,
232	NFSV2PROC_NOOP,
233	NFSV2PROC_NOOP,
234};
235
236#define	nfsrc_hash(xid)	(((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
237#define	NFSRCUDPHASH(xid) \
238	(&nfsrvudphashtbl[nfsrc_hash(xid)])
239#define	NFSRCHASH(xid) \
240	(&nfsrchash_table[nfsrc_hash(xid)].tbl)
241#define	TRUE	1
242#define	FALSE	0
243#define	NFSRVCACHE_CHECKLEN	100
244
245/* True iff the rpc reply is an nfs status ONLY! */
246static int nfsv2_repstat[NFS_V3NPROCS] = {
247	FALSE,
248	FALSE,
249	FALSE,
250	FALSE,
251	FALSE,
252	FALSE,
253	FALSE,
254	FALSE,
255	FALSE,
256	FALSE,
257	TRUE,
258	TRUE,
259	TRUE,
260	TRUE,
261	FALSE,
262	TRUE,
263	FALSE,
264	FALSE,
265	FALSE,
266	FALSE,
267	FALSE,
268	FALSE,
269};
270
271/*
272 * Will NFS want to work over IPv6 someday?
273 */
274#define	NETFAMILY(rp) \
275		(((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
276
277/* local functions */
278static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
279static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
280static void nfsrc_lock(struct nfsrvcache *rp);
281static void nfsrc_unlock(struct nfsrvcache *rp);
282static void nfsrc_wanted(struct nfsrvcache *rp);
283static void nfsrc_freecache(struct nfsrvcache *rp);
284static void nfsrc_trimcache(u_int64_t, struct socket *);
285static int nfsrc_activesocket(struct nfsrvcache *rp, u_int64_t,
286    struct socket *);
287static int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum);
288static void nfsrc_marksametcpconn(u_int64_t);
289
290/*
291 * Return the correct mutex for this cache entry.
292 */
293static __inline struct mtx *
294nfsrc_cachemutex(struct nfsrvcache *rp)
295{
296
297	if ((rp->rc_flag & RC_UDP) != 0)
298		return (&nfsrc_udpmtx);
299	return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx);
300}
301
302/*
303 * Initialize the server request cache list
304 */
305APPLESTATIC void
306nfsrvd_initcache(void)
307{
308	int i;
309	static int inited = 0;
310
311	if (inited)
312		return;
313	inited = 1;
314	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
315		LIST_INIT(&nfsrvudphashtbl[i]);
316		LIST_INIT(&nfsrchash_table[i].tbl);
317	}
318	TAILQ_INIT(&nfsrvudplru);
319	nfsrc_tcpsavedreplies = 0;
320	nfsrc_udpcachesize = 0;
321	newnfsstats.srvcache_tcppeak = 0;
322	newnfsstats.srvcache_size = 0;
323}
324
325/*
326 * Get a cache entry for this request. Basically just malloc a new one
327 * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
328 * Call nfsrc_trimcache() to clean up the cache before returning.
329 */
330APPLESTATIC int
331nfsrvd_getcache(struct nfsrv_descript *nd, struct socket *so)
332{
333	struct nfsrvcache *newrp;
334	int ret;
335
336	if (nd->nd_procnum == NFSPROC_NULL)
337		panic("nfsd cache null");
338	MALLOC(newrp, struct nfsrvcache *, sizeof (struct nfsrvcache),
339	    M_NFSRVCACHE, M_WAITOK);
340	NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
341	if (nd->nd_flag & ND_NFSV4)
342		newrp->rc_flag = RC_NFSV4;
343	else if (nd->nd_flag & ND_NFSV3)
344		newrp->rc_flag = RC_NFSV3;
345	else
346		newrp->rc_flag = RC_NFSV2;
347	newrp->rc_xid = nd->nd_retxid;
348	newrp->rc_proc = nd->nd_procnum;
349	newrp->rc_sockref = nd->nd_sockref;
350	newrp->rc_cachetime = nd->nd_tcpconntime;
351	if (nd->nd_flag & ND_SAMETCPCONN)
352		newrp->rc_flag |= RC_SAMETCPCONN;
353	if (nd->nd_nam2 != NULL) {
354		newrp->rc_flag |= RC_UDP;
355		ret = nfsrc_getudp(nd, newrp);
356	} else {
357		ret = nfsrc_gettcp(nd, newrp);
358	}
359	nfsrc_trimcache(nd->nd_sockref, so);
360	NFSEXITCODE2(0, nd);
361	return (ret);
362}
363
364/*
365 * For UDP (v2, v3):
366 * - key on <xid, NFS version, RPC#, Client host ip#>
367 *   (at most one entry for each key)
368 */
369static int
370nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
371{
372	struct nfsrvcache *rp;
373	struct sockaddr_in *saddr;
374	struct sockaddr_in6 *saddr6;
375	struct nfsrvhashhead *hp;
376	int ret = 0;
377	struct mtx *mutex;
378
379	mutex = nfsrc_cachemutex(newrp);
380	hp = NFSRCUDPHASH(newrp->rc_xid);
381loop:
382	mtx_lock(mutex);
383	LIST_FOREACH(rp, hp, rc_hash) {
384	    if (newrp->rc_xid == rp->rc_xid &&
385		newrp->rc_proc == rp->rc_proc &&
386		(newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
387		nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
388			if ((rp->rc_flag & RC_LOCKED) != 0) {
389				rp->rc_flag |= RC_WANTED;
390				(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
391				    "nfsrc", 10 * hz);
392				goto loop;
393			}
394			if (rp->rc_flag == 0)
395				panic("nfs udp cache0");
396			rp->rc_flag |= RC_LOCKED;
397			TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
398			TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
399			if (rp->rc_flag & RC_INPROG) {
400				newnfsstats.srvcache_inproghits++;
401				mtx_unlock(mutex);
402				ret = RC_DROPIT;
403			} else if (rp->rc_flag & RC_REPSTATUS) {
404				/*
405				 * V2 only.
406				 */
407				newnfsstats.srvcache_nonidemdonehits++;
408				mtx_unlock(mutex);
409				nfsrvd_rephead(nd);
410				*(nd->nd_errp) = rp->rc_status;
411				ret = RC_REPLY;
412				rp->rc_timestamp = NFSD_MONOSEC +
413					NFSRVCACHE_UDPTIMEOUT;
414			} else if (rp->rc_flag & RC_REPMBUF) {
415				newnfsstats.srvcache_nonidemdonehits++;
416				mtx_unlock(mutex);
417				nd->nd_mreq = m_copym(rp->rc_reply, 0,
418					M_COPYALL, M_WAITOK);
419				ret = RC_REPLY;
420				rp->rc_timestamp = NFSD_MONOSEC +
421					NFSRVCACHE_UDPTIMEOUT;
422			} else {
423				panic("nfs udp cache1");
424			}
425			nfsrc_unlock(rp);
426			free((caddr_t)newrp, M_NFSRVCACHE);
427			goto out;
428		}
429	}
430	newnfsstats.srvcache_misses++;
431	atomic_add_int(&newnfsstats.srvcache_size, 1);
432	nfsrc_udpcachesize++;
433
434	newrp->rc_flag |= RC_INPROG;
435	saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
436	if (saddr->sin_family == AF_INET)
437		newrp->rc_inet = saddr->sin_addr.s_addr;
438	else if (saddr->sin_family == AF_INET6) {
439		saddr6 = (struct sockaddr_in6 *)saddr;
440		NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
441		    sizeof (struct in6_addr));
442		newrp->rc_flag |= RC_INETIPV6;
443	}
444	LIST_INSERT_HEAD(hp, newrp, rc_hash);
445	TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
446	mtx_unlock(mutex);
447	nd->nd_rp = newrp;
448	ret = RC_DOIT;
449
450out:
451	NFSEXITCODE2(0, nd);
452	return (ret);
453}
454
455/*
456 * Update a request cache entry after the rpc has been done
457 */
458APPLESTATIC struct nfsrvcache *
459nfsrvd_updatecache(struct nfsrv_descript *nd, struct socket *so)
460{
461	struct nfsrvcache *rp;
462	struct nfsrvcache *retrp = NULL;
463	mbuf_t m;
464	struct mtx *mutex;
465
466	rp = nd->nd_rp;
467	if (!rp)
468		panic("nfsrvd_updatecache null rp");
469	nd->nd_rp = NULL;
470	mutex = nfsrc_cachemutex(rp);
471	mtx_lock(mutex);
472	nfsrc_lock(rp);
473	if (!(rp->rc_flag & RC_INPROG))
474		panic("nfsrvd_updatecache not inprog");
475	rp->rc_flag &= ~RC_INPROG;
476	if (rp->rc_flag & RC_UDP) {
477		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
478		TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
479	}
480
481	/*
482	 * Reply from cache is a special case returned by nfsrv_checkseqid().
483	 */
484	if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
485		newnfsstats.srvcache_nonidemdonehits++;
486		mtx_unlock(mutex);
487		nd->nd_repstat = 0;
488		if (nd->nd_mreq)
489			mbuf_freem(nd->nd_mreq);
490		if (!(rp->rc_flag & RC_REPMBUF))
491			panic("reply from cache");
492		nd->nd_mreq = m_copym(rp->rc_reply, 0,
493		    M_COPYALL, M_WAITOK);
494		rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
495		nfsrc_unlock(rp);
496		goto out;
497	}
498
499	/*
500	 * If rc_refcnt > 0, save it
501	 * For UDP, save it if ND_SAVEREPLY is set
502	 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
503	 */
504	if (nd->nd_repstat != NFSERR_DONTREPLY &&
505	    (rp->rc_refcnt > 0 ||
506	     ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
507	     ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
508	      nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
509	      nfsrc_tcpnonidempotent))) {
510		if (rp->rc_refcnt > 0) {
511			if (!(rp->rc_flag & RC_NFSV4))
512				panic("update_cache refcnt");
513			rp->rc_flag |= RC_REFCNT;
514		}
515		if ((nd->nd_flag & ND_NFSV2) &&
516		    nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
517			rp->rc_status = nd->nd_repstat;
518			rp->rc_flag |= RC_REPSTATUS;
519			mtx_unlock(mutex);
520		} else {
521			if (!(rp->rc_flag & RC_UDP)) {
522			    atomic_add_int(&nfsrc_tcpsavedreplies, 1);
523			    if (nfsrc_tcpsavedreplies >
524				newnfsstats.srvcache_tcppeak)
525				newnfsstats.srvcache_tcppeak =
526				    nfsrc_tcpsavedreplies;
527			}
528			mtx_unlock(mutex);
529			m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
530			mtx_lock(mutex);
531			rp->rc_reply = m;
532			rp->rc_flag |= RC_REPMBUF;
533			mtx_unlock(mutex);
534		}
535		if (rp->rc_flag & RC_UDP) {
536			rp->rc_timestamp = NFSD_MONOSEC +
537			    NFSRVCACHE_UDPTIMEOUT;
538			nfsrc_unlock(rp);
539		} else {
540			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
541			if (rp->rc_refcnt > 0)
542				nfsrc_unlock(rp);
543			else
544				retrp = rp;
545		}
546	} else {
547		nfsrc_freecache(rp);
548		mtx_unlock(mutex);
549	}
550
551out:
552	nfsrc_trimcache(nd->nd_sockref, so);
553	NFSEXITCODE2(0, nd);
554	return (retrp);
555}
556
557/*
558 * Invalidate and, if possible, free an in prog cache entry.
559 * Must not sleep.
560 */
561APPLESTATIC void
562nfsrvd_delcache(struct nfsrvcache *rp)
563{
564	struct mtx *mutex;
565
566	mutex = nfsrc_cachemutex(rp);
567	if (!(rp->rc_flag & RC_INPROG))
568		panic("nfsrvd_delcache not in prog");
569	mtx_lock(mutex);
570	rp->rc_flag &= ~RC_INPROG;
571	if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
572		nfsrc_freecache(rp);
573	mtx_unlock(mutex);
574}
575
576/*
577 * Called after nfsrvd_updatecache() once the reply is sent, to update
578 * the entry for nfsrc_activesocket() and unlock it. The argument is
579 * the pointer returned by nfsrvd_updatecache().
580 */
581APPLESTATIC void
582nfsrvd_sentcache(struct nfsrvcache *rp, struct socket *so, int err)
583{
584	tcp_seq tmp_seq;
585	struct mtx *mutex;
586
587	mutex = nfsrc_cachemutex(rp);
588	if (!(rp->rc_flag & RC_LOCKED))
589		panic("nfsrvd_sentcache not locked");
590	if (!err) {
591		if ((so->so_proto->pr_domain->dom_family != AF_INET &&
592		     so->so_proto->pr_domain->dom_family != AF_INET6) ||
593		     so->so_proto->pr_protocol != IPPROTO_TCP)
594			panic("nfs sent cache");
595		if (nfsrv_getsockseqnum(so, &tmp_seq)) {
596			mtx_lock(mutex);
597			rp->rc_tcpseq = tmp_seq;
598			rp->rc_flag |= RC_TCPSEQ;
599			mtx_unlock(mutex);
600		}
601	}
602	nfsrc_unlock(rp);
603}
604
605/*
606 * Get a cache entry for TCP
607 * - key on <xid, nfs version>
608 *   (allow multiple entries for a given key)
609 */
610static int
611nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
612{
613	struct nfsrvcache *rp, *nextrp;
614	int i;
615	struct nfsrvcache *hitrp;
616	struct nfsrvhashhead *hp, nfsrc_templist;
617	int hit, ret = 0;
618	struct mtx *mutex;
619
620	mutex = nfsrc_cachemutex(newrp);
621	hp = NFSRCHASH(newrp->rc_xid);
622	newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
623tryagain:
624	mtx_lock(mutex);
625	hit = 1;
626	LIST_INIT(&nfsrc_templist);
627	/*
628	 * Get all the matches and put them on the temp list.
629	 */
630	rp = LIST_FIRST(hp);
631	while (rp != LIST_END(hp)) {
632		nextrp = LIST_NEXT(rp, rc_hash);
633		if (newrp->rc_xid == rp->rc_xid &&
634		    (!(rp->rc_flag & RC_INPROG) ||
635		     ((newrp->rc_flag & RC_SAMETCPCONN) &&
636		      newrp->rc_sockref == rp->rc_sockref)) &&
637		    (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
638		    newrp->rc_proc == rp->rc_proc &&
639		    ((newrp->rc_flag & RC_NFSV4) &&
640		     newrp->rc_sockref != rp->rc_sockref &&
641		     newrp->rc_cachetime >= rp->rc_cachetime)
642		    && newrp->rc_reqlen == rp->rc_reqlen &&
643		    newrp->rc_cksum == rp->rc_cksum) {
644			LIST_REMOVE(rp, rc_hash);
645			LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
646		}
647		rp = nextrp;
648	}
649
650	/*
651	 * Now, use nfsrc_templist to decide if there is a match.
652	 */
653	i = 0;
654	LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
655		i++;
656		if (rp->rc_refcnt > 0) {
657			hit = 0;
658			break;
659		}
660	}
661	/*
662	 * Can be a hit only if one entry left.
663	 * Note possible hit entry and put nfsrc_templist back on hash
664	 * list.
665	 */
666	if (i != 1)
667		hit = 0;
668	hitrp = rp = LIST_FIRST(&nfsrc_templist);
669	while (rp != LIST_END(&nfsrc_templist)) {
670		nextrp = LIST_NEXT(rp, rc_hash);
671		LIST_REMOVE(rp, rc_hash);
672		LIST_INSERT_HEAD(hp, rp, rc_hash);
673		rp = nextrp;
674	}
675	if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
676		panic("nfs gettcp cache templist");
677
678	if (hit) {
679		rp = hitrp;
680		if ((rp->rc_flag & RC_LOCKED) != 0) {
681			rp->rc_flag |= RC_WANTED;
682			(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
683			    "nfsrc", 10 * hz);
684			goto tryagain;
685		}
686		if (rp->rc_flag == 0)
687			panic("nfs tcp cache0");
688		rp->rc_flag |= RC_LOCKED;
689		if (rp->rc_flag & RC_INPROG) {
690			newnfsstats.srvcache_inproghits++;
691			mtx_unlock(mutex);
692			if (newrp->rc_sockref == rp->rc_sockref)
693				nfsrc_marksametcpconn(rp->rc_sockref);
694			ret = RC_DROPIT;
695		} else if (rp->rc_flag & RC_REPSTATUS) {
696			/*
697			 * V2 only.
698			 */
699			newnfsstats.srvcache_nonidemdonehits++;
700			mtx_unlock(mutex);
701			if (newrp->rc_sockref == rp->rc_sockref)
702				nfsrc_marksametcpconn(rp->rc_sockref);
703			ret = RC_REPLY;
704			nfsrvd_rephead(nd);
705			*(nd->nd_errp) = rp->rc_status;
706			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
707		} else if (rp->rc_flag & RC_REPMBUF) {
708			newnfsstats.srvcache_nonidemdonehits++;
709			mtx_unlock(mutex);
710			if (newrp->rc_sockref == rp->rc_sockref)
711				nfsrc_marksametcpconn(rp->rc_sockref);
712			ret = RC_REPLY;
713			nd->nd_mreq = m_copym(rp->rc_reply, 0,
714				M_COPYALL, M_WAITOK);
715			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
716		} else {
717			panic("nfs tcp cache1");
718		}
719		nfsrc_unlock(rp);
720		free((caddr_t)newrp, M_NFSRVCACHE);
721		goto out;
722	}
723	newnfsstats.srvcache_misses++;
724	atomic_add_int(&newnfsstats.srvcache_size, 1);
725
726	/*
727	 * For TCP, multiple entries for a key are allowed, so don't
728	 * chain it into the hash table until done.
729	 */
730	newrp->rc_cachetime = NFSD_MONOSEC;
731	newrp->rc_flag |= RC_INPROG;
732	LIST_INSERT_HEAD(hp, newrp, rc_hash);
733	mtx_unlock(mutex);
734	nd->nd_rp = newrp;
735	ret = RC_DOIT;
736
737out:
738	NFSEXITCODE2(0, nd);
739	return (ret);
740}
741
742/*
743 * Lock a cache entry.
744 */
745static void
746nfsrc_lock(struct nfsrvcache *rp)
747{
748	struct mtx *mutex;
749
750	mutex = nfsrc_cachemutex(rp);
751	mtx_assert(mutex, MA_OWNED);
752	while ((rp->rc_flag & RC_LOCKED) != 0) {
753		rp->rc_flag |= RC_WANTED;
754		(void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
755	}
756	rp->rc_flag |= RC_LOCKED;
757}
758
759/*
760 * Unlock a cache entry.
761 */
762static void
763nfsrc_unlock(struct nfsrvcache *rp)
764{
765	struct mtx *mutex;
766
767	mutex = nfsrc_cachemutex(rp);
768	mtx_lock(mutex);
769	rp->rc_flag &= ~RC_LOCKED;
770	nfsrc_wanted(rp);
771	mtx_unlock(mutex);
772}
773
774/*
775 * Wakeup anyone wanting entry.
776 */
777static void
778nfsrc_wanted(struct nfsrvcache *rp)
779{
780	if (rp->rc_flag & RC_WANTED) {
781		rp->rc_flag &= ~RC_WANTED;
782		wakeup((caddr_t)rp);
783	}
784}
785
786/*
787 * Free up the entry.
788 * Must not sleep.
789 */
790static void
791nfsrc_freecache(struct nfsrvcache *rp)
792{
793
794	LIST_REMOVE(rp, rc_hash);
795	if (rp->rc_flag & RC_UDP) {
796		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
797		nfsrc_udpcachesize--;
798	}
799	nfsrc_wanted(rp);
800	if (rp->rc_flag & RC_REPMBUF) {
801		mbuf_freem(rp->rc_reply);
802		if (!(rp->rc_flag & RC_UDP))
803			atomic_add_int(&nfsrc_tcpsavedreplies, -1);
804	}
805	FREE((caddr_t)rp, M_NFSRVCACHE);
806	atomic_add_int(&newnfsstats.srvcache_size, -1);
807}
808
809/*
810 * Clean out the cache. Called when nfsserver module is unloaded.
811 */
812APPLESTATIC void
813nfsrvd_cleancache(void)
814{
815	struct nfsrvcache *rp, *nextrp;
816	int i;
817
818	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
819		mtx_lock(&nfsrchash_table[i].mtx);
820		LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp)
821			nfsrc_freecache(rp);
822		mtx_unlock(&nfsrchash_table[i].mtx);
823	}
824	mtx_lock(&nfsrc_udpmtx);
825	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
826		LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
827			nfsrc_freecache(rp);
828		}
829	}
830	newnfsstats.srvcache_size = 0;
831	mtx_unlock(&nfsrc_udpmtx);
832	nfsrc_tcpsavedreplies = 0;
833}
834
835#define HISTSIZE	16
836/*
837 * The basic rule is to get rid of entries that are expired.
838 */
839static void
840nfsrc_trimcache(u_int64_t sockref, struct socket *so)
841{
842	struct nfsrvcache *rp, *nextrp;
843	int i, j, k, tto, time_histo[HISTSIZE];
844	time_t thisstamp;
845	static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
846	static int onethread = 0;
847
848	if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
849		return;
850	if (NFSD_MONOSEC != udp_lasttrim ||
851	    nfsrc_udpcachesize >= (nfsrc_udphighwater +
852	    nfsrc_udphighwater / 2)) {
853		mtx_lock(&nfsrc_udpmtx);
854		udp_lasttrim = NFSD_MONOSEC;
855		TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
856			if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
857			     && rp->rc_refcnt == 0
858			     && ((rp->rc_flag & RC_REFCNT) ||
859				 udp_lasttrim > rp->rc_timestamp ||
860				 nfsrc_udpcachesize > nfsrc_udphighwater))
861				nfsrc_freecache(rp);
862		}
863		mtx_unlock(&nfsrc_udpmtx);
864	}
865	if (NFSD_MONOSEC != tcp_lasttrim ||
866	    nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
867		for (i = 0; i < HISTSIZE; i++)
868			time_histo[i] = 0;
869		tto = nfsrc_tcptimeout;
870		for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
871			mtx_lock(&nfsrchash_table[i].mtx);
872			if (i == 0)
873				tcp_lasttrim = NFSD_MONOSEC;
874			LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash,
875			    nextrp) {
876				if (!(rp->rc_flag &
877				     (RC_INPROG|RC_LOCKED|RC_WANTED))
878				     && rp->rc_refcnt == 0) {
879					if ((rp->rc_flag & RC_REFCNT) ||
880					    tcp_lasttrim > rp->rc_timestamp ||
881					    nfsrc_activesocket(rp, sockref, so)) {
882						nfsrc_freecache(rp);
883						continue;
884					}
885
886					if (nfsrc_tcphighwater == 0)
887						continue;
888					/*
889					 * The timestamps range from roughly the
890					 * present (tcp_lasttrim) to the present
891					 * + nfsrc_tcptimeout. Generate a simple
892					 * histogram of where the timeouts fall.
893					 */
894					j = rp->rc_timestamp - tcp_lasttrim;
895					if (j >= tto)
896						j = HISTSIZE - 1;
897					else if (j < 0)
898						j = 0;
899					else
900						j = j * HISTSIZE / tto;
901					time_histo[j]++;
902				}
903			}
904			mtx_unlock(&nfsrchash_table[i].mtx);
905		}
906		j = nfsrc_tcphighwater / 5;	/* 20% of it */
907		if (j > 0 && (nfsrc_tcpsavedreplies + j) > nfsrc_tcphighwater) {
908			/*
909			 * Trim some more with a smaller timeout of as little
910			 * as 20% of nfsrc_tcptimeout to try and get below
911			 * 80% of the nfsrc_tcphighwater.
912			 */
913			k = 0;
914			for (i = 0; i < (HISTSIZE - 2); i++) {
915				k += time_histo[i];
916				if (k > j)
917					break;
918			}
919			k = tto * (i + 1) / HISTSIZE;
920			if (k < 1)
921				k = 1;
922			thisstamp = tcp_lasttrim + k;
923			for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
924				mtx_lock(&nfsrchash_table[i].mtx);
925				LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl,
926				    rc_hash, nextrp) {
927					if (!(rp->rc_flag &
928					     (RC_INPROG|RC_LOCKED|RC_WANTED))
929					     && rp->rc_refcnt == 0
930					     && ((rp->rc_flag & RC_REFCNT) ||
931						 thisstamp > rp->rc_timestamp ||
932						 nfsrc_activesocket(rp, sockref,
933						    so)))
934						nfsrc_freecache(rp);
935				}
936				mtx_unlock(&nfsrchash_table[i].mtx);
937			}
938		}
939	}
940	atomic_store_rel_int(&onethread, 0);
941}
942
943/*
944 * Add a seqid# reference to the cache entry.
945 */
946APPLESTATIC void
947nfsrvd_refcache(struct nfsrvcache *rp)
948{
949	struct mtx *mutex;
950
951	mutex = nfsrc_cachemutex(rp);
952	mtx_lock(mutex);
953	if (rp->rc_refcnt < 0)
954		panic("nfs cache refcnt");
955	rp->rc_refcnt++;
956	mtx_unlock(mutex);
957}
958
959/*
960 * Dereference a seqid# cache entry.
961 */
962APPLESTATIC void
963nfsrvd_derefcache(struct nfsrvcache *rp)
964{
965	struct mtx *mutex;
966
967	mutex = nfsrc_cachemutex(rp);
968	mtx_lock(mutex);
969	if (rp->rc_refcnt <= 0)
970		panic("nfs cache derefcnt");
971	rp->rc_refcnt--;
972	if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
973		nfsrc_freecache(rp);
974	mtx_unlock(mutex);
975}
976
977/*
978 * Check to see if the socket is active.
979 * Return 1 if the reply has been received/acknowledged by the client,
980 * 0 otherwise.
981 * XXX - Uses tcp internals.
982 */
983static int
984nfsrc_activesocket(struct nfsrvcache *rp, u_int64_t cur_sockref,
985    struct socket *cur_so)
986{
987	int ret = 0;
988
989	if (!(rp->rc_flag & RC_TCPSEQ))
990		return (ret);
991	/*
992	 * If the sockref is the same, it is the same TCP connection.
993	 */
994	if (cur_sockref == rp->rc_sockref)
995		ret = nfsrv_checksockseqnum(cur_so, rp->rc_tcpseq);
996	return (ret);
997}
998
999/*
1000 * Calculate the length of the mbuf list and a checksum on the first up to
1001 * NFSRVCACHE_CHECKLEN bytes.
1002 */
1003static int
1004nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
1005{
1006	int len = 0, cklen;
1007	mbuf_t m;
1008
1009	m = m1;
1010	while (m) {
1011		len += mbuf_len(m);
1012		m = mbuf_next(m);
1013	}
1014	cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1015	*cksum = in_cksum(m1, cklen);
1016	return (len);
1017}
1018
1019/*
1020 * Mark a TCP connection that is seeing retries. Should never happen for
1021 * NFSv4.
1022 */
1023static void
1024nfsrc_marksametcpconn(u_int64_t sockref)
1025{
1026}
1027
1028