ip_state.c revision 64580
1/* 2 * Copyright (C) 1995-2000 by Darren Reed. 3 * 4 * Redistribution and use in source and binary forms are permitted 5 * provided that this notice is preserved and due credit is given 6 * to the original author and the contributors. 7 */ 8#if !defined(lint) 9static const char sccsid[] = "@(#)ip_state.c 1.8 6/5/96 (C) 1993-1995 Darren Reed"; 10static const char rcsid[] = "@(#)$FreeBSD: head/sys/contrib/ipfilter/netinet/ip_state.c 64580 2000-08-13 04:31:06Z darrenr $"; 11#endif 12 13#include <sys/errno.h> 14#include <sys/types.h> 15#include <sys/param.h> 16#include <sys/file.h> 17#if defined(__NetBSD__) && (NetBSD >= 199905) && !defined(IPFILTER_LKM) && \ 18 defined(_KERNEL) 19# include "opt_ipfilter_log.h" 20#endif 21#if defined(_KERNEL) && defined(__FreeBSD_version) && \ 22 (__FreeBSD_version >= 400000) && !defined(KLD_MODULE) 23#include "opt_inet6.h" 24#endif 25#if !defined(_KERNEL) && !defined(KERNEL) && !defined(__KERNEL__) 26# include <stdio.h> 27# include <stdlib.h> 28# include <string.h> 29#else 30# ifdef linux 31# include <linux/kernel.h> 32# include <linux/module.h> 33# endif 34#endif 35#if (defined(KERNEL) || defined(_KERNEL)) && (__FreeBSD_version >= 220000) 36# include <sys/filio.h> 37# include <sys/fcntl.h> 38# if (__FreeBSD_version >= 300000) && !defined(IPFILTER_LKM) 39# include "opt_ipfilter.h" 40# endif 41#else 42# include <sys/ioctl.h> 43#endif 44#include <sys/time.h> 45#include <sys/uio.h> 46#ifndef linux 47# include <sys/protosw.h> 48#endif 49#include <sys/socket.h> 50#if (defined(_KERNEL) || defined(KERNEL)) && !defined(linux) 51# include <sys/systm.h> 52#endif 53#if !defined(__SVR4) && !defined(__svr4__) 54# ifndef linux 55# include <sys/mbuf.h> 56# endif 57#else 58# include <sys/filio.h> 59# include <sys/byteorder.h> 60# ifdef _KERNEL 61# include <sys/dditypes.h> 62# endif 63# include <sys/stream.h> 64# include <sys/kmem.h> 65#endif 66 67#include <net/if.h> 68#ifdef sun 69# include <net/af.h> 70#endif 71#include <net/route.h> 72#include <netinet/in.h> 73#include <netinet/in_systm.h> 74#include <netinet/ip.h> 75#include <netinet/tcp.h> 76#ifndef linux 77# include <netinet/ip_var.h> 78# include <netinet/tcp_fsm.h> 79#endif 80#include <netinet/udp.h> 81#include <netinet/ip_icmp.h> 82#include "netinet/ip_compat.h" 83#include <netinet/tcpip.h> 84#include "netinet/ip_fil.h" 85#include "netinet/ip_nat.h" 86#include "netinet/ip_frag.h" 87#include "netinet/ip_proxy.h" 88#include "netinet/ip_state.h" 89#ifdef USE_INET6 90#include <netinet/icmp6.h> 91#endif 92#if (__FreeBSD_version >= 300000) 93# include <sys/malloc.h> 94# if (defined(_KERNEL) || defined(KERNEL)) && !defined(IPFILTER_LKM) 95# include <sys/libkern.h> 96# include <sys/systm.h> 97# endif 98#endif 99 100#ifndef MIN 101# define MIN(a,b) (((a)<(b))?(a):(b)) 102#endif 103 104#define TCP_CLOSE (TH_FIN|TH_RST) 105 106static ipstate_t **ips_table = NULL; 107static ipstate_t *ips_list = NULL; 108static int ips_num = 0; 109static ips_stat_t ips_stats; 110#if (SOLARIS || defined(__sgi)) && defined(_KERNEL) 111extern KRWLOCK_T ipf_state, ipf_mutex; 112extern kmutex_t ipf_rw; 113#endif 114 115#ifdef USE_INET6 116static frentry_t *fr_checkicmp6matchingstate __P((ip6_t *, fr_info_t *)); 117#endif 118static int fr_matchsrcdst __P((ipstate_t *, union i6addr, union i6addr, 119 fr_info_t *, tcphdr_t *)); 120static frentry_t *fr_checkicmpmatchingstate __P((ip_t *, fr_info_t *)); 121static int fr_matchicmpqueryreply __P((int, ipstate_t *, icmphdr_t *)); 122static int fr_state_flush __P((int)); 123static ips_stat_t *fr_statetstats __P((void)); 124static void fr_delstate __P((ipstate_t *)); 125static int fr_state_remove __P((caddr_t)); 126int fr_stputent __P((caddr_t)); 127int fr_stgetent __P((caddr_t)); 128void fr_stinsert __P((ipstate_t *)); 129 130 131#define FIVE_DAYS (2 * 5 * 86400) /* 5 days: half closed session */ 132 133#define TCP_MSL 240 /* 2 minutes */ 134u_long fr_tcpidletimeout = FIVE_DAYS, 135 fr_tcpclosewait = 2 * TCP_MSL, 136 fr_tcplastack = 2 * TCP_MSL, 137 fr_tcptimeout = 2 * TCP_MSL, 138 fr_tcpclosed = 1, 139 fr_udptimeout = 240, 140 fr_icmptimeout = 120; 141int fr_statemax = IPSTATE_MAX, 142 fr_statesize = IPSTATE_SIZE; 143int fr_state_doflush = 0, 144 fr_state_lock = 0; 145 146static int icmpreplytype4[ICMP_MAXTYPE + 1]; 147 148int fr_stateinit() 149{ 150 int i; 151 152 KMALLOCS(ips_table, ipstate_t **, fr_statesize * sizeof(ipstate_t *)); 153 if (ips_table != NULL) 154 bzero((char *)ips_table, fr_statesize * sizeof(ipstate_t *)); 155 else 156 return -1; 157 158 /* fill icmp reply type table */ 159 for (i = 0; i <= ICMP_MAXTYPE; i++) 160 icmpreplytype4[i] = -1; 161 icmpreplytype4[ICMP_ECHO] = ICMP_ECHOREPLY; 162 icmpreplytype4[ICMP_TSTAMP] = ICMP_TSTAMPREPLY; 163 icmpreplytype4[ICMP_IREQ] = ICMP_IREQREPLY; 164 icmpreplytype4[ICMP_MASKREQ] = ICMP_MASKREPLY; 165 166 return 0; 167} 168 169 170static ips_stat_t *fr_statetstats() 171{ 172 ips_stats.iss_active = ips_num; 173 ips_stats.iss_table = ips_table; 174 ips_stats.iss_list = ips_list; 175 return &ips_stats; 176} 177 178 179/* 180 * flush state tables. two actions currently defined: 181 * which == 0 : flush all state table entries 182 * which == 1 : flush TCP connections which have started to close but are 183 * stuck for some reason. 184 */ 185static int fr_state_flush(which) 186int which; 187{ 188 register ipstate_t *is, **isp; 189#if defined(_KERNEL) && !SOLARIS 190 int s; 191#endif 192 int delete, removed = 0; 193 194 SPL_NET(s); 195 for (isp = &ips_list; (is = *isp); ) { 196 delete = 0; 197 198 switch (which) 199 { 200 case 0 : 201 delete = 1; 202 break; 203 case 1 : 204 if (is->is_p != IPPROTO_TCP) 205 break; 206 if ((is->is_state[0] != TCPS_ESTABLISHED) || 207 (is->is_state[1] != TCPS_ESTABLISHED)) 208 delete = 1; 209 break; 210 } 211 212 if (delete) { 213 if (is->is_p == IPPROTO_TCP) 214 ips_stats.iss_fin++; 215 else 216 ips_stats.iss_expire++; 217#ifdef IPFILTER_LOG 218 ipstate_log(is, ISL_FLUSH); 219#endif 220 fr_delstate(is); 221 removed++; 222 } else 223 isp = &is->is_next; 224 } 225 SPL_X(s); 226 return removed; 227} 228 229 230static int fr_state_remove(data) 231caddr_t data; 232{ 233 ipstate_t *sp, st; 234 int error; 235 236 sp = &st; 237 error = IRCOPYPTR(data, (caddr_t)&st, sizeof(st)); 238 if (error) 239 return EFAULT; 240 241 for (sp = ips_list; sp; sp = sp->is_next) 242 if ((sp->is_p == st.is_p) && (sp->is_v == st.is_v) && 243 !bcmp(&sp->is_src, &st.is_src, sizeof(st.is_src)) && 244 !bcmp(&sp->is_dst, &st.is_src, sizeof(st.is_dst)) && 245 !bcmp(&sp->is_ps, &st.is_ps, sizeof(st.is_ps))) { 246 WRITE_ENTER(&ipf_state); 247#ifdef IPFILTER_LOG 248 ipstate_log(sp, ISL_REMOVE); 249#endif 250 fr_delstate(sp); 251 RWLOCK_EXIT(&ipf_state); 252 return 0; 253 } 254 return ESRCH; 255} 256 257 258int fr_state_ioctl(data, cmd, mode) 259caddr_t data; 260#if defined(__NetBSD__) || defined(__OpenBSD__) 261u_long cmd; 262#else 263int cmd; 264#endif 265int mode; 266{ 267 int arg, ret, error = 0; 268 269 switch (cmd) 270 { 271 case SIOCDELST : 272 error = fr_state_remove(data); 273 break; 274 case SIOCIPFFL : 275 error = IRCOPY(data, (caddr_t)&arg, sizeof(arg)); 276 if (error) 277 break; 278 if (arg == 0 || arg == 1) { 279 WRITE_ENTER(&ipf_state); 280 ret = fr_state_flush(arg); 281 RWLOCK_EXIT(&ipf_state); 282 error = IWCOPY((caddr_t)&ret, data, sizeof(ret)); 283 } else 284 error = EINVAL; 285 break; 286#ifdef IPFILTER_LOG 287 case SIOCIPFFB : 288 if (!(mode & FWRITE)) 289 error = EPERM; 290 else { 291 int tmp; 292 293 tmp = ipflog_clear(IPL_LOGSTATE); 294 IWCOPY((char *)&tmp, data, sizeof(tmp)); 295 } 296 break; 297#endif 298 case SIOCGETFS : 299 error = IWCOPYPTR((caddr_t)fr_statetstats(), data, 300 sizeof(ips_stat_t)); 301 break; 302 case FIONREAD : 303#ifdef IPFILTER_LOG 304 error = IWCOPY((caddr_t)&iplused[IPL_LOGSTATE], (caddr_t)data, 305 sizeof(iplused[IPL_LOGSTATE])); 306#endif 307 break; 308 case SIOCSTLCK : 309 error = fr_lock(data, &fr_state_lock); 310 break; 311 case SIOCSTPUT : 312 if (!fr_state_lock) { 313 error = EACCES; 314 break; 315 } 316 error = fr_stputent(data); 317 break; 318 case SIOCSTGET : 319 if (!fr_state_lock) { 320 error = EACCES; 321 break; 322 } 323 error = fr_stgetent(data); 324 break; 325 default : 326 error = EINVAL; 327 break; 328 } 329 return error; 330} 331 332 333int fr_stgetent(data) 334caddr_t data; 335{ 336 register ipstate_t *is, *isn; 337 ipstate_save_t ips, *ipsp; 338 int error; 339 340 error = IRCOPY(data, (caddr_t)&ipsp, sizeof(ipsp)); 341 if (error) 342 return EFAULT; 343 error = IRCOPY((caddr_t)ipsp, (caddr_t)&ips, sizeof(ips)); 344 if (error) 345 return EFAULT; 346 347 isn = ips.ips_next; 348 if (!isn) { 349 isn = ips_list; 350 if (isn == NULL) { 351 if (ips.ips_next == NULL) 352 return ENOENT; 353 return 0; 354 } 355 } else { 356 /* 357 * Make sure the pointer we're copying from exists in the 358 * current list of entries. Security precaution to prevent 359 * copying of random kernel data. 360 */ 361 for (is = ips_list; is; is = is->is_next) 362 if (is == isn) 363 break; 364 if (!is) 365 return ESRCH; 366 } 367 ips.ips_next = isn->is_next; 368 bcopy((char *)isn, (char *)&ips.ips_is, sizeof(ips.ips_is)); 369 if (isn->is_rule) 370 bcopy((char *)isn->is_rule, (char *)&ips.ips_fr, 371 sizeof(ips.ips_fr)); 372 error = IWCOPY((caddr_t)&ips, ipsp, sizeof(ips)); 373 if (error) 374 error = EFAULT; 375 return error; 376} 377 378 379int fr_stputent(data) 380caddr_t data; 381{ 382 register ipstate_t *is, *isn; 383 ipstate_save_t ips, *ipsp; 384 int error, out; 385 frentry_t *fr; 386 387 error = IRCOPY(data, (caddr_t)&ipsp, sizeof(ipsp)); 388 if (error) 389 return EFAULT; 390 error = IRCOPY((caddr_t)ipsp, (caddr_t)&ips, sizeof(ips)); 391 if (error) 392 return EFAULT; 393 394 KMALLOC(isn, ipstate_t *); 395 if (isn == NULL) 396 return ENOMEM; 397 398 bcopy((char *)&ips.ips_is, (char *)isn, sizeof(*isn)); 399 fr = isn->is_rule; 400 if (fr != NULL) { 401 if (isn->is_flags & FI_NEWFR) { 402 KMALLOC(fr, frentry_t *); 403 if (fr == NULL) { 404 KFREE(isn); 405 return ENOMEM; 406 } 407 bcopy((char *)&ips.ips_fr, (char *)fr, sizeof(*fr)); 408 out = fr->fr_flags & FR_OUTQUE ? 1 : 0; 409 isn->is_rule = fr; 410 ips.ips_is.is_rule = fr; 411 if (*fr->fr_ifname) { 412 fr->fr_ifa = GETUNIT(fr->fr_ifname, fr->fr_v); 413 if (fr->fr_ifa == NULL) 414 fr->fr_ifa = (void *)-1; 415#ifdef _KERNEL 416 else { 417 strncpy(isn->is_ifname[out], 418 IFNAME(fr->fr_ifa), IFNAMSIZ); 419 isn->is_ifp[out] = fr->fr_ifa; 420 } 421#endif 422 } else 423 fr->fr_ifa = NULL; 424 /* 425 * send a copy back to userland of what we ended up 426 * to allow for verification. 427 */ 428 error = IWCOPY((caddr_t)&ips, ipsp, sizeof(ips)); 429 if (error) { 430 KFREE(isn); 431 KFREE(fr); 432 return EFAULT; 433 } 434 } else { 435 for (is = ips_list; is; is = is->is_next) 436 if (is->is_rule == fr) 437 break; 438 if (!is) { 439 KFREE(isn); 440 return ESRCH; 441 } 442 } 443 } 444 fr_stinsert(isn); 445 return 0; 446} 447 448 449void fr_stinsert(is) 450register ipstate_t *is; 451{ 452 register u_int hv = is->is_hv; 453 454 MUTEX_INIT(&is->is_lock, "ipf state entry", NULL); 455 456 is->is_ifname[0][sizeof(is->is_ifname[0]) - 1] = '\0'; 457 if (is->is_ifname[0][0] != '\0') { 458 is->is_ifp[0] = GETUNIT(is->is_ifname[0], is->is_v); 459 } 460 is->is_ifname[1][sizeof(is->is_ifname[0]) - 1] = '\0'; 461 if (is->is_ifname[1][0] != '\0') { 462 is->is_ifp[1] = GETUNIT(is->is_ifname[1], is->is_v); 463 } 464 465 /* 466 * add into list table. 467 */ 468 if (ips_list) 469 ips_list->is_pnext = &is->is_next; 470 is->is_pnext = &ips_list; 471 is->is_next = ips_list; 472 ips_list = is; 473 if (ips_table[hv]) 474 ips_table[hv]->is_phnext = &is->is_hnext; 475 else 476 ips_stats.iss_inuse++; 477 is->is_phnext = ips_table + hv; 478 is->is_hnext = ips_table[hv]; 479 ips_table[hv] = is; 480 ips_num++; 481} 482 483 484/* 485 * Create a new ipstate structure and hang it off the hash table. 486 */ 487ipstate_t *fr_addstate(ip, fin, flags) 488ip_t *ip; 489fr_info_t *fin; 490u_int flags; 491{ 492 register tcphdr_t *tcp = NULL; 493 register ipstate_t *is; 494 register u_int hv; 495 ipstate_t ips; 496 u_int pass; 497 int out; 498 499 if (fr_state_lock || (fin->fin_off & IP_OFFMASK) || 500 (fin->fin_fi.fi_fl & FI_SHORT)) 501 return NULL; 502 if (ips_num == fr_statemax) { 503 ips_stats.iss_max++; 504 fr_state_doflush = 1; 505 return NULL; 506 } 507 out = fin->fin_out; 508 is = &ips; 509 bzero((char *)is, sizeof(*is)); 510 ips.is_age = 1; 511 ips.is_state[0] = 0; 512 ips.is_state[1] = 0; 513 /* 514 * Copy and calculate... 515 */ 516 hv = (is->is_p = fin->fin_fi.fi_p); 517 is->is_src = fin->fin_fi.fi_src; 518 hv += is->is_saddr; 519 is->is_dst = fin->fin_fi.fi_dst; 520 hv += is->is_daddr; 521#ifdef USE_INET6 522 if (fin->fin_v == 6) { 523 if (is->is_p == IPPROTO_ICMPV6) { 524 if (IN6_IS_ADDR_MULTICAST(&is->is_dst.in6)) 525 flags |= FI_W_DADDR; 526 if (out) 527 hv -= is->is_daddr; 528 else 529 hv -= is->is_saddr; 530 } 531 } 532#endif 533 534 switch (is->is_p) 535 { 536#ifdef USE_INET6 537 case IPPROTO_ICMPV6 : 538#endif 539 case IPPROTO_ICMP : 540 { 541 struct icmp *ic = (struct icmp *)fin->fin_dp; 542 543#ifdef USE_INET6 544 if ((is->is_p == IPPROTO_ICMPV6) && 545 ((ic->icmp_type & ICMP6_INFOMSG_MASK) == 0)) 546 return NULL; 547#endif 548 switch (ic->icmp_type) 549 { 550#ifdef USE_INET6 551 case ICMP6_ECHO_REQUEST : 552 is->is_icmp.ics_type = ICMP6_ECHO_REPLY; 553 hv += (is->is_icmp.ics_id = ic->icmp_id); 554 hv += (is->is_icmp.ics_seq = ic->icmp_seq); 555 break; 556 case ICMP6_MEMBERSHIP_QUERY : 557 case ND_ROUTER_SOLICIT : 558 case ND_NEIGHBOR_SOLICIT : 559 is->is_icmp.ics_type = ic->icmp_type + 1; 560 break; 561#endif 562 case ICMP_ECHO : 563 case ICMP_TSTAMP : 564 case ICMP_IREQ : 565 case ICMP_MASKREQ : 566 is->is_icmp.ics_type = ic->icmp_type; 567 hv += (is->is_icmp.ics_id = ic->icmp_id); 568 hv += (is->is_icmp.ics_seq = ic->icmp_seq); 569 break; 570 default : 571 return NULL; 572 } 573 ATOMIC_INCL(ips_stats.iss_icmp); 574 is->is_age = fr_icmptimeout; 575 break; 576 } 577 case IPPROTO_TCP : 578 { 579 tcp = (tcphdr_t *)fin->fin_dp; 580 581 if (tcp->th_flags & TH_RST) 582 return NULL; 583 /* 584 * The endian of the ports doesn't matter, but the ack and 585 * sequence numbers do as we do mathematics on them later. 586 */ 587 is->is_dport = tcp->th_dport; 588 is->is_sport = tcp->th_sport; 589 if ((flags & (FI_W_DPORT|FI_W_SPORT)) == 0) { 590 hv += tcp->th_dport; 591 hv += tcp->th_sport; 592 } 593 is->is_send = ntohl(tcp->th_seq) + ip->ip_len - 594 fin->fin_hlen - (tcp->th_off << 2) + 595 ((tcp->th_flags & TH_SYN) ? 1 : 0) + 596 ((tcp->th_flags & TH_FIN) ? 1 : 0); 597 is->is_maxsend = is->is_send; 598 is->is_dend = 0; 599 is->is_maxdwin = 1; 600 is->is_maxswin = ntohs(tcp->th_win); 601 if (is->is_maxswin == 0) 602 is->is_maxswin = 1; 603 /* 604 * If we're creating state for a starting connection, start the 605 * timer on it as we'll never see an error if it fails to 606 * connect. 607 */ 608 ATOMIC_INCL(ips_stats.iss_tcp); 609 break; 610 } 611 case IPPROTO_UDP : 612 { 613 tcp = (tcphdr_t *)fin->fin_dp; 614 615 is->is_dport = tcp->th_dport; 616 is->is_sport = tcp->th_sport; 617 if ((flags & (FI_W_DPORT|FI_W_SPORT)) == 0) { 618 hv += tcp->th_dport; 619 hv += tcp->th_sport; 620 } 621 ATOMIC_INCL(ips_stats.iss_udp); 622 is->is_age = fr_udptimeout; 623 break; 624 } 625 default : 626 return NULL; 627 } 628 629 KMALLOC(is, ipstate_t *); 630 if (is == NULL) { 631 ATOMIC_INCL(ips_stats.iss_nomem); 632 return NULL; 633 } 634 bcopy((char *)&ips, (char *)is, sizeof(*is)); 635 hv %= fr_statesize; 636 is->is_hv = hv; 637 is->is_rule = fin->fin_fr; 638 if (is->is_rule != NULL) { 639 ATOMIC_INC32(is->is_rule->fr_ref); 640 pass = is->is_rule->fr_flags; 641 } else 642 pass = fr_flags; 643 WRITE_ENTER(&ipf_state); 644 645 is->is_pass = pass; 646 is->is_pkts = 1; 647 is->is_bytes = fin->fin_dlen + fin->fin_hlen; 648 /* 649 * We want to check everything that is a property of this packet, 650 * but we don't (automatically) care about it's fragment status as 651 * this may change. 652 */ 653 is->is_v = fin->fin_fi.fi_v; 654 is->is_opt = fin->fin_fi.fi_optmsk; 655 is->is_optmsk = 0xffffffff; 656 is->is_sec = fin->fin_fi.fi_secmsk; 657 is->is_secmsk = 0xffff; 658 is->is_auth = fin->fin_fi.fi_auth; 659 is->is_authmsk = 0xffff; 660 is->is_flags = fin->fin_fi.fi_fl & FI_CMP; 661 is->is_flags |= FI_CMP << 4; 662 is->is_flags |= flags & (FI_WILDP|FI_WILDA); 663 is->is_ifp[1 - out] = NULL; 664 is->is_ifp[out] = fin->fin_ifp; 665#ifdef _KERNEL 666 strncpy(is->is_ifname[out], IFNAME(fin->fin_ifp), IFNAMSIZ); 667#endif 668 is->is_ifname[1 - out][0] = '\0'; 669 if (pass & FR_LOGFIRST) 670 is->is_pass &= ~(FR_LOGFIRST|FR_LOG); 671 fr_stinsert(is); 672 if (is->is_p == IPPROTO_TCP) { 673 MUTEX_ENTER(&is->is_lock); 674 fr_tcp_age(&is->is_age, is->is_state, fin, 675 0); /* 0 = packet from the source */ 676 MUTEX_EXIT(&is->is_lock); 677 } 678#ifdef IPFILTER_LOG 679 ipstate_log(is, ISL_NEW); 680#endif 681 RWLOCK_EXIT(&ipf_state); 682 fin->fin_rev = IP6NEQ(is->is_dst, fin->fin_fi.fi_dst); 683 if (fin->fin_fi.fi_fl & FI_FRAG) 684 ipfr_newfrag(ip, fin, pass ^ FR_KEEPSTATE); 685 return is; 686} 687 688 689 690/* 691 * check to see if a packet with TCP headers fits within the TCP window. 692 * change timeout depending on whether new packet is a SYN-ACK returning for a 693 * SYN or a RST or FIN which indicate time to close up shop. 694 */ 695int fr_tcpstate(is, fin, ip, tcp) 696register ipstate_t *is; 697fr_info_t *fin; 698ip_t *ip; 699tcphdr_t *tcp; 700{ 701 register tcp_seq seq, ack, end; 702 register int ackskew; 703 tcpdata_t *fdata, *tdata; 704 u_short win, maxwin; 705 int ret = 0; 706 int source; 707 708 /* 709 * Find difference between last checked packet and this packet. 710 */ 711 source = IP6EQ(fin->fin_fi.fi_src, is->is_src); 712 fdata = &is->is_tcp.ts_data[!source]; 713 tdata = &is->is_tcp.ts_data[source]; 714 seq = ntohl(tcp->th_seq); 715 ack = ntohl(tcp->th_ack); 716 win = ntohs(tcp->th_win); 717 end = seq + fin->fin_dlen - (tcp->th_off << 2) + 718 ((tcp->th_flags & TH_SYN) ? 1 : 0) + 719 ((tcp->th_flags & TH_FIN) ? 1 : 0); 720 721 if (fdata->td_end == 0) { 722 /* 723 * Must be a (outgoing) SYN-ACK in reply to a SYN. 724 */ 725 fdata->td_end = end; 726 fdata->td_maxwin = 1; 727 fdata->td_maxend = end + 1; 728 } 729 730 if (!(tcp->th_flags & TH_ACK)) { /* Pretend an ack was sent */ 731 ack = tdata->td_end; 732 } else if (((tcp->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) && 733 (ack == 0)) { 734 /* gross hack to get around certain broken tcp stacks */ 735 ack = tdata->td_end; 736 } 737 738 if (seq == end) 739 seq = end = fdata->td_end; 740 741 maxwin = tdata->td_maxwin; 742 ackskew = tdata->td_end - ack; 743 744#define SEQ_GE(a,b) ((int)((a) - (b)) >= 0) 745#define SEQ_GT(a,b) ((int)((a) - (b)) > 0) 746 if ((SEQ_GE(fdata->td_maxend, end)) && 747 (SEQ_GE(seq, fdata->td_end - maxwin)) && 748/* XXX what about big packets */ 749#define MAXACKWINDOW 66000 750 (ackskew >= -MAXACKWINDOW) && 751 (ackskew <= MAXACKWINDOW)) { 752 /* if ackskew < 0 then this should be due to fragented 753 * packets. There is no way to know the length of the 754 * total packet in advance. 755 * We do know the total length from the fragment cache though. 756 * Note however that there might be more sessions with 757 * exactly the same source and destination paramters in the 758 * state cache (and source and destination is the only stuff 759 * that is saved in the fragment cache). Note further that 760 * some TCP connections in the state cache are hashed with 761 * sport and dport as well which makes it not worthwhile to 762 * look for them. 763 * Thus, when ackskew is negative but still seems to belong 764 * to this session, we bump up the destinations end value. 765 */ 766 if (ackskew < 0) 767 tdata->td_end = ack; 768 769 /* update max window seen */ 770 if (fdata->td_maxwin < win) 771 fdata->td_maxwin = win; 772 if (SEQ_GT(end, fdata->td_end)) 773 fdata->td_end = end; 774 if (SEQ_GE(ack + win, tdata->td_maxend)) { 775 tdata->td_maxend = ack + win; 776 if (win == 0) 777 tdata->td_maxend++; 778 } 779 780 ATOMIC_INCL(ips_stats.iss_hits); 781 is->is_pkts++; 782 is->is_bytes += fin->fin_dlen + fin->fin_hlen; 783 /* 784 * Nearing end of connection, start timeout. 785 */ 786 MUTEX_ENTER(&is->is_lock); 787 /* source ? 0 : 1 -> !source */ 788 fr_tcp_age(&is->is_age, is->is_state, fin, !source); 789 MUTEX_EXIT(&is->is_lock); 790 ret = 1; 791 } 792 return ret; 793} 794 795 796static int fr_matchsrcdst(is, src, dst, fin, tcp) 797ipstate_t *is; 798union i6addr src, dst; 799fr_info_t *fin; 800tcphdr_t *tcp; 801{ 802 int ret = 0, rev, out, flags; 803 u_short sp, dp; 804 void *ifp; 805 806 rev = fin->fin_rev = IP6NEQ(is->is_dst, dst); 807 ifp = fin->fin_ifp; 808 out = fin->fin_out; 809 810 if (tcp != NULL) { 811 flags = is->is_flags; 812 sp = tcp->th_sport; 813 dp = tcp->th_dport; 814 } else { 815 flags = is->is_flags & FI_WILDA; 816 sp = 0; 817 dp = 0; 818 } 819 820 if (rev == 0) { 821 if (!out) { 822 if (is->is_ifpin == NULL || is->is_ifpin == ifp) 823 ret = 1; 824 } else { 825 if (is->is_ifpout == NULL || is->is_ifpout == ifp) 826 ret = 1; 827 } 828 } else { 829 if (out) { 830 if (is->is_ifpin == NULL || is->is_ifpin == ifp) 831 ret = 1; 832 } else { 833 if (is->is_ifpout == NULL || is->is_ifpout == ifp) 834 ret = 1; 835 } 836 } 837 if (ret == 0) 838 return 0; 839 ret = 0; 840 841 if (rev == 0) { 842 if ( 843 (IP6EQ(is->is_dst, dst) || (flags & FI_W_DADDR)) && 844 (IP6EQ(is->is_src, src) || (flags & FI_W_SADDR)) && 845 (!tcp || ((sp == is->is_sport || flags & FI_W_SPORT) && 846 (dp == is->is_dport || flags & FI_W_DPORT)))) { 847 ret = 1; 848 } 849 } else { 850 if ( 851 (IP6EQ(is->is_dst, src) || (flags & FI_W_DADDR)) && 852 (IP6EQ(is->is_src, dst) || (flags & FI_W_SADDR)) && 853 (!tcp || ((sp == is->is_dport || flags & FI_W_DPORT) && 854 (dp == is->is_sport || flags & FI_W_SPORT)))) { 855 ret = 1; 856 } 857 } 858 if (ret == 0) 859 return 0; 860 861 /* 862 * Whether or not this should be here, is questionable, but the aim 863 * is to get this out of the main line. 864 */ 865 if (tcp == NULL) 866 flags = is->is_flags & (FI_CMP|(FI_CMP<<4)); 867 868 if (((fin->fin_fi.fi_fl & (flags >> 4)) != (flags & FI_CMP)) || 869 ((fin->fin_fi.fi_optmsk & is->is_optmsk) != is->is_opt) || 870 ((fin->fin_fi.fi_secmsk & is->is_secmsk) != is->is_sec) || 871 ((fin->fin_fi.fi_auth & is->is_authmsk) != is->is_auth)) 872 return 0; 873 874 if ((flags & (FI_W_SPORT|FI_W_DPORT))) { 875 if ((flags & FI_W_SPORT) != 0) { 876 if (rev == 0) { 877 is->is_sport = sp; 878 is->is_send = htonl(tcp->th_seq); 879 } else { 880 is->is_sport = dp; 881 is->is_send = htonl(tcp->th_ack); 882 } 883 is->is_maxsend = is->is_send + 1; 884 } else if ((flags & FI_W_DPORT) != 0) { 885 if (rev == 0) { 886 is->is_dport = dp; 887 is->is_dend = htonl(tcp->th_ack); 888 } else { 889 is->is_dport = sp; 890 is->is_dend = htonl(tcp->th_seq); 891 } 892 is->is_maxdend = is->is_dend + 1; 893 } 894 is->is_flags &= ~(FI_W_SPORT|FI_W_DPORT); 895 } 896 897 ret = -1; 898 899 if (!rev) { 900 if (out) { 901 if (!is->is_ifpout) 902 ret = 1; 903 } else { 904 if (!is->is_ifpin) 905 ret = 0; 906 } 907 } else { 908 if (out) { 909 if (!is->is_ifpin) 910 ret = 0; 911 } else { 912 if (!is->is_ifpout) 913 ret = 1; 914 } 915 } 916 917 if (ret >= 0) { 918 is->is_ifp[ret] = ifp; 919#ifdef _KERNEL 920 strncpy(is->is_ifname[out], IFNAME(fin->fin_ifp), 921 sizeof(is->is_ifname[1])); 922#endif 923 } 924#ifdef _KERNEL 925 if (ret >= 0) { 926 strncpy(is->is_ifname[out], IFNAME(fin->fin_ifp), 927 sizeof(is->is_ifname[1])); 928 } 929#endif 930 return 1; 931} 932 933static int fr_matchicmpqueryreply(v, is, icmp) 934int v; 935ipstate_t *is; 936icmphdr_t *icmp; 937{ 938 if (v == 4) { 939 /* 940 * If we matched its type on the way in, then when going out 941 * it will still be the same type. 942 */ 943 if (((icmp->icmp_type == is->is_type) || 944 (icmpreplytype4[is->is_type] == icmp->icmp_type)) && 945 (icmp->icmp_id == is->is_icmp.ics_id) && 946 (icmp->icmp_seq == is->is_icmp.ics_seq)) { 947 return 1; 948 }; 949 } 950#ifdef USE_INET6 951 else if (is->is_v == 6) { 952 if ((is->is_type == ICMP6_ECHO_REPLY) && 953 (icmp->icmp_type == ICMP6_ECHO_REQUEST) && 954 (icmp->icmp_id == is->is_icmp.ics_id) && 955 (icmp->icmp_seq == is->is_icmp.ics_seq)) { 956 return 1; 957 }; 958 } 959#endif 960 return 0; 961} 962 963static frentry_t *fr_checkicmpmatchingstate(ip, fin) 964ip_t *ip; 965fr_info_t *fin; 966{ 967 register ipstate_t *is, **isp; 968 register u_short sport, dport; 969 register u_char pr; 970 union i6addr dst, src; 971 struct icmp *ic; 972 u_short savelen; 973 icmphdr_t *icmp; 974 fr_info_t ofin; 975 int type, len; 976 tcphdr_t *tcp; 977 frentry_t *fr; 978 ip_t *oip; 979 u_int hv; 980 981 /* 982 * Does it at least have the return (basic) IP header ? 983 * Only a basic IP header (no options) should be with 984 * an ICMP error header. 985 */ 986 if (((ip->ip_v != 4) && (ip->ip_hl != 5)) || 987 (fin->fin_plen < ICMPERR_MINPKTLEN)) 988 return NULL; 989 ic = (struct icmp *)fin->fin_dp; 990 type = ic->icmp_type; 991 /* 992 * If it's not an error type, then return 993 */ 994 if ((type != ICMP_UNREACH) && (type != ICMP_SOURCEQUENCH) && 995 (type != ICMP_REDIRECT) && (type != ICMP_TIMXCEED) && 996 (type != ICMP_PARAMPROB)) 997 return NULL; 998 999 oip = (ip_t *)((char *)ic + ICMPERR_ICMPHLEN); 1000 if (fin->fin_plen < ICMPERR_MAXPKTLEN + ((oip->ip_hl - 5) << 2)) 1001 return NULL; 1002 1003 /* 1004 * Sanity checks. 1005 */ 1006 len = fin->fin_dlen - ICMPERR_ICMPHLEN; 1007 if ((len <= 0) || ((oip->ip_hl << 2) > len)) 1008 return NULL; 1009 1010 /* 1011 * Is the buffer big enough for all of it ? It's the size of the IP 1012 * header claimed in the encapsulated part which is of concern. It 1013 * may be too big to be in this buffer but not so big that it's 1014 * outside the ICMP packet, leading to TCP deref's causing problems. 1015 * This is possible because we don't know how big oip_hl is when we 1016 * do the pullup early in fr_check() and thus can't gaurantee it is 1017 * all here now. 1018 */ 1019#ifdef _KERNEL 1020 { 1021 mb_t *m; 1022 1023# if SOLARIS 1024 m = fin->fin_qfm; 1025 if ((char *)oip + len > (char *)m->b_wptr) 1026 return NULL; 1027# else 1028 m = *(mb_t **)fin->fin_mp; 1029 if ((char *)oip + len > (char *)ip + m->m_len) 1030 return NULL; 1031# endif 1032 } 1033#endif 1034 1035 /* 1036 * in the IPv4 case we must zero the i6addr union otherwise 1037 * the IP6EQ and IP6NEQ macros produce the wrong results because 1038 * of the 'junk' in the unused part of the union 1039 */ 1040 bzero(&src, sizeof(src)); 1041 bzero(&dst, sizeof(dst)); 1042 1043 if (oip->ip_p == IPPROTO_ICMP) { 1044 icmp = (icmphdr_t *)((char *)oip + (oip->ip_hl << 2)); 1045 1046 /* 1047 * a ICMP error can only be generated as a result of an 1048 * ICMP query, not as the response on an ICMP error 1049 * 1050 * XXX theoretically ICMP_ECHOREP and the other reply's are 1051 * ICMP query's as well, but adding them here seems strange XXX 1052 */ 1053 if ((icmp->icmp_type != ICMP_ECHO) && 1054 (icmp->icmp_type != ICMP_TSTAMP) && 1055 (icmp->icmp_type != ICMP_IREQ) && 1056 (icmp->icmp_type != ICMP_MASKREQ)) 1057 return NULL; 1058 1059 /* 1060 * perform a lookup of the ICMP packet in the state table 1061 */ 1062 hv = (pr = oip->ip_p); 1063 src.in4 = oip->ip_src; 1064 hv += src.in4.s_addr; 1065 dst.in4 = oip->ip_dst; 1066 hv += dst.in4.s_addr; 1067 hv += icmp->icmp_id; 1068 hv += icmp->icmp_seq; 1069 hv %= fr_statesize; 1070 1071 savelen = oip->ip_len; 1072 oip->ip_len = len; 1073 ofin.fin_v = 4; 1074 fr_makefrip(oip->ip_hl << 2, oip, &ofin); 1075 oip->ip_len = savelen; 1076 ofin.fin_ifp = fin->fin_ifp; 1077 ofin.fin_out = !fin->fin_out; 1078 ofin.fin_mp = NULL; /* if dereferenced, panic XXX */ 1079 1080 READ_ENTER(&ipf_state); 1081 for (isp = &ips_table[hv]; (is = *isp); isp = &is->is_hnext) 1082 if ((is->is_p == pr) && (is->is_v == 4) && 1083 fr_matchsrcdst(is, src, dst, &ofin, NULL) && 1084 fr_matchicmpqueryreply(is->is_v, is, icmp)) { 1085 ips_stats.iss_hits++; 1086 is->is_pkts++; 1087 is->is_bytes += ip->ip_len; 1088 fr = is->is_rule; 1089 RWLOCK_EXIT(&ipf_state); 1090 return fr; 1091 } 1092 RWLOCK_EXIT(&ipf_state); 1093 return NULL; 1094 }; 1095 1096 if ((oip->ip_p != IPPROTO_TCP) && (oip->ip_p != IPPROTO_UDP)) 1097 return NULL; 1098 1099 tcp = (tcphdr_t *)((char *)oip + (oip->ip_hl << 2)); 1100 dport = tcp->th_dport; 1101 sport = tcp->th_sport; 1102 1103 hv = (pr = oip->ip_p); 1104 src.in4 = oip->ip_src; 1105 hv += src.in4.s_addr; 1106 dst.in4 = oip->ip_dst; 1107 hv += dst.in4.s_addr; 1108 hv += dport; 1109 hv += sport; 1110 hv %= fr_statesize; 1111 /* 1112 * we make an fin entry to be able to feed it to 1113 * matchsrcdst note that not all fields are encessary 1114 * but this is the cleanest way. Note further we fill 1115 * in fin_mp such that if someone uses it we'll get 1116 * a kernel panic. fr_matchsrcdst does not use this. 1117 * 1118 * watch out here, as ip is in host order and oip in network 1119 * order. Any change we make must be undone afterwards. 1120 */ 1121 savelen = oip->ip_len; 1122 oip->ip_len = len; 1123 ofin.fin_v = 4; 1124 fr_makefrip(oip->ip_hl << 2, oip, &ofin); 1125 oip->ip_len = savelen; 1126 ofin.fin_ifp = fin->fin_ifp; 1127 ofin.fin_out = !fin->fin_out; 1128 ofin.fin_mp = NULL; /* if dereferenced, panic XXX */ 1129 READ_ENTER(&ipf_state); 1130 for (isp = &ips_table[hv]; (is = *isp); isp = &is->is_hnext) { 1131 /* 1132 * Only allow this icmp though if the 1133 * encapsulated packet was allowed through the 1134 * other way around. Note that the minimal amount 1135 * of info present does not allow for checking against 1136 * tcp internals such as seq and ack numbers. 1137 */ 1138 if ((is->is_p == pr) && (is->is_v == 4) && 1139 fr_matchsrcdst(is, src, dst, &ofin, tcp)) { 1140 fr = is->is_rule; 1141 ips_stats.iss_hits++; 1142 /* 1143 * we must swap src and dst here because the icmp 1144 * comes the other way around 1145 */ 1146 is->is_pkts++; 1147 is->is_bytes += fin->fin_plen; 1148 /* 1149 * we deliberately do not touch the timeouts 1150 * for the accompanying state table entry. 1151 * It remains to be seen if that is correct. XXX 1152 */ 1153 RWLOCK_EXIT(&ipf_state); 1154 return fr; 1155 } 1156 } 1157 RWLOCK_EXIT(&ipf_state); 1158 return NULL; 1159} 1160 1161/* 1162 * Check if a packet has a registered state. 1163 */ 1164frentry_t *fr_checkstate(ip, fin) 1165ip_t *ip; 1166fr_info_t *fin; 1167{ 1168 union i6addr dst, src; 1169 register ipstate_t *is, **isp; 1170 register u_char pr; 1171 u_int hv, hvm, hlen, tryagain, pass, v; 1172 struct icmp *ic; 1173 frentry_t *fr; 1174 tcphdr_t *tcp; 1175 1176 if (fr_state_lock || (fin->fin_off & IP_OFFMASK) || 1177 (fin->fin_fi.fi_fl & FI_SHORT)) 1178 return NULL; 1179 1180 is = NULL; 1181 hlen = fin->fin_hlen; 1182 tcp = (tcphdr_t *)((char *)ip + hlen); 1183 ic = (struct icmp *)tcp; 1184 hv = (pr = fin->fin_fi.fi_p); 1185 src = fin->fin_fi.fi_src; 1186 dst = fin->fin_fi.fi_dst; 1187 hv += src.in4.s_addr; 1188 hv += dst.in4.s_addr; 1189 1190 /* 1191 * Search the hash table for matching packet header info. 1192 */ 1193 v = fin->fin_fi.fi_v; 1194 switch (fin->fin_fi.fi_p) 1195 { 1196#ifdef USE_INET6 1197 case IPPROTO_ICMPV6 : 1198 if (v == 6) { 1199 if (fin->fin_out) 1200 hv -= dst.in4.s_addr; 1201 else 1202 hv -= src.in4.s_addr; 1203 if ((ic->icmp_type == ICMP6_ECHO_REQUEST) || 1204 (ic->icmp_type == ICMP6_ECHO_REPLY)) { 1205 hv += ic->icmp_id; 1206 hv += ic->icmp_seq; 1207 } 1208 } 1209#endif 1210 case IPPROTO_ICMP : 1211 if (v == 4) { 1212 hv += ic->icmp_id; 1213 hv += ic->icmp_seq; 1214 } 1215 hv %= fr_statesize; 1216 READ_ENTER(&ipf_state); 1217 for (isp = &ips_table[hv]; (is = *isp); isp = &is->is_hnext) { 1218 if ((is->is_p == pr) && (is->is_v == v) && 1219 fr_matchsrcdst(is, src, dst, fin, NULL) && 1220 fr_matchicmpqueryreply(v, is, ic)) { 1221 is->is_age = fr_icmptimeout; 1222 break; 1223 } 1224 } 1225 if (is != NULL) 1226 break; 1227 RWLOCK_EXIT(&ipf_state); 1228 /* 1229 * No matching icmp state entry. Perhaps this is a 1230 * response to another state entry. 1231 */ 1232#ifdef USE_INET6 1233 if (v == 6) 1234 fr = fr_checkicmp6matchingstate((ip6_t *)ip, fin); 1235 else 1236#endif 1237 fr = fr_checkicmpmatchingstate(ip, fin); 1238 if (fr) 1239 return fr; 1240 break; 1241 case IPPROTO_TCP : 1242 { 1243 register u_short dport = tcp->th_dport, sport = tcp->th_sport; 1244 register int i; 1245 1246 i = tcp->th_flags; 1247 /* 1248 * Just plain ignore RST flag set with either FIN or SYN. 1249 */ 1250 if ((i & TH_RST) && 1251 ((i & (TH_FIN|TH_SYN|TH_RST)) != TH_RST)) 1252 break; 1253 tryagain = 0; 1254retry_tcp: 1255 hvm = hv % fr_statesize; 1256 WRITE_ENTER(&ipf_state); 1257 for (isp = &ips_table[hvm]; (is = *isp); 1258 isp = &is->is_hnext) 1259 1260 1261 if ((is->is_p == pr) && (is->is_v == v) && 1262 fr_matchsrcdst(is, src, dst, fin, tcp)) { 1263 if (fr_tcpstate(is, fin, ip, tcp)) 1264 break; 1265 is = NULL; 1266 break; 1267 } 1268 if (is != NULL) 1269 break; 1270 RWLOCK_EXIT(&ipf_state); 1271 hv += dport; 1272 hv += sport; 1273 if (tryagain == 0) { 1274 tryagain = 1; 1275 goto retry_tcp; 1276 } 1277 break; 1278 } 1279 case IPPROTO_UDP : 1280 { 1281 register u_short dport = tcp->th_dport, sport = tcp->th_sport; 1282 1283 tryagain = 0; 1284retry_udp: 1285 hvm = hv % fr_statesize; 1286 /* 1287 * Nothing else to match on but ports. and IP#'s 1288 */ 1289 READ_ENTER(&ipf_state); 1290 for (is = ips_table[hvm]; is; is = is->is_hnext) 1291 if ((is->is_p == pr) && (is->is_v == v) && 1292 fr_matchsrcdst(is, src, dst, fin, tcp)) { 1293 is->is_age = fr_udptimeout; 1294 break; 1295 } 1296 if (is != NULL) 1297 break; 1298 RWLOCK_EXIT(&ipf_state); 1299 hv += dport; 1300 hv += sport; 1301 if (tryagain == 0) { 1302 tryagain = 1; 1303 goto retry_udp; 1304 } 1305 break; 1306 } 1307 default : 1308 break; 1309 } 1310 if (is == NULL) { 1311 ATOMIC_INCL(ips_stats.iss_miss); 1312 return NULL; 1313 } 1314 MUTEX_ENTER(&is->is_lock); 1315 is->is_bytes += fin->fin_plen; 1316 ips_stats.iss_hits++; 1317 is->is_pkts++; 1318 MUTEX_EXIT(&is->is_lock); 1319 fr = is->is_rule; 1320 fin->fin_fr = fr; 1321 pass = is->is_pass; 1322#ifndef _KERNEL 1323 if (tcp->th_flags & TCP_CLOSE) 1324 fr_delstate(is); 1325#endif 1326 RWLOCK_EXIT(&ipf_state); 1327 if (fin->fin_fi.fi_fl & FI_FRAG) 1328 ipfr_newfrag(ip, fin, pass ^ FR_KEEPSTATE); 1329 return fr; 1330} 1331 1332 1333void ip_statesync(ifp) 1334void *ifp; 1335{ 1336 register ipstate_t *is; 1337 1338 WRITE_ENTER(&ipf_state); 1339 for (is = ips_list; is; is = is->is_next) { 1340 if (is->is_ifpin == ifp) { 1341 is->is_ifpin = GETUNIT(is->is_ifname[0], is->is_v); 1342 if (!is->is_ifpin) 1343 is->is_ifpin = (void *)-1; 1344 } 1345 if (is->is_ifpout == ifp) { 1346 is->is_ifpout = GETUNIT(is->is_ifname[1], is->is_v); 1347 if (!is->is_ifpout) 1348 is->is_ifpout = (void *)-1; 1349 } 1350 } 1351 RWLOCK_EXIT(&ipf_state); 1352} 1353 1354 1355static void fr_delstate(is) 1356ipstate_t *is; 1357{ 1358 frentry_t *fr; 1359 1360 if (is->is_next) 1361 is->is_next->is_pnext = is->is_pnext; 1362 *is->is_pnext = is->is_next; 1363 if (is->is_hnext) 1364 is->is_hnext->is_phnext = is->is_phnext; 1365 *is->is_phnext = is->is_hnext; 1366 if (ips_table[is->is_hv] == NULL) 1367 ips_stats.iss_inuse--; 1368 1369 fr = is->is_rule; 1370 if (fr != NULL) { 1371 ATOMIC_DEC32(fr->fr_ref); 1372 if (fr->fr_ref == 0) 1373 KFREE(fr); 1374 } 1375#ifdef _KERNEL 1376 MUTEX_DESTROY(&is->is_lock); 1377#endif 1378 KFREE(is); 1379 ips_num--; 1380} 1381 1382 1383/* 1384 * Free memory in use by all state info. kept. 1385 */ 1386void fr_stateunload() 1387{ 1388 register ipstate_t *is; 1389 1390 WRITE_ENTER(&ipf_state); 1391 while ((is = ips_list)) 1392 fr_delstate(is); 1393 ips_stats.iss_inuse = 0; 1394 ips_num = 0; 1395 RWLOCK_EXIT(&ipf_state); 1396 KFREES(ips_table, fr_statesize * sizeof(ipstate_t *)); 1397 ips_table = NULL; 1398} 1399 1400 1401/* 1402 * Slowly expire held state for thingslike UDP and ICMP. Timeouts are set 1403 * in expectation of this being called twice per second. 1404 */ 1405void fr_timeoutstate() 1406{ 1407 register ipstate_t *is, **isp; 1408#if defined(_KERNEL) && !SOLARIS 1409 int s; 1410#endif 1411 1412 SPL_NET(s); 1413 WRITE_ENTER(&ipf_state); 1414 for (isp = &ips_list; (is = *isp); ) 1415 if (is->is_age && !--is->is_age) { 1416 if (is->is_p == IPPROTO_TCP) 1417 ips_stats.iss_fin++; 1418 else 1419 ips_stats.iss_expire++; 1420#ifdef IPFILTER_LOG 1421 ipstate_log(is, ISL_EXPIRE); 1422#endif 1423 fr_delstate(is); 1424 } else 1425 isp = &is->is_next; 1426 RWLOCK_EXIT(&ipf_state); 1427 SPL_X(s); 1428 if (fr_state_doflush) { 1429 (void) fr_state_flush(1); 1430 fr_state_doflush = 0; 1431 } 1432} 1433 1434 1435/* 1436 * Original idea freom Pradeep Krishnan for use primarily with NAT code. 1437 * (pkrishna@netcom.com) 1438 * 1439 * Rewritten by Arjan de Vet <Arjan.deVet@adv.iae.nl>, 2000-07-29: 1440 * 1441 * - (try to) base state transitions on real evidence only, 1442 * i.e. packets that are sent and have been received by ipfilter; 1443 * diagram 18.12 of TCP/IP volume 1 by W. Richard Stevens was used. 1444 * 1445 * - deal with half-closed connections correctly; 1446 * 1447 * - store the state of the source in state[0] such that ipfstat 1448 * displays the state as source/dest instead of dest/source; the calls 1449 * to fr_tcp_age have been changed accordingly. 1450 * 1451 * Parameters: 1452 * 1453 * state[0] = state of source (host that initiated connection) 1454 * state[1] = state of dest (host that accepted the connection) 1455 * 1456 * dir == 0 : a packet from source to dest 1457 * dir == 1 : a packet from dest to source 1458 * 1459 */ 1460void fr_tcp_age(age, state, fin, dir) 1461u_long *age; 1462u_char *state; 1463fr_info_t *fin; 1464int dir; 1465{ 1466 tcphdr_t *tcp = (tcphdr_t *)fin->fin_dp; 1467 u_char flags = tcp->th_flags; 1468 int dlen, ostate; 1469 1470 ostate = state[1 - dir]; 1471 1472 dlen = fin->fin_plen - fin->fin_hlen - (tcp->th_off << 2); 1473 1474 if (flags & TH_RST) { 1475 if (!(tcp->th_flags & TH_PUSH) && !dlen) { 1476 *age = fr_tcpclosed; 1477 state[dir] = TCPS_CLOSED; 1478 } else { 1479 *age = fr_tcpclosewait; 1480 state[dir] = TCPS_CLOSE_WAIT; 1481 } 1482 return; 1483 } 1484 1485 *age = fr_tcptimeout; /* default 4 mins */ 1486 1487 switch(state[dir]) 1488 { 1489 case TCPS_CLOSED: /* 0 */ 1490 if ((flags & TH_OPENING) == TH_OPENING) { 1491 /* 1492 * 'dir' received an S and sends SA in response, 1493 * CLOSED -> SYN_RECEIVED 1494 */ 1495 state[dir] = TCPS_SYN_RECEIVED; 1496 *age = fr_tcptimeout; 1497 } else if ((flags & (TH_SYN|TH_ACK)) == TH_SYN) { 1498 /* 'dir' sent S, CLOSED -> SYN_SENT */ 1499 state[dir] = TCPS_SYN_SENT; 1500 *age = fr_tcptimeout; 1501 } 1502 /* 1503 * The next piece of code makes it possible to get 1504 * already established connections into the state table 1505 * after a restart or reload of the filter rules; this 1506 * does not work when a strict 'flags S keep state' is 1507 * used for tcp connections of course 1508 */ 1509 if ((flags & (TH_FIN|TH_SYN|TH_RST|TH_ACK)) == TH_ACK) { 1510 /* we saw an A, guess 'dir' is in ESTABLISHED mode */ 1511 state[dir] = TCPS_ESTABLISHED; 1512 *age = fr_tcpidletimeout; 1513 } 1514 /* 1515 * TODO: besides regular ACK packets we can have other 1516 * packets as well; it is yet to be determined how we 1517 * should initialize the states in those cases 1518 */ 1519 break; 1520 1521 case TCPS_LISTEN: /* 1 */ 1522 /* NOT USED */ 1523 break; 1524 1525 case TCPS_SYN_SENT: /* 2 */ 1526 if ((flags & (TH_SYN|TH_FIN|TH_ACK)) == TH_ACK) { 1527 /* 1528 * We see an A from 'dir' which is in SYN_SENT 1529 * state: 'dir' sent an A in response to an SA 1530 * which it received, SYN_SENT -> ESTABLISHED 1531 */ 1532 state[dir] = TCPS_ESTABLISHED; 1533 *age = fr_tcpidletimeout; 1534 } else if (flags & TH_FIN) { 1535 /* 1536 * We see an F from 'dir' which is in SYN_SENT 1537 * state and wants to close its side of the 1538 * connection; SYN_SENT -> FIN_WAIT_1 1539 */ 1540 state[dir] = TCPS_FIN_WAIT_1; 1541 *age = fr_tcpidletimeout; /* or fr_tcptimeout? */ 1542 } else if ((flags & TH_OPENING) == TH_OPENING) { 1543 /* 1544 * We see an SA from 'dir' which is already in 1545 * SYN_SENT state, this means we have a 1546 * simultaneous open; SYN_SENT -> SYN_RECEIVED 1547 */ 1548 state[dir] = TCPS_SYN_RECEIVED; 1549 *age = fr_tcptimeout; 1550 } 1551 break; 1552 1553 case TCPS_SYN_RECEIVED: /* 3 */ 1554 if ((flags & (TH_SYN|TH_FIN|TH_ACK)) == TH_ACK) { 1555 /* 1556 * We see an A from 'dir' which was in SYN_RECEIVED 1557 * state so it must now be in established state, 1558 * SYN_RECEIVED -> ESTABLISHED 1559 */ 1560 state[dir] = TCPS_ESTABLISHED; 1561 *age = fr_tcpidletimeout; 1562 } else if (flags & TH_FIN) { 1563 /* 1564 * We see an F from 'dir' which is in SYN_RECEIVED 1565 * state and wants to close its side of the connection; 1566 * SYN_RECEIVED -> FIN_WAIT_1 1567 */ 1568 state[dir] = TCPS_FIN_WAIT_1; 1569 *age = fr_tcpidletimeout; /* or fr_tcptimeout? */ 1570 } 1571 break; 1572 1573 case TCPS_ESTABLISHED: /* 4 */ 1574 if (flags & TH_FIN) { 1575 /* 1576 * 'dir' closed its side of the connection; this 1577 * gives us a half-closed connection; 1578 * ESTABLISHED -> FIN_WAIT_1 1579 */ 1580 state[dir] = TCPS_FIN_WAIT_1; 1581 *age = fr_tcpidletimeout; 1582 } else if (flags & TH_ACK) { 1583 /* an ACK, should we exclude other flags here? */ 1584 if (ostate == TCPS_FIN_WAIT_1) { 1585 /* 1586 * We know the other side did an active close, 1587 * so we are ACKing the recvd FIN packet (does 1588 * the window matching code guarantee this?) 1589 * and go into CLOSE_WAIT state; this gives us 1590 * a half-closed connection 1591 */ 1592 state[dir] = TCPS_CLOSE_WAIT; 1593 *age = fr_tcpidletimeout; 1594 } else if (ostate < TCPS_CLOSE_WAIT) 1595 /* 1596 * Still a fully established connection, 1597 * reset timeout 1598 */ 1599 *age = fr_tcpidletimeout; 1600 } 1601 break; 1602 1603 case TCPS_CLOSE_WAIT: /* 5 */ 1604 if (flags & TH_FIN) { 1605 /* 1606 * Application closed and 'dir' sent a FIN, we're now 1607 * going into LAST_ACK state 1608 */ 1609 *age = fr_tcplastack; 1610 state[dir] = TCPS_LAST_ACK; 1611 } else { 1612 /* 1613 * We remain in CLOSE_WAIT because the other side has 1614 * closed already and we did not close our side yet; 1615 * reset timeout 1616 */ 1617 *age = fr_tcpidletimeout; 1618 } 1619 break; 1620 1621 case TCPS_FIN_WAIT_1: /* 6 */ 1622 if ((flags & TH_ACK) && ostate > TCPS_CLOSE_WAIT) { 1623 /* 1624 * If the other side is not active anymore it has sent 1625 * us a FIN packet that we are ack'ing now with an ACK; 1626 * this means both sides have now closed the connection 1627 * and we go into TIME_WAIT 1628 */ 1629 /* 1630 * XXX: how do we know we really are ACKing the FIN 1631 * packet here? does the window code guarantee that? 1632 */ 1633 state[dir] = TCPS_TIME_WAIT; 1634 *age = fr_tcptimeout; 1635 } else 1636 /* 1637 * We closed our side of the connection already but the 1638 * other side is still active (ESTABLISHED/CLOSE_WAIT); 1639 * continue with this half-closed connection 1640 */ 1641 *age = fr_tcpidletimeout; 1642 break; 1643 1644 case TCPS_CLOSING: /* 7 */ 1645 /* NOT USED */ 1646 break; 1647 1648 case TCPS_LAST_ACK: /* 8 */ 1649 if (flags & TH_ACK) { 1650 if ((flags & TH_PUSH) || dlen) 1651 /* 1652 * There is still data to be delivered, reset 1653 * timeout 1654 */ 1655 *age = fr_tcplastack; 1656 } 1657 /* 1658 * We cannot detect when we go out of LAST_ACK state to CLOSED 1659 * because that is based on the reception of ACK packets; 1660 * ipfilter can only detect that a packet has been sent by a 1661 * host 1662 */ 1663 break; 1664 1665 case TCPS_FIN_WAIT_2: /* 9 */ 1666 /* NOT USED */ 1667 break; 1668 1669 case TCPS_TIME_WAIT: /* 10 */ 1670 /* we're in 2MSL timeout now */ 1671 break; 1672 } 1673} 1674 1675 1676#ifdef IPFILTER_LOG 1677void ipstate_log(is, type) 1678struct ipstate *is; 1679u_int type; 1680{ 1681 struct ipslog ipsl; 1682 void *items[1]; 1683 size_t sizes[1]; 1684 int types[1]; 1685 1686 ipsl.isl_type = type; 1687 ipsl.isl_pkts = is->is_pkts; 1688 ipsl.isl_bytes = is->is_bytes; 1689 ipsl.isl_src = is->is_src; 1690 ipsl.isl_dst = is->is_dst; 1691 ipsl.isl_p = is->is_p; 1692 ipsl.isl_v = is->is_v; 1693 ipsl.isl_flags = is->is_flags; 1694 if (ipsl.isl_p == IPPROTO_TCP || ipsl.isl_p == IPPROTO_UDP) { 1695 ipsl.isl_sport = is->is_sport; 1696 ipsl.isl_dport = is->is_dport; 1697 if (ipsl.isl_p == IPPROTO_TCP) { 1698 ipsl.isl_state[0] = is->is_state[0]; 1699 ipsl.isl_state[1] = is->is_state[1]; 1700 } 1701 } else if (ipsl.isl_p == IPPROTO_ICMP) 1702 ipsl.isl_itype = is->is_icmp.ics_type; 1703 else { 1704 ipsl.isl_ps.isl_filler[0] = 0; 1705 ipsl.isl_ps.isl_filler[1] = 0; 1706 } 1707 items[0] = &ipsl; 1708 sizes[0] = sizeof(ipsl); 1709 types[0] = 0; 1710 1711 (void) ipllog(IPL_LOGSTATE, NULL, items, sizes, types, 1); 1712} 1713#endif 1714 1715 1716#ifdef USE_INET6 1717frentry_t *fr_checkicmp6matchingstate(ip, fin) 1718ip6_t *ip; 1719fr_info_t *fin; 1720{ 1721 register ipstate_t *is, **isp; 1722 register u_short sport, dport; 1723 register u_char pr; 1724 struct icmp6_hdr *ic, *oic; 1725 union i6addr dst, src; 1726 u_short savelen; 1727 fr_info_t ofin; 1728 tcphdr_t *tcp; 1729 frentry_t *fr; 1730 ip6_t *oip; 1731 int type; 1732 u_int hv; 1733 1734 /* 1735 * Does it at least have the return (basic) IP header ? 1736 * Only a basic IP header (no options) should be with 1737 * an ICMP error header. 1738 */ 1739 if ((fin->fin_v != 6) || (fin->fin_plen < ICMP6ERR_MINPKTLEN)) 1740 return NULL; 1741 ic = (struct icmp6_hdr *)fin->fin_dp; 1742 type = ic->icmp6_type; 1743 /* 1744 * If it's not an error type, then return 1745 */ 1746 if ((type != ICMP6_DST_UNREACH) && (type != ICMP6_PACKET_TOO_BIG) && 1747 (type != ICMP6_TIME_EXCEEDED) && (type != ICMP6_PARAM_PROB)) 1748 return NULL; 1749 1750 oip = (ip6_t *)((char *)ic + ICMPERR_ICMPHLEN); 1751 if (fin->fin_plen < sizeof(*oip)) 1752 return NULL; 1753 1754 if (oip->ip6_nxt == IPPROTO_ICMPV6) { 1755 oic = (struct icmp6_hdr *)(oip + 1); 1756 /* 1757 * a ICMP error can only be generated as a result of an 1758 * ICMP query, not as the response on an ICMP error 1759 * 1760 * XXX theoretically ICMP_ECHOREP and the other reply's are 1761 * ICMP query's as well, but adding them here seems strange XXX 1762 */ 1763 if (!(oic->icmp6_type & ICMP6_INFOMSG_MASK)) 1764 return NULL; 1765 1766 /* 1767 * perform a lookup of the ICMP packet in the state table 1768 */ 1769 hv = (pr = oip->ip6_nxt); 1770 src.in6 = oip->ip6_src; 1771 hv += src.in4.s_addr; 1772 dst.in6 = oip->ip6_dst; 1773 hv += dst.in4.s_addr; 1774 hv += oic->icmp6_id; 1775 hv += oic->icmp6_seq; 1776 hv %= fr_statesize; 1777 1778 oip->ip6_plen = ntohs(oip->ip6_plen); 1779 ofin.fin_v = 6; 1780 fr_makefrip(sizeof(*oip), (ip_t *)oip, &ofin); 1781 oip->ip6_plen = htons(oip->ip6_plen); 1782 ofin.fin_ifp = fin->fin_ifp; 1783 ofin.fin_out = !fin->fin_out; 1784 ofin.fin_mp = NULL; /* if dereferenced, panic XXX */ 1785 1786 READ_ENTER(&ipf_state); 1787 for (isp = &ips_table[hv]; (is = *isp); isp = &is->is_hnext) 1788 if ((is->is_p == pr) && 1789 (oic->icmp6_id == is->is_icmp.ics_id) && 1790 (oic->icmp6_seq == is->is_icmp.ics_seq) && 1791 fr_matchsrcdst(is, src, dst, &ofin, NULL)) { 1792 /* 1793 * in the state table ICMP query's are stored 1794 * with the type of the corresponding ICMP 1795 * response. Correct here 1796 */ 1797 if (((is->is_type == ICMP6_ECHO_REPLY) && 1798 (oic->icmp6_type == ICMP6_ECHO_REQUEST)) || 1799 (is->is_type - 1 == oic->icmp6_type )) { 1800 ips_stats.iss_hits++; 1801 is->is_pkts++; 1802 is->is_bytes += fin->fin_plen; 1803 return is->is_rule; 1804 } 1805 } 1806 RWLOCK_EXIT(&ipf_state); 1807 1808 return NULL; 1809 }; 1810 1811 if ((oip->ip6_nxt != IPPROTO_TCP) && (oip->ip6_nxt != IPPROTO_UDP)) 1812 return NULL; 1813 tcp = (tcphdr_t *)(oip + 1); 1814 dport = tcp->th_dport; 1815 sport = tcp->th_sport; 1816 1817 hv = (pr = oip->ip6_nxt); 1818 src.in6 = oip->ip6_src; 1819 hv += src.in4.s_addr; 1820 dst.in6 = oip->ip6_dst; 1821 hv += dst.in4.s_addr; 1822 hv += dport; 1823 hv += sport; 1824 hv %= fr_statesize; 1825 /* 1826 * we make an fin entry to be able to feed it to 1827 * matchsrcdst note that not all fields are encessary 1828 * but this is the cleanest way. Note further we fill 1829 * in fin_mp such that if someone uses it we'll get 1830 * a kernel panic. fr_matchsrcdst does not use this. 1831 * 1832 * watch out here, as ip is in host order and oip in network 1833 * order. Any change we make must be undone afterwards. 1834 */ 1835 savelen = oip->ip6_plen; 1836 oip->ip6_plen = ip->ip6_plen - sizeof(*ip) - ICMPERR_ICMPHLEN; 1837 ofin.fin_v = 6; 1838 fr_makefrip(sizeof(*oip), (ip_t *)oip, &ofin); 1839 oip->ip6_plen = savelen; 1840 ofin.fin_ifp = fin->fin_ifp; 1841 ofin.fin_out = !fin->fin_out; 1842 ofin.fin_mp = NULL; /* if dereferenced, panic XXX */ 1843 READ_ENTER(&ipf_state); 1844 for (isp = &ips_table[hv]; (is = *isp); isp = &is->is_hnext) { 1845 /* 1846 * Only allow this icmp though if the 1847 * encapsulated packet was allowed through the 1848 * other way around. Note that the minimal amount 1849 * of info present does not allow for checking against 1850 * tcp internals such as seq and ack numbers. 1851 */ 1852 if ((is->is_p == pr) && (is->is_v == 6) && 1853 fr_matchsrcdst(is, src, dst, &ofin, tcp)) { 1854 fr = is->is_rule; 1855 ips_stats.iss_hits++; 1856 /* 1857 * we must swap src and dst here because the icmp 1858 * comes the other way around 1859 */ 1860 is->is_pkts++; 1861 is->is_bytes += fin->fin_plen; 1862 /* 1863 * we deliberately do not touch the timeouts 1864 * for the accompanying state table entry. 1865 * It remains to be seen if that is correct. XXX 1866 */ 1867 RWLOCK_EXIT(&ipf_state); 1868 return fr; 1869 } 1870 } 1871 RWLOCK_EXIT(&ipf_state); 1872 return NULL; 1873} 1874#endif 1875