Crossbow - transition to Mercurial

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 #pragma ident   "%Z%%M% %I%     %E% SMI"
  27 
  28 /*
  29  * This file contains consumer routines of the IPv4 forwarding engine
  30  */
  31 
  32 #include <sys/types.h>
  33 #include <sys/stream.h>
  34 #include <sys/stropts.h>
  35 #include <sys/strlog.h>
  36 #include <sys/dlpi.h>
  37 #include <sys/ddi.h>
  38 #include <sys/cmn_err.h>
  39 #include <sys/policy.h>
  40 
  41 #include <sys/systm.h>
  42 #include <sys/strsun.h>
  43 #include <sys/kmem.h>
  44 #include <sys/param.h>
  45 #include <sys/socket.h>
  46 #include <sys/strsubr.h>
  47 #include <sys/pattr.h>
  48 #include <net/if.h>
  49 #include <net/route.h>
  50 #include <netinet/in.h>
  51 #include <net/if_dl.h>
  52 #include <netinet/ip6.h>
  53 #include <netinet/icmp6.h>
  54 
  55 #include <inet/common.h>
  56 #include <inet/mi.h>
  57 #include <inet/mib2.h>
  58 #include <inet/ip.h>
  59 #include <inet/ip_impl.h>
  60 #include <inet/ip6.h>
  61 #include <inet/ip_ndp.h>
  62 #include <inet/arp.h>
  63 #include <inet/ip_if.h>
  64 #include <inet/ip_ire.h>
  65 #include <inet/ip_ftable.h>
  66 #include <inet/ip_rts.h>
  67 #include <inet/nd.h>
  68 
  69 #include <net/pfkeyv2.h>
  70 #include <inet/ipsec_info.h>
  71 #include <inet/sadb.h>
  72 #include <sys/kmem.h>
  73 #include <inet/tcp.h>
  74 #include <inet/ipclassifier.h>
  75 #include <sys/zone.h>
  76 #include <net/radix.h>
  77 #include <sys/tsol/label.h>
  78 #include <sys/tsol/tnet.h>
  79 
  80 #define IS_DEFAULT_ROUTE(ire)   \
  81         (((ire)->ire_type & IRE_DEFAULT) || \
  82             (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0)))
  83 
  84 /*
  85  * structure for passing args between ire_ftable_lookup and ire_find_best_route
  86  */
  87 typedef struct ire_ftable_args_s {
  88         ipaddr_t        ift_addr;
  89         ipaddr_t        ift_mask;
  90         ipaddr_t        ift_gateway;
  91         int             ift_type;
  92         const ipif_t            *ift_ipif;
  93         zoneid_t        ift_zoneid;
  94         uint32_t        ift_ihandle;
  95         const ts_label_t        *ift_tsl;
  96         int             ift_flags;
  97         ire_t           *ift_best_ire;
  98 } ire_ftable_args_t;
  99 
 100 static ire_t    *route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *);
 101 static ire_t    *ire_round_robin(irb_t *, zoneid_t, ire_ftable_args_t *,
 102     ip_stack_t *);
 103 static void             ire_del_host_redir(ire_t *, char *);
 104 static boolean_t        ire_find_best_route(struct radix_node *, void *);
 105 static int      ip_send_align_hcksum_flags(mblk_t *, ill_t *);
 106 static ire_t    *ire_ftable_lookup_simple(ipaddr_t,
 107         ire_t **, zoneid_t,  int, ip_stack_t *);
 108 
 109 /*
 110  * Lookup a route in forwarding table. A specific lookup is indicated by
 111  * passing the required parameters and indicating the match required in the
 112  * flag field.
 113  *
 114  * Looking for default route can be done in three ways
 115  * 1) pass mask as 0 and set MATCH_IRE_MASK in flags field
 116  *    along with other matches.
 117  * 2) pass type as IRE_DEFAULT and set MATCH_IRE_TYPE in flags
 118  *    field along with other matches.
 119  * 3) if the destination and mask are passed as zeros.
 120  *
 121  * A request to return a default route if no route
 122  * is found, can be specified by setting MATCH_IRE_DEFAULT
 123  * in flags.
 124  *
 125  * It does not support recursion more than one level. It
 126  * will do recursive lookup only when the lookup maps to
 127  * a prefix or default route and MATCH_IRE_RECURSIVE flag is passed.
 128  *
 129  * If the routing table is setup to allow more than one level
 130  * of recursion, the cleaning up cache table will not work resulting
 131  * in invalid routing.
 132  *
 133  * Supports IP_BOUND_IF by following the ipif/ill when recursing.
 134  *
 135  * NOTE : When this function returns NULL, pire has already been released.
 136  *        pire is valid only when this function successfully returns an
 137  *        ire.
 138  */
 139 ire_t *
 140 ire_ftable_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
 141     int type, const ipif_t *ipif, ire_t **pire, zoneid_t zoneid,
 142     uint32_t ihandle, const ts_label_t *tsl, int flags, ip_stack_t *ipst)
 143 {
 144         ire_t *ire = NULL;
 145         ipaddr_t gw_addr;
 146         struct rt_sockaddr rdst, rmask;
 147         struct rt_entry *rt;
 148         ire_ftable_args_t margs;
 149         boolean_t found_incomplete = B_FALSE;
 150 
 151         ASSERT(ipif == NULL || !ipif->ipif_isv6);
 152 
 153         /*
 154          * When we return NULL from this function, we should make
 155          * sure that *pire is NULL so that the callers will not
 156          * wrongly REFRELE the pire.
 157          */
 158         if (pire != NULL)
 159                 *pire = NULL;
 160         /*
 161          * ire_match_args() will dereference ipif MATCH_IRE_SRC or
 162          * MATCH_IRE_ILL is set.
 163          */
 164         if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
 165             (ipif == NULL))
 166                 return (NULL);
 167 
 168         (void) memset(&rdst, 0, sizeof (rdst));
 169         rdst.rt_sin_len = sizeof (rdst);
 170         rdst.rt_sin_family = AF_INET;
 171         rdst.rt_sin_addr.s_addr = addr;
 172 
 173         (void) memset(&rmask, 0, sizeof (rmask));
 174         rmask.rt_sin_len = sizeof (rmask);
 175         rmask.rt_sin_family = AF_INET;
 176         rmask.rt_sin_addr.s_addr = mask;
 177 
 178         (void) memset(&margs, 0, sizeof (margs));
 179         margs.ift_addr = addr;
 180         margs.ift_mask = mask;
 181         margs.ift_gateway = gateway;
 182         margs.ift_type = type;
 183         margs.ift_ipif = ipif;
 184         margs.ift_zoneid = zoneid;
 185         margs.ift_ihandle = ihandle;
 186         margs.ift_tsl = tsl;
 187         margs.ift_flags = flags;
 188 
 189         /*
 190          * The flags argument passed to ire_ftable_lookup may cause the
 191          * search to return, not the longest matching prefix, but the
 192          * "best matching prefix", i.e., the longest prefix that also
 193          * satisfies constraints imposed via the permutation of flags
 194          * passed in. To achieve this, we invoke ire_match_args() on
 195          * each matching leaf in the  radix tree. ire_match_args is
 196          * invoked by the callback function ire_find_best_route()
 197          * We hold the global tree lock in read mode when calling
 198          * rn_match_args.Before dropping the global tree lock, ensure
 199          * that the radix node can't be deleted by incrementing ire_refcnt.
 200          */
 201         RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
 202         rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
 203             ipst->ips_ip_ftable, ire_find_best_route, &margs);
 204         ire = margs.ift_best_ire;
 205         RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
 206 
 207         if (rt == NULL) {
 208                 return (NULL);
 209         } else {
 210                 ASSERT(ire != NULL);
 211         }
 212 
 213         DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire);
 214 
 215         if (!IS_DEFAULT_ROUTE(ire))
 216                 goto found_ire_held;
 217         /*
 218          * If default route is found, see if default matching criteria
 219          * are satisfied.
 220          */
 221         if (flags & MATCH_IRE_MASK) {
 222                 /*
 223                  * we were asked to match a 0 mask, and came back with
 224                  * a default route. Ok to return it.
 225                  */
 226                 goto found_default_ire;
 227         }
 228         if ((flags & MATCH_IRE_TYPE) &&
 229             (type & (IRE_DEFAULT | IRE_INTERFACE))) {
 230                 /*
 231                  * we were asked to match a default ire type. Ok to return it.
 232                  */
 233                 goto found_default_ire;
 234         }
 235         if (flags & MATCH_IRE_DEFAULT) {
 236                 goto found_default_ire;
 237         }
 238         /*
 239          * we found a default route, but default matching criteria
 240          * are not specified and we are not explicitly looking for
 241          * default.
 242          */
 243         IRE_REFRELE(ire);
 244         return (NULL);
 245 found_default_ire:
 246         /*
 247          * round-robin only if we have more than one route in the bucket.
 248          */
 249         if ((ire->ire_bucket->irb_ire_cnt > 1) &&
 250             IS_DEFAULT_ROUTE(ire) &&
 251             ((flags & (MATCH_IRE_DEFAULT | MATCH_IRE_MASK)) ==
 252             MATCH_IRE_DEFAULT)) {
 253                 ire_t *next_ire;
 254 
 255                 next_ire = ire_round_robin(ire->ire_bucket, zoneid, &margs,
 256                     ipst);
 257                 IRE_REFRELE(ire);
 258                 if (next_ire != NULL) {
 259                         ire = next_ire;
 260                 } else {
 261                         /* no route */
 262                         return (NULL);
 263                 }
 264         }
 265 found_ire_held:
 266         if ((flags & MATCH_IRE_RJ_BHOLE) &&
 267             (ire->ire_flags & (RTF_BLACKHOLE | RTF_REJECT))) {
 268                 return (ire);
 269         }
 270         /*
 271          * At this point, IRE that was found must be an IRE_FORWARDTABLE
 272          * type.  If this is a recursive lookup and an IRE_INTERFACE type was
 273          * found, return that.  If it was some other IRE_FORWARDTABLE type of
 274          * IRE (one of the prefix types), then it is necessary to fill in the
 275          * parent IRE pointed to by pire, and then lookup the gateway address of
 276          * the parent.  For backwards compatiblity, if this lookup returns an
 277          * IRE other than a IRE_CACHETABLE or IRE_INTERFACE, then one more level
 278          * of lookup is done.
 279          */
 280         if (flags & MATCH_IRE_RECURSIVE) {
 281                 ipif_t  *gw_ipif;
 282                 int match_flags = MATCH_IRE_DSTONLY;
 283                 ire_t *save_ire;
 284 
 285                 if (ire->ire_type & IRE_INTERFACE)
 286                         return (ire);
 287                 if (pire != NULL)
 288                         *pire = ire;
 289                 /*
 290                  * If we can't find an IRE_INTERFACE or the caller has not
 291                  * asked for pire, we need to REFRELE the save_ire.
 292                  */
 293                 save_ire = ire;
 294 
 295                 /*
 296                  * Currently MATCH_IRE_ILL is never used with
 297                  * (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT) while
 298                  * sending out packets as MATCH_IRE_ILL is used only
 299                  * for communicating with on-link hosts. We can't assert
 300                  * that here as RTM_GET calls this function with
 301                  * MATCH_IRE_ILL | MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE.
 302                  * We have already used the MATCH_IRE_ILL in determining
 303                  * the right prefix route at this point. To match the
 304                  * behavior of how we locate routes while sending out
 305                  * packets, we don't want to use MATCH_IRE_ILL below
 306                  * while locating the interface route.
 307                  *
 308                  * ire_ftable_lookup may end up with an incomplete IRE_CACHE
 309                  * entry for the gateway (i.e., one for which the
 310                  * ire_nce->nce_state is not yet ND_REACHABLE). If the caller
 311                  * has specified MATCH_IRE_COMPLETE, such entries will not
 312                  * be returned; instead, we return the IF_RESOLVER ire.
 313                  */
 314                 if (ire->ire_ipif != NULL)
 315                         match_flags |= MATCH_IRE_ILL_GROUP;
 316 
 317                 ire = ire_route_lookup(ire->ire_gateway_addr, 0, 0, 0,
 318                     ire->ire_ipif, NULL, zoneid, tsl, match_flags, ipst);
 319                 DTRACE_PROBE2(ftable__route__lookup1, (ire_t *), ire,
 320                     (ire_t *), save_ire);
 321                 if (ire == NULL ||
 322                     ((ire->ire_type & IRE_CACHE) && ire->ire_nce &&
 323                     ire->ire_nce->nce_state != ND_REACHABLE &&
 324                     (flags & MATCH_IRE_COMPLETE))) {
 325                         /*
 326                          * Do not release the parent ire if MATCH_IRE_PARENT
 327                          * is set. Also return it via ire.
 328                          */
 329                         if (ire != NULL) {
 330                                 ire_refrele(ire);
 331                                 ire = NULL;
 332                                 found_incomplete = B_TRUE;
 333                         }
 334                         if (flags & MATCH_IRE_PARENT) {
 335                                 if (pire != NULL) {
 336                                         /*
 337                                          * Need an extra REFHOLD, if the parent
 338                                          * ire is returned via both ire and
 339                                          * pire.
 340                                          */
 341                                         IRE_REFHOLD(save_ire);
 342                                 }
 343                                 ire = save_ire;
 344                         } else {
 345                                 ire_refrele(save_ire);
 346                                 if (pire != NULL)
 347                                         *pire = NULL;
 348                         }
 349                         if (!found_incomplete)
 350                                 return (ire);
 351                 }
 352                 if (ire->ire_type & (IRE_CACHETABLE | IRE_INTERFACE)) {
 353                         /*
 354                          * If the caller did not ask for pire, release
 355                          * it now.
 356                          */
 357                         if (pire == NULL) {
 358                                 ire_refrele(save_ire);
 359                         }
 360                         return (ire);
 361                 }
 362                 match_flags |= MATCH_IRE_TYPE;
 363                 gw_addr = ire->ire_gateway_addr;
 364                 gw_ipif = ire->ire_ipif;
 365                 ire_refrele(ire);
 366                 ire = ire_route_lookup(gw_addr, 0, 0,
 367                     (found_incomplete? IRE_INTERFACE :
 368                     (IRE_CACHETABLE | IRE_INTERFACE)),
 369                     gw_ipif, NULL, zoneid, tsl, match_flags, ipst);
 370                 DTRACE_PROBE2(ftable__route__lookup2, (ire_t *), ire,
 371                     (ire_t *), save_ire);
 372                 if (ire == NULL ||
 373                     ((ire->ire_type & IRE_CACHE) && ire->ire_nce &&
 374                     ire->ire_nce->nce_state != ND_REACHABLE &&
 375                     (flags & MATCH_IRE_COMPLETE))) {
 376                         /*
 377                          * Do not release the parent ire if MATCH_IRE_PARENT
 378                          * is set. Also return it via ire.
 379                          */
 380                         if (ire != NULL) {
 381                                 ire_refrele(ire);
 382                                 ire = NULL;
 383                         }
 384                         if (flags & MATCH_IRE_PARENT) {
 385                                 if (pire != NULL) {
 386                                         /*
 387                                          * Need an extra REFHOLD, if the
 388                                          * parent ire is returned via both
 389                                          * ire and pire.
 390                                          */
 391                                         IRE_REFHOLD(save_ire);
 392                                 }
 393                                 ire = save_ire;
 394                         } else {
 395                                 ire_refrele(save_ire);
 396                                 if (pire != NULL)
 397                                         *pire = NULL;
 398                         }
 399                         return (ire);
 400                 } else if (pire == NULL) {
 401                         /*
 402                          * If the caller did not ask for pire, release
 403                          * it now.
 404                          */
 405                         ire_refrele(save_ire);
 406                 }
 407                 return (ire);
 408         }
 409         ASSERT(pire == NULL || *pire == NULL);
 410         return (ire);
 411 }
 412 
 413 /*
 414  * This function is called by
 415  * ip_fast_forward->ire_forward_simple
 416  * The optimizations of this function over ire_ftable_lookup are:
 417  *      o removing unnecessary flag matching
 418  *      o doing longest prefix match instead of overloading it further
 419  *        with the unnecessary "best_prefix_match"
 420  *      o Does not do round robin of default route for every packet
 421  *      o inlines code of ire_ctable_lookup to look for nexthop cache
 422  *        entry before calling ire_route_lookup
 423  */
 424 static ire_t *
 425 ire_ftable_lookup_simple(ipaddr_t addr,
 426     ire_t **pire, zoneid_t zoneid, int flags,
 427     ip_stack_t *ipst)
 428 {
 429         ire_t *ire = NULL;
 430         ire_t *tmp_ire = NULL;
 431         struct rt_sockaddr rdst;
 432         struct rt_entry *rt;
 433         irb_t *irb_ptr;
 434         ire_t *save_ire;
 435         int match_flags;
 436 
 437         rdst.rt_sin_len = sizeof (rdst);
 438         rdst.rt_sin_family = AF_INET;
 439         rdst.rt_sin_addr.s_addr = addr;
 440 
 441         /*
 442          * This is basically inlining  a simpler version of ire_match_args
 443          */
 444         RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
 445 
 446         rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
 447             ipst->ips_ip_ftable, NULL, NULL);
 448 
 449         if (rt == NULL) {
 450                 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
 451                 return (NULL);
 452         }
 453         irb_ptr = &rt->rt_irb;
 454         if (irb_ptr == NULL || irb_ptr->irb_ire_cnt == 0) {
 455                 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
 456                 return (NULL);
 457         }
 458 
 459         rw_enter(&irb_ptr->irb_lock, RW_READER);
 460         ire = irb_ptr->irb_ire;
 461         if (ire == NULL || (ire->ire_marks & IRE_MARK_CONDEMNED)) {
 462                 rw_exit(&irb_ptr->irb_lock);
 463                 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
 464                 return (NULL);
 465         }
 466         /* we have a ire that matches */
 467         if (ire != NULL)
 468                 IRE_REFHOLD(ire);
 469         rw_exit(&irb_ptr->irb_lock);
 470         RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
 471 
 472         if ((flags & MATCH_IRE_RJ_BHOLE) &&
 473             (ire->ire_flags & (RTF_BLACKHOLE | RTF_REJECT))) {
 474                 return (ire);
 475         }
 476         /*
 477          * At this point, IRE that was found must be an IRE_FORWARDTABLE
 478          * type.  If this is a recursive lookup and an IRE_INTERFACE type was
 479          * found, return that.  If it was some other IRE_FORWARDTABLE type of
 480          * IRE (one of the prefix types), then it is necessary to fill in the
 481          * parent IRE pointed to by pire, and then lookup the gateway address of
 482          * the parent.  For backwards compatiblity, if this lookup returns an
 483          * IRE other than a IRE_CACHETABLE or IRE_INTERFACE, then one more level
 484          * of lookup is done.
 485          */
 486         match_flags = MATCH_IRE_DSTONLY;
 487 
 488         if (ire->ire_type & IRE_INTERFACE)
 489                 return (ire);
 490         *pire = ire;
 491         /*
 492          * If we can't find an IRE_INTERFACE or the caller has not
 493          * asked for pire, we need to REFRELE the save_ire.
 494          */
 495         save_ire = ire;
 496 
 497         /*
 498          * Currently MATCH_IRE_ILL is never used with
 499          * (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT) while
 500          * sending out packets as MATCH_IRE_ILL is used only
 501          * for communicating with on-link hosts. We can't assert
 502          * that here as RTM_GET calls this function with
 503          * MATCH_IRE_ILL | MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE.
 504          * We have already used the MATCH_IRE_ILL in determining
 505          * the right prefix route at this point. To match the
 506          * behavior of how we locate routes while sending out
 507          * packets, we don't want to use MATCH_IRE_ILL below
 508          * while locating the interface route.
 509          *
 510          * ire_ftable_lookup may end up with an incomplete IRE_CACHE
 511          * entry for the gateway (i.e., one for which the
 512          * ire_nce->nce_state is not yet ND_REACHABLE). If the caller
 513          * has specified MATCH_IRE_COMPLETE, such entries will not
 514          * be returned; instead, we return the IF_RESOLVER ire.
 515          */
 516 
 517         if (ire->ire_ipif == NULL) {
 518                 tmp_ire = ire;
 519                 /*
 520                  * Look to see if the nexthop entry is in the
 521                  * cachetable (I am inlining a simpler ire_cache_lookup
 522                  * here).
 523                  */
 524                 ire = ire_cache_lookup_simple(ire->ire_gateway_addr, ipst);
 525                 if (ire == NULL) {
 526                         /* Try ire_route_lookup */
 527                         ire = tmp_ire;
 528                 } else {
 529                         goto solved;
 530                 }
 531         }
 532         if (ire->ire_ipif != NULL)
 533                 match_flags |= MATCH_IRE_ILL_GROUP;
 534 
 535         ire = ire_route_lookup(ire->ire_gateway_addr, 0,
 536             0, 0, ire->ire_ipif, NULL, zoneid, NULL, match_flags, ipst);
 537 solved:
 538         DTRACE_PROBE2(ftable__route__lookup1, (ire_t *), ire,
 539             (ire_t *), save_ire);
 540         if (ire == NULL) {
 541                 /*
 542                  * Do not release the parent ire if MATCH_IRE_PARENT
 543                  * is set. Also return it via ire.
 544                  */
 545                 ire_refrele(save_ire);
 546                 *pire = NULL;
 547                 return (ire);
 548         }
 549         if (ire->ire_type & (IRE_CACHETABLE | IRE_INTERFACE)) {
 550                 /*
 551                  * If the caller did not ask for pire, release
 552                  * it now.
 553                  */
 554                 if (pire == NULL) {
 555                         ire_refrele(save_ire);
 556                 }
 557         }
 558         return (ire);
 559 }
 560 
 561 /*
 562  * Find an IRE_OFFSUBNET IRE entry for the multicast address 'group'
 563  * that goes through 'ipif'. As a fallback, a route that goes through
 564  * ipif->ipif_ill can be returned.
 565  */
 566 ire_t *
 567 ipif_lookup_multi_ire(ipif_t *ipif, ipaddr_t group)
 568 {
 569         ire_t   *ire;
 570         ire_t   *save_ire = NULL;
 571         ire_t   *gw_ire;
 572         irb_t   *irb;
 573         ipaddr_t gw_addr;
 574         int     match_flags = MATCH_IRE_TYPE | MATCH_IRE_ILL;
 575         ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
 576 
 577         ASSERT(CLASSD(group));
 578 
 579         ire = ire_ftable_lookup(group, 0, 0, 0, NULL, NULL, ALL_ZONES, 0,
 580             NULL, MATCH_IRE_DEFAULT, ipst);
 581 
 582         if (ire == NULL)
 583                 return (NULL);
 584 
 585         irb = ire->ire_bucket;
 586         ASSERT(irb);
 587 
 588         IRB_REFHOLD(irb);
 589         ire_refrele(ire);
 590         for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
 591                 if (ire->ire_addr != group ||
 592                     ipif->ipif_zoneid != ire->ire_zoneid &&
 593                     ire->ire_zoneid != ALL_ZONES) {
 594                         continue;
 595                 }
 596 
 597                 switch (ire->ire_type) {
 598                 case IRE_DEFAULT:
 599                 case IRE_PREFIX:
 600                 case IRE_HOST:
 601                         gw_addr = ire->ire_gateway_addr;
 602                         gw_ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE,
 603                             ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
 604 
 605                         if (gw_ire != NULL) {
 606                                 if (save_ire != NULL) {
 607                                         ire_refrele(save_ire);
 608                                 }
 609                                 IRE_REFHOLD(ire);
 610                                 if (gw_ire->ire_ipif == ipif) {
 611                                         ire_refrele(gw_ire);
 612 
 613                                         IRB_REFRELE(irb);
 614                                         return (ire);
 615                                 }
 616                                 ire_refrele(gw_ire);
 617                                 save_ire = ire;
 618                         }
 619                         break;
 620                 case IRE_IF_NORESOLVER:
 621                 case IRE_IF_RESOLVER:
 622                         if (ire->ire_ipif == ipif) {
 623                                 if (save_ire != NULL) {
 624                                         ire_refrele(save_ire);
 625                                 }
 626                                 IRE_REFHOLD(ire);
 627 
 628                                 IRB_REFRELE(irb);
 629                                 return (ire);
 630                         }
 631                         break;
 632                 }
 633         }
 634         IRB_REFRELE(irb);
 635 
 636         return (save_ire);
 637 }
 638 
 639 /*
 640  * Find an IRE_INTERFACE for the multicast group.
 641  * Allows different routes for multicast addresses
 642  * in the unicast routing table (akin to 224.0.0.0 but could be more specific)
 643  * which point at different interfaces. This is used when IP_MULTICAST_IF
 644  * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't
 645  * specify the interface to join on.
 646  *
 647  * Supports IP_BOUND_IF by following the ipif/ill when recursing.
 648  */
 649 ire_t *
 650 ire_lookup_multi(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst)
 651 {
 652         ire_t   *ire;
 653         ipif_t  *ipif = NULL;
 654         int     match_flags = MATCH_IRE_TYPE;
 655         ipaddr_t gw_addr;
 656 
 657         ire = ire_ftable_lookup(group, 0, 0, 0, NULL, NULL, zoneid,
 658             0, NULL, MATCH_IRE_DEFAULT, ipst);
 659 
 660         /* We search a resolvable ire in case of multirouting. */
 661         if ((ire != NULL) && (ire->ire_flags & RTF_MULTIRT)) {
 662                 ire_t *cire = NULL;
 663                 /*
 664                  * If the route is not resolvable, the looked up ire
 665                  * may be changed here. In that case, ire_multirt_lookup()
 666                  * IRE_REFRELE the original ire and change it.
 667                  */
 668                 (void) ire_multirt_lookup(&cire, &ire, MULTIRT_CACHEGW,
 669                     NULL, ipst);
 670                 if (cire != NULL)
 671                         ire_refrele(cire);
 672         }
 673         if (ire == NULL)
 674                 return (NULL);
 675         /*
 676          * Make sure we follow ire_ipif.
 677          *
 678          * We need to determine the interface route through
 679          * which the gateway will be reached. We don't really
 680          * care which interface is picked if the interface is
 681          * part of a group.
 682          */
 683         if (ire->ire_ipif != NULL) {
 684                 ipif = ire->ire_ipif;
 685                 match_flags |= MATCH_IRE_ILL_GROUP;
 686         }
 687 
 688         switch (ire->ire_type) {
 689         case IRE_DEFAULT:
 690         case IRE_PREFIX:
 691         case IRE_HOST:
 692                 gw_addr = ire->ire_gateway_addr;
 693                 ire_refrele(ire);
 694                 ire = ire_ftable_lookup(gw_addr, 0, 0,
 695                     IRE_INTERFACE, ipif, NULL, zoneid, 0,
 696                     NULL, match_flags, ipst);
 697                 return (ire);
 698         case IRE_IF_NORESOLVER:
 699         case IRE_IF_RESOLVER:
 700                 return (ire);
 701         default:
 702                 ire_refrele(ire);
 703                 return (NULL);
 704         }
 705 }
 706 
 707 /*
 708  * Delete the passed in ire if the gateway addr matches
 709  */
 710 void
 711 ire_del_host_redir(ire_t *ire, char *gateway)
 712 {
 713         if ((ire->ire_flags & RTF_DYNAMIC) &&
 714             (ire->ire_gateway_addr == *(ipaddr_t *)gateway))
 715                 ire_delete(ire);
 716 }
 717 
 718 /*
 719  * Search for all HOST REDIRECT routes that are
 720  * pointing at the specified gateway and
 721  * delete them. This routine is called only
 722  * when a default gateway is going away.
 723  */
 724 void
 725 ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst)
 726 {
 727         struct rtfuncarg rtfarg;
 728 
 729         (void) memset(&rtfarg, 0, sizeof (rtfarg));
 730         rtfarg.rt_func = ire_del_host_redir;
 731         rtfarg.rt_arg = (void *)&gateway;
 732         (void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable,
 733             rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn);
 734 }
 735 
 736 struct ihandle_arg {
 737         uint32_t ihandle;
 738         ire_t    *ire;
 739 };
 740 
 741 static int
 742 ire_ihandle_onlink_match(struct radix_node *rn, void *arg)
 743 {
 744         struct rt_entry *rt;
 745         irb_t *irb;
 746         ire_t *ire;
 747         struct ihandle_arg *ih = arg;
 748 
 749         rt = (struct rt_entry *)rn;
 750         ASSERT(rt != NULL);
 751         irb = &rt->rt_irb;
 752         for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
 753                 if ((ire->ire_type & IRE_INTERFACE) &&
 754                     (ire->ire_ihandle == ih->ihandle)) {
 755                         ih->ire = ire;
 756                         IRE_REFHOLD(ire);
 757                         return (1);
 758                 }
 759         }
 760         return (0);
 761 }
 762 
 763 /*
 764  * Locate the interface ire that is tied to the cache ire 'cire' via
 765  * cire->ire_ihandle.
 766  *
 767  * We are trying to create the cache ire for an onlink destn. or
 768  * gateway in 'cire'. We are called from ire_add_v4() in the IRE_IF_RESOLVER
 769  * case, after the ire has come back from ARP.
 770  */
 771 ire_t *
 772 ire_ihandle_lookup_onlink(ire_t *cire)
 773 {
 774         ire_t   *ire;
 775         int     match_flags;
 776         struct ihandle_arg ih;
 777         ip_stack_t *ipst;
 778 
 779         ASSERT(cire != NULL);
 780         ipst = cire->ire_ipst;
 781 
 782         /*
 783          * We don't need to specify the zoneid to ire_ftable_lookup() below
 784          * because the ihandle refers to an ipif which can be in only one zone.
 785          */
 786         match_flags =  MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK;
 787         /*
 788          * We know that the mask of the interface ire equals cire->ire_cmask.
 789          * (When ip_newroute() created 'cire' for an on-link destn. it set its
 790          * cmask from the interface ire's mask)
 791          */
 792         ire = ire_ftable_lookup(cire->ire_addr, cire->ire_cmask, 0,
 793             IRE_INTERFACE, NULL, NULL, ALL_ZONES, cire->ire_ihandle,
 794             NULL, match_flags, ipst);
 795         if (ire != NULL)
 796                 return (ire);
 797         /*
 798          * If we didn't find an interface ire above, we can't declare failure.
 799          * For backwards compatibility, we need to support prefix routes
 800          * pointing to next hop gateways that are not on-link.
 801          *
 802          * In the resolver/noresolver case, ip_newroute() thinks it is creating
 803          * the cache ire for an onlink destination in 'cire'. But 'cire' is
 804          * not actually onlink, because ire_ftable_lookup() cheated it, by
 805          * doing ire_route_lookup() twice and returning an interface ire.
 806          *
 807          * Eg. default  -       gw1                     (line 1)
 808          *      gw1     -       gw2                     (line 2)
 809          *      gw2     -       hme0                    (line 3)
 810          *
 811          * In the above example, ip_newroute() tried to create the cache ire
 812          * 'cire' for gw1, based on the interface route in line 3. The
 813          * ire_ftable_lookup() above fails, because there is no interface route
 814          * to reach gw1. (it is gw2). We fall thru below.
 815          *
 816          * Do a brute force search based on the ihandle in a subset of the
 817          * forwarding tables, corresponding to cire->ire_cmask. Otherwise
 818          * things become very complex, since we don't have 'pire' in this
 819          * case. (Also note that this method is not possible in the offlink
 820          * case because we don't know the mask)
 821          */
 822         (void) memset(&ih, 0, sizeof (ih));
 823         ih.ihandle = cire->ire_ihandle;
 824         (void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable,
 825             ire_ihandle_onlink_match, &ih, irb_refhold_rn, irb_refrele_rn);
 826         return (ih.ire);
 827 }
 828 
 829 /*
 830  * IRE iterator used by ire_ftable_lookup[_v6]() to process multiple default
 831  * routes. Given a starting point in the hash list (ire_origin), walk the IREs
 832  * in the bucket skipping default interface routes and deleted entries.
 833  * Returns the next IRE (unheld), or NULL when we're back to the starting point.
 834  * Assumes that the caller holds a reference on the IRE bucket.
 835  */
 836 ire_t *
 837 ire_get_next_default_ire(ire_t *ire, ire_t *ire_origin)
 838 {
 839         ASSERT(ire_origin->ire_bucket != NULL);
 840         ASSERT(ire != NULL);
 841 
 842         do {
 843                 ire = ire->ire_next;
 844                 if (ire == NULL)
 845                         ire = ire_origin->ire_bucket->irb_ire;
 846                 if (ire == ire_origin)
 847                         return (NULL);
 848         } while ((ire->ire_type & IRE_INTERFACE) ||
 849             (ire->ire_marks & IRE_MARK_CONDEMNED));
 850         ASSERT(ire != NULL);
 851         return (ire);
 852 }
 853 
 854 static ipif_t *
 855 ire_forward_src_ipif(ipaddr_t dst, ire_t *sire, ire_t *ire, ill_t *dst_ill,
 856     int zoneid, ushort_t *marks)
 857 {
 858         ipif_t *src_ipif;
 859         ip_stack_t *ipst = dst_ill->ill_ipst;
 860 
 861         /*
 862          * Pick the best source address from dst_ill.
 863          *
 864          * 1) If it is part of a multipathing group, we would
 865          *    like to spread the inbound packets across different
 866          *    interfaces. ipif_select_source picks a random source
 867          *    across the different ills in the group.
 868          *
 869          * 2) If it is not part of a multipathing group, we try
 870          *    to pick the source address from the destination
 871          *    route. Clustering assumes that when we have multiple
 872          *    prefixes hosted on an interface, the prefix of the
 873          *    source address matches the prefix of the destination
 874          *    route. We do this only if the address is not
 875          *    DEPRECATED.
 876          *
 877          * 3) If the conn is in a different zone than the ire, we
 878          *    need to pick a source address from the right zone.
 879          *
 880          * NOTE : If we hit case (1) above, the prefix of the source
 881          *        address picked may not match the prefix of the
 882          *        destination routes prefix as ipif_select_source
 883          *        does not look at "dst" while picking a source
 884          *        address.
 885          *        If we want the same behavior as (2), we will need
 886          *        to change the behavior of ipif_select_source.
 887          */
 888 
 889         if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) {
 890                 /*
 891                  * The RTF_SETSRC flag is set in the parent ire (sire).
 892                  * Check that the ipif matching the requested source
 893                  * address still exists.
 894                  */
 895                 src_ipif = ipif_lookup_addr(sire->ire_src_addr, NULL,
 896                     zoneid, NULL, NULL, NULL, NULL, ipst);
 897                 return (src_ipif);
 898         }
 899         *marks |= IRE_MARK_USESRC_CHECK;
 900         if ((dst_ill->ill_group != NULL) ||
 901             (ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) ||
 902             (dst_ill->ill_usesrc_ifindex != 0)) {
 903                 src_ipif = ipif_select_source(dst_ill, dst, zoneid);
 904                 if (src_ipif == NULL)
 905                         return (NULL);
 906 
 907         } else {
 908                 src_ipif = ire->ire_ipif;
 909                 ASSERT(src_ipif != NULL);
 910                 /* hold src_ipif for uniformity */
 911                 ipif_refhold(src_ipif);
 912         }
 913         return (src_ipif);
 914 }
 915 
 916 /*
 917  * This function is called by ip_rput_noire() and ip_fast_forward()
 918  * to resolve the route of incoming packet that needs to be forwarded.
 919  * If the ire of the nexthop is not already in the cachetable, this
 920  * routine will insert it to the table, but won't trigger ARP resolution yet.
 921  * Thus unlike ip_newroute, this function adds incomplete ires to
 922  * the cachetable. ARP resolution for these ires are  delayed until
 923  * after all of the packet processing is completed and its ready to
 924  * be sent out on the wire, Eventually, the packet transmit routine
 925  * ip_xmit_v4() attempts to send a packet  to the driver. If it finds
 926  * that there is no link layer information, it will do the arp
 927  * resolution and queue the packet in ire->ire_nce->nce_qd_mp and
 928  * then send it out once the arp resolution is over
 929  * (see ip_xmit_v4()->ire_arpresolve()). This scheme is similar to
 930  * the model of BSD/SunOS 4
 931  *
 932  * In future, the insertion of incomplete ires in the cachetable should
 933  * be implemented in hostpath as well, as doing so will greatly reduce
 934  * the existing complexity for code paths that depend on the context of
 935  * the sender (such as IPsec).
 936  *
 937  * Thus this scheme of adding incomplete ires in cachetable in forwarding
 938  * path can be used as a template for simplifying the hostpath.
 939  */
 940 
 941 ire_t *
 942 ire_forward(ipaddr_t dst, enum ire_forward_action *ret_action,
 943     ire_t *supplied_ire, ire_t *supplied_sire, const struct ts_label_s *tsl,
 944     ip_stack_t *ipst)
 945 {
 946         ipaddr_t gw = 0;
 947         ire_t   *ire = NULL;
 948         ire_t   *sire = NULL, *save_ire;
 949         ill_t *dst_ill = NULL;
 950         int error;
 951         zoneid_t zoneid;
 952         ipif_t *src_ipif = NULL;
 953         mblk_t *res_mp;
 954         ushort_t ire_marks = 0;
 955         tsol_gcgrp_t *gcgrp = NULL;
 956         tsol_gcgrp_addr_t ga;
 957 
 958         zoneid = GLOBAL_ZONEID;
 959 
 960         if (supplied_ire != NULL) {
 961                 /* We have arrived here from ipfil_sendpkt */
 962                 ire = supplied_ire;
 963                 sire = supplied_sire;
 964                 goto create_irecache;
 965         }
 966 
 967         ire = ire_ftable_lookup(dst, 0, 0, 0, NULL, &sire, zoneid, 0,
 968             tsl, MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
 969             MATCH_IRE_RJ_BHOLE | MATCH_IRE_PARENT|MATCH_IRE_SECATTR, ipst);
 970 
 971         if (ire == NULL) {
 972                 ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0, RTA_DST, ipst);
 973                 goto icmp_err_ret;
 974         }
 975 
 976         /*
 977          * If we encounter CGTP, we should  have the caller use
 978          * ip_newroute to resolve multirt instead of this function.
 979          * CGTP specs explicitly state that it can't be used with routers.
 980          * This essentially prevents insertion of incomplete RTF_MULTIRT
 981          * ires in cachetable.
 982          */
 983         if (ipst->ips_ip_cgtp_filter &&
 984             ((ire->ire_flags & RTF_MULTIRT) ||
 985             ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)))) {
 986                 ip3dbg(("ire_forward: packet is to be multirouted- "
 987                     "handing it to ip_newroute\n"));
 988                 if (sire != NULL)
 989                         ire_refrele(sire);
 990                 ire_refrele(ire);
 991                 /*
 992                  * Inform caller about encountering of multirt so that
 993                  * ip_newroute() can be called.
 994                  */
 995                 *ret_action = Forward_check_multirt;
 996                 return (NULL);
 997         }
 998 
 999         /*
1000          * Verify that the returned IRE does not have either
1001          * the RTF_REJECT or RTF_BLACKHOLE flags set and that the IRE is
1002          * either an IRE_CACHE, IRE_IF_NORESOLVER or IRE_IF_RESOLVER.
1003          */
1004         if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) ||
1005             (ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0) {
1006                 ip3dbg(("ire 0x%p is not cache/resolver/noresolver\n",
1007                     (void *)ire));
1008                 goto icmp_err_ret;
1009         }
1010 
1011         /*
1012          * If we already have a fully resolved IRE CACHE of the
1013          * nexthop router, just hand over the cache entry
1014          * and we are done.
1015          */
1016 
1017         if (ire->ire_type & IRE_CACHE) {
1018 
1019                 /*
1020                  * If we are using this ire cache entry as a
1021                  * gateway to forward packets, chances are we
1022                  * will be using it again. So turn off
1023                  * the temporary flag, thus reducing its
1024                  * chances of getting deleted frequently.
1025                  */
1026                 if (ire->ire_marks & IRE_MARK_TEMPORARY) {
1027                         irb_t *irb = ire->ire_bucket;
1028                         rw_enter(&irb->irb_lock, RW_WRITER);
1029                         /*
1030                          * We need to recheck for IRE_MARK_TEMPORARY after
1031                          * acquiring the lock in order to guarantee
1032                          * irb_tmp_ire_cnt
1033                          */
1034                         if (ire->ire_marks & IRE_MARK_TEMPORARY) {
1035                                 ire->ire_marks &= ~IRE_MARK_TEMPORARY;
1036                                 irb->irb_tmp_ire_cnt--;
1037                         }
1038                         rw_exit(&irb->irb_lock);
1039                 }
1040 
1041                 if (sire != NULL) {
1042                         UPDATE_OB_PKT_COUNT(sire);
1043                         sire->ire_last_used_time = lbolt;
1044                         ire_refrele(sire);
1045                 }
1046                 *ret_action = Forward_ok;
1047                 return (ire);
1048         }
1049 create_irecache:
1050         /*
1051          * Increment the ire_ob_pkt_count field for ire if it is an
1052          * INTERFACE (IF_RESOLVER or IF_NORESOLVER) IRE type, and
1053          * increment the same for the parent IRE, sire, if it is some
1054          * sort of prefix IRE (which includes DEFAULT, PREFIX, and HOST).
1055          */
1056         if ((ire->ire_type & IRE_INTERFACE) != 0) {
1057                 UPDATE_OB_PKT_COUNT(ire);
1058                 ire->ire_last_used_time = lbolt;
1059         }
1060 
1061         /*
1062          * sire must be either IRE_CACHETABLE OR IRE_INTERFACE type
1063          */
1064         if (sire != NULL) {
1065                 gw = sire->ire_gateway_addr;
1066                 ASSERT((sire->ire_type &
1067                     (IRE_CACHETABLE | IRE_INTERFACE)) == 0);
1068                 UPDATE_OB_PKT_COUNT(sire);
1069                 sire->ire_last_used_time = lbolt;
1070         }
1071 
1072         /* Obtain dst_ill */
1073         dst_ill = ip_newroute_get_dst_ill(ire->ire_ipif->ipif_ill);
1074         if (dst_ill == NULL) {
1075                 ip2dbg(("ire_forward no dst ill; ire 0x%p\n",
1076                     (void *)ire));
1077                 goto icmp_err_ret;
1078         }
1079 
1080         ASSERT(src_ipif == NULL);
1081         /* Now obtain the src_ipif */
1082         src_ipif = ire_forward_src_ipif(dst, sire, ire, dst_ill,
1083             zoneid, &ire_marks);
1084         if (src_ipif == NULL)
1085                 goto icmp_err_ret;
1086 
1087         switch (ire->ire_type) {
1088         case IRE_IF_NORESOLVER:
1089                 /* create ire_cache for ire_addr endpoint */
1090                 if (dst_ill->ill_phys_addr_length != IP_ADDR_LEN &&
1091                     dst_ill->ill_resolver_mp == NULL) {
1092                         ip1dbg(("ire_forward: dst_ill %p "
1093                             "for IRE_IF_NORESOLVER ire %p has "
1094                             "no ill_resolver_mp\n",
1095                             (void *)dst_ill, (void *)ire));
1096                         goto icmp_err_ret;
1097                 }
1098                 /* FALLTHRU */
1099         case IRE_IF_RESOLVER:
1100                 /*
1101                  * We have the IRE_IF_RESOLVER of the nexthop gateway
1102                  * and now need to build a IRE_CACHE for it.
1103                  * In this case, we have the following :
1104                  *
1105                  * 1) src_ipif - used for getting a source address.
1106                  *
1107                  * 2) dst_ill - from which we derive ire_stq/ire_rfq. This
1108                  *    means packets using the IRE_CACHE that we will build
1109                  *    here will go out on dst_ill.
1110                  *
1111                  * 3) sire may or may not be NULL. But, the IRE_CACHE that is
1112                  *    to be created will only be tied to the IRE_INTERFACE
1113                  *    that was derived from the ire_ihandle field.
1114                  *
1115                  *    If sire is non-NULL, it means the destination is
1116                  *    off-link and we will first create the IRE_CACHE for the
1117                  *    gateway.
1118                  */
1119                 res_mp = dst_ill->ill_resolver_mp;
1120                 if (ire->ire_type == IRE_IF_RESOLVER &&
1121                     (!OK_RESOLVER_MP(res_mp))) {
1122                         goto icmp_err_ret;
1123                 }
1124                 /*
1125                  * To be at this point in the code with a non-zero gw
1126                  * means that dst is reachable through a gateway that
1127                  * we have never resolved.  By changing dst to the gw
1128                  * addr we resolve the gateway first.
1129                  */
1130                 if (gw != INADDR_ANY) {
1131                         /*
1132                          * The source ipif that was determined above was
1133                          * relative to the destination address, not the
1134                          * gateway's. If src_ipif was not taken out of
1135                          * the IRE_IF_RESOLVER entry, we'll need to call
1136                          * ipif_select_source() again.
1137                          */
1138                         if (src_ipif != ire->ire_ipif) {
1139                                 ipif_refrele(src_ipif);
1140                                 src_ipif = ipif_select_source(dst_ill,
1141                                     gw, zoneid);
1142                                 if (src_ipif == NULL)
1143                                         goto icmp_err_ret;
1144                         }
1145                         dst = gw;
1146                         gw = INADDR_ANY;
1147                 }
1148                 /*
1149                  * dst has been set to the address of the nexthop.
1150                  *
1151                  * TSol note: get security attributes of the nexthop;
1152                  * Note that the nexthop may either be a gateway, or the
1153                  * packet destination itself; Detailed explanation of
1154                  * issues involved is  provided in the  IRE_IF_NORESOLVER
1155                  * logic in ip_newroute().
1156                  */
1157                 ga.ga_af = AF_INET;
1158                 IN6_IPADDR_TO_V4MAPPED(dst, &ga.ga_addr);
1159                 gcgrp = gcgrp_lookup(&ga, B_FALSE);
1160 
1161                 if (ire->ire_type == IRE_IF_NORESOLVER)
1162                         dst = ire->ire_addr; /* ire_cache for tunnel endpoint */
1163 
1164                 save_ire = ire;
1165                 /*
1166                  * create an incomplete IRE_CACHE.
1167                  * An areq_mp will be generated in ire_arpresolve() for
1168                  * RESOLVER interfaces.
1169                  */
1170                 ire = ire_create(
1171                     (uchar_t *)&dst,                /* dest address */
1172                     (uchar_t *)&ip_g_all_ones,      /* mask */
1173                     (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */
1174                     (uchar_t *)&gw,         /* gateway address */
1175                     (save_ire->ire_type == IRE_IF_RESOLVER ?  NULL:
1176                     &save_ire->ire_max_frag),
1177                     NULL,
1178                     dst_ill->ill_rq,         /* recv-from queue */
1179                     dst_ill->ill_wq,         /* send-to queue */
1180                     IRE_CACHE,                  /* IRE type */
1181                     src_ipif,
1182                     ire->ire_mask,           /* Parent mask */
1183                     0,
1184                     ire->ire_ihandle,        /* Interface handle */
1185                     0,
1186                     &(ire->ire_uinfo),
1187                     NULL,
1188                     gcgrp,
1189                     ipst);
1190                 ip1dbg(("incomplete ire_cache 0x%p\n", (void *)ire));
1191                 if (ire != NULL) {
1192                         gcgrp = NULL; /* reference now held by IRE */
1193                         ire->ire_marks |= ire_marks;
1194                         /* add the incomplete ire: */
1195                         error = ire_add(&ire, NULL, NULL, NULL, B_TRUE);
1196                         if (error == 0 && ire != NULL) {
1197                                 ire->ire_max_frag = save_ire->ire_max_frag;
1198                                 ip1dbg(("setting max_frag to %d in ire 0x%p\n",
1199                                     ire->ire_max_frag, (void *)ire));
1200                         } else {
1201                                 ire_refrele(save_ire);
1202                                 goto icmp_err_ret;
1203                         }
1204                 } else {
1205                         if (gcgrp != NULL) {
1206                                 GCGRP_REFRELE(gcgrp);
1207                                 gcgrp = NULL;
1208                         }
1209                 }
1210 
1211                 ire_refrele(save_ire);
1212                 break;
1213         default:
1214                 break;
1215         }
1216 
1217         *ret_action = Forward_ok;
1218         if (sire != NULL)
1219                 ire_refrele(sire);
1220         if (dst_ill != NULL)
1221                 ill_refrele(dst_ill);
1222         if (src_ipif != NULL)
1223                 ipif_refrele(src_ipif);
1224         return (ire);
1225 icmp_err_ret:
1226         *ret_action = Forward_ret_icmp_err;
1227         if (sire != NULL)
1228                 ire_refrele(sire);
1229         if (dst_ill != NULL)
1230                 ill_refrele(dst_ill);
1231         if (src_ipif != NULL)
1232                 ipif_refrele(src_ipif);
1233         if (ire != NULL) {
1234                 if (ire->ire_flags & RTF_BLACKHOLE)
1235                         *ret_action = Forward_blackhole;
1236                 ire_refrele(ire);
1237         }
1238         return (NULL);
1239 }
1240 
1241 /*
1242  * Since caller is ip_fast_forward, there is no CGTP or Tsol test
1243  * Also we dont call ftable lookup with MATCH_IRE_PARENT
1244  */
1245 
1246 ire_t *
1247 ire_forward_simple(ipaddr_t dst, enum ire_forward_action *ret_action,
1248     ip_stack_t *ipst)
1249 {
1250         ipaddr_t gw = 0;
1251         ire_t   *ire = NULL;
1252         ire_t   *sire = NULL, *save_ire;
1253         ill_t *dst_ill = NULL;
1254         int error;
1255         zoneid_t zoneid;
1256         ipif_t *src_ipif = NULL;
1257         mblk_t *res_mp;
1258         ushort_t ire_marks = 0;
1259 
1260         zoneid = GLOBAL_ZONEID;
1261 
1262 
1263         ire = ire_ftable_lookup_simple(dst, &sire, zoneid,
1264             MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
1265             MATCH_IRE_RJ_BHOLE, ipst);
1266 
1267         if (ire == NULL) {
1268                 ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0, RTA_DST, ipst);
1269                 goto icmp_err_ret;
1270         }
1271 
1272         /*
1273          * Verify that the returned IRE does not have either
1274          * the RTF_REJECT or RTF_BLACKHOLE flags set and that the IRE is
1275          * either an IRE_CACHE, IRE_IF_NORESOLVER or IRE_IF_RESOLVER.
1276          */
1277         if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE))) {
1278                 ASSERT(ire->ire_type & (IRE_CACHE | IRE_INTERFACE));
1279                 ip3dbg(("ire 0x%p is not cache/resolver/noresolver\n",
1280                     (void *)ire));
1281                 goto icmp_err_ret;
1282         }
1283 
1284         /*
1285          * If we already have a fully resolved IRE CACHE of the
1286          * nexthop router, just hand over the cache entry
1287          * and we are done.
1288          */
1289 
1290         if (ire->ire_type & IRE_CACHE) {
1291 
1292                 /*
1293                  * If we are using this ire cache entry as a
1294                  * gateway to forward packets, chances are we
1295                  * will be using it again. So turn off
1296                  * the temporary flag, thus reducing its
1297                  * chances of getting deleted frequently.
1298                  */
1299                 if (ire->ire_marks & IRE_MARK_TEMPORARY) {
1300                         irb_t *irb = ire->ire_bucket;
1301                         rw_enter(&irb->irb_lock, RW_WRITER);
1302                         ire->ire_marks &= ~IRE_MARK_TEMPORARY;
1303                         irb->irb_tmp_ire_cnt--;
1304                         rw_exit(&irb->irb_lock);
1305                 }
1306 
1307                 if (sire != NULL) {
1308                         UPDATE_OB_PKT_COUNT(sire);
1309                         ire_refrele(sire);
1310                 }
1311                 *ret_action = Forward_ok;
1312                 return (ire);
1313         }
1314         /*
1315          * Increment the ire_ob_pkt_count field for ire if it is an
1316          * INTERFACE (IF_RESOLVER or IF_NORESOLVER) IRE type, and
1317          * increment the same for the parent IRE, sire, if it is some
1318          * sort of prefix IRE (which includes DEFAULT, PREFIX, and HOST).
1319          */
1320         if ((ire->ire_type & IRE_INTERFACE) != 0) {
1321                 UPDATE_OB_PKT_COUNT(ire);
1322                 ire->ire_last_used_time = lbolt;
1323         }
1324 
1325         /*
1326          * sire must be either IRE_CACHETABLE OR IRE_INTERFACE type
1327          */
1328         if (sire != NULL) {
1329                 gw = sire->ire_gateway_addr;
1330                 ASSERT((sire->ire_type &
1331                     (IRE_CACHETABLE | IRE_INTERFACE)) == 0);
1332                 UPDATE_OB_PKT_COUNT(sire);
1333         }
1334 
1335         /* Obtain dst_ill */
1336         dst_ill = ip_newroute_get_dst_ill(ire->ire_ipif->ipif_ill);
1337         if (dst_ill == NULL) {
1338                 ip2dbg(("ire_forward no dst ill; ire 0x%p\n",
1339                     (void *)ire));
1340                 goto icmp_err_ret;
1341         }
1342 
1343         ASSERT(src_ipif == NULL);
1344         /* Now obtain the src_ipif */
1345         src_ipif = ire_forward_src_ipif(dst, sire, ire, dst_ill,
1346             zoneid, &ire_marks);
1347         if (src_ipif == NULL)
1348                 goto icmp_err_ret;
1349 
1350         switch (ire->ire_type) {
1351         case IRE_IF_NORESOLVER:
1352                 /* create ire_cache for ire_addr endpoint */
1353         case IRE_IF_RESOLVER:
1354                 /*
1355                  * We have the IRE_IF_RESOLVER of the nexthop gateway
1356                  * and now need to build a IRE_CACHE for it.
1357                  * In this case, we have the following :
1358                  *
1359                  * 1) src_ipif - used for getting a source address.
1360                  *
1361                  * 2) dst_ill - from which we derive ire_stq/ire_rfq. This
1362                  *    means packets using the IRE_CACHE that we will build
1363                  *    here will go out on dst_ill.
1364                  *
1365                  * 3) sire may or may not be NULL. But, the IRE_CACHE that is
1366                  *    to be created will only be tied to the IRE_INTERFACE
1367                  *    that was derived from the ire_ihandle field.
1368                  *
1369                  *    If sire is non-NULL, it means the destination is
1370                  *    off-link and we will first create the IRE_CACHE for the
1371                  *    gateway.
1372                  */
1373                 res_mp = dst_ill->ill_resolver_mp;
1374                 if (ire->ire_type == IRE_IF_RESOLVER &&
1375                     (!OK_RESOLVER_MP(res_mp))) {
1376                         ire_refrele(ire);
1377                         ire = NULL;
1378                         goto out;
1379                 }
1380                 /*
1381                  * To be at this point in the code with a non-zero gw
1382                  * means that dst is reachable through a gateway that
1383                  * we have never resolved.  By changing dst to the gw
1384                  * addr we resolve the gateway first.
1385                  */
1386                 if (gw != INADDR_ANY) {
1387                         /*
1388                          * The source ipif that was determined above was
1389                          * relative to the destination address, not the
1390                          * gateway's. If src_ipif was not taken out of
1391                          * the IRE_IF_RESOLVER entry, we'll need to call
1392                          * ipif_select_source() again.
1393                          */
1394                         if (src_ipif != ire->ire_ipif) {
1395                                 ipif_refrele(src_ipif);
1396                                 src_ipif = ipif_select_source(dst_ill,
1397                                     gw, zoneid);
1398                                 if (src_ipif == NULL)
1399                                         goto icmp_err_ret;
1400                         }
1401                         dst = gw;
1402                         gw = INADDR_ANY;
1403                 }
1404 
1405                 if (ire->ire_type == IRE_IF_NORESOLVER)
1406                         dst = ire->ire_addr; /* ire_cache for tunnel endpoint */
1407 
1408                 save_ire = ire;
1409                 /*
1410                  * create an incomplete IRE_CACHE.
1411                  * An areq_mp will be generated in ire_arpresolve() for
1412                  * RESOLVER interfaces.
1413                  */
1414                 ire = ire_create(
1415                     (uchar_t *)&dst,                /* dest address */
1416                     (uchar_t *)&ip_g_all_ones,      /* mask */
1417                     (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */
1418                     (uchar_t *)&gw,         /* gateway address */
1419                     (save_ire->ire_type == IRE_IF_RESOLVER ?  NULL:
1420                     &save_ire->ire_max_frag),
1421                     NULL,
1422                     dst_ill->ill_rq,         /* recv-from queue */
1423                     dst_ill->ill_wq,         /* send-to queue */
1424                     IRE_CACHE,                  /* IRE type */
1425                     src_ipif,
1426                     ire->ire_mask,           /* Parent mask */
1427                     0,
1428                     ire->ire_ihandle,        /* Interface handle */
1429                     0,
1430                     &(ire->ire_uinfo),
1431                     NULL,
1432                     NULL,
1433                     ipst);
1434                 ip1dbg(("incomplete ire_cache 0x%p\n", (void *)ire));
1435                 if (ire != NULL) {
1436                         ire->ire_marks |= ire_marks;
1437                         /* add the incomplete ire: */
1438                         error = ire_add(&ire, NULL, NULL, NULL, B_TRUE);
1439                         if (error == 0 && ire != NULL) {
1440                                 ire->ire_max_frag = save_ire->ire_max_frag;
1441                                 ip1dbg(("setting max_frag to %d in ire 0x%p\n",
1442                                     ire->ire_max_frag, (void *)ire));
1443                         } else {
1444                                 ire_refrele(save_ire);
1445                                 goto icmp_err_ret;
1446                         }
1447                 }
1448 
1449                 ire_refrele(save_ire);
1450                 break;
1451         default:
1452                 break;
1453         }
1454 
1455 out:
1456         *ret_action = Forward_ok;
1457         if (sire != NULL)
1458                 ire_refrele(sire);
1459         if (dst_ill != NULL)
1460                 ill_refrele(dst_ill);
1461         if (src_ipif != NULL)
1462                 ipif_refrele(src_ipif);
1463         return (ire);
1464 icmp_err_ret:
1465         *ret_action = Forward_ret_icmp_err;
1466         if (src_ipif != NULL)
1467                 ipif_refrele(src_ipif);
1468         if (dst_ill != NULL)
1469                 ill_refrele(dst_ill);
1470         if (sire != NULL)
1471                 ire_refrele(sire);
1472         if (ire != NULL) {
1473                 if (ire->ire_flags & RTF_BLACKHOLE)
1474                         *ret_action = Forward_blackhole;
1475                 ire_refrele(ire);
1476         }
1477         /* caller needs to send icmp error message */
1478         return (NULL);
1479 
1480 }
1481 
1482 /*
1483  * Obtain the rt_entry and rt_irb for the route to be added to
1484  * the ips_ip_ftable.
1485  * First attempt to add a node to the radix tree via rn_addroute. If the
1486  * route already exists, return the bucket for the existing route.
1487  *
1488  * Locking notes: Need to hold the global radix tree lock in write mode to
1489  * add a radix node. To prevent the node from being deleted, ire_get_bucket()
1490  * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4()
1491  * while holding the irb_lock, but not the radix tree lock.
1492  */
1493 irb_t *
1494 ire_get_bucket(ire_t *ire)
1495 {
1496         struct radix_node *rn;
1497         struct rt_entry *rt;
1498         struct rt_sockaddr rmask, rdst;
1499         irb_t *irb = NULL;
1500         ip_stack_t *ipst = ire->ire_ipst;
1501 
1502         ASSERT(ipst->ips_ip_ftable != NULL);
1503 
1504         /* first try to see if route exists (based on rtalloc1) */
1505         (void) memset(&rdst, 0, sizeof (rdst));
1506         rdst.rt_sin_len = sizeof (rdst);
1507         rdst.rt_sin_family = AF_INET;
1508         rdst.rt_sin_addr.s_addr = ire->ire_addr;
1509 
1510         (void) memset(&rmask, 0, sizeof (rmask));
1511         rmask.rt_sin_len = sizeof (rmask);
1512         rmask.rt_sin_family = AF_INET;
1513         rmask.rt_sin_addr.s_addr = ire->ire_mask;
1514 
1515         /*
1516          * add the route. based on BSD's rtrequest1(RTM_ADD)
1517          */
1518         R_Malloc(rt, rt_entry_cache,  sizeof (*rt));
1519         /* kmem_alloc failed */
1520         if (rt == NULL)
1521                 return (NULL);
1522 
1523         (void) memset(rt, 0, sizeof (*rt));
1524         rt->rt_nodes->rn_key = (char *)&rt->rt_dst;
1525         rt->rt_dst = rdst;
1526         irb = &rt->rt_irb;
1527         irb->irb_marks |= IRB_MARK_FTABLE; /* dynamically allocated/freed */
1528         irb->irb_ipst = ipst;
1529         rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL);
1530         RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable);
1531         rn = ipst->ips_ip_ftable->rnh_addaddr(&rt->rt_dst, &rmask,
1532             ipst->ips_ip_ftable, (struct radix_node *)rt);
1533         if (rn == NULL) {
1534                 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
1535                 Free(rt, rt_entry_cache);
1536                 rt = NULL;
1537                 irb = NULL;
1538                 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
1539                 rn = ipst->ips_ip_ftable->rnh_lookup(&rdst, &rmask,
1540                     ipst->ips_ip_ftable);
1541                 if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
1542                         /* found a non-root match */
1543                         rt = (struct rt_entry *)rn;
1544                 }
1545         }
1546         if (rt != NULL) {
1547                 irb = &rt->rt_irb;
1548                 IRB_REFHOLD(irb);
1549         }
1550         RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
1551         return (irb);
1552 }
1553 
1554 /*
1555  * This function is used when the caller wants to know the outbound
1556  * interface for a packet given only the address.
1557  * If this is a offlink IP address and there are multiple
1558  * routes to this destination, this routine will utilise the
1559  * first route it finds to IP address
1560  * Return values:
1561  *      0       - FAILURE
1562  *      nonzero - ifindex
1563  */
1564 uint_t
1565 ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid)
1566 {
1567         uint_t ifindex = 0;
1568         ire_t *ire;
1569         ill_t *ill;
1570         netstack_t *ns;
1571         ip_stack_t *ipst;
1572 
1573         if (zoneid == ALL_ZONES)
1574                 ns = netstack_find_by_zoneid(GLOBAL_ZONEID);
1575         else
1576                 ns = netstack_find_by_zoneid(zoneid);
1577         ASSERT(ns != NULL);
1578 
1579         /*
1580          * For exclusive stacks we set the zoneid to zero
1581          * since IP uses the global zoneid in the exclusive stacks.
1582          */
1583         if (ns->netstack_stackid != GLOBAL_NETSTACKID)
1584                 zoneid = GLOBAL_ZONEID;
1585         ipst = ns->netstack_ip;
1586 
1587         ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6);
1588 
1589         if ((ire =  route_to_dst(ipaddr, zoneid, ipst)) != NULL) {
1590                 ill = ire_to_ill(ire);
1591                 if (ill != NULL)
1592                         ifindex = ill->ill_phyint->phyint_ifindex;
1593                 ire_refrele(ire);
1594         }
1595         netstack_rele(ns);
1596         return (ifindex);
1597 }
1598 
1599 /*
1600  * Routine to find the route to a destination. If a ifindex is supplied
1601  * it tries to match the the route to the corresponding ipif for the ifindex
1602  */
1603 static  ire_t *
1604 route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst)
1605 {
1606         ire_t *ire = NULL;
1607         int match_flags;
1608 
1609         match_flags = (MATCH_IRE_DSTONLY | MATCH_IRE_DEFAULT |
1610             MATCH_IRE_RECURSIVE | MATCH_IRE_RJ_BHOLE);
1611 
1612         /* XXX pass NULL tsl for now */
1613 
1614         if (dst_addr->sa_family == AF_INET) {
1615                 ire = ire_route_lookup(
1616                     ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr,
1617                     0, 0, 0, NULL, NULL, zoneid, NULL, match_flags, ipst);
1618         } else {
1619                 ire = ire_route_lookup_v6(
1620                     &((struct sockaddr_in6 *)dst_addr)->sin6_addr,
1621                     0, 0, 0, NULL, NULL, zoneid, NULL, match_flags, ipst);
1622         }
1623         return (ire);
1624 }
1625 
1626 /*
1627  * This routine is called by IP Filter to send a packet out on the wire
1628  * to a specified V4 dst (which may be onlink or offlink). The ifindex may or
1629  * may not be 0. A non-null ifindex indicates IP Filter has stipulated
1630  * an outgoing interface and requires the nexthop to be on that interface.
1631  * IP WILL NOT DO the following to the data packet before sending it out:
1632  *      a. manipulate ttl
1633  *      b. ipsec work
1634  *      c. fragmentation
1635  *
1636  * If the packet has been prepared for hardware checksum then it will be
1637  * passed off to ip_send_align_cksum() to check that the flags set on the
1638  * packet are in alignment with the capabilities of the new outgoing NIC.
1639  *
1640  * Return values:
1641  *      0:              IP was able to send of the data pkt
1642  *      ECOMM:          Could not send packet
1643  *      ENONET          No route to dst. It is up to the caller
1644  *                      to send icmp unreachable error message,
1645  *      EINPROGRESS     The macaddr of the onlink dst or that
1646  *                      of the offlink dst's nexthop needs to get
1647  *                      resolved before packet can be sent to dst.
1648  *                      Thus transmission is not guaranteed.
1649  *
1650  */
1651 
1652 int
1653 ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex,
1654     zoneid_t zoneid)
1655 {
1656         ire_t *ire = NULL, *sire = NULL;
1657         ire_t *ire_cache = NULL;
1658         int value;
1659         int match_flags;
1660         ipaddr_t dst;
1661         netstack_t *ns;
1662         ip_stack_t *ipst;
1663         enum ire_forward_action ret_action;
1664 
1665         ASSERT(mp != NULL);
1666 
1667         if (zoneid == ALL_ZONES)
1668                 ns = netstack_find_by_zoneid(GLOBAL_ZONEID);
1669         else
1670                 ns = netstack_find_by_zoneid(zoneid);
1671         ASSERT(ns != NULL);
1672 
1673         /*
1674          * For exclusive stacks we set the zoneid to zero
1675          * since IP uses the global zoneid in the exclusive stacks.
1676          */
1677         if (ns->netstack_stackid != GLOBAL_NETSTACKID)
1678                 zoneid = GLOBAL_ZONEID;
1679         ipst = ns->netstack_ip;
1680 
1681         ASSERT(dst_addr->sa_family == AF_INET ||
1682             dst_addr->sa_family == AF_INET6);
1683 
1684         if (dst_addr->sa_family == AF_INET) {
1685                 dst = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr;
1686         } else {
1687                 /*
1688                  * We dont have support for V6 yet. It will be provided
1689                  * once RFE  6399103  has been delivered.
1690                  * Until then, for V6 dsts, IP Filter will not call
1691                  * this function. Instead the netinfo framework provides
1692                  * its own code path, in ip_inject_impl(), to achieve
1693                  * what it needs to do, for the time being.
1694                  */
1695                 ip1dbg(("ipfil_sendpkt: no V6 support \n"));
1696                 value = ECOMM;
1697                 freemsg(mp);
1698                 goto discard;
1699         }
1700 
1701         /*
1702          * Lets get the ire. We might get the ire cache entry,
1703          * or the ire,sire pair needed to create the cache entry.
1704          * XXX pass NULL tsl for now.
1705          */
1706 
1707         if (ifindex == 0) {
1708                 /* There is no supplied index. So use the FIB info */
1709 
1710                 match_flags = (MATCH_IRE_DSTONLY | MATCH_IRE_DEFAULT |
1711                     MATCH_IRE_RECURSIVE | MATCH_IRE_RJ_BHOLE);
1712                 ire = ire_route_lookup(dst,
1713                     0, 0, 0, NULL, &sire, zoneid, MBLK_GETLABEL(mp),
1714                     match_flags, ipst);
1715         } else {
1716                 ipif_t *supplied_ipif;
1717                 ill_t *ill;
1718 
1719                 match_flags = (MATCH_IRE_DSTONLY | MATCH_IRE_DEFAULT |
1720                     MATCH_IRE_RECURSIVE| MATCH_IRE_RJ_BHOLE|
1721                     MATCH_IRE_SECATTR);
1722 
1723                 /*
1724                  * If supplied ifindex is non-null, the only valid
1725                  * nexthop is one off of the interface or group corresponding
1726                  * to the specified ifindex.
1727                  */
1728                 ill = ill_lookup_on_ifindex(ifindex, B_FALSE,
1729                     NULL, NULL, NULL, NULL, ipst);
1730                 if (ill != NULL) {
1731                         match_flags |= MATCH_IRE_ILL;
1732                 } else {
1733                         /* Fallback to group names if hook_emulation set */
1734                         if (ipst->ips_ipmp_hook_emulation) {
1735                                 ill = ill_group_lookup_on_ifindex(ifindex,
1736                                     B_FALSE, ipst);
1737                         }
1738                         if (ill == NULL) {
1739                                 ip1dbg(("ipfil_sendpkt: Could not find"
1740                                     " route to dst\n"));
1741                                 value = ECOMM;
1742                                 freemsg(mp);
1743                                 goto discard;
1744                         }
1745                         match_flags |= MATCH_IRE_ILL_GROUP;
1746                 }
1747                 supplied_ipif = ipif_get_next_ipif(NULL, ill);
1748 
1749                 ire = ire_route_lookup(dst, 0, 0, 0, supplied_ipif,
1750                     &sire, zoneid, MBLK_GETLABEL(mp), match_flags, ipst);
1751                 ipif_refrele(supplied_ipif);
1752                 ill_refrele(ill);
1753         }
1754 
1755         /*
1756          * Verify that the returned IRE is non-null and does
1757          * not have either the RTF_REJECT or RTF_BLACKHOLE
1758          * flags set and that the IRE is  either an IRE_CACHE,
1759          * IRE_IF_NORESOLVER or IRE_IF_RESOLVER.
1760          */
1761         if (ire == NULL ||
1762             ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) ||
1763             (ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0)) {
1764                 /*
1765                  * Either ire could not be found or we got
1766                  * an invalid one
1767                  */
1768                 ip1dbg(("ipfil_sendpkt: Could not find route to dst\n"));
1769                 value = ENONET;
1770                 freemsg(mp);
1771                 goto discard;
1772         }
1773 
1774         /* IP Filter and CGTP dont mix. So bail out if CGTP is on */
1775         if (ipst->ips_ip_cgtp_filter &&
1776             ((ire->ire_flags & RTF_MULTIRT) ||
1777             ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)))) {
1778                 ip1dbg(("ipfil_sendpkt: IPFilter does not work with CGTP\n"));
1779                 value = ECOMM;
1780                 freemsg(mp);
1781                 goto discard;
1782         }
1783 
1784         ASSERT(ire->ire_type != IRE_CACHE || ire->ire_nce != NULL);
1785 
1786         /*
1787          * If needed, we will create the ire cache entry for the
1788          * nexthop, resolve its link-layer address and then send
1789          * the packet out without ttl or IPSec processing.
1790          */
1791         switch (ire->ire_type) {
1792         case IRE_CACHE:
1793                 if (sire != NULL) {
1794                         UPDATE_OB_PKT_COUNT(sire);
1795                         sire->ire_last_used_time = lbolt;
1796                         ire_refrele(sire);
1797                 }
1798                 ire_cache = ire;
1799                 break;
1800         case IRE_IF_NORESOLVER:
1801         case IRE_IF_RESOLVER:
1802                 /*
1803                  * Call ire_forward(). This function
1804                  * will, create the ire cache entry of the
1805                  * the nexthop and adds this incomplete ire
1806                  * to the ire cache table
1807                  */
1808                 ire_cache = ire_forward(dst, &ret_action, ire, sire,
1809                     MBLK_GETLABEL(mp), ipst);
1810                 if (ire_cache == NULL) {
1811                         ip1dbg(("ipfil_sendpkt: failed to create the"
1812                             " ire cache entry \n"));
1813                         value = ENONET;
1814                         freemsg(mp);
1815                         sire = NULL;
1816                         ire = NULL;
1817                         goto discard;
1818                 }
1819                 break;
1820         }
1821 
1822         if (DB_CKSUMFLAGS(mp)) {
1823                 if (ip_send_align_hcksum_flags(mp, ire_to_ill(ire_cache)))
1824                         goto cleanup;
1825         }
1826 
1827         /*
1828          * Now that we have the ire cache entry of the nexthop, call
1829          * ip_xmit_v4() to trigger mac addr resolution
1830          * if necessary and send it once ready.
1831          */
1832 
1833         value = ip_xmit_v4(mp, ire_cache, NULL, B_FALSE);
1834 cleanup:
1835         ire_refrele(ire_cache);
1836         /*
1837          * At this point, the reference for these have already been
1838          * released within ire_forward() and/or ip_xmit_v4(). So we set
1839          * them to NULL to make sure we dont drop the references
1840          * again in case ip_xmit_v4() returns with either SEND_FAILED
1841          * or LLHDR_RESLV_FAILED
1842          */
1843         sire = NULL;
1844         ire = NULL;
1845 
1846         switch (value) {
1847         case SEND_FAILED:
1848                 ip1dbg(("ipfil_sendpkt: Send failed\n"));
1849                 value = ECOMM;
1850                 break;
1851         case LLHDR_RESLV_FAILED:
1852                 ip1dbg(("ipfil_sendpkt: Link-layer resolution"
1853                     "  failed\n"));
1854                 value = ECOMM;
1855                 break;
1856         case LOOKUP_IN_PROGRESS:
1857                 netstack_rele(ns);
1858                 return (EINPROGRESS);
1859         case SEND_PASSED:
1860                 netstack_rele(ns);
1861                 return (0);
1862         }
1863 discard:
1864         if (dst_addr->sa_family == AF_INET) {
1865                 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
1866         } else {
1867                 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
1868         }
1869         if (ire != NULL)
1870                 ire_refrele(ire);
1871         if (sire != NULL)
1872                 ire_refrele(sire);
1873         netstack_rele(ns);
1874         return (value);
1875 }
1876 
1877 
1878 /*
1879  * We don't check for dohwcksum in here because it should be being used
1880  * elsewhere to control what flags are being set on the mblk.  That is,
1881  * if DB_CKSUMFLAGS() is non-zero then we assume dohwcksum to be true
1882  * for this packet.
1883  *
1884  * This function assumes that it is *only* being called for TCP or UDP
1885  * packets and nothing else.
1886  */
1887 static int
1888 ip_send_align_hcksum_flags(mblk_t *mp, ill_t *ill)
1889 {
1890         int illhckflags;
1891         int mbhckflags;
1892         uint16_t *up;
1893         uint32_t cksum;
1894         ipha_t *ipha;
1895         ip6_t *ip6;
1896         int proto;
1897         int ipversion;
1898         int length;
1899         int start;
1900         ip6_pkt_t ipp;
1901 
1902         mbhckflags = DB_CKSUMFLAGS(mp);
1903         ASSERT(mbhckflags != 0);
1904         ASSERT(mp->b_datap->db_type == M_DATA);
1905         /*
1906          * Since this function only knows how to manage the hardware checksum
1907          * issue, reject and packets that have flags set on the aside from
1908          * checksum related attributes as we cannot necessarily safely map
1909          * that packet onto the new NIC.  Packets that can be potentially
1910          * dropped here include those marked for LSO.
1911          */
1912         if ((mbhckflags &
1913             ~(HCK_FULLCKSUM|HCK_PARTIALCKSUM|HCK_IPV4_HDRCKSUM)) != 0) {
1914                 DTRACE_PROBE2(pbr__incapable, (mblk_t *), mp, (ill_t *), ill);
1915                 freemsg(mp);
1916                 return (-1);
1917         }
1918 
1919         ipha = (ipha_t *)mp->b_rptr;
1920 
1921         /*
1922          * Find out what the new NIC is capable of, if anything, and
1923          * only allow it to be used with M_DATA mblks being sent out.
1924          */
1925         if (ILL_HCKSUM_CAPABLE(ill)) {
1926                 illhckflags = ill->ill_hcksum_capab->ill_hcksum_txflags;
1927         } else {
1928                 /*
1929                  * No capabilities, so turn off everything.
1930                  */
1931                 illhckflags = 0;
1932                 (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0, 0, 0);
1933                 mp->b_datap->db_struioflag &= ~STRUIO_IP;
1934         }
1935 
1936         DTRACE_PROBE4(pbr__info__a, (mblk_t *), mp, (ill_t *), ill,
1937             uint32_t, illhckflags, uint32_t, mbhckflags);
1938         /*
1939          * This block of code that looks for the position of the TCP/UDP
1940          * checksum is early in this function because we need to know
1941          * what needs to be blanked out for the hardware checksum case.
1942          *
1943          * That we're in this function implies that the packet is either
1944          * TCP or UDP on Solaris, so checks are made for one protocol and
1945          * if that fails, the other is therefore implied.
1946          */
1947         ipversion = IPH_HDR_VERSION(ipha);
1948 
1949         if (ipversion == IPV4_VERSION) {
1950                 proto = ipha->ipha_protocol;
1951                 if (proto == IPPROTO_TCP) {
1952                         up = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
1953                 } else {
1954                         up = IPH_UDPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
1955                 }
1956         } else {
1957                 uint8_t lasthdr;
1958 
1959                 /*
1960                  * Nothing I've seen indicates that IPv6 checksum'ing
1961                  * precludes the presence of extension headers, so we
1962                  * can't just look at the next header value in the IPv6
1963                  * packet header to see if it is TCP/UDP.
1964                  */
1965                 ip6 = (ip6_t *)ipha;
1966                 (void) memset(&ipp, 0, sizeof (ipp));
1967                 start = ip_find_hdr_v6(mp, ip6, &ipp, &lasthdr);
1968                 proto = lasthdr;
1969 
1970                 if (proto == IPPROTO_TCP) {
1971                         up = IPH_TCPH_CHECKSUMP(ipha, start);
1972                 } else {
1973                         up = IPH_UDPH_CHECKSUMP(ipha, start);
1974                 }
1975         }
1976 
1977         /*
1978          * The first case here is easiest:
1979          * mblk hasn't asked for full checksum, but the card supports it.
1980          *
1981          * In addition, check for IPv4 header capability.  Note that only
1982          * the mblk flag is checked and not ipversion.
1983          */
1984         if ((((illhckflags & HCKSUM_INET_FULL_V4) && (ipversion == 4)) ||
1985             (((illhckflags & HCKSUM_INET_FULL_V6) && (ipversion == 6)))) &&
1986             ((mbhckflags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) != 0)) {
1987                 int newflags = HCK_FULLCKSUM;
1988 
1989                 if ((mbhckflags & HCK_IPV4_HDRCKSUM) != 0) {
1990                         if ((illhckflags & HCKSUM_IPHDRCKSUM) != 0) {
1991                                 newflags |= HCK_IPV4_HDRCKSUM;
1992                         } else {
1993                                 /*
1994                                  * Rather than call a function, just inline
1995                                  * the computation of the basic IPv4 header.
1996                                  */
1997                                 cksum = (ipha->ipha_dst >> 16) +
1998                                     (ipha->ipha_dst & 0xFFFF) +
1999                                     (ipha->ipha_src >> 16) +
2000                                     (ipha->ipha_src & 0xFFFF);
2001                                 IP_HDR_CKSUM(ipha, cksum,
2002                                     ((uint32_t *)ipha)[0],
2003                                     ((uint16_t *)ipha)[4]);
2004                         }
2005                 }
2006 
2007                 *up = 0;
2008                 (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
2009                     newflags, 0);
2010                 return (0);
2011         }
2012 
2013         DTRACE_PROBE2(pbr__info__b, int, ipversion, int, proto);
2014 
2015         /*
2016          * Start calculating the pseudo checksum over the IP packet header.
2017          * Although the final pseudo checksum used by TCP/UDP consists of
2018          * more than just the address fields, we can use the result of
2019          * adding those together a little bit further down for IPv4.
2020          */
2021         if (ipversion == IPV4_VERSION) {
2022                 cksum = (ipha->ipha_dst >> 16) + (ipha->ipha_dst & 0xFFFF) +
2023                     (ipha->ipha_src >> 16) + (ipha->ipha_src & 0xFFFF);
2024                 start = IP_SIMPLE_HDR_LENGTH;
2025                 length = ntohs(ipha->ipha_length);
2026                 DTRACE_PROBE3(pbr__info__e, uint32_t, ipha->ipha_src,
2027                     uint32_t, ipha->ipha_dst, int, cksum);
2028         } else {
2029                 uint16_t *pseudo;
2030 
2031                 pseudo = (uint16_t *)&ip6->ip6_src;
2032 
2033                 /* calculate pseudo-header checksum */
2034                 cksum = pseudo[0] + pseudo[1] + pseudo[2] + pseudo[3] +
2035                     pseudo[4] + pseudo[5] + pseudo[6] + pseudo[7] +
2036                     pseudo[8] + pseudo[9] + pseudo[10] + pseudo[11] +
2037                     pseudo[12] + pseudo[13] + pseudo[14] + pseudo[15];
2038 
2039                 length = ntohs(ip6->ip6_plen) + sizeof (ip6_t);
2040         }
2041 
2042         /* Fold the initial sum */
2043         cksum = (cksum & 0xffff) + (cksum >> 16);
2044 
2045         /*
2046          * If the packet was asking for an IPv4 header checksum to be
2047          * calculated but the interface doesn't support that, fill it in
2048          * using our pseudo checksum as a starting point.
2049          */
2050         if (((mbhckflags & HCK_IPV4_HDRCKSUM) != 0) &&
2051             ((illhckflags & HCKSUM_IPHDRCKSUM) == 0)) {
2052                 /*
2053                  * IP_HDR_CKSUM uses the 2rd arg to the macro in a destructive
2054                  * way so pass in a copy of the checksum calculated thus far.
2055                  */
2056                 uint32_t ipsum = cksum;
2057 
2058                 DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
2059 
2060                 IP_HDR_CKSUM(ipha, ipsum, ((uint32_t *)ipha)[0],
2061                     ((uint16_t *)ipha)[4]);
2062         }
2063 
2064         DTRACE_PROBE3(pbr__info__c, int, start, int, length, int, cksum);
2065 
2066         if (proto == IPPROTO_TCP) {
2067                 cksum += IP_TCP_CSUM_COMP;
2068         } else {
2069                 cksum += IP_UDP_CSUM_COMP;
2070         }
2071         cksum += htons(length - start);
2072         cksum = (cksum & 0xffff) + (cksum >> 16);
2073 
2074         /*
2075          * For TCP/UDP, we either want to setup the packet for partial
2076          * checksum or we want to do it all ourselves because the NIC
2077          * offers no support for either partial or full checksum.
2078          */
2079         if ((illhckflags & HCKSUM_INET_PARTIAL) != 0) {
2080                 /*
2081                  * The only case we care about here is if the mblk was
2082                  * previously set for full checksum offload.  If it was
2083                  * marked for partial (and the NIC does partial), then
2084                  * we have nothing to do.  Similarly if the packet was
2085                  * not set for partial or full, we do nothing as this
2086                  * is cheaper than more work to set something up.
2087                  */
2088                 if ((mbhckflags & HCK_FULLCKSUM) != 0) {
2089                         uint32_t offset;
2090 
2091                         if (proto == IPPROTO_TCP) {
2092                                 offset = TCP_CHECKSUM_OFFSET;
2093                         } else {
2094                                 offset = UDP_CHECKSUM_OFFSET;
2095                         }
2096                         *up = cksum;
2097 
2098                         DTRACE_PROBE3(pbr__info__f, int, length - start, int,
2099                             cksum, int, offset);
2100 
2101                         (void) hcksum_assoc(mp, NULL, NULL, start,
2102                             start + offset, length, 0,
2103                             DB_CKSUMFLAGS(mp) | HCK_PARTIALCKSUM, 0);
2104                 }
2105 
2106         } else if (mbhckflags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) {
2107                 DB_CKSUMFLAGS(mp) &= ~(HCK_PARTIALCKSUM|HCK_FULLCKSUM);
2108 
2109                 *up = 0;
2110                 *up = IP_CSUM(mp, start, cksum);
2111         }
2112 
2113         DTRACE_PROBE4(pbr__info__d, (mblk_t *), mp, (ipha_t *), ipha,
2114             (uint16_t *), up, int, cksum);
2115         return (0);
2116 }
2117 
2118 /*
2119  * callback function provided by ire_ftable_lookup when calling
2120  * rn_match_args(). Invoke ire_match_args on each matching leaf node in
2121  * the radix tree.
2122  */
2123 boolean_t
2124 ire_find_best_route(struct radix_node *rn, void *arg)
2125 {
2126         struct rt_entry *rt = (struct rt_entry *)rn;
2127         irb_t *irb_ptr;
2128         ire_t *ire;
2129         ire_ftable_args_t *margs = arg;
2130         ipaddr_t match_mask;
2131 
2132         ASSERT(rt != NULL);
2133 
2134         irb_ptr = &rt->rt_irb;
2135 
2136         if (irb_ptr->irb_ire_cnt == 0)
2137                 return (B_FALSE);
2138 
2139         rw_enter(&irb_ptr->irb_lock, RW_READER);
2140         for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
2141                 if (ire->ire_marks & IRE_MARK_CONDEMNED)
2142                         continue;
2143                 if (margs->ift_flags & MATCH_IRE_MASK)
2144                         match_mask = margs->ift_mask;
2145                 else
2146                         match_mask = ire->ire_mask;
2147 
2148                 if (ire_match_args(ire, margs->ift_addr, match_mask,
2149                     margs->ift_gateway, margs->ift_type, margs->ift_ipif,
2150                     margs->ift_zoneid, margs->ift_ihandle, margs->ift_tsl,
2151                     margs->ift_flags)) {
2152                         IRE_REFHOLD(ire);
2153                         rw_exit(&irb_ptr->irb_lock);
2154                         margs->ift_best_ire = ire;
2155                         return (B_TRUE);
2156                 }
2157         }
2158         rw_exit(&irb_ptr->irb_lock);
2159         return (B_FALSE);
2160 }
2161 
2162 /*
2163  * ftable irb_t structures are dynamically allocated, and we need to
2164  * check if the irb_t (and associated ftable tree attachment) needs to
2165  * be cleaned up when the irb_refcnt goes to 0. The conditions that need
2166  * be verified are:
2167  * - no other walkers of the irebucket, i.e., quiescent irb_refcnt,
2168  * - no other threads holding references to ire's in the bucket,
2169  *   i.e., irb_nire == 0
2170  * - no active ire's in the bucket, i.e., irb_ire_cnt == 0
2171  * - need to hold the global tree lock and irb_lock in write mode.
2172  */
2173 void
2174 irb_refrele_ftable(irb_t *irb)
2175 {
2176         for (;;) {
2177                 rw_enter(&irb->irb_lock, RW_WRITER);
2178                 ASSERT(irb->irb_refcnt != 0);
2179                 if (irb->irb_refcnt != 1) {
2180                         /*
2181                          * Someone has a reference to this radix node
2182                          * or there is some bucket walker.
2183                          */
2184                         irb->irb_refcnt--;
2185                         rw_exit(&irb->irb_lock);
2186                         return;
2187                 } else {
2188                         /*
2189                          * There is no other walker, nor is there any
2190                          * other thread that holds a direct ref to this
2191                          * radix node. Do the clean up if needed. Call
2192                          * to ire_unlink will clear the IRB_MARK_CONDEMNED flag
2193                          */
2194                         if (irb->irb_marks & IRB_MARK_CONDEMNED)  {
2195                                 ire_t *ire_list;
2196 
2197                                 ire_list = ire_unlink(irb);
2198                                 rw_exit(&irb->irb_lock);
2199 
2200                                 if (ire_list != NULL)
2201                                         ire_cleanup(ire_list);
2202                                 /*
2203                                  * more CONDEMNED entries could have
2204                                  * been added while we dropped the lock,
2205                                  * so we have to re-check.
2206                                  */
2207                                 continue;
2208                         }
2209 
2210                         /*
2211                          * Now check if there are still any ires
2212                          * associated with this radix node.
2213                          */
2214                         if (irb->irb_nire != 0) {
2215                                 /*
2216                                  * someone is still holding on
2217                                  * to ires in this bucket
2218                                  */
2219                                 irb->irb_refcnt--;
2220                                 rw_exit(&irb->irb_lock);
2221                                 return;
2222                         } else {
2223                                 /*
2224                                  * Everything is clear. Zero walkers,
2225                                  * Zero threads with a ref to this
2226                                  * radix node, Zero ires associated with
2227                                  * this radix node. Due to lock order,
2228                                  * check the above conditions again
2229                                  * after grabbing all locks in the right order
2230                                  */
2231                                 rw_exit(&irb->irb_lock);
2232                                 if (irb_inactive(irb))
2233                                         return;
2234                                 /*
2235                                  * irb_inactive could not free the irb.
2236                                  * See if there are any walkers, if not
2237                                  * try to clean up again.
2238                                  */
2239                         }
2240                 }
2241         }
2242 }
2243 
2244 /*
2245  * IRE iterator used by ire_ftable_lookup() to process multiple default
2246  * routes. Given a starting point in the hash list (ire_origin), walk the IREs
2247  * in the bucket skipping default interface routes and deleted entries.
2248  * Returns the next IRE (unheld), or NULL when we're back to the starting point.
2249  * Assumes that the caller holds a reference on the IRE bucket.
2250  *
2251  * In the absence of good IRE_DEFAULT routes, this function will return
2252  * the first IRE_INTERFACE route found (if any).
2253  */
2254 ire_t *
2255 ire_round_robin(irb_t *irb_ptr, zoneid_t zoneid, ire_ftable_args_t *margs,
2256         ip_stack_t *ipst)
2257 {
2258         ire_t   *ire_origin;
2259         ire_t   *ire, *maybe_ire = NULL;
2260 
2261         rw_enter(&irb_ptr->irb_lock, RW_WRITER);
2262         ire_origin = irb_ptr->irb_rr_origin;
2263         if (ire_origin != NULL) {
2264                 ire_origin = ire_origin->ire_next;
2265                 IRE_FIND_NEXT_ORIGIN(ire_origin);
2266         }
2267 
2268         if (ire_origin == NULL) {
2269                 /*
2270                  * first time through routine, or we dropped off the end
2271                  * of list.
2272                  */
2273                 ire_origin = irb_ptr->irb_ire;
2274                 IRE_FIND_NEXT_ORIGIN(ire_origin);
2275         }
2276         irb_ptr->irb_rr_origin = ire_origin;
2277         IRB_REFHOLD_LOCKED(irb_ptr);
2278         rw_exit(&irb_ptr->irb_lock);
2279 
2280         DTRACE_PROBE2(ire__rr__origin, (irb_t *), irb_ptr,
2281             (ire_t *), ire_origin);
2282 
2283         /*
2284          * Round-robin the routers list looking for a route that
2285          * matches the passed in parameters.
2286          * We start with the ire we found above and we walk the hash
2287          * list until we're back where we started. It doesn't matter if
2288          * routes are added or deleted by other threads - we know this
2289          * ire will stay in the list because we hold a reference on the
2290          * ire bucket.
2291          */
2292         ire = ire_origin;
2293         while (ire != NULL) {
2294                 int match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR;
2295                 ire_t *rire;
2296 
2297                 if (ire->ire_marks & IRE_MARK_CONDEMNED)
2298                         goto next_ire;
2299 
2300                 if (!ire_match_args(ire, margs->ift_addr, (ipaddr_t)0,
2301                     margs->ift_gateway, margs->ift_type, margs->ift_ipif,
2302                     margs->ift_zoneid, margs->ift_ihandle, margs->ift_tsl,
2303                     margs->ift_flags))
2304                         goto next_ire;
2305 
2306                 if (ire->ire_type & IRE_INTERFACE) {
2307                         /*
2308                          * keep looking to see if there is a non-interface
2309                          * default ire, but save this one as a last resort.
2310                          */
2311                         if (maybe_ire == NULL)
2312                                 maybe_ire = ire;
2313                         goto next_ire;
2314                 }
2315 
2316                 if (zoneid == ALL_ZONES) {
2317                         IRE_REFHOLD(ire);
2318                         IRB_REFRELE(irb_ptr);
2319                         return (ire);
2320                 }
2321                 /*
2322                  * When we're in a non-global zone, we're only
2323                  * interested in routers that are
2324                  * reachable through ipifs within our zone.
2325                  */
2326                 if (ire->ire_ipif != NULL) {
2327                         match_flags |= MATCH_IRE_ILL_GROUP;
2328                 }
2329                 rire = ire_route_lookup(ire->ire_gateway_addr, 0, 0,
2330                     IRE_INTERFACE, ire->ire_ipif, NULL, zoneid, margs->ift_tsl,
2331                     match_flags, ipst);
2332                 if (rire != NULL) {
2333                         ire_refrele(rire);
2334                         IRE_REFHOLD(ire);
2335                         IRB_REFRELE(irb_ptr);
2336                         return (ire);
2337                 }
2338 next_ire:
2339                 ire = (ire->ire_next ?  ire->ire_next : irb_ptr->irb_ire);
2340                 if (ire == ire_origin)
2341                         break;
2342         }
2343         if (maybe_ire != NULL)
2344                 IRE_REFHOLD(maybe_ire);
2345         IRB_REFRELE(irb_ptr);
2346         return (maybe_ire);
2347 }
2348 
2349 void
2350 irb_refhold_rn(struct radix_node *rn)
2351 {
2352         if ((rn->rn_flags & RNF_ROOT) == 0)
2353                 IRB_REFHOLD(&((rt_t *)(rn))->rt_irb);
2354 }
2355 
2356 void
2357 irb_refrele_rn(struct radix_node *rn)
2358 {
2359         if ((rn->rn_flags & RNF_ROOT) == 0)
2360                 irb_refrele_ftable(&((rt_t *)(rn))->rt_irb);
2361 }
--- EOF ---