14 files changed, 8371 insertions, 0 deletions
diff --git a/arch/tile/include/hv/drv_mpipe_intf.h b/arch/tile/include/hv/drv_mpipe_intf.h
new file mode 100644
index 000000000..c97e416dd
--- /dev/null
+++ b/arch/tile/include/hv/drv_mpipe_intf.h
@@ -0,0 +1,605 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/**
+ * Interface definitions for the mpipe driver.
+ */
+
+#ifndef _SYS_HV_DRV_MPIPE_INTF_H
+#define _SYS_HV_DRV_MPIPE_INTF_H
+
+#include <arch/mpipe.h>
+#include <arch/mpipe_constants.h>
+
+
+/** Number of mPIPE instances supported */
+#define HV_MPIPE_INSTANCE_MAX   (2)
+
+/** Number of buffer stacks (32). */
+#define HV_MPIPE_NUM_BUFFER_STACKS \
+  (MPIPE_MMIO_INIT_DAT_GX36_1__BUFFER_STACK_MASK_WIDTH)
+
+/** Number of NotifRings (256). */
+#define HV_MPIPE_NUM_NOTIF_RINGS (MPIPE_NUM_NOTIF_RINGS)
+
+/** Number of NotifGroups (32). */
+#define HV_MPIPE_NUM_NOTIF_GROUPS (MPIPE_NUM_NOTIF_GROUPS)
+
+/** Number of buckets (4160). */
+#define HV_MPIPE_NUM_BUCKETS (MPIPE_NUM_BUCKETS)
+
+/** Number of "lo" buckets (4096). */
+#define HV_MPIPE_NUM_LO_BUCKETS 4096
+
+/** Number of "hi" buckets (64). */
+#define HV_MPIPE_NUM_HI_BUCKETS \
+  (HV_MPIPE_NUM_BUCKETS - HV_MPIPE_NUM_LO_BUCKETS)
+
+/** Number of edma rings (24). */
+#define HV_MPIPE_NUM_EDMA_RINGS \
+  (MPIPE_MMIO_INIT_DAT_GX36_1__EDMA_POST_MASK_WIDTH)
+
+
+
+
+/** A flag bit indicating a fixed resource allocation. */
+#define HV_MPIPE_ALLOC_FIXED 0x01
+
+/** Offset for the config register MMIO region. */
+#define HV_MPIPE_CONFIG_MMIO_OFFSET \
+  (MPIPE_MMIO_ADDR__REGION_VAL_CFG << MPIPE_MMIO_ADDR__REGION_SHIFT)
+
+/** Size of the config register MMIO region. */
+#define HV_MPIPE_CONFIG_MMIO_SIZE (64 * 1024)
+
+/** Offset for the config register MMIO region. */
+#define HV_MPIPE_FAST_MMIO_OFFSET \
+  (MPIPE_MMIO_ADDR__REGION_VAL_IDMA << MPIPE_MMIO_ADDR__REGION_SHIFT)
+
+/** Size of the fast register MMIO region (IDMA, EDMA, buffer stack). */
+#define HV_MPIPE_FAST_MMIO_SIZE \
+  ((MPIPE_MMIO_ADDR__REGION_VAL_BSM + 1 - MPIPE_MMIO_ADDR__REGION_VAL_IDMA) \
+   << MPIPE_MMIO_ADDR__REGION_SHIFT)
+
+
+/*
+ * Each type of resource allocation comes in quantized chunks, where
+ * XXX_BITS is the number of chunks, and XXX_RES_PER_BIT is the number
+ * of resources in each chunk.
+ */
+
+/** Number of buffer stack chunks available (32). */
+#define HV_MPIPE_ALLOC_BUFFER_STACKS_BITS \
+  MPIPE_MMIO_INIT_DAT_GX36_1__BUFFER_STACK_MASK_WIDTH
+
+/** Granularity of buffer stack allocation (1). */
+#define HV_MPIPE_ALLOC_BUFFER_STACKS_RES_PER_BIT \
+  (HV_MPIPE_NUM_BUFFER_STACKS / HV_MPIPE_ALLOC_BUFFER_STACKS_BITS)
+
+/** Number of NotifRing chunks available (32). */
+#define HV_MPIPE_ALLOC_NOTIF_RINGS_BITS \
+  MPIPE_MMIO_INIT_DAT_GX36_0__NOTIF_RING_MASK_WIDTH
+
+/** Granularity of NotifRing allocation (8). */
+#define HV_MPIPE_ALLOC_NOTIF_RINGS_RES_PER_BIT \
+  (HV_MPIPE_NUM_NOTIF_RINGS / HV_MPIPE_ALLOC_NOTIF_RINGS_BITS)
+
+/** Number of NotifGroup chunks available (32). */
+#define HV_MPIPE_ALLOC_NOTIF_GROUPS_BITS \
+  HV_MPIPE_NUM_NOTIF_GROUPS
+
+/** Granularity of NotifGroup allocation (1). */
+#define HV_MPIPE_ALLOC_NOTIF_GROUPS_RES_PER_BIT \
+  (HV_MPIPE_NUM_NOTIF_GROUPS / HV_MPIPE_ALLOC_NOTIF_GROUPS_BITS)
+
+/** Number of lo bucket chunks available (16). */
+#define HV_MPIPE_ALLOC_LO_BUCKETS_BITS \
+  MPIPE_MMIO_INIT_DAT_GX36_0__BUCKET_RELEASE_MASK_LO_WIDTH
+
+/** Granularity of lo bucket allocation (256). */
+#define HV_MPIPE_ALLOC_LO_BUCKETS_RES_PER_BIT \
+  (HV_MPIPE_NUM_LO_BUCKETS / HV_MPIPE_ALLOC_LO_BUCKETS_BITS)
+
+/** Number of hi bucket chunks available (16). */
+#define HV_MPIPE_ALLOC_HI_BUCKETS_BITS \
+  MPIPE_MMIO_INIT_DAT_GX36_0__BUCKET_RELEASE_MASK_HI_WIDTH
+
+/** Granularity of hi bucket allocation (4). */
+#define HV_MPIPE_ALLOC_HI_BUCKETS_RES_PER_BIT \
+  (HV_MPIPE_NUM_HI_BUCKETS / HV_MPIPE_ALLOC_HI_BUCKETS_BITS)
+
+/** Number of eDMA ring chunks available (24). */
+#define HV_MPIPE_ALLOC_EDMA_RINGS_BITS \
+  MPIPE_MMIO_INIT_DAT_GX36_1__EDMA_POST_MASK_WIDTH
+
+/** Granularity of eDMA ring allocation (1). */
+#define HV_MPIPE_ALLOC_EDMA_RINGS_RES_PER_BIT \
+  (HV_MPIPE_NUM_EDMA_RINGS / HV_MPIPE_ALLOC_EDMA_RINGS_BITS)
+
+
+
+
+/** Bit vector encoding which NotifRings are in a NotifGroup. */
+typedef struct
+{
+  /** The actual bits. */
+  uint64_t ring_mask[4];
+
+} gxio_mpipe_notif_group_bits_t;
+
+
+/** Another name for MPIPE_LBL_INIT_DAT_BSTS_TBL_t. */
+typedef MPIPE_LBL_INIT_DAT_BSTS_TBL_t gxio_mpipe_bucket_info_t;
+
+
+
+/** Eight buffer stack ids. */
+typedef struct
+{
+  /** The stacks. */
+  uint8_t stacks[8];
+
+} gxio_mpipe_rules_stacks_t;
+
+
+/** A destination mac address. */
+typedef struct
+{
+  /** The octets. */
+  uint8_t octets[6];
+
+} gxio_mpipe_rules_dmac_t;
+
+
+/** A vlan. */
+typedef uint16_t gxio_mpipe_rules_vlan_t;
+
+
+
+/** Maximum number of characters in a link name. */
+#define GXIO_MPIPE_LINK_NAME_LEN  32
+
+
+/** Structure holding a link name.  Only needed, and only typedef'ed,
+ *  because the IORPC stub generator only handles types which are single
+ *  words coming before the parameter name. */
+typedef struct
+{
+  /** The name itself. */
+  char name[GXIO_MPIPE_LINK_NAME_LEN];
+}
+_gxio_mpipe_link_name_t;
+
+/** Maximum number of characters in a symbol name. */
+#define GXIO_MPIPE_SYMBOL_NAME_LEN  128
+
+
+/** Structure holding a symbol name.  Only needed, and only typedef'ed,
+ *  because the IORPC stub generator only handles types which are single
+ *  words coming before the parameter name. */
+typedef struct
+{
+  /** The name itself. */
+  char name[GXIO_MPIPE_SYMBOL_NAME_LEN];
+}
+_gxio_mpipe_symbol_name_t;
+
+
+/** Structure holding a MAC address. */
+typedef struct
+{
+  /** The address. */
+  uint8_t mac[6];
+}
+_gxio_mpipe_link_mac_t;
+
+
+
+/** Request shared data permission -- that is, the ability to send and
+ *  receive packets -- on the specified link.  Other processes may also
+ *  request shared data permission on the same link.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_DATA, ::GXIO_MPIPE_LINK_NO_DATA,
+ *  or ::GXIO_MPIPE_LINK_EXCL_DATA may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_DATA is assumed.
+ */
+#define GXIO_MPIPE_LINK_DATA               0x00000001UL
+
+/** Do not request data permission on the specified link.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_DATA, ::GXIO_MPIPE_LINK_NO_DATA,
+ *  or ::GXIO_MPIPE_LINK_EXCL_DATA may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_DATA is assumed.
+ */
+#define GXIO_MPIPE_LINK_NO_DATA            0x00000002UL
+
+/** Request exclusive data permission -- that is, the ability to send and
+ *  receive packets -- on the specified link.  No other processes may
+ *  request data permission on this link, and if any process already has
+ *  data permission on it, this open will fail.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_DATA, ::GXIO_MPIPE_LINK_NO_DATA,
+ *  or ::GXIO_MPIPE_LINK_EXCL_DATA may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_DATA is assumed.
+ */
+#define GXIO_MPIPE_LINK_EXCL_DATA          0x00000004UL
+
+/** Request shared stats permission -- that is, the ability to read and write
+ *  registers which contain link statistics, and to get link attributes --
+ *  on the specified link.  Other processes may also request shared stats
+ *  permission on the same link.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_STATS, ::GXIO_MPIPE_LINK_NO_STATS,
+ *  or ::GXIO_MPIPE_LINK_EXCL_STATS may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_STATS is assumed.
+ */
+#define GXIO_MPIPE_LINK_STATS              0x00000008UL
+
+/** Do not request stats permission on the specified link.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_STATS, ::GXIO_MPIPE_LINK_NO_STATS,
+ *  or ::GXIO_MPIPE_LINK_EXCL_STATS may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_STATS is assumed.
+ */
+#define GXIO_MPIPE_LINK_NO_STATS           0x00000010UL
+
+/** Request exclusive stats permission -- that is, the ability to read and
+ *  write registers which contain link statistics, and to get link
+ *  attributes -- on the specified link.  No other processes may request
+ *  stats permission on this link, and if any process already
+ *  has stats permission on it, this open will fail.
+ *
+ *  Requesting exclusive stats permission is normally a very bad idea, since
+ *  it prevents programs like mpipe-stat from providing information on this
+ *  link.  Applications should only do this if they use MAC statistics
+ *  registers, and cannot tolerate any of the clear-on-read registers being
+ *  reset by other statistics programs.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_STATS, ::GXIO_MPIPE_LINK_NO_STATS,
+ *  or ::GXIO_MPIPE_LINK_EXCL_STATS may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_STATS is assumed.
+ */
+#define GXIO_MPIPE_LINK_EXCL_STATS         0x00000020UL
+
+/** Request shared control permission -- that is, the ability to modify link
+ *  attributes, and read and write MAC and MDIO registers -- on the
+ *  specified link.  Other processes may also request shared control
+ *  permission on the same link.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_CTL, ::GXIO_MPIPE_LINK_NO_CTL,
+ *  or ::GXIO_MPIPE_LINK_EXCL_CTL may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_CTL is assumed.
+ */
+#define GXIO_MPIPE_LINK_CTL                0x00000040UL
+
+/** Do not request control permission on the specified link.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_CTL, ::GXIO_MPIPE_LINK_NO_CTL,
+ *  or ::GXIO_MPIPE_LINK_EXCL_CTL may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_CTL is assumed.
+ */
+#define GXIO_MPIPE_LINK_NO_CTL             0x00000080UL
+
+/** Request exclusive control permission -- that is, the ability to modify
+ *  link attributes, and read and write MAC and MDIO registers -- on the
+ *  specified link.  No other processes may request control permission on
+ *  this link, and if any process already has control permission on it,
+ *  this open will fail.
+ *
+ *  Requesting exclusive control permission is not always a good idea, since
+ *  it prevents programs like mpipe-link from configuring the link.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_CTL, ::GXIO_MPIPE_LINK_NO_CTL,
+ *  or ::GXIO_MPIPE_LINK_EXCL_CTL may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_CTL is assumed.
+ */
+#define GXIO_MPIPE_LINK_EXCL_CTL           0x00000100UL
+
+/** Set the desired state of the link to up, allowing any speeds which are
+ *  supported by the link hardware, as part of this open operation; do not
+ *  change the desired state of the link when it is closed or the process
+ *  exits.  No more than one of ::GXIO_MPIPE_LINK_AUTO_UP,
+ *  ::GXIO_MPIPE_LINK_AUTO_UPDOWN, ::GXIO_MPIPE_LINK_AUTO_DOWN, or
+ *  ::GXIO_MPIPE_LINK_AUTO_NONE may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_AUTO_UPDOWN is assumed.
+ */
+#define GXIO_MPIPE_LINK_AUTO_UP            0x00000200UL
+
+/** Set the desired state of the link to up, allowing any speeds which are
+ *  supported by the link hardware, as part of this open operation; when the
+ *  link is closed or this process exits, if no other process has the link
+ *  open, set the desired state of the link to down.  No more than one of
+ *  ::GXIO_MPIPE_LINK_AUTO_UP, ::GXIO_MPIPE_LINK_AUTO_UPDOWN,
+ *  ::GXIO_MPIPE_LINK_AUTO_DOWN, or ::GXIO_MPIPE_LINK_AUTO_NONE may be
+ *  specifed in a gxio_mpipe_link_open() call.  If none are specified,
+ *  ::GXIO_MPIPE_LINK_AUTO_UPDOWN is assumed.
+ */
+#define GXIO_MPIPE_LINK_AUTO_UPDOWN        0x00000400UL
+
+/** Do not change the desired state of the link as part of the open
+ *  operation; when the link is closed or this process exits, if no other
+ *  process has the link open, set the desired state of the link to down.
+ *  No more than one of ::GXIO_MPIPE_LINK_AUTO_UP,
+ *  ::GXIO_MPIPE_LINK_AUTO_UPDOWN, ::GXIO_MPIPE_LINK_AUTO_DOWN, or
+ *  ::GXIO_MPIPE_LINK_AUTO_NONE may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_AUTO_UPDOWN is assumed.
+ */
+#define GXIO_MPIPE_LINK_AUTO_DOWN          0x00000800UL
+
+/** Do not change the desired state of the link as part of the open
+ *  operation; do not change the desired state of the link when it is
+ *  closed or the process exits.  No more than one of
+ *  ::GXIO_MPIPE_LINK_AUTO_UP, ::GXIO_MPIPE_LINK_AUTO_UPDOWN,
+ *  ::GXIO_MPIPE_LINK_AUTO_DOWN, or ::GXIO_MPIPE_LINK_AUTO_NONE may be
+ *  specifed in a gxio_mpipe_link_open() call.  If none are specified,
+ *  ::GXIO_MPIPE_LINK_AUTO_UPDOWN is assumed.
+ */
+#define GXIO_MPIPE_LINK_AUTO_NONE          0x00001000UL
+
+/** Request that this open call not complete until the network link is up.
+ *  The process will wait as long as necessary for this to happen;
+ *  applications which wish to abandon waiting for the link after a
+ *  specific time period should not specify this flag when opening a link,
+ *  but should instead call gxio_mpipe_link_wait() afterward.  The link
+ *  must be opened with stats permission.  Note that this flag by itself
+ *  does not change the desired link state; if other open flags or previous
+ *  link state changes have not requested a desired state of up, the open
+ *  call will never complete.  This flag is not available to kernel
+ *  clients.
+ */
+#define GXIO_MPIPE_LINK_WAIT               0x00002000UL
+
+
+/*
+ * Note: link attributes must fit in 24 bits, since we use the top 8 bits
+ * of the IORPC offset word for the channel number.
+ */
+
+/** Determine whether jumbo frames may be received.  If this attribute's
+ *  value value is nonzero, the MAC will accept frames of up to 10240 bytes.
+ *  If the value is zero, the MAC will only accept frames of up to 1544
+ *  bytes.  The default value is zero. */
+#define GXIO_MPIPE_LINK_RECEIVE_JUMBO      0x010000
+
+/** Determine whether to send pause frames on this link if the mPIPE packet
+ *  FIFO is nearly full.  If the value is zero, pause frames are not sent.
+ *  If the value is nonzero, it is the delay value which will be sent in any
+ *  pause frames which are output, in units of 512 bit times.
+ *
+ *  Bear in mind that in almost all circumstances, the mPIPE packet FIFO
+ *  will never fill up, since mPIPE will empty it as fast as or faster than
+ *  the incoming data rate, by either delivering or dropping packets.  The
+ *  only situation in which this is not true is if the memory and cache
+ *  subsystem is extremely heavily loaded, and mPIPE cannot perform DMA of
+ *  packet data to memory in a timely fashion.  In particular, pause frames
+ *  will <em>not</em> be sent if packets cannot be delivered because
+ *  NotifRings are full, buckets are full, or buffers are not available in
+ *  a buffer stack. */
+#define GXIO_MPIPE_LINK_SEND_PAUSE         0x020000
+
+/** Determine whether to suspend output on the receipt of pause frames.
+ *  If the value is nonzero, mPIPE shim will suspend output on the link's
+ *  channel when a pause frame is received.  If the value is zero, pause
+ *  frames will be ignored.  The default value is zero. */
+#define GXIO_MPIPE_LINK_RECEIVE_PAUSE      0x030000
+
+/** Interface MAC address.  The value is a 6-byte MAC address, in the least
+ *  significant 48 bits of the value; in other words, an address which would
+ *  be printed as '12:34:56:78:90:AB' in IEEE 802 canonical format would
+ *  be returned as 0x12345678ab.
+ *
+ *  Depending upon the overall system design, a MAC address may or may not
+ *  be available for each interface.  Note that the interface's MAC address
+ *  does not limit the packets received on its channel, although the
+ *  classifier's rules could be configured to do that.  Similarly, the MAC
+ *  address is not used when transmitting packets, although applications
+ *  could certainly decide to use the assigned address as a source MAC
+ *  address when doing so.  This attribute may only be retrieved with
+ *  gxio_mpipe_link_get_attr(); it may not be modified.
+ */
+#define GXIO_MPIPE_LINK_MAC                0x040000
+
+/** Determine whether to discard egress packets on link down. If this value
+ *  is nonzero, packets sent on this link while the link is down will be
+ *  discarded.  If this value is zero, no packets will be sent on this link
+ *  while it is down.  The default value is one. */
+#define GXIO_MPIPE_LINK_DISCARD_IF_DOWN    0x050000
+
+/** Possible link state.  The value is a combination of link state flags,
+ *  ORed together, that indicate link modes which are actually supported by
+ *  the hardware.  This attribute may only be retrieved with
+ *  gxio_mpipe_link_get_attr(); it may not be modified. */
+#define GXIO_MPIPE_LINK_POSSIBLE_STATE     0x060000
+
+/** Current link state.  The value is a combination of link state flags,
+ *  ORed together, that indicate the current state of the hardware.  If the
+ *  link is down, the value ANDed with ::GXIO_MPIPE_LINK_SPEED will be zero;
+ *  if the link is up, the value ANDed with ::GXIO_MPIPE_LINK_SPEED will
+ *  result in exactly one of the speed values, indicating the current speed.
+ *  This attribute may only be retrieved with gxio_mpipe_link_get_attr(); it
+ *  may not be modified. */
+#define GXIO_MPIPE_LINK_CURRENT_STATE      0x070000
+
+/** Desired link state. The value is a conbination of flags, which specify
+ *  the desired state for the link.  With gxio_mpipe_link_set_attr(), this
+ *  will, in the background, attempt to bring up the link using whichever of
+ *  the requested flags are reasonable, or take down the link if the flags
+ *  are zero.  The actual link up or down operation may happen after this
+ *  call completes.  If the link state changes in the future, the system
+ *  will continue to try to get back to the desired link state; for
+ *  instance, if the link is brought up successfully, and then the network
+ *  cable is disconnected, the link will go down.  However, the desired
+ *  state of the link is still up, so if the cable is reconnected, the link
+ *  will be brought up again.
+ *
+ *  With gxio_mpipe_link_set_attr(), this will indicate the desired state
+ *  for the link, as set with a previous gxio_mpipe_link_set_attr() call,
+ *  or implicitly by a gxio_mpipe_link_open() or link close operation.
+ *  This may not reflect the current state of the link; to get that, use
+ *  ::GXIO_MPIPE_LINK_CURRENT_STATE.
+ */
+#define GXIO_MPIPE_LINK_DESIRED_STATE      0x080000
+
+
+
+/** Link can run, should run, or is running at 10 Mbps. */
+#define GXIO_MPIPE_LINK_10M        0x0000000000000001UL
+
+/** Link can run, should run, or is running at 100 Mbps. */
+#define GXIO_MPIPE_LINK_100M       0x0000000000000002UL
+
+/** Link can run, should run, or is running at 1 Gbps. */
+#define GXIO_MPIPE_LINK_1G         0x0000000000000004UL
+
+/** Link can run, should run, or is running at 10 Gbps. */
+#define GXIO_MPIPE_LINK_10G        0x0000000000000008UL
+
+/** Link can run, should run, or is running at 20 Gbps. */
+#define GXIO_MPIPE_LINK_20G        0x0000000000000010UL
+
+/** Link can run, should run, or is running at 25 Gbps. */
+#define GXIO_MPIPE_LINK_25G        0x0000000000000020UL
+
+/** Link can run, should run, or is running at 50 Gbps. */
+#define GXIO_MPIPE_LINK_50G        0x0000000000000040UL
+
+/** Link should run at the highest speed supported by the link and by
+ *  the device connected to the link.  Only usable as a value for
+ *  the link's desired state; never returned as a value for the current
+ *  or possible states. */
+#define GXIO_MPIPE_LINK_ANYSPEED   0x0000000000000800UL
+
+/** All legal link speeds.  This value is provided for use in extracting
+ *  the speed-related subset of the link state flags; it is not intended
+ *  to be set directly as a value for one of the GXIO_MPIPE_LINK_xxx_STATE
+ *  attributes.  A link is up or is requested to be up if its current or
+ *  desired state, respectively, ANDED with this value, is nonzero. */
+#define GXIO_MPIPE_LINK_SPEED_MASK 0x0000000000000FFFUL
+
+/** Link can run, should run, or is running in MAC loopback mode.  This
+ *  loops transmitted packets back to the receiver, inside the Tile
+ *  Processor. */
+#define GXIO_MPIPE_LINK_LOOP_MAC   0x0000000000001000UL
+
+/** Link can run, should run, or is running in PHY loopback mode.  This
+ *  loops transmitted packets back to the receiver, inside the external
+ *  PHY chip. */
+#define GXIO_MPIPE_LINK_LOOP_PHY   0x0000000000002000UL
+
+/** Link can run, should run, or is running in external loopback mode.
+ *  This requires that an external loopback plug be installed on the
+ *  Ethernet port.  Note that only some links require that this be
+ *  configured via the gxio_mpipe_link routines; other links can do
+ *  external loopack with the plug and no special configuration. */
+#define GXIO_MPIPE_LINK_LOOP_EXT   0x0000000000004000UL
+
+/** All legal loopback types. */
+#define GXIO_MPIPE_LINK_LOOP_MASK  0x000000000000F000UL
+
+/** Link can run, should run, or is running in full-duplex mode.
+ *  If neither ::GXIO_MPIPE_LINK_FDX nor ::GXIO_MPIPE_LINK_HDX are
+ *  specified in a set of desired state flags, both are assumed. */
+#define GXIO_MPIPE_LINK_FDX        0x0000000000010000UL
+
+/** Link can run, should run, or is running in half-duplex mode.
+ *  If neither ::GXIO_MPIPE_LINK_FDX nor ::GXIO_MPIPE_LINK_HDX are
+ *  specified in a set of desired state flags, both are assumed. */
+#define GXIO_MPIPE_LINK_HDX        0x0000000000020000UL
+
+
+/** An individual rule. */
+typedef struct
+{
+  /** The total size. */
+  uint16_t size;
+
+  /** The priority. */
+  int16_t priority;
+
+  /** The "headroom" in each buffer. */
+  uint8_t headroom;
+
+  /** The "tailroom" in each buffer. */
+  uint8_t tailroom;
+
+  /** The "capacity" of the largest buffer. */
+  uint16_t capacity;
+
+  /** The mask for converting a flow hash into a bucket. */
+  uint16_t bucket_mask;
+
+  /** The offset for converting a flow hash into a bucket. */
+  uint16_t bucket_first;
+
+  /** The buffer stack ids. */
+  gxio_mpipe_rules_stacks_t stacks;
+
+  /** The actual channels. */
+  uint32_t channel_bits;
+
+  /** The number of dmacs. */
+  uint16_t num_dmacs;
+
+  /** The number of vlans. */
+  uint16_t num_vlans;
+
+  /** The actual dmacs and vlans. */
+  uint8_t dmacs_and_vlans[];
+
+} gxio_mpipe_rules_rule_t;
+
+
+/** A list of classifier rules. */
+typedef struct
+{
+  /** The offset to the end of the current rule. */
+  uint16_t tail;
+
+  /** The offset to the start of the current rule. */
+  uint16_t head;
+
+  /** The actual rules. */
+  uint8_t rules[4096 - 4];
+
+} gxio_mpipe_rules_list_t;
+
+
+
+
+/** mPIPE statistics structure. These counters include all relevant
+ *  events occurring on all links within the mPIPE shim. */
+typedef struct
+{
+  /** Number of ingress packets dropped for any reason. */
+  uint64_t ingress_drops;
+  /** Number of ingress packets dropped because a buffer stack was empty. */
+  uint64_t ingress_drops_no_buf;
+  /** Number of ingress packets dropped or truncated due to lack of space in
+   *  the iPkt buffer. */
+  uint64_t ingress_drops_ipkt;
+  /** Number of ingress packets dropped by the classifier or load balancer */
+  uint64_t ingress_drops_cls_lb;
+  /** Total number of ingress packets. */
+  uint64_t ingress_packets;
+  /** Total number of egress packets. */
+  uint64_t egress_packets;
+  /** Total number of ingress bytes. */
+  uint64_t ingress_bytes;
+  /** Total number of egress bytes. */
+  uint64_t egress_bytes;
+}
+gxio_mpipe_stats_t;
+
+
+#endif /* _SYS_HV_DRV_MPIPE_INTF_H */
diff --git a/arch/tile/include/hv/drv_mshim_intf.h b/arch/tile/include/hv/drv_mshim_intf.h
new file mode 100644
index 000000000..c6ef3bdc5
--- /dev/null
+++ b/arch/tile/include/hv/drv_mshim_intf.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/**
+ * @file drv_mshim_intf.h
+ * Interface definitions for the Linux EDAC memory controller driver.
+ */
+
+#ifndef _SYS_HV_INCLUDE_DRV_MSHIM_INTF_H
+#define _SYS_HV_INCLUDE_DRV_MSHIM_INTF_H
+
+/** Number of memory controllers in the public API. */
+#define TILE_MAX_MSHIMS 4
+
+/** Memory info under each memory controller. */
+struct mshim_mem_info
+{
+  uint64_t mem_size;     /**< Total memory size in bytes. */
+  uint8_t mem_type;      /**< Memory type, DDR2 or DDR3. */
+  uint8_t mem_ecc;       /**< Memory supports ECC. */
+};
+
+/**
+ * DIMM error structure.
+ * For now, only correctable errors are counted and the mshim doesn't record
+ * the error PA. HV takes panic upon uncorrectable errors.
+ */
+struct mshim_mem_error
+{
+  uint32_t sbe_count;     /**< Number of single-bit errors. */
+};
+
+/** Read this offset to get the memory info per mshim. */
+#define MSHIM_MEM_INFO_OFF 0x100
+
+/** Read this offset to check DIMM error. */
+#define MSHIM_MEM_ERROR_OFF 0x200
+
+#endif /* _SYS_HV_INCLUDE_DRV_MSHIM_INTF_H */
diff --git a/arch/tile/include/hv/drv_pcie_rc_intf.h b/arch/tile/include/hv/drv_pcie_rc_intf.h
new file mode 100644
index 000000000..9bd2243be
--- /dev/null
+++ b/arch/tile/include/hv/drv_pcie_rc_intf.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/**
+ * @file drv_pcie_rc_intf.h
+ * Interface definitions for the PCIE Root Complex.
+ */
+
+#ifndef _SYS_HV_DRV_PCIE_RC_INTF_H
+#define _SYS_HV_DRV_PCIE_RC_INTF_H
+
+/** File offset for reading the interrupt base number used for PCIE legacy
+    interrupts and PLX Gen 1 requirement flag */
+#define PCIE_RC_CONFIG_MASK_OFF 0
+
+
+/**
+ * Structure used for obtaining PCIe config information, read from the PCIE
+ * subsystem /ctl file at initialization
+ */
+typedef struct pcie_rc_config
+{
+  int intr;                     /**< interrupt number used for downcall */
+  int plx_gen1;                 /**< flag for PLX Gen 1 configuration */
+} pcie_rc_config_t;
+
+#endif  /* _SYS_HV_DRV_PCIE_RC_INTF_H */
diff --git a/arch/tile/include/hv/drv_srom_intf.h b/arch/tile/include/hv/drv_srom_intf.h
new file mode 100644
index 000000000..6395faa6d
--- /dev/null
+++ b/arch/tile/include/hv/drv_srom_intf.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/**
+ * @file drv_srom_intf.h
+ * Interface definitions for the SPI Flash ROM driver.
+ */
+
+#ifndef _SYS_HV_INCLUDE_DRV_SROM_INTF_H
+#define _SYS_HV_INCLUDE_DRV_SROM_INTF_H
+
+/** Read this offset to get the total device size. */
+#define SROM_TOTAL_SIZE_OFF   0xF0000000
+
+/** Read this offset to get the device sector size. */
+#define SROM_SECTOR_SIZE_OFF  0xF0000004
+
+/** Read this offset to get the device page size. */
+#define SROM_PAGE_SIZE_OFF    0xF0000008
+
+/** Write this offset to flush any pending writes. */
+#define SROM_FLUSH_OFF        0xF1000000
+
+/** Write this offset, plus the byte offset of the start of a sector, to
+ *  erase a sector.  Any write data is ignored, but there must be at least
+ *  one byte of write data.  Only applies when the driver is in MTD mode.
+ */
+#define SROM_ERASE_OFF        0xF2000000
+
+#endif /* _SYS_HV_INCLUDE_DRV_SROM_INTF_H */
diff --git a/arch/tile/include/hv/drv_trio_intf.h b/arch/tile/include/hv/drv_trio_intf.h
new file mode 100644
index 000000000..237e04dee
--- /dev/null
+++ b/arch/tile/include/hv/drv_trio_intf.h
@@ -0,0 +1,199 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/**
+ * Interface definitions for the trio driver.
+ */
+
+#ifndef _SYS_HV_DRV_TRIO_INTF_H
+#define _SYS_HV_DRV_TRIO_INTF_H
+
+#include <arch/trio.h>
+
+/** The vendor ID for all Tilera processors. */
+#define TILERA_VENDOR_ID 0x1a41
+
+/** The device ID for the Gx36 processor. */
+#define TILERA_GX36_DEV_ID 0x0200
+
+/** Device ID for our internal bridge when running as RC. */
+#define TILERA_GX36_RC_DEV_ID 0x2000
+
+/** Maximum number of TRIO interfaces. */
+#define TILEGX_NUM_TRIO         2
+
+/** Gx36 has max 3 PCIe MACs per TRIO interface. */
+#define TILEGX_TRIO_PCIES       3
+
+/** Specify port properties for a PCIe MAC. */
+struct pcie_port_property
+{
+  /** If true, the link can be configured in PCIe root complex mode. */
+  uint8_t allow_rc: 1;
+
+  /** If true, the link can be configured in PCIe endpoint mode. */
+  uint8_t allow_ep: 1;
+
+  /** If true, the link can be configured in StreamIO mode. */
+  uint8_t allow_sio: 1;
+
+  /** If true, the link is allowed to support 1-lane operation. Software
+   *  will not consider it an error if the link comes up as a x1 link. */
+  uint8_t allow_x1: 1;
+
+  /** If true, the link is allowed to support 2-lane operation. Software
+   *  will not consider it an error if the link comes up as a x2 link. */
+  uint8_t allow_x2: 1;
+
+  /** If true, the link is allowed to support 4-lane operation. Software
+   *  will not consider it an error if the link comes up as a x4 link. */
+  uint8_t allow_x4: 1;
+
+  /** If true, the link is allowed to support 8-lane operation. Software
+   *  will not consider it an error if the link comes up as a x8 link. */
+  uint8_t allow_x8: 1;
+
+  /** If true, this link is connected to a device which may or may not
+   *  be present. */
+  uint8_t removable: 1;
+
+};
+
+/** Configurations can be issued to configure a char stream interrupt. */
+typedef enum pcie_stream_intr_config_sel_e
+{
+  /** Interrupt configuration for memory map regions. */
+  MEM_MAP_SEL,
+
+  /** Interrupt configuration for push DMAs. */
+  PUSH_DMA_SEL,
+
+  /** Interrupt configuration for pull DMAs. */
+  PULL_DMA_SEL,
+}
+pcie_stream_intr_config_sel_t;
+
+
+/** The mmap file offset (PA) of the TRIO config region. */
+#define HV_TRIO_CONFIG_OFFSET                                        \
+  ((unsigned long long)TRIO_MMIO_ADDRESS_SPACE__REGION_VAL_CFG <<   \
+    TRIO_MMIO_ADDRESS_SPACE__REGION_SHIFT)
+
+/** The maximum size of the TRIO config region. */
+#define HV_TRIO_CONFIG_SIZE                                 \
+  (1ULL << TRIO_CFG_REGION_ADDR__REGION_SHIFT)
+
+/** Size of the config region mapped into client. We can't use
+ *  TRIO_MMIO_ADDRESS_SPACE__OFFSET_WIDTH because it
+ *  will require the kernel to allocate 4GB VA space
+ *  from the VMALLOC region which has a total range
+ *  of 4GB.
+ */
+#define HV_TRIO_CONFIG_IOREMAP_SIZE                            \
+  ((uint64_t) 1 << TRIO_CFG_REGION_ADDR__PROT_SHIFT)
+
+/** The mmap file offset (PA) of a scatter queue region. */
+#define HV_TRIO_SQ_OFFSET(queue)                                        \
+  (((unsigned long long)TRIO_MMIO_ADDRESS_SPACE__REGION_VAL_MAP_SQ <<   \
+    TRIO_MMIO_ADDRESS_SPACE__REGION_SHIFT) |                            \
+   ((queue) << TRIO_MAP_SQ_REGION_ADDR__SQ_SEL_SHIFT))
+
+/** The maximum size of a scatter queue region. */
+#define HV_TRIO_SQ_SIZE                                 \
+  (1ULL << TRIO_MAP_SQ_REGION_ADDR__SQ_SEL_SHIFT)
+
+
+/** The "hardware MMIO region" of the first PIO region. */
+#define HV_TRIO_FIRST_PIO_REGION 8
+
+/** The mmap file offset (PA) of a PIO region. */
+#define HV_TRIO_PIO_OFFSET(region)                           \
+  (((unsigned long long)(region) + HV_TRIO_FIRST_PIO_REGION) \
+   << TRIO_PIO_REGIONS_ADDR__REGION_SHIFT)
+
+/** The maximum size of a PIO region. */
+#define HV_TRIO_PIO_SIZE (1ULL << TRIO_PIO_REGIONS_ADDR__ADDR_WIDTH)
+
+
+/** The mmap file offset (PA) of a push DMA region. */
+#define HV_TRIO_PUSH_DMA_OFFSET(ring)                                   \
+  (((unsigned long long)TRIO_MMIO_ADDRESS_SPACE__REGION_VAL_PUSH_DMA << \
+    TRIO_MMIO_ADDRESS_SPACE__REGION_SHIFT) |                            \
+   ((ring) << TRIO_PUSH_DMA_REGION_ADDR__RING_SEL_SHIFT))
+
+/** The mmap file offset (PA) of a pull DMA region. */
+#define HV_TRIO_PULL_DMA_OFFSET(ring)                                   \
+  (((unsigned long long)TRIO_MMIO_ADDRESS_SPACE__REGION_VAL_PULL_DMA << \
+    TRIO_MMIO_ADDRESS_SPACE__REGION_SHIFT) |                            \
+   ((ring) << TRIO_PULL_DMA_REGION_ADDR__RING_SEL_SHIFT))
+
+/** The maximum size of a DMA region. */
+#define HV_TRIO_DMA_REGION_SIZE                         \
+  (1ULL << TRIO_PUSH_DMA_REGION_ADDR__RING_SEL_SHIFT)
+
+
+/** The mmap file offset (PA) of a Mem-Map interrupt region. */
+#define HV_TRIO_MEM_MAP_INTR_OFFSET(map)                                 \
+  (((unsigned long long)TRIO_MMIO_ADDRESS_SPACE__REGION_VAL_MAP_MEM <<   \
+    TRIO_MMIO_ADDRESS_SPACE__REGION_SHIFT) |                            \
+   ((map) << TRIO_MAP_MEM_REGION_ADDR__MAP_SEL_SHIFT))
+
+/** The maximum size of a Mem-Map interrupt region. */
+#define HV_TRIO_MEM_MAP_INTR_SIZE                                 \
+  (1ULL << TRIO_MAP_MEM_REGION_ADDR__MAP_SEL_SHIFT)
+
+
+/** A flag bit indicating a fixed resource allocation. */
+#define HV_TRIO_ALLOC_FIXED 0x01
+
+/** TRIO requires that all mappings have 4kB aligned start addresses. */
+#define HV_TRIO_PAGE_SHIFT 12
+
+/** TRIO requires that all mappings have 4kB aligned start addresses. */
+#define HV_TRIO_PAGE_SIZE (1ull << HV_TRIO_PAGE_SHIFT)
+
+
+/* Specify all PCIe port properties for a TRIO. */
+struct pcie_trio_ports_property
+{
+  struct pcie_port_property ports[TILEGX_TRIO_PCIES];
+
+  /** Set if this TRIO belongs to a Gx72 device. */
+  uint8_t is_gx72;
+};
+
+/* Flags indicating traffic class. */
+#define HV_TRIO_FLAG_TC_SHIFT 4
+#define HV_TRIO_FLAG_TC_RMASK 0xf
+#define HV_TRIO_FLAG_TC(N) \
+  ((((N) & HV_TRIO_FLAG_TC_RMASK) + 1) << HV_TRIO_FLAG_TC_SHIFT)
+
+/* Flags indicating virtual functions. */
+#define HV_TRIO_FLAG_VFUNC_SHIFT 8
+#define HV_TRIO_FLAG_VFUNC_RMASK 0xff
+#define HV_TRIO_FLAG_VFUNC(N) \
+  ((((N) & HV_TRIO_FLAG_VFUNC_RMASK) + 1) << HV_TRIO_FLAG_VFUNC_SHIFT)
+
+
+/* Flag indicating an ordered PIO region. */
+#define HV_TRIO_PIO_FLAG_ORDERED (1 << 16)
+
+/* Flags indicating special types of PIO regions. */
+#define HV_TRIO_PIO_FLAG_SPACE_SHIFT 17
+#define HV_TRIO_PIO_FLAG_SPACE_MASK (0x3 << HV_TRIO_PIO_FLAG_SPACE_SHIFT)
+#define HV_TRIO_PIO_FLAG_CONFIG_SPACE (0x1 << HV_TRIO_PIO_FLAG_SPACE_SHIFT)
+#define HV_TRIO_PIO_FLAG_IO_SPACE (0x2 << HV_TRIO_PIO_FLAG_SPACE_SHIFT)
+
+
+#endif /* _SYS_HV_DRV_TRIO_INTF_H */
diff --git a/arch/tile/include/hv/drv_uart_intf.h b/arch/tile/include/hv/drv_uart_intf.h
new file mode 100644
index 000000000..f5379e240
--- /dev/null
+++ b/arch/tile/include/hv/drv_uart_intf.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/**
+ * Interface definitions for the UART driver.
+ */
+
+#ifndef _SYS_HV_DRV_UART_INTF_H
+#define _SYS_HV_DRV_UART_INTF_H
+
+#include <arch/uart.h>
+
+/** Number of UART ports supported. */
+#define TILEGX_UART_NR        2
+
+/** The mmap file offset (PA) of the UART MMIO region. */
+#define HV_UART_MMIO_OFFSET   0
+
+/** The maximum size of the UARTs MMIO region (64K Bytes). */
+#define HV_UART_MMIO_SIZE     (1UL << 16)
+
+#endif /* _SYS_HV_DRV_UART_INTF_H */
diff --git a/arch/tile/include/hv/drv_usb_host_intf.h b/arch/tile/include/hv/drv_usb_host_intf.h
new file mode 100644
index 000000000..24ce774a3
--- /dev/null
+++ b/arch/tile/include/hv/drv_usb_host_intf.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/**
+ * Interface definitions for the USB host driver.
+ */
+
+#ifndef _SYS_HV_DRV_USB_HOST_INTF_H
+#define _SYS_HV_DRV_USB_HOST_INTF_H
+
+#include <arch/usb_host.h>
+
+
+/** Offset for the EHCI register MMIO region. */
+#define HV_USB_HOST_MMIO_OFFSET_EHCI ((uint64_t) USB_HOST_HCCAPBASE_REG)
+
+/** Offset for the OHCI register MMIO region. */
+#define HV_USB_HOST_MMIO_OFFSET_OHCI ((uint64_t) USB_HOST_OHCD_HC_REVISION_REG)
+
+/** Size of the register MMIO region.  This turns out to be the same for
+ *  both EHCI and OHCI. */
+#define HV_USB_HOST_MMIO_SIZE ((uint64_t) 0x1000)
+
+/** The number of service domains supported by the USB host shim. */
+#define HV_USB_HOST_NUM_SVC_DOM 1
+
+
+#endif /* _SYS_HV_DRV_USB_HOST_INTF_H */
diff --git a/arch/tile/include/hv/drv_xgbe_impl.h b/arch/tile/include/hv/drv_xgbe_impl.h
new file mode 100644
index 000000000..3a73b2b44
--- /dev/null
+++ b/arch/tile/include/hv/drv_xgbe_impl.h
@@ -0,0 +1,300 @@
+/*
+ * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/**
+ * @file drivers/xgbe/impl.h
+ * Implementation details for the NetIO library.
+ */
+
+#ifndef __DRV_XGBE_IMPL_H__
+#define __DRV_XGBE_IMPL_H__
+
+#include <hv/netio_errors.h>
+#include <hv/netio_intf.h>
+#include <hv/drv_xgbe_intf.h>
+
+
+/** How many groups we have (log2). */
+#define LOG2_NUM_GROUPS (12)
+/** How many groups we have. */
+#define NUM_GROUPS (1 << LOG2_NUM_GROUPS)
+
+/** Number of output requests we'll buffer per tile. */
+#define EPP_REQS_PER_TILE (32)
+
+/** Words used in an eDMA command without checksum acceleration. */
+#define EDMA_WDS_NO_CSUM      8
+/** Words used in an eDMA command with checksum acceleration. */
+#define EDMA_WDS_CSUM        10
+/** Total available words in the eDMA command FIFO. */
+#define EDMA_WDS_TOTAL      128
+
+
+/*
+ * FIXME: These definitions are internal and should have underscores!
+ * NOTE: The actual numeric values here are intentional and allow us to
+ * optimize the concept "if small ... else if large ... else ...", by
+ * checking for the low bit being set, and then for non-zero.
+ * These are used as array indices, so they must have the values (0, 1, 2)
+ * in some order.
+ */
+#define SIZE_SMALL (1)       /**< Small packet queue. */
+#define SIZE_LARGE (2)       /**< Large packet queue. */
+#define SIZE_JUMBO (0)       /**< Jumbo packet queue. */
+
+/** The number of "SIZE_xxx" values. */
+#define NETIO_NUM_SIZES 3
+
+
+/*
+ * Default numbers of packets for IPP drivers.  These values are chosen
+ * such that CIPP1 will not overflow its L2 cache.
+ */
+
+/** The default number of small packets. */
+#define NETIO_DEFAULT_SMALL_PACKETS 2750
+/** The default number of large packets. */
+#define NETIO_DEFAULT_LARGE_PACKETS 2500
+/** The default number of jumbo packets. */
+#define NETIO_DEFAULT_JUMBO_PACKETS 250
+
+
+/** Log2 of the size of a memory arena. */
+#define NETIO_ARENA_SHIFT      24      /* 16 MB */
+/** Size of a memory arena. */
+#define NETIO_ARENA_SIZE       (1 << NETIO_ARENA_SHIFT)
+
+
+/** A queue of packets.
+ *
+ * This structure partially defines a queue of packets waiting to be
+ * processed.  The queue as a whole is written to by an interrupt handler and
+ * read by non-interrupt code; this data structure is what's touched by the
+ * interrupt handler.  The other part of the queue state, the read offset, is
+ * kept in user space, not in hypervisor space, so it is in a separate data
+ * structure.
+ *
+ * The read offset (__packet_receive_read in the user part of the queue
+ * structure) points to the next packet to be read. When the read offset is
+ * equal to the write offset, the queue is empty; therefore the queue must
+ * contain one more slot than the required maximum queue size.
+ *
+ * Here's an example of all 3 state variables and what they mean.  All
+ * pointers move left to right.
+ *
+ * @code
+ *   I   I   V   V   V   V   I   I   I   I
+ *   0   1   2   3   4   5   6   7   8   9  10
+ *           ^       ^       ^               ^
+ *           |               |               |
+ *           |               |               __last_packet_plus_one
+ *           |               __buffer_write
+ *           __packet_receive_read
+ * @endcode
+ *
+ * This queue has 10 slots, and thus can hold 9 packets (_last_packet_plus_one
+ * = 10).  The read pointer is at 2, and the write pointer is at 6; thus,
+ * there are valid, unread packets in slots 2, 3, 4, and 5.  The remaining
+ * slots are invalid (do not contain a packet).
+ */
+typedef struct {
+  /** Byte offset of the next notify packet to be written: zero for the first
+   *  packet on the queue, sizeof (netio_pkt_t) for the second packet on the
+   *  queue, etc. */
+  volatile uint32_t __packet_write;
+
+  /** Offset of the packet after the last valid packet (i.e., when any
+   *  pointer is incremented to this value, it wraps back to zero). */
+  uint32_t __last_packet_plus_one;
+}
+__netio_packet_queue_t;
+
+
+/** A queue of buffers.
+ *
+ * This structure partially defines a queue of empty buffers which have been
+ * obtained via requests to the IPP.  (The elements of the queue are packet
+ * handles, which are transformed into a full netio_pkt_t when the buffer is
+ * retrieved.)  The queue as a whole is written to by an interrupt handler and
+ * read by non-interrupt code; this data structure is what's touched by the
+ * interrupt handler.  The other parts of the queue state, the read offset and
+ * requested write offset, are kept in user space, not in hypervisor space, so
+ * they are in a separate data structure.
+ *
+ * The read offset (__buffer_read in the user part of the queue structure)
+ * points to the next buffer to be read. When the read offset is equal to the
+ * write offset, the queue is empty; therefore the queue must contain one more
+ * slot than the required maximum queue size.
+ *
+ * The requested write offset (__buffer_requested_write in the user part of
+ * the queue structure) points to the slot which will hold the next buffer we
+ * request from the IPP, once we get around to sending such a request.  When
+ * the requested write offset is equal to the write offset, no requests for
+ * new buffers are outstanding; when the requested write offset is one greater
+ * than the read offset, no more requests may be sent.
+ *
+ * Note that, unlike the packet_queue, the buffer_queue places incoming
+ * buffers at decreasing addresses.  This makes the check for "is it time to
+ * wrap the buffer pointer" cheaper in the assembly code which receives new
+ * buffers, and means that the value which defines the queue size,
+ * __last_buffer, is different than in the packet queue.  Also, the offset
+ * used in the packet_queue is already scaled by the size of a packet; here we
+ * use unscaled slot indices for the offsets.  (These differences are
+ * historical, and in the future it's possible that the packet_queue will look
+ * more like this queue.)
+ *
+ * @code
+ * Here's an example of all 4 state variables and what they mean.  Remember:
+ * all pointers move right to left.
+ *
+ *   V   V   V   I   I   R   R   V   V   V
+ *   0   1   2   3   4   5   6   7   8   9
+ *           ^       ^       ^           ^
+ *           |       |       |           |
+ *           |       |       |           __last_buffer
+ *           |       |       __buffer_write
+ *           |       __buffer_requested_write
+ *           __buffer_read
+ * @endcode
+ *
+ * This queue has 10 slots, and thus can hold 9 buffers (_last_buffer = 9).
+ * The read pointer is at 2, and the write pointer is at 6; thus, there are
+ * valid, unread buffers in slots 2, 1, 0, 9, 8, and 7.  The requested write
+ * pointer is at 4; thus, requests have been made to the IPP for buffers which
+ * will be placed in slots 6 and 5 when they arrive.  Finally, the remaining
+ * slots are invalid (do not contain a buffer).
+ */
+typedef struct
+{
+  /** Ordinal number of the next buffer to be written: 0 for the first slot in
+   *  the queue, 1 for the second slot in the queue, etc. */
+  volatile uint32_t __buffer_write;
+
+  /** Ordinal number of the last buffer (i.e., when any pointer is decremented
+   *  below zero, it is reloaded with this value). */
+  uint32_t __last_buffer;
+}
+__netio_buffer_queue_t;
+
+
+/**
+ * An object for providing Ethernet packets to a process.
+ */
+typedef struct __netio_queue_impl_t
+{
+  /** The queue of packets waiting to be received. */
+  __netio_packet_queue_t __packet_receive_queue;
+  /** The intr bit mask that IDs this device. */
+  unsigned int __intr_id;
+  /** Offset to queues of empty buffers, one per size. */
+  uint32_t __buffer_queue[NETIO_NUM_SIZES];
+  /** The address of the first EPP tile, or -1 if no EPP. */
+  /* ISSUE: Actually this is always "0" or "~0". */
+  uint32_t __epp_location;
+  /** The queue ID that this queue represents. */
+  unsigned int __queue_id;
+  /** Number of acknowledgements received. */
+  volatile uint32_t __acks_received;
+  /** Last completion number received for packet_sendv. */
+  volatile uint32_t __last_completion_rcv;
+  /** Number of packets allowed to be outstanding. */
+  uint32_t __max_outstanding;
+  /** First VA available for packets. */
+  void* __va_0;
+  /** First VA in second range available for packets. */
+  void* __va_1;
+  /** Padding to align the "__packets" field to the size of a netio_pkt_t. */
+  uint32_t __padding[3];
+  /** The packets themselves. */
+  netio_pkt_t __packets[0];
+}
+netio_queue_impl_t;
+
+
+/**
+ * An object for managing the user end of a NetIO queue.
+ */
+typedef struct __netio_queue_user_impl_t
+{
+  /** The next incoming packet to be read. */
+  uint32_t __packet_receive_read;
+  /** The next empty buffers to be read, one index per size. */
+  uint8_t __buffer_read[NETIO_NUM_SIZES];
+  /** Where the empty buffer we next request from the IPP will go, one index
+   * per size. */
+  uint8_t __buffer_requested_write[NETIO_NUM_SIZES];
+  /** PCIe interface flag. */
+  uint8_t __pcie;
+  /** Number of packets left to be received before we send a credit update. */
+  uint32_t __receive_credit_remaining;
+  /** Value placed in __receive_credit_remaining when it reaches zero. */
+  uint32_t __receive_credit_interval;
+  /** First fast I/O routine index. */
+  uint32_t __fastio_index;
+  /** Number of acknowledgements expected. */
+  uint32_t __acks_outstanding;
+  /** Last completion number requested. */
+  uint32_t __last_completion_req;
+  /** File descriptor for driver. */
+  int __fd;
+}
+netio_queue_user_impl_t;
+
+
+#define NETIO_GROUP_CHUNK_SIZE   64   /**< Max # groups in one IPP request */
+#define NETIO_BUCKET_CHUNK_SIZE  64   /**< Max # buckets in one IPP request */
+
+
+/** Internal structure used to convey packet send information to the
+ * hypervisor.  FIXME: Actually, it's not used for that anymore, but
+ * netio_packet_send() still uses it internally.
+ */
+typedef struct
+{
+  uint16_t flags;              /**< Packet flags (__NETIO_SEND_FLG_xxx) */
+  uint16_t transfer_size;      /**< Size of packet */
+  uint32_t va;                 /**< VA of start of packet */
+  __netio_pkt_handle_t handle; /**< Packet handle */
+  uint32_t csum0;              /**< First checksum word */
+  uint32_t csum1;              /**< Second checksum word */
+}
+__netio_send_cmd_t;
+
+
+/** Flags used in two contexts:
+ *  - As the "flags" member in the __netio_send_cmd_t, above; used only
+ *    for netio_pkt_send_{prepare,commit}.
+ *  - As part of the flags passed to the various send packet fast I/O calls.
+ */
+
+/** Need acknowledgement on this packet.  Note that some code in the
+ *  normal send_pkt fast I/O handler assumes that this is equal to 1. */
+#define __NETIO_SEND_FLG_ACK    0x1
+
+/** Do checksum on this packet.  (Only used with the __netio_send_cmd_t;
+ *  normal packet sends use a special fast I/O index to denote checksumming,
+ *  and multi-segment sends test the checksum descriptor.) */
+#define __NETIO_SEND_FLG_CSUM   0x2
+
+/** Get a completion on this packet.  Only used with multi-segment sends.  */
+#define __NETIO_SEND_FLG_COMPLETION 0x4
+
+/** Position of the number-of-extra-segments value in the flags word.
+    Only used with multi-segment sends. */
+#define __NETIO_SEND_FLG_XSEG_SHIFT 3
+
+/** Width of the number-of-extra-segments value in the flags word. */
+#define __NETIO_SEND_FLG_XSEG_WIDTH 2
+
+#endif /* __DRV_XGBE_IMPL_H__ */
diff --git a/arch/tile/include/hv/drv_xgbe_intf.h b/arch/tile/include/hv/drv_xgbe_intf.h
new file mode 100644
index 000000000..2a20b266d
--- /dev/null
+++ b/arch/tile/include/hv/drv_xgbe_intf.h
@@ -0,0 +1,615 @@
+/*
+ * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/**
+ * @file drv_xgbe_intf.h
+ * Interface to the hypervisor XGBE driver.
+ */
+
+#ifndef __DRV_XGBE_INTF_H__
+#define __DRV_XGBE_INTF_H__
+
+/**
+ * An object for forwarding VAs and PAs to the hypervisor.
+ * @ingroup types
+ *
+ * This allows the supervisor to specify a number of areas of memory to
+ * store packet buffers.
+ */
+typedef struct
+{
+  /** The physical address of the memory. */
+  HV_PhysAddr pa;
+  /** Page table entry for the memory.  This is only used to derive the
+   *  memory's caching mode; the PA bits are ignored. */
+  HV_PTE pte;
+  /** The virtual address of the memory. */
+  HV_VirtAddr va;
+  /** Size (in bytes) of the memory area. */
+  int size;
+
+}
+netio_ipp_address_t;
+
+/** The various pread/pwrite offsets into the hypervisor-level driver.
+ * @ingroup types
+ */
+typedef enum
+{
+  /** Inform the Linux driver of the address of the NetIO arena memory.
+   *  This offset is actually only used to convey information from netio
+   *  to the Linux driver; it never makes it from there to the hypervisor.
+   *  Write-only; takes a uint32_t specifying the VA address. */
+  NETIO_FIXED_ADDR               = 0x5000000000000000ULL,
+
+  /** Inform the Linux driver of the size of the NetIO arena memory.
+   *  This offset is actually only used to convey information from netio
+   *  to the Linux driver; it never makes it from there to the hypervisor.
+   *  Write-only; takes a uint32_t specifying the VA size. */
+  NETIO_FIXED_SIZE               = 0x5100000000000000ULL,
+
+  /** Register current tile with IPP.  Write then read: write, takes a
+   *  netio_input_config_t, read returns a pointer to a netio_queue_impl_t. */
+  NETIO_IPP_INPUT_REGISTER_OFF   = 0x6000000000000000ULL,
+
+  /** Unregister current tile from IPP.  Write-only, takes a dummy argument. */
+  NETIO_IPP_INPUT_UNREGISTER_OFF = 0x6100000000000000ULL,
+
+  /** Start packets flowing.  Write-only, takes a dummy argument. */
+  NETIO_IPP_INPUT_INIT_OFF       = 0x6200000000000000ULL,
+
+  /** Stop packets flowing.  Write-only, takes a dummy argument. */
+  NETIO_IPP_INPUT_UNINIT_OFF     = 0x6300000000000000ULL,
+
+  /** Configure group (typically we group on VLAN).  Write-only: takes an
+   *  array of netio_group_t's, low 24 bits of the offset is the base group
+   *  number times the size of a netio_group_t. */
+  NETIO_IPP_INPUT_GROUP_CFG_OFF  = 0x6400000000000000ULL,
+
+  /** Configure bucket.  Write-only: takes an array of netio_bucket_t's, low
+   *  24 bits of the offset is the base bucket number times the size of a
+   *  netio_bucket_t. */
+  NETIO_IPP_INPUT_BUCKET_CFG_OFF = 0x6500000000000000ULL,
+
+  /** Get/set a parameter.  Read or write: read or write data is the parameter
+   *  value, low 32 bits of the offset is a __netio_getset_offset_t. */
+  NETIO_IPP_PARAM_OFF            = 0x6600000000000000ULL,
+
+  /** Get fast I/O index.  Read-only; returns a 4-byte base index value. */
+  NETIO_IPP_GET_FASTIO_OFF       = 0x6700000000000000ULL,
+
+  /** Configure hijack IP address.  Packets with this IPv4 dest address
+   *  go to bucket NETIO_NUM_BUCKETS - 1.  Write-only: takes an IP address
+   *  in some standard form.  FIXME: Define the form! */
+  NETIO_IPP_INPUT_HIJACK_CFG_OFF  = 0x6800000000000000ULL,
+
+  /**
+   * Offsets beyond this point are reserved for the supervisor (although that
+   * enforcement must be done by the supervisor driver itself).
+   */
+  NETIO_IPP_USER_MAX_OFF         = 0x6FFFFFFFFFFFFFFFULL,
+
+  /** Register I/O memory.  Write-only, takes a netio_ipp_address_t. */
+  NETIO_IPP_IOMEM_REGISTER_OFF   = 0x7000000000000000ULL,
+
+  /** Unregister I/O memory.  Write-only, takes a netio_ipp_address_t. */
+  NETIO_IPP_IOMEM_UNREGISTER_OFF = 0x7100000000000000ULL,
+
+  /* Offsets greater than 0x7FFFFFFF can't be used directly from Linux
+   * userspace code due to limitations in the pread/pwrite syscalls. */
+
+  /** Drain LIPP buffers. */
+  NETIO_IPP_DRAIN_OFF              = 0xFA00000000000000ULL,
+
+  /** Supply a netio_ipp_address_t to be used as shared memory for the
+   *  LEPP command queue. */
+  NETIO_EPP_SHM_OFF              = 0xFB00000000000000ULL,
+
+  /* 0xFC... is currently unused. */
+
+  /** Stop IPP/EPP tiles.  Write-only, takes a dummy argument.  */
+  NETIO_IPP_STOP_SHIM_OFF        = 0xFD00000000000000ULL,
+
+  /** Start IPP/EPP tiles.  Write-only, takes a dummy argument.  */
+  NETIO_IPP_START_SHIM_OFF       = 0xFE00000000000000ULL,
+
+  /** Supply packet arena.  Write-only, takes an array of
+    * netio_ipp_address_t values. */
+  NETIO_IPP_ADDRESS_OFF          = 0xFF00000000000000ULL,
+} netio_hv_offset_t;
+
+/** Extract the base offset from an offset */
+#define NETIO_BASE_OFFSET(off)    ((off) & 0xFF00000000000000ULL)
+/** Extract the local offset from an offset */
+#define NETIO_LOCAL_OFFSET(off)   ((off) & 0x00FFFFFFFFFFFFFFULL)
+
+
+/**
+ * Get/set offset.
+ */
+typedef union
+{
+  struct
+  {
+    uint64_t addr:48;        /**< Class-specific address */
+    unsigned int class:8;    /**< Class (e.g., NETIO_PARAM) */
+    unsigned int opcode:8;   /**< High 8 bits of NETIO_IPP_PARAM_OFF */
+  }
+  bits;                      /**< Bitfields */
+  uint64_t word;             /**< Aggregated value to use as the offset */
+}
+__netio_getset_offset_t;
+
+/**
+ * Fast I/O index offsets (must be contiguous).
+ */
+typedef enum
+{
+  NETIO_FASTIO_ALLOCATE         = 0, /**< Get empty packet buffer */
+  NETIO_FASTIO_FREE_BUFFER      = 1, /**< Give buffer back to IPP */
+  NETIO_FASTIO_RETURN_CREDITS   = 2, /**< Give credits to IPP */
+  NETIO_FASTIO_SEND_PKT_NOCK    = 3, /**< Send a packet, no checksum */
+  NETIO_FASTIO_SEND_PKT_CK      = 4, /**< Send a packet, with checksum */
+  NETIO_FASTIO_SEND_PKT_VEC     = 5, /**< Send a vector of packets */
+  NETIO_FASTIO_SENDV_PKT        = 6, /**< Sendv one packet */
+  NETIO_FASTIO_NUM_INDEX        = 7, /**< Total number of fast I/O indices */
+} netio_fastio_index_t;
+
+/** 3-word return type for Fast I/O call. */
+typedef struct
+{
+  int err;            /**< Error code. */
+  uint32_t val0;      /**< Value.  Meaning depends upon the specific call. */
+  uint32_t val1;      /**< Value.  Meaning depends upon the specific call. */
+} netio_fastio_rv3_t;
+
+/** 0-argument fast I/O call */
+int __netio_fastio0(uint32_t fastio_index);
+/** 1-argument fast I/O call */
+int __netio_fastio1(uint32_t fastio_index, uint32_t arg0);
+/** 3-argument fast I/O call, 2-word return value */
+netio_fastio_rv3_t __netio_fastio3_rv3(uint32_t fastio_index, uint32_t arg0,
+                                       uint32_t arg1, uint32_t arg2);
+/** 4-argument fast I/O call */
+int __netio_fastio4(uint32_t fastio_index, uint32_t arg0, uint32_t arg1,
+                    uint32_t arg2, uint32_t arg3);
+/** 6-argument fast I/O call */
+int __netio_fastio6(uint32_t fastio_index, uint32_t arg0, uint32_t arg1,
+                    uint32_t arg2, uint32_t arg3, uint32_t arg4, uint32_t arg5);
+/** 9-argument fast I/O call */
+int __netio_fastio9(uint32_t fastio_index, uint32_t arg0, uint32_t arg1,
+                    uint32_t arg2, uint32_t arg3, uint32_t arg4, uint32_t arg5,
+                    uint32_t arg6, uint32_t arg7, uint32_t arg8);
+
+/** Allocate an empty packet.
+ * @param fastio_index Fast I/O index.
+ * @param size Size of the packet to allocate.
+ */
+#define __netio_fastio_allocate(fastio_index, size) \
+  __netio_fastio1((fastio_index) + NETIO_FASTIO_ALLOCATE, size)
+
+/** Free a buffer.
+ * @param fastio_index Fast I/O index.
+ * @param handle Handle for the packet to free.
+ */
+#define __netio_fastio_free_buffer(fastio_index, handle) \
+  __netio_fastio1((fastio_index) + NETIO_FASTIO_FREE_BUFFER, handle)
+
+/** Increment our receive credits.
+ * @param fastio_index Fast I/O index.
+ * @param credits Number of credits to add.
+ */
+#define __netio_fastio_return_credits(fastio_index, credits) \
+  __netio_fastio1((fastio_index) + NETIO_FASTIO_RETURN_CREDITS, credits)
+
+/** Send packet, no checksum.
+ * @param fastio_index Fast I/O index.
+ * @param ackflag Nonzero if we want an ack.
+ * @param size Size of the packet.
+ * @param va Virtual address of start of packet.
+ * @param handle Packet handle.
+ */
+#define __netio_fastio_send_pkt_nock(fastio_index, ackflag, size, va, handle) \
+  __netio_fastio4((fastio_index) + NETIO_FASTIO_SEND_PKT_NOCK, ackflag, \
+                  size, va, handle)
+
+/** Send packet, calculate checksum.
+ * @param fastio_index Fast I/O index.
+ * @param ackflag Nonzero if we want an ack.
+ * @param size Size of the packet.
+ * @param va Virtual address of start of packet.
+ * @param handle Packet handle.
+ * @param csum0 Shim checksum header.
+ * @param csum1 Checksum seed.
+ */
+#define __netio_fastio_send_pkt_ck(fastio_index, ackflag, size, va, handle, \
+                                   csum0, csum1) \
+  __netio_fastio6((fastio_index) + NETIO_FASTIO_SEND_PKT_CK, ackflag, \
+                  size, va, handle, csum0, csum1)
+
+
+/** Format for the "csum0" argument to the __netio_fastio_send routines
+ * and LEPP.  Note that this is currently exactly identical to the
+ * ShimProtocolOffloadHeader.
+ */
+typedef union
+{
+  struct
+  {
+    unsigned int start_byte:7;       /**< The first byte to be checksummed */
+    unsigned int count:14;           /**< Number of bytes to be checksummed. */
+    unsigned int destination_byte:7; /**< The byte to write the checksum to. */
+    unsigned int reserved:4;         /**< Reserved. */
+  } bits;                            /**< Decomposed method of access. */
+  unsigned int word;                 /**< To send out the IDN. */
+} __netio_checksum_header_t;
+
+
+/** Sendv packet with 1 or 2 segments.
+ * @param fastio_index Fast I/O index.
+ * @param flags Ack/csum/notify flags in low 3 bits; number of segments minus
+ *        1 in next 2 bits; expected checksum in high 16 bits.
+ * @param confno Confirmation number to request, if notify flag set.
+ * @param csum0 Checksum descriptor; if zero, no checksum.
+ * @param va_F Virtual address of first segment.
+ * @param va_L Virtual address of last segment, if 2 segments.
+ * @param len_F_L Length of first segment in low 16 bits; length of last
+ *        segment, if 2 segments, in high 16 bits.
+ */
+#define __netio_fastio_sendv_pkt_1_2(fastio_index, flags, confno, csum0, \
+                                     va_F, va_L, len_F_L) \
+  __netio_fastio6((fastio_index) + NETIO_FASTIO_SENDV_PKT, flags, confno, \
+                  csum0, va_F, va_L, len_F_L)
+
+/** Send packet on PCIe interface.
+ * @param fastio_index Fast I/O index.
+ * @param flags Ack/csum/notify flags in low 3 bits.
+ * @param confno Confirmation number to request, if notify flag set.
+ * @param csum0 Checksum descriptor; Hard wired 0, not needed for PCIe.
+ * @param va_F Virtual address of the packet buffer.
+ * @param va_L Virtual address of last segment, if 2 segments. Hard wired 0.
+ * @param len_F_L Length of the packet buffer in low 16 bits.
+ */
+#define __netio_fastio_send_pcie_pkt(fastio_index, flags, confno, csum0, \
+                                     va_F, va_L, len_F_L) \
+  __netio_fastio6((fastio_index) + PCIE_FASTIO_SENDV_PKT, flags, confno, \
+                  csum0, va_F, va_L, len_F_L)
+
+/** Sendv packet with 3 or 4 segments.
+ * @param fastio_index Fast I/O index.
+ * @param flags Ack/csum/notify flags in low 3 bits; number of segments minus
+ *        1 in next 2 bits; expected checksum in high 16 bits.
+ * @param confno Confirmation number to request, if notify flag set.
+ * @param csum0 Checksum descriptor; if zero, no checksum.
+ * @param va_F Virtual address of first segment.
+ * @param va_L Virtual address of last segment (third segment if 3 segments,
+ *        fourth segment if 4 segments).
+ * @param len_F_L Length of first segment in low 16 bits; length of last
+ *        segment in high 16 bits.
+ * @param va_M0 Virtual address of "middle 0" segment; this segment is sent
+ *        second when there are three segments, and third if there are four.
+ * @param va_M1 Virtual address of "middle 1" segment; this segment is sent
+ *        second when there are four segments.
+ * @param len_M0_M1 Length of middle 0 segment in low 16 bits; length of middle
+ *        1 segment, if 4 segments, in high 16 bits.
+ */
+#define __netio_fastio_sendv_pkt_3_4(fastio_index, flags, confno, csum0, va_F, \
+                                     va_L, len_F_L, va_M0, va_M1, len_M0_M1) \
+  __netio_fastio9((fastio_index) + NETIO_FASTIO_SENDV_PKT, flags, confno, \
+                  csum0, va_F, va_L, len_F_L, va_M0, va_M1, len_M0_M1)
+
+/** Send vector of packets.
+ * @param fastio_index Fast I/O index.
+ * @param seqno Number of packets transmitted so far on this interface;
+ *        used to decide which packets should be acknowledged.
+ * @param nentries Number of entries in vector.
+ * @param va Virtual address of start of vector entry array.
+ * @return 3-word netio_fastio_rv3_t structure.  The structure's err member
+ *         is an error code, or zero if no error.  The val0 member is the
+ *         updated value of seqno; it has been incremented by 1 for each
+ *         packet sent.  That increment may be less than nentries if an
+ *         error occurred, or if some of the entries in the vector contain
+ *         handles equal to NETIO_PKT_HANDLE_NONE.  The val1 member is the
+ *         updated value of nentries; it has been decremented by 1 for each
+ *         vector entry processed.  Again, that decrement may be less than
+ *         nentries (leaving the returned value positive) if an error
+ *         occurred.
+ */
+#define __netio_fastio_send_pkt_vec(fastio_index, seqno, nentries, va) \
+  __netio_fastio3_rv3((fastio_index) + NETIO_FASTIO_SEND_PKT_VEC, seqno, \
+                      nentries, va)
+
+
+/** An egress DMA command for LEPP. */
+typedef struct
+{
+  /** Is this a TSO transfer?
+   *
+   * NOTE: This field is always 0, to distinguish it from
+   * lepp_tso_cmd_t.  It must come first!
+   */
+  uint8_t tso               : 1;
+
+  /** Unused padding bits. */
+  uint8_t _unused           : 3;
+
+  /** Should this packet be sent directly from caches instead of DRAM,
+   * using hash-for-home to locate the packet data?
+   */
+  uint8_t hash_for_home     : 1;
+
+  /** Should we compute a checksum? */
+  uint8_t compute_checksum  : 1;
+
+  /** Is this the final buffer for this packet?
+   *
+   * A single packet can be split over several input buffers (a "gather"
+   * operation).  This flag indicates that this is the last buffer
+   * in a packet.
+   */
+  uint8_t end_of_packet     : 1;
+
+  /** Should LEPP advance 'comp_busy' when this DMA is fully finished? */
+  uint8_t send_completion   : 1;
+
+  /** High bits of Client Physical Address of the start of the buffer
+   *  to be egressed.
+   *
+   *  NOTE: Only 6 bits are actually needed here, as CPAs are
+   *  currently 38 bits.  So two bits could be scavenged from this.
+   */
+  uint8_t cpa_hi;
+
+  /** The number of bytes to be egressed. */
+  uint16_t length;
+
+  /** Low 32 bits of Client Physical Address of the start of the buffer
+   *  to be egressed.
+   */
+  uint32_t cpa_lo;
+
+  /** Checksum information (only used if 'compute_checksum'). */
+  __netio_checksum_header_t checksum_data;
+
+} lepp_cmd_t;
+
+
+/** A chunk of physical memory for a TSO egress. */
+typedef struct
+{
+  /** The low bits of the CPA. */
+  uint32_t cpa_lo;
+  /** The high bits of the CPA. */
+  uint16_t cpa_hi		: 15;
+  /** Should this packet be sent directly from caches instead of DRAM,
+   *  using hash-for-home to locate the packet data?
+   */
+  uint16_t hash_for_home	: 1;
+  /** The length in bytes. */
+  uint16_t length;
+} lepp_frag_t;
+
+
+/** An LEPP command that handles TSO. */
+typedef struct
+{
+  /** Is this a TSO transfer?
+   *
+   *  NOTE: This field is always 1, to distinguish it from
+   *  lepp_cmd_t.  It must come first!
+   */
+  uint8_t tso             : 1;
+
+  /** Unused padding bits. */
+  uint8_t _unused         : 7;
+
+  /** Size of the header[] array in bytes.  It must be in the range
+   *  [40, 127], which are the smallest header for a TCP packet over
+   *  Ethernet and the maximum possible prepend size supported by
+   *  hardware, respectively.  Note that the array storage must be
+   *  padded out to a multiple of four bytes so that the following
+   *  LEPP command is aligned properly.
+   */
+  uint8_t header_size;
+
+  /** Byte offset of the IP header in header[]. */
+  uint8_t ip_offset;
+
+  /** Byte offset of the TCP header in header[]. */
+  uint8_t tcp_offset;
+
+  /** The number of bytes to use for the payload of each packet,
+   *  except of course the last one, which may not have enough bytes.
+   *  This means that each Ethernet packet except the last will have a
+   *  size of header_size + payload_size.
+   */
+  uint16_t payload_size;
+
+  /** The length of the 'frags' array that follows this struct. */
+  uint16_t num_frags;
+
+  /** The actual frags. */
+  lepp_frag_t frags[0 /* Variable-sized; num_frags entries. */];
+
+  /*
+   * The packet header template logically follows frags[],
+   * but you can't declare that in C.
+   *
+   * uint32_t header[header_size_in_words_rounded_up];
+   */
+
+} lepp_tso_cmd_t;
+
+
+/** An LEPP completion ring entry. */
+typedef void* lepp_comp_t;
+
+
+/** Maximum number of frags for one TSO command.  This is adapted from
+ *  linux's "MAX_SKB_FRAGS", and presumably over-estimates by one, for
+ *  our page size of exactly 65536.  We add one for a "body" fragment.
+ */
+#define LEPP_MAX_FRAGS (65536 / HV_DEFAULT_PAGE_SIZE_SMALL + 2 + 1)
+
+/** Total number of bytes needed for an lepp_tso_cmd_t. */
+#define LEPP_TSO_CMD_SIZE(num_frags, header_size) \
+  (sizeof(lepp_tso_cmd_t) + \
+   (num_frags) * sizeof(lepp_frag_t) + \
+   (((header_size) + 3) & -4))
+
+/** The size of the lepp "cmd" queue. */
+#define LEPP_CMD_QUEUE_BYTES \
+ (((CHIP_L2_CACHE_SIZE() - 2 * CHIP_L2_LINE_SIZE()) / \
+  (sizeof(lepp_cmd_t) + sizeof(lepp_comp_t))) * sizeof(lepp_cmd_t))
+
+/** The largest possible command that can go in lepp_queue_t::cmds[]. */
+#define LEPP_MAX_CMD_SIZE LEPP_TSO_CMD_SIZE(LEPP_MAX_FRAGS, 128)
+
+/** The largest possible value of lepp_queue_t::cmd_{head, tail} (inclusive).
+ */
+#define LEPP_CMD_LIMIT \
+  (LEPP_CMD_QUEUE_BYTES - LEPP_MAX_CMD_SIZE)
+
+/** The maximum number of completions in an LEPP queue. */
+#define LEPP_COMP_QUEUE_SIZE \
+  ((LEPP_CMD_LIMIT + sizeof(lepp_cmd_t) - 1) / sizeof(lepp_cmd_t))
+
+/** Increment an index modulo the queue size. */
+#define LEPP_QINC(var) \
+  (var = __insn_mnz(var - (LEPP_COMP_QUEUE_SIZE - 1), var + 1))
+
+/** A queue used to convey egress commands from the client to LEPP. */
+typedef struct
+{
+  /** Index of first completion not yet processed by user code.
+   *  If this is equal to comp_busy, there are no such completions.
+   *
+   *  NOTE: This is only read/written by the user.
+   */
+  unsigned int comp_head;
+
+  /** Index of first completion record not yet completed.
+   *  If this is equal to comp_tail, there are no such completions.
+   *  This index gets advanced (modulo LEPP_QUEUE_SIZE) whenever
+   *  a command with the 'completion' bit set is finished.
+   *
+   *  NOTE: This is only written by LEPP, only read by the user.
+   */
+  volatile unsigned int comp_busy;
+
+  /** Index of the first empty slot in the completion ring.
+   *  Entries from this up to but not including comp_head (in ring order)
+   *  can be filled in with completion data.
+   *
+   *  NOTE: This is only read/written by the user.
+   */
+  unsigned int comp_tail;
+
+  /** Byte index of first command enqueued for LEPP but not yet processed.
+   *
+   *  This is always divisible by sizeof(void*) and always <= LEPP_CMD_LIMIT.
+   *
+   *  NOTE: LEPP advances this counter as soon as it no longer needs
+   *  the cmds[] storage for this entry, but the transfer is not actually
+   *  complete (i.e. the buffer pointed to by the command is no longer
+   *  needed) until comp_busy advances.
+   *
+   *  If this is equal to cmd_tail, the ring is empty.
+   *
+   *  NOTE: This is only written by LEPP, only read by the user.
+   */
+  volatile unsigned int cmd_head;
+
+  /** Byte index of first empty slot in the command ring.  This field can
+   *  be incremented up to but not equal to cmd_head (because that would
+   *  mean the ring is empty).
+   *
+   *  This is always divisible by sizeof(void*) and always <= LEPP_CMD_LIMIT.
+   *
+   *  NOTE: This is read/written by the user, only read by LEPP.
+   */
+  volatile unsigned int cmd_tail;
+
+  /** A ring of variable-sized egress DMA commands.
+   *
+   *  NOTE: Only written by the user, only read by LEPP.
+   */
+  char cmds[LEPP_CMD_QUEUE_BYTES]
+    __attribute__((aligned(CHIP_L2_LINE_SIZE())));
+
+  /** A ring of user completion data.
+   *  NOTE: Only read/written by the user.
+   */
+  lepp_comp_t comps[LEPP_COMP_QUEUE_SIZE]
+    __attribute__((aligned(CHIP_L2_LINE_SIZE())));
+} lepp_queue_t;
+
+
+/** An internal helper function for determining the number of entries
+ *  available in a ring buffer, given that there is one sentinel.
+ */
+static inline unsigned int
+_lepp_num_free_slots(unsigned int head, unsigned int tail)
+{
+  /*
+   * One entry is reserved for use as a sentinel, to distinguish
+   * "empty" from "full".  So we compute
+   * (head - tail - 1) % LEPP_QUEUE_SIZE, but without using a slow % operation.
+   */
+  return (head - tail - 1) + ((head <= tail) ? LEPP_COMP_QUEUE_SIZE : 0);
+}
+
+
+/** Returns how many new comp entries can be enqueued. */
+static inline unsigned int
+lepp_num_free_comp_slots(const lepp_queue_t* q)
+{
+  return _lepp_num_free_slots(q->comp_head, q->comp_tail);
+}
+
+static inline int
+lepp_qsub(int v1, int v2)
+{
+  int delta = v1 - v2;
+  return delta + ((delta >> 31) & LEPP_COMP_QUEUE_SIZE);
+}
+
+
+/** FIXME: Check this from linux, via a new "pwrite()" call. */
+#define LIPP_VERSION 1
+
+
+/** We use exactly two bytes of alignment padding. */
+#define LIPP_PACKET_PADDING 2
+
+/** The minimum size of a "small" buffer (including the padding). */
+#define LIPP_SMALL_PACKET_SIZE 128
+
+/*
+ * NOTE: The following two values should total to less than around
+ * 13582, to keep the total size used for "lipp_state_t" below 64K.
+ */
+
+/** The maximum number of "small" buffers.
+ *  This is enough for 53 network cpus with 128 credits.  Note that
+ *  if these are exhausted, we will fall back to using large buffers.
+ */
+#define LIPP_SMALL_BUFFERS 6785
+
+/** The maximum number of "large" buffers.
+ *  This is enough for 53 network cpus with 128 credits.
+ */
+#define LIPP_LARGE_BUFFERS 6785
+
+#endif /* __DRV_XGBE_INTF_H__ */
diff --git a/arch/tile/include/hv/hypervisor.h b/arch/tile/include/hv/hypervisor.h
new file mode 100644
index 000000000..e0e6af4e7
--- /dev/null
+++ b/arch/tile/include/hv/hypervisor.h
@@ -0,0 +1,2598 @@
+/*
+ * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/**
+ * @file hypervisor.h
+ * The hypervisor's public API.
+ */
+
+#ifndef _HV_HV_H
+#define _HV_HV_H
+
+#include <arch/chip.h>
+
+/* Linux builds want unsigned long constants, but assembler wants numbers */
+#ifdef __ASSEMBLER__
+/** One, for assembler */
+#define __HV_SIZE_ONE 1
+#elif !defined(__tile__) && CHIP_VA_WIDTH() > 32
+/** One, for 64-bit on host */
+#define __HV_SIZE_ONE 1ULL
+#else
+/** One, for Linux */
+#define __HV_SIZE_ONE 1UL
+#endif
+
+/** The log2 of the span of a level-1 page table, in bytes.
+ */
+#define HV_LOG2_L1_SPAN 32
+
+/** The span of a level-1 page table, in bytes.
+ */
+#define HV_L1_SPAN (__HV_SIZE_ONE << HV_LOG2_L1_SPAN)
+
+/** The log2 of the initial size of small pages, in bytes.
+ * See HV_DEFAULT_PAGE_SIZE_SMALL.
+ */
+#define HV_LOG2_DEFAULT_PAGE_SIZE_SMALL 16
+
+/** The initial size of small pages, in bytes. This value should be verified
+ * at runtime by calling hv_sysconf(HV_SYSCONF_PAGE_SIZE_SMALL).
+ * It may also be modified when installing a new context.
+ */
+#define HV_DEFAULT_PAGE_SIZE_SMALL \
+  (__HV_SIZE_ONE << HV_LOG2_DEFAULT_PAGE_SIZE_SMALL)
+
+/** The log2 of the initial size of large pages, in bytes.
+ * See HV_DEFAULT_PAGE_SIZE_LARGE.
+ */
+#define HV_LOG2_DEFAULT_PAGE_SIZE_LARGE 24
+
+/** The initial size of large pages, in bytes. This value should be verified
+ * at runtime by calling hv_sysconf(HV_SYSCONF_PAGE_SIZE_LARGE).
+ * It may also be modified when installing a new context.
+ */
+#define HV_DEFAULT_PAGE_SIZE_LARGE \
+  (__HV_SIZE_ONE << HV_LOG2_DEFAULT_PAGE_SIZE_LARGE)
+
+#if CHIP_VA_WIDTH() > 32
+
+/** The log2 of the initial size of jumbo pages, in bytes.
+ * See HV_DEFAULT_PAGE_SIZE_JUMBO.
+ */
+#define HV_LOG2_DEFAULT_PAGE_SIZE_JUMBO 32
+
+/** The initial size of jumbo pages, in bytes. This value should
+ * be verified at runtime by calling hv_sysconf(HV_SYSCONF_PAGE_SIZE_JUMBO).
+ * It may also be modified when installing a new context.
+ */
+#define HV_DEFAULT_PAGE_SIZE_JUMBO \
+  (__HV_SIZE_ONE << HV_LOG2_DEFAULT_PAGE_SIZE_JUMBO)
+
+#endif
+
+/** The log2 of the granularity at which page tables must be aligned;
+ *  in other words, the CPA for a page table must have this many zero
+ *  bits at the bottom of the address.
+ */
+#define HV_LOG2_PAGE_TABLE_ALIGN 11
+
+/** The granularity at which page tables must be aligned.
+ */
+#define HV_PAGE_TABLE_ALIGN (__HV_SIZE_ONE << HV_LOG2_PAGE_TABLE_ALIGN)
+
+/** Normal start of hypervisor glue in client physical memory. */
+#define HV_GLUE_START_CPA 0x10000
+
+/** This much space is reserved at HV_GLUE_START_CPA
+ * for the hypervisor glue. The client program must start at
+ * some address higher than this, and in particular the address of
+ * its text section should be equal to zero modulo HV_PAGE_SIZE_LARGE
+ * so that relative offsets to the HV glue are correct.
+ */
+#define HV_GLUE_RESERVED_SIZE 0x10000
+
+/** Each entry in the hv dispatch array takes this many bytes. */
+#define HV_DISPATCH_ENTRY_SIZE 32
+
+/** Version of the hypervisor interface defined by this file */
+#define _HV_VERSION 13
+
+/** Last version of the hypervisor interface with old hv_init() ABI.
+ *
+ * The change from version 12 to version 13 corresponds to launching
+ * the client by default at PL2 instead of PL1 (corresponding to the
+ * hv itself running at PL3 instead of PL2).  To make this explicit,
+ * the hv_init() API was also extended so the client can report its
+ * desired PL, resulting in a more helpful failure diagnostic.  If you
+ * call hv_init() with _HV_VERSION_OLD_HV_INIT and omit the client_pl
+ * argument, the hypervisor will assume client_pl = 1.
+ *
+ * Note that this is a deprecated solution and we do not expect to
+ * support clients of the Tilera hypervisor running at PL1 indefinitely.
+ */
+#define _HV_VERSION_OLD_HV_INIT 12
+
+/* Index into hypervisor interface dispatch code blocks.
+ *
+ * Hypervisor calls are invoked from user space by calling code
+ * at an address HV_BASE_ADDRESS + (index) * HV_DISPATCH_ENTRY_SIZE,
+ * where index is one of these enum values.
+ *
+ * Normally a supervisor is expected to produce a set of symbols
+ * starting at HV_BASE_ADDRESS that obey this convention, but a user
+ * program could call directly through function pointers if desired.
+ *
+ * These numbers are part of the binary API and will not be changed
+ * without updating HV_VERSION, which should be a rare event.
+ */
+
+/** reserved. */
+#define _HV_DISPATCH_RESERVED                     0
+
+/** hv_init  */
+#define HV_DISPATCH_INIT                          1
+
+/** hv_install_context */
+#define HV_DISPATCH_INSTALL_CONTEXT               2
+
+/** hv_sysconf */
+#define HV_DISPATCH_SYSCONF                       3
+
+/** hv_get_rtc */
+#define HV_DISPATCH_GET_RTC                       4
+
+/** hv_set_rtc */
+#define HV_DISPATCH_SET_RTC                       5
+
+/** hv_flush_asid */
+#define HV_DISPATCH_FLUSH_ASID                    6
+
+/** hv_flush_page */
+#define HV_DISPATCH_FLUSH_PAGE                    7
+
+/** hv_flush_pages */
+#define HV_DISPATCH_FLUSH_PAGES                   8
+
+/** hv_restart */
+#define HV_DISPATCH_RESTART                       9
+
+/** hv_halt */
+#define HV_DISPATCH_HALT                          10
+
+/** hv_power_off */
+#define HV_DISPATCH_POWER_OFF                     11
+
+/** hv_inquire_physical */
+#define HV_DISPATCH_INQUIRE_PHYSICAL              12
+
+/** hv_inquire_memory_controller */
+#define HV_DISPATCH_INQUIRE_MEMORY_CONTROLLER     13
+
+/** hv_inquire_virtual */
+#define HV_DISPATCH_INQUIRE_VIRTUAL               14
+
+/** hv_inquire_asid */
+#define HV_DISPATCH_INQUIRE_ASID                  15
+
+/** hv_nanosleep */
+#define HV_DISPATCH_NANOSLEEP                     16
+
+/** hv_console_read_if_ready */
+#define HV_DISPATCH_CONSOLE_READ_IF_READY         17
+
+/** hv_console_write */
+#define HV_DISPATCH_CONSOLE_WRITE                 18
+
+/** hv_downcall_dispatch */
+#define HV_DISPATCH_DOWNCALL_DISPATCH             19
+
+/** hv_inquire_topology */
+#define HV_DISPATCH_INQUIRE_TOPOLOGY              20
+
+/** hv_fs_findfile */
+#define HV_DISPATCH_FS_FINDFILE                   21
+
+/** hv_fs_fstat */
+#define HV_DISPATCH_FS_FSTAT                      22
+
+/** hv_fs_pread */
+#define HV_DISPATCH_FS_PREAD                      23
+
+/** hv_physaddr_read64 */
+#define HV_DISPATCH_PHYSADDR_READ64               24
+
+/** hv_physaddr_write64 */
+#define HV_DISPATCH_PHYSADDR_WRITE64              25
+
+/** hv_get_command_line */
+#define HV_DISPATCH_GET_COMMAND_LINE              26
+
+/** hv_set_caching */
+#define HV_DISPATCH_SET_CACHING                   27
+
+/** hv_bzero_page */
+#define HV_DISPATCH_BZERO_PAGE                    28
+
+/** hv_register_message_state */
+#define HV_DISPATCH_REGISTER_MESSAGE_STATE        29
+
+/** hv_send_message */
+#define HV_DISPATCH_SEND_MESSAGE                  30
+
+/** hv_receive_message */
+#define HV_DISPATCH_RECEIVE_MESSAGE               31
+
+/** hv_inquire_context */
+#define HV_DISPATCH_INQUIRE_CONTEXT               32
+
+/** hv_start_all_tiles */
+#define HV_DISPATCH_START_ALL_TILES               33
+
+/** hv_dev_open */
+#define HV_DISPATCH_DEV_OPEN                      34
+
+/** hv_dev_close */
+#define HV_DISPATCH_DEV_CLOSE                     35
+
+/** hv_dev_pread */
+#define HV_DISPATCH_DEV_PREAD                     36
+
+/** hv_dev_pwrite */
+#define HV_DISPATCH_DEV_PWRITE                    37
+
+/** hv_dev_poll */
+#define HV_DISPATCH_DEV_POLL                      38
+
+/** hv_dev_poll_cancel */
+#define HV_DISPATCH_DEV_POLL_CANCEL               39
+
+/** hv_dev_preada */
+#define HV_DISPATCH_DEV_PREADA                    40
+
+/** hv_dev_pwritea */
+#define HV_DISPATCH_DEV_PWRITEA                   41
+
+/** hv_flush_remote */
+#define HV_DISPATCH_FLUSH_REMOTE                  42
+
+/** hv_console_putc */
+#define HV_DISPATCH_CONSOLE_PUTC                  43
+
+/** hv_inquire_tiles */
+#define HV_DISPATCH_INQUIRE_TILES                 44
+
+/** hv_confstr */
+#define HV_DISPATCH_CONFSTR                       45
+
+/** hv_reexec */
+#define HV_DISPATCH_REEXEC                        46
+
+/** hv_set_command_line */
+#define HV_DISPATCH_SET_COMMAND_LINE              47
+
+#if !CHIP_HAS_IPI()
+
+/** hv_clear_intr */
+#define HV_DISPATCH_CLEAR_INTR                    48
+
+/** hv_enable_intr */
+#define HV_DISPATCH_ENABLE_INTR                   49
+
+/** hv_disable_intr */
+#define HV_DISPATCH_DISABLE_INTR                  50
+
+/** hv_raise_intr */
+#define HV_DISPATCH_RAISE_INTR                    51
+
+/** hv_trigger_ipi */
+#define HV_DISPATCH_TRIGGER_IPI                   52
+
+#endif /* !CHIP_HAS_IPI() */
+
+/** hv_store_mapping */
+#define HV_DISPATCH_STORE_MAPPING                 53
+
+/** hv_inquire_realpa */
+#define HV_DISPATCH_INQUIRE_REALPA                54
+
+/** hv_flush_all */
+#define HV_DISPATCH_FLUSH_ALL                     55
+
+#if CHIP_HAS_IPI()
+/** hv_get_ipi_pte */
+#define HV_DISPATCH_GET_IPI_PTE                   56
+#endif
+
+/** hv_set_pte_super_shift */
+#define HV_DISPATCH_SET_PTE_SUPER_SHIFT           57
+
+/** hv_console_set_ipi */
+#define HV_DISPATCH_CONSOLE_SET_IPI               63
+
+/** One more than the largest dispatch value */
+#define _HV_DISPATCH_END                          64
+
+
+#ifndef __ASSEMBLER__
+
+#ifdef __KERNEL__
+#include <asm/types.h>
+typedef u32 __hv32;        /**< 32-bit value */
+typedef u64 __hv64;        /**< 64-bit value */
+#else
+#include <stdint.h>
+typedef uint32_t __hv32;   /**< 32-bit value */
+typedef uint64_t __hv64;   /**< 64-bit value */
+#endif
+
+
+/** Hypervisor physical address. */
+typedef __hv64 HV_PhysAddr;
+
+#if CHIP_VA_WIDTH() > 32
+/** Hypervisor virtual address. */
+typedef __hv64 HV_VirtAddr;
+#else
+/** Hypervisor virtual address. */
+typedef __hv32 HV_VirtAddr;
+#endif /* CHIP_VA_WIDTH() > 32 */
+
+/** Hypervisor ASID. */
+typedef unsigned int HV_ASID;
+
+/** Hypervisor tile location for a memory access
+ * ("location overridden target").
+ */
+typedef unsigned int HV_LOTAR;
+
+/** Hypervisor size of a page. */
+typedef unsigned long HV_PageSize;
+
+/** A page table entry.
+ */
+typedef struct
+{
+  __hv64 val;                /**< Value of PTE */
+} HV_PTE;
+
+/** Hypervisor error code. */
+typedef int HV_Errno;
+
+#endif /* !__ASSEMBLER__ */
+
+#define HV_OK           0    /**< No error */
+#define HV_EINVAL      -801  /**< Invalid argument */
+#define HV_ENODEV      -802  /**< No such device */
+#define HV_ENOENT      -803  /**< No such file or directory */
+#define HV_EBADF       -804  /**< Bad file number */
+#define HV_EFAULT      -805  /**< Bad address */
+#define HV_ERECIP      -806  /**< Bad recipients */
+#define HV_E2BIG       -807  /**< Message too big */
+#define HV_ENOTSUP     -808  /**< Service not supported */
+#define HV_EBUSY       -809  /**< Device busy */
+#define HV_ENOSYS      -810  /**< Invalid syscall */
+#define HV_EPERM       -811  /**< No permission */
+#define HV_ENOTREADY   -812  /**< Device not ready */
+#define HV_EIO         -813  /**< I/O error */
+#define HV_ENOMEM      -814  /**< Out of memory */
+#define HV_EAGAIN      -815  /**< Try again */
+
+#define HV_ERR_MAX     -801  /**< Largest HV error code */
+#define HV_ERR_MIN     -815  /**< Smallest HV error code */
+
+#ifndef __ASSEMBLER__
+
+/** Pass HV_VERSION to hv_init to request this version of the interface. */
+typedef enum {
+  HV_VERSION = _HV_VERSION,
+  HV_VERSION_OLD_HV_INIT = _HV_VERSION_OLD_HV_INIT,
+
+} HV_VersionNumber;
+
+/** Initializes the hypervisor.
+ *
+ * @param interface_version_number The version of the hypervisor interface
+ * that this program expects, typically HV_VERSION.
+ * @param chip_num Architecture number of the chip the client was built for.
+ * @param chip_rev_num Revision number of the chip the client was built for.
+ * @param client_pl Privilege level the client is built for
+ *   (not required if interface_version_number == HV_VERSION_OLD_HV_INIT).
+ */
+void hv_init(HV_VersionNumber interface_version_number,
+             int chip_num, int chip_rev_num, int client_pl);
+
+
+/** Queries we can make for hv_sysconf().
+ *
+ * These numbers are part of the binary API and guaranteed not to change.
+ */
+typedef enum {
+  /** An invalid value; do not use. */
+  _HV_SYSCONF_RESERVED       = 0,
+
+  /** The length of the glue section containing the hv_ procs, in bytes. */
+  HV_SYSCONF_GLUE_SIZE       = 1,
+
+  /** The size of small pages, in bytes. */
+  HV_SYSCONF_PAGE_SIZE_SMALL = 2,
+
+  /** The size of large pages, in bytes. */
+  HV_SYSCONF_PAGE_SIZE_LARGE = 3,
+
+  /** Processor clock speed, in hertz. */
+  HV_SYSCONF_CPU_SPEED       = 4,
+
+  /** Processor temperature, in degrees Kelvin.  The value
+   *  HV_SYSCONF_TEMP_KTOC may be subtracted from this to get degrees
+   *  Celsius.  If that Celsius value is HV_SYSCONF_OVERTEMP, this indicates
+   *  that the temperature has hit an upper limit and is no longer being
+   *  accurately tracked.
+   */
+  HV_SYSCONF_CPU_TEMP        = 5,
+
+  /** Board temperature, in degrees Kelvin.  The value
+   *  HV_SYSCONF_TEMP_KTOC may be subtracted from this to get degrees
+   *  Celsius.  If that Celsius value is HV_SYSCONF_OVERTEMP, this indicates
+   *  that the temperature has hit an upper limit and is no longer being
+   *  accurately tracked.
+   */
+  HV_SYSCONF_BOARD_TEMP      = 6,
+
+  /** Legal page size bitmask for hv_install_context().
+   * For example, if 16KB and 64KB small pages are supported,
+   * it would return "HV_CTX_PG_SM_16K | HV_CTX_PG_SM_64K".
+   */
+  HV_SYSCONF_VALID_PAGE_SIZES = 7,
+
+  /** The size of jumbo pages, in bytes.
+   * If no jumbo pages are available, zero will be returned.
+   */
+  HV_SYSCONF_PAGE_SIZE_JUMBO = 8,
+
+} HV_SysconfQuery;
+
+/** Offset to subtract from returned Kelvin temperature to get degrees
+    Celsius. */
+#define HV_SYSCONF_TEMP_KTOC 273
+
+/** Pseudo-temperature value indicating that the temperature has
+ *  pegged at its upper limit and is no longer accurate; note that this is
+ *  the value after subtracting HV_SYSCONF_TEMP_KTOC. */
+#define HV_SYSCONF_OVERTEMP 999
+
+/** Query a configuration value from the hypervisor.
+ * @param query Which value is requested (HV_SYSCONF_xxx).
+ * @return The requested value, or -1 the requested value is illegal or
+ *         unavailable.
+ */
+long hv_sysconf(HV_SysconfQuery query);
+
+
+/** Queries we can make for hv_confstr().
+ *
+ * These numbers are part of the binary API and guaranteed not to change.
+ */
+typedef enum {
+  /** An invalid value; do not use. */
+  _HV_CONFSTR_RESERVED        = 0,
+
+  /** Board part number. */
+  HV_CONFSTR_BOARD_PART_NUM   = 1,
+
+  /** Board serial number. */
+  HV_CONFSTR_BOARD_SERIAL_NUM = 2,
+
+  /** Chip serial number. */
+  HV_CONFSTR_CHIP_SERIAL_NUM  = 3,
+
+  /** Board revision level. */
+  HV_CONFSTR_BOARD_REV        = 4,
+
+  /** Hypervisor software version. */
+  HV_CONFSTR_HV_SW_VER        = 5,
+
+  /** The name for this chip model. */
+  HV_CONFSTR_CHIP_MODEL       = 6,
+
+  /** Human-readable board description. */
+  HV_CONFSTR_BOARD_DESC       = 7,
+
+  /** Human-readable description of the hypervisor configuration. */
+  HV_CONFSTR_HV_CONFIG        = 8,
+
+  /** Human-readable version string for the boot image (for instance,
+   *  who built it and when, what configuration file was used). */
+  HV_CONFSTR_HV_CONFIG_VER    = 9,
+
+  /** Mezzanine part number. */
+  HV_CONFSTR_MEZZ_PART_NUM   = 10,
+
+  /** Mezzanine serial number. */
+  HV_CONFSTR_MEZZ_SERIAL_NUM = 11,
+
+  /** Mezzanine revision level. */
+  HV_CONFSTR_MEZZ_REV        = 12,
+
+  /** Human-readable mezzanine description. */
+  HV_CONFSTR_MEZZ_DESC       = 13,
+
+  /** Control path for the onboard network switch. */
+  HV_CONFSTR_SWITCH_CONTROL  = 14,
+
+  /** Chip revision level. */
+  HV_CONFSTR_CHIP_REV        = 15,
+
+  /** CPU module part number. */
+  HV_CONFSTR_CPUMOD_PART_NUM = 16,
+
+  /** CPU module serial number. */
+  HV_CONFSTR_CPUMOD_SERIAL_NUM = 17,
+
+  /** CPU module revision level. */
+  HV_CONFSTR_CPUMOD_REV      = 18,
+
+  /** Human-readable CPU module description. */
+  HV_CONFSTR_CPUMOD_DESC     = 19,
+
+  /** Per-tile hypervisor statistics.  When this identifier is specified,
+   *  the hv_confstr call takes two extra arguments.  The first is the
+   *  HV_XY_TO_LOTAR of the target tile's coordinates.  The second is
+   *  a flag word.  The only current flag is the lowest bit, which means
+   *  "zero out the stats instead of retrieving them"; in this case the
+   *  buffer and buffer length are ignored. */
+  HV_CONFSTR_HV_STATS        = 20
+
+} HV_ConfstrQuery;
+
+/** Query a configuration string from the hypervisor.
+ *
+ * @param query Identifier for the specific string to be retrieved
+ *        (HV_CONFSTR_xxx).  Some strings may require or permit extra
+ *        arguments to be appended which select specific objects to be
+ *        described; see the string descriptions above.
+ * @param buf Buffer in which to place the string.
+ * @param len Length of the buffer.
+ * @return If query is valid, then the length of the corresponding string,
+ *        including the trailing null; if this is greater than len, the string
+ *        was truncated.  If query is invalid, HV_EINVAL.  If the specified
+ *        buffer is not writable by the client, HV_EFAULT.
+ */
+int hv_confstr(HV_ConfstrQuery query, HV_VirtAddr buf, int len, ...);
+
+/** Tile coordinate */
+typedef struct
+{
+  /** X coordinate, relative to supervisor's top-left coordinate */
+  int x;
+
+  /** Y coordinate, relative to supervisor's top-left coordinate */
+  int y;
+} HV_Coord;
+
+
+#if CHIP_HAS_IPI()
+
+/** Get the PTE for sending an IPI to a particular tile.
+ *
+ * @param tile Tile which will receive the IPI.
+ * @param pl Indicates which IPI registers: 0 = IPI_0, 1 = IPI_1.
+ * @param pte Filled with resulting PTE.
+ * @result Zero if no error, non-zero for invalid parameters.
+ */
+int hv_get_ipi_pte(HV_Coord tile, int pl, HV_PTE* pte);
+
+/** Configure the console interrupt.
+ *
+ * When the console client interrupt is enabled, the hypervisor will
+ * deliver the specified IPI to the client in the following situations:
+ *
+ * - The console has at least one character available for input.
+ *
+ * - The console can accept new characters for output, and the last call
+ *   to hv_console_write() did not write all of the characters requested
+ *   by the client.
+ *
+ * Note that in some system configurations, console interrupt will not
+ * be available; clients should be prepared for this routine to fail and
+ * to fall back to periodic console polling in that case.
+ *
+ * @param ipi Index of the IPI register which will receive the interrupt.
+ * @param event IPI event number for console interrupt. If less than 0,
+ *        disable the console IPI interrupt.
+ * @param coord Tile to be targeted for console interrupt.
+ * @return 0 on success, otherwise, HV_EINVAL if illegal parameter,
+ *         HV_ENOTSUP if console interrupt are not available.
+ */
+int hv_console_set_ipi(int ipi, int event, HV_Coord coord);
+
+#else /* !CHIP_HAS_IPI() */
+
+/** A set of interrupts. */
+typedef __hv32 HV_IntrMask;
+
+/** The low interrupt numbers are reserved for use by the client in
+ *  delivering IPIs.  Any interrupt numbers higher than this value are
+ *  reserved for use by HV device drivers. */
+#define HV_MAX_IPI_INTERRUPT 7
+
+/** Enable a set of device interrupts.
+ *
+ * @param enab_mask Bitmap of interrupts to enable.
+ */
+void hv_enable_intr(HV_IntrMask enab_mask);
+
+/** Disable a set of device interrupts.
+ *
+ * @param disab_mask Bitmap of interrupts to disable.
+ */
+void hv_disable_intr(HV_IntrMask disab_mask);
+
+/** Clear a set of device interrupts.
+ *
+ * @param clear_mask Bitmap of interrupts to clear.
+ */
+void hv_clear_intr(HV_IntrMask clear_mask);
+
+/** Raise a set of device interrupts.
+ *
+ * @param raise_mask Bitmap of interrupts to raise.
+ */
+void hv_raise_intr(HV_IntrMask raise_mask);
+
+/** Trigger a one-shot interrupt on some tile
+ *
+ * @param tile Which tile to interrupt.
+ * @param interrupt Interrupt number to trigger; must be between 0 and
+ *        HV_MAX_IPI_INTERRUPT.
+ * @return HV_OK on success, or a hypervisor error code.
+ */
+HV_Errno hv_trigger_ipi(HV_Coord tile, int interrupt);
+
+#endif /* !CHIP_HAS_IPI() */
+
+/** Store memory mapping in debug memory so that external debugger can read it.
+ * A maximum of 16 entries can be stored.
+ *
+ * @param va VA of memory that is mapped.
+ * @param len Length of mapped memory.
+ * @param pa PA of memory that is mapped.
+ * @return 0 on success, -1 if the maximum number of mappings is exceeded.
+ */
+int hv_store_mapping(HV_VirtAddr va, unsigned int len, HV_PhysAddr pa);
+
+/** Given a client PA and a length, return its real (HV) PA.
+ *
+ * @param cpa Client physical address.
+ * @param len Length of mapped memory.
+ * @return physical address, or -1 if cpa or len is not valid.
+ */
+HV_PhysAddr hv_inquire_realpa(HV_PhysAddr cpa, unsigned int len);
+
+/** RTC return flag for no RTC chip present.
+ */
+#define HV_RTC_NO_CHIP     0x1
+
+/** RTC return flag for low-voltage condition, indicating that battery had
+ * died and time read is unreliable.
+ */
+#define HV_RTC_LOW_VOLTAGE 0x2
+
+/** Date/Time of day */
+typedef struct {
+#if CHIP_WORD_SIZE() > 32
+  __hv64 tm_sec;   /**< Seconds, 0-59 */
+  __hv64 tm_min;   /**< Minutes, 0-59 */
+  __hv64 tm_hour;  /**< Hours, 0-23 */
+  __hv64 tm_mday;  /**< Day of month, 0-30 */
+  __hv64 tm_mon;   /**< Month, 0-11 */
+  __hv64 tm_year;  /**< Years since 1900, 0-199 */
+  __hv64 flags;    /**< Return flags, 0 if no error */
+#else
+  __hv32 tm_sec;   /**< Seconds, 0-59 */
+  __hv32 tm_min;   /**< Minutes, 0-59 */
+  __hv32 tm_hour;  /**< Hours, 0-23 */
+  __hv32 tm_mday;  /**< Day of month, 0-30 */
+  __hv32 tm_mon;   /**< Month, 0-11 */
+  __hv32 tm_year;  /**< Years since 1900, 0-199 */
+  __hv32 flags;    /**< Return flags, 0 if no error */
+#endif
+} HV_RTCTime;
+
+/** Read the current time-of-day clock.
+ * @return HV_RTCTime of current time (GMT).
+ */
+HV_RTCTime hv_get_rtc(void);
+
+
+/** Set the current time-of-day clock.
+ * @param time time to reset time-of-day to (GMT).
+ */
+void hv_set_rtc(HV_RTCTime time);
+
+/** Installs a context, comprising a page table and other attributes.
+ *
+ *  Once this service completes, page_table will be used to translate
+ *  subsequent virtual address references to physical memory.
+ *
+ *  Installing a context does not cause an implicit TLB flush.  Before
+ *  reusing an ASID value for a different address space, the client is
+ *  expected to flush old references from the TLB with hv_flush_asid().
+ *  (Alternately, hv_flush_all() may be used to flush many ASIDs at once.)
+ *  After invalidating a page table entry, changing its attributes, or
+ *  changing its target CPA, the client is expected to flush old references
+ *  from the TLB with hv_flush_page() or hv_flush_pages(). Making a
+ *  previously invalid page valid does not require a flush.
+ *
+ *  Specifying an invalid ASID, or an invalid CPA (client physical address)
+ *  (either as page_table_pointer, or within the referenced table),
+ *  or another page table data item documented as above as illegal may
+ *  lead to client termination; since the validation of the table is
+ *  done as needed, this may happen before the service returns, or at
+ *  some later time, or never, depending upon the client's pattern of
+ *  memory references.  Page table entries which supply translations for
+ *  invalid virtual addresses may result in client termination, or may
+ *  be silently ignored.  "Invalid" in this context means a value which
+ *  was not provided to the client via the appropriate hv_inquire_* routine.
+ *
+ *  To support changing the instruction VAs at the same time as
+ *  installing the new page table, this call explicitly supports
+ *  setting the "lr" register to a different address and then jumping
+ *  directly to the hv_install_context() routine.  In this case, the
+ *  new page table does not need to contain any mapping for the
+ *  hv_install_context address itself.
+ *
+ *  At most one HV_CTX_PG_SM_* flag may be specified in "flags";
+ *  if multiple flags are specified, HV_EINVAL is returned.
+ *  Specifying none of the flags results in using the default page size.
+ *  All cores participating in a given client must request the same
+ *  page size, or the results are undefined.
+ *
+ * @param page_table Root of the page table.
+ * @param access PTE providing info on how to read the page table.  This
+ *   value must be consistent between multiple tiles sharing a page table,
+ *   and must also be consistent with any virtual mappings the client
+ *   may be using to access the page table.
+ * @param asid HV_ASID the page table is to be used for.
+ * @param flags Context flags, denoting attributes or privileges of the
+ *   current context (HV_CTX_xxx).
+ * @return Zero on success, or a hypervisor error code on failure.
+ */
+int hv_install_context(HV_PhysAddr page_table, HV_PTE access, HV_ASID asid,
+                       __hv32 flags);
+
+#endif /* !__ASSEMBLER__ */
+
+#define HV_CTX_DIRECTIO     0x1   /**< Direct I/O requests are accepted from
+                                       PL0. */
+
+#define HV_CTX_PG_SM_4K     0x10  /**< Use 4K small pages, if available. */
+#define HV_CTX_PG_SM_16K    0x20  /**< Use 16K small pages, if available. */
+#define HV_CTX_PG_SM_64K    0x40  /**< Use 64K small pages, if available. */
+#define HV_CTX_PG_SM_MASK   0xf0  /**< Mask of all possible small pages. */
+
+#ifndef __ASSEMBLER__
+
+
+/** Set the number of pages ganged together by HV_PTE_SUPER at a
+ * particular level of the page table.
+ *
+ * The current TILE-Gx hardware only supports powers of four
+ * (i.e. log2_count must be a multiple of two), and the requested
+ * "super" page size must be less than the span of the next level in
+ * the page table.  The largest size that can be requested is 64GB.
+ *
+ * The shift value is initially "0" for all page table levels,
+ * indicating that the HV_PTE_SUPER bit is effectively ignored.
+ *
+ * If you change the count from one non-zero value to another, the
+ * hypervisor will flush the entire TLB and TSB to avoid confusion.
+ *
+ * @param level Page table level (0, 1, or 2)
+ * @param log2_count Base-2 log of the number of pages to gang together,
+ * i.e. how much to shift left the base page size for the super page size.
+ * @return Zero on success, or a hypervisor error code on failure.
+ */
+int hv_set_pte_super_shift(int level, int log2_count);
+
+
+/** Value returned from hv_inquire_context(). */
+typedef struct
+{
+  /** Physical address of page table */
+  HV_PhysAddr page_table;
+
+  /** PTE which defines access method for top of page table */
+  HV_PTE access;
+
+  /** ASID associated with this page table */
+  HV_ASID asid;
+
+  /** Context flags */
+  __hv32 flags;
+} HV_Context;
+
+/** Retrieve information about the currently installed context.
+ * @return The data passed to the last successful hv_install_context call.
+ */
+HV_Context hv_inquire_context(void);
+
+
+/** Flushes all translations associated with the named address space
+ *  identifier from the TLB and any other hypervisor data structures.
+ *  Translations installed with the "global" bit are not flushed.
+ *
+ *  Specifying an invalid ASID may lead to client termination.  "Invalid"
+ *  in this context means a value which was not provided to the client
+ *  via <tt>hv_inquire_asid()</tt>.
+ *
+ * @param asid HV_ASID whose entries are to be flushed.
+ * @return Zero on success, or a hypervisor error code on failure.
+*/
+int hv_flush_asid(HV_ASID asid);
+
+
+/** Flushes all translations associated with the named virtual address
+ *  and page size from the TLB and other hypervisor data structures. Only
+ *  pages visible to the current ASID are affected; note that this includes
+ *  global pages in addition to pages specific to the current ASID.
+ *
+ *  The supplied VA need not be aligned; it may be anywhere in the
+ *  subject page.
+ *
+ *  Specifying an invalid virtual address may lead to client termination,
+ *  or may silently succeed.  "Invalid" in this context means a value
+ *  which was not provided to the client via hv_inquire_virtual.
+ *
+ * @param address Address of the page to flush.
+ * @param page_size Size of pages to assume.
+ * @return Zero on success, or a hypervisor error code on failure.
+ */
+int hv_flush_page(HV_VirtAddr address, HV_PageSize page_size);
+
+
+/** Flushes all translations associated with the named virtual address range
+ *  and page size from the TLB and other hypervisor data structures. Only
+ *  pages visible to the current ASID are affected; note that this includes
+ *  global pages in addition to pages specific to the current ASID.
+ *
+ *  The supplied VA need not be aligned; it may be anywhere in the
+ *  subject page.
+ *
+ *  Specifying an invalid virtual address may lead to client termination,
+ *  or may silently succeed.  "Invalid" in this context means a value
+ *  which was not provided to the client via hv_inquire_virtual.
+ *
+ * @param start Address to flush.
+ * @param page_size Size of pages to assume.
+ * @param size The number of bytes to flush. Any page in the range
+ *        [start, start + size) will be flushed from the TLB.
+ * @return Zero on success, or a hypervisor error code on failure.
+ */
+int hv_flush_pages(HV_VirtAddr start, HV_PageSize page_size,
+                   unsigned long size);
+
+
+/** Flushes all non-global translations (if preserve_global is true),
+ *  or absolutely all translations (if preserve_global is false).
+ *
+ * @param preserve_global Non-zero if we want to preserve "global" mappings.
+ * @return Zero on success, or a hypervisor error code on failure.
+*/
+int hv_flush_all(int preserve_global);
+
+
+/** Restart machine with optional restart command and optional args.
+ * @param cmd Const pointer to command to restart with, or NULL
+ * @param args Const pointer to argument string to restart with, or NULL
+ */
+void hv_restart(HV_VirtAddr cmd, HV_VirtAddr args);
+
+
+/** Halt machine. */
+void hv_halt(void);
+
+
+/** Power off machine. */
+void hv_power_off(void);
+
+
+/** Re-enter virtual-is-physical memory translation mode and restart
+ *  execution at a given address.
+ * @param entry Client physical address at which to begin execution.
+ * @return A hypervisor error code on failure; if the operation is
+ *         successful the call does not return.
+ */
+int hv_reexec(HV_PhysAddr entry);
+
+
+/** Chip topology */
+typedef struct
+{
+  /** Relative coordinates of the querying tile */
+  HV_Coord coord;
+
+  /** Width of the querying supervisor's tile rectangle. */
+  int width;
+
+  /** Height of the querying supervisor's tile rectangle. */
+  int height;
+
+} HV_Topology;
+
+/** Returns information about the tile coordinate system.
+ *
+ * Each supervisor is given a rectangle of tiles it potentially controls.
+ * These tiles are labeled using a relative coordinate system with (0,0) as
+ * the upper left tile regardless of their physical location on the chip.
+ *
+ * This call returns both the size of that rectangle and the position
+ * within that rectangle of the querying tile.
+ *
+ * Not all tiles within that rectangle may be available to the supervisor;
+ * to get the precise set of available tiles, you must also call
+ * hv_inquire_tiles(HV_INQ_TILES_AVAIL, ...).
+ **/
+HV_Topology hv_inquire_topology(void);
+
+/** Sets of tiles we can retrieve with hv_inquire_tiles().
+ *
+ * These numbers are part of the binary API and guaranteed not to change.
+ */
+typedef enum {
+  /** An invalid value; do not use. */
+  _HV_INQ_TILES_RESERVED       = 0,
+
+  /** All available tiles within the supervisor's tile rectangle. */
+  HV_INQ_TILES_AVAIL           = 1,
+
+  /** The set of tiles used for hash-for-home caching. */
+  HV_INQ_TILES_HFH_CACHE       = 2,
+
+  /** The set of tiles that can be legally used as a LOTAR for a PTE. */
+  HV_INQ_TILES_LOTAR           = 3,
+
+  /** The set of "shared" driver tiles that the hypervisor may
+   *  periodically interrupt. */
+  HV_INQ_TILES_SHARED          = 4
+} HV_InqTileSet;
+
+/** Returns specific information about various sets of tiles within the
+ *  supervisor's tile rectangle.
+ *
+ * @param set Which set of tiles to retrieve.
+ * @param cpumask Pointer to a returned bitmask (in row-major order,
+ *        supervisor-relative) of tiles.  The low bit of the first word
+ *        corresponds to the tile at the upper left-hand corner of the
+ *        supervisor's rectangle.  In order for the supervisor to know the
+ *        buffer length to supply, it should first call hv_inquire_topology.
+ * @param length Number of bytes available for the returned bitmask.
+ **/
+HV_Errno hv_inquire_tiles(HV_InqTileSet set, HV_VirtAddr cpumask, int length);
+
+
+/** An identifier for a memory controller. Multiple memory controllers
+ * may be connected to one chip, and this uniquely identifies each one.
+ */
+typedef int HV_MemoryController;
+
+/** A range of physical memory. */
+typedef struct
+{
+  HV_PhysAddr start;   /**< Starting address. */
+  __hv64 size;         /**< Size in bytes. */
+  HV_MemoryController controller;  /**< Which memory controller owns this. */
+} HV_PhysAddrRange;
+
+/** Returns information about a range of physical memory.
+ *
+ * hv_inquire_physical() returns one of the ranges of client
+ * physical addresses which are available to this client.
+ *
+ * The first range is retrieved by specifying an idx of 0, and
+ * successive ranges are returned with subsequent idx values.  Ranges
+ * are ordered by increasing start address (i.e., as idx increases,
+ * so does start), do not overlap, and do not touch (i.e., the
+ * available memory is described with the fewest possible ranges).
+ *
+ * If an out-of-range idx value is specified, the returned size will be zero.
+ * A client can count the number of ranges by increasing idx until the
+ * returned size is zero. There will always be at least one valid range.
+ *
+ * Some clients might not be prepared to deal with more than one
+ * physical address range; they still ought to call this routine and
+ * issue a warning message if they're given more than one range, on the
+ * theory that whoever configured the hypervisor to provide that memory
+ * should know that it's being wasted.
+ */
+HV_PhysAddrRange hv_inquire_physical(int idx);
+
+/** Possible DIMM types. */
+typedef enum
+{
+  NO_DIMM                    = 0,  /**< No DIMM */
+  DDR2                       = 1,  /**< DDR2 */
+  DDR3                       = 2   /**< DDR3 */
+} HV_DIMM_Type;
+
+#ifdef __tilegx__
+
+/** Log2 of minimum DIMM bytes supported by the memory controller. */
+#define HV_MSH_MIN_DIMM_SIZE_SHIFT 29
+
+/** Max number of DIMMs contained by one memory controller. */
+#define HV_MSH_MAX_DIMMS 8
+
+#else
+
+/** Log2 of minimum DIMM bytes supported by the memory controller. */
+#define HV_MSH_MIN_DIMM_SIZE_SHIFT 26
+
+/** Max number of DIMMs contained by one memory controller. */
+#define HV_MSH_MAX_DIMMS 2
+
+#endif
+
+/** Number of bits to right-shift to get the DIMM type. */
+#define HV_DIMM_TYPE_SHIFT 0
+
+/** Bits to mask to get the DIMM type. */
+#define HV_DIMM_TYPE_MASK 0xf
+
+/** Number of bits to right-shift to get the DIMM size. */
+#define HV_DIMM_SIZE_SHIFT 4
+
+/** Bits to mask to get the DIMM size. */
+#define HV_DIMM_SIZE_MASK 0xf
+
+/** Memory controller information. */
+typedef struct
+{
+  HV_Coord coord;   /**< Relative tile coordinates of the port used by a
+                         specified tile to communicate with this controller. */
+  __hv64 speed;     /**< Speed of this controller in bytes per second. */
+} HV_MemoryControllerInfo;
+
+/** Returns information about a particular memory controller.
+ *
+ *  hv_inquire_memory_controller(coord,idx) returns information about a
+ *  particular controller.  Two pieces of information are returned:
+ *  - The relative coordinates of the port on the controller that the specified
+ *    tile would use to contact it.  The relative coordinates may lie
+ *    outside the supervisor's rectangle, i.e. the controller may not
+ *    be attached to a node managed by the querying node's supervisor.
+ *    In particular note that x or y may be negative.
+ *  - The speed of the memory controller.  (This is a not-to-exceed value
+ *    based on the raw hardware data rate, and may not be achievable in
+ *    practice; it is provided to give clients information on the relative
+ *    performance of the available controllers.)
+ *
+ *  Clients should avoid calling this interface with invalid values.
+ *  A client who does may be terminated.
+ * @param coord Tile for which to calculate the relative port position.
+ * @param controller Index of the controller; identical to value returned
+ *        from other routines like hv_inquire_physical.
+ * @return Information about the controller.
+ */
+HV_MemoryControllerInfo hv_inquire_memory_controller(HV_Coord coord,
+                                                     int controller);
+
+
+/** A range of virtual memory. */
+typedef struct
+{
+  HV_VirtAddr start;   /**< Starting address. */
+  __hv64 size;         /**< Size in bytes. */
+} HV_VirtAddrRange;
+
+/** Returns information about a range of virtual memory.
+ *
+ * hv_inquire_virtual() returns one of the ranges of client
+ * virtual addresses which are available to this client.
+ *
+ * The first range is retrieved by specifying an idx of 0, and
+ * successive ranges are returned with subsequent idx values.  Ranges
+ * are ordered by increasing start address (i.e., as idx increases,
+ * so does start), do not overlap, and do not touch (i.e., the
+ * available memory is described with the fewest possible ranges).
+ *
+ * If an out-of-range idx value is specified, the returned size will be zero.
+ * A client can count the number of ranges by increasing idx until the
+ * returned size is zero. There will always be at least one valid range.
+ *
+ * Some clients may well have various virtual addresses hardwired
+ * into themselves; for instance, their instruction stream may
+ * have been compiled expecting to live at a particular address.
+ * Such clients should use this interface to verify they've been
+ * given the virtual address space they expect, and issue a (potentially
+ * fatal) warning message otherwise.
+ *
+ * Note that the returned size is a __hv64, not a __hv32, so it is
+ * possible to express a single range spanning the entire 32-bit
+ * address space.
+ */
+HV_VirtAddrRange hv_inquire_virtual(int idx);
+
+
+/** A range of ASID values. */
+typedef struct
+{
+  HV_ASID start;        /**< First ASID in the range. */
+  unsigned int size;    /**< Number of ASIDs. Zero for an invalid range. */
+} HV_ASIDRange;
+
+/** Returns information about a range of ASIDs.
+ *
+ * hv_inquire_asid() returns one of the ranges of address
+ * space identifiers which are available to this client.
+ *
+ * The first range is retrieved by specifying an idx of 0, and
+ * successive ranges are returned with subsequent idx values.  Ranges
+ * are ordered by increasing start value (i.e., as idx increases,
+ * so does start), do not overlap, and do not touch (i.e., the
+ * available ASIDs are described with the fewest possible ranges).
+ *
+ * If an out-of-range idx value is specified, the returned size will be zero.
+ * A client can count the number of ranges by increasing idx until the
+ * returned size is zero. There will always be at least one valid range.
+ */
+HV_ASIDRange hv_inquire_asid(int idx);
+
+
+/** Waits for at least the specified number of nanoseconds then returns.
+ *
+ * NOTE: this deprecated function currently assumes a 750 MHz clock,
+ * and is thus not generally suitable for use.  New code should call
+ * hv_sysconf(HV_SYSCONF_CPU_SPEED), compute a cycle count to wait for,
+ * and delay by looping while checking the cycle counter SPR.
+ *
+ * @param nanosecs The number of nanoseconds to sleep.
+ */
+void hv_nanosleep(int nanosecs);
+
+
+/** Reads a character from the console without blocking.
+ *
+ * @return A value from 0-255 indicates the value successfully read.
+ * A negative value means no value was ready.
+ */
+int hv_console_read_if_ready(void);
+
+
+/** Writes a character to the console, blocking if the console is busy.
+ *
+ *  This call cannot fail. If the console is broken for some reason,
+ *  output will simply vanish.
+ * @param byte Character to write.
+ */
+void hv_console_putc(int byte);
+
+
+/** Writes a string to the console, blocking if the console is busy.
+ * @param bytes Pointer to characters to write.
+ * @param len Number of characters to write.
+ * @return Number of characters written, or HV_EFAULT if the buffer is invalid.
+ */
+int hv_console_write(HV_VirtAddr bytes, int len);
+
+
+/** Dispatch the next interrupt from the client downcall mechanism.
+ *
+ *  The hypervisor uses downcalls to notify the client of asynchronous
+ *  events.  Some of these events are hypervisor-created (like incoming
+ *  messages).  Some are regular interrupts which initially occur in
+ *  the hypervisor, and are normally handled directly by the client;
+ *  when these occur in a client's interrupt critical section, they must
+ *  be delivered through the downcall mechanism.
+ *
+ *  A downcall is initially delivered to the client as an INTCTRL_CL
+ *  interrupt, where CL is the client's PL.  Upon entry to the INTCTRL_CL
+ *  vector, the client must immediately invoke the hv_downcall_dispatch
+ *  service.  This service will not return; instead it will cause one of
+ *  the client's actual downcall-handling interrupt vectors to be entered.
+ *  The EX_CONTEXT registers in the client will be set so that when the
+ *  client irets, it will return to the code which was interrupted by the
+ *  INTCTRL_CL interrupt.
+ *
+ *  Under some circumstances, the firing of INTCTRL_CL can race with
+ *  the lowering of a device interrupt.  In such a case, the
+ *  hv_downcall_dispatch service may issue an iret instruction instead
+ *  of entering one of the client's actual downcall-handling interrupt
+ *  vectors.  This will return execution to the location that was
+ *  interrupted by INTCTRL_CL.
+ *
+ *  Any saving of registers should be done by the actual handling
+ *  vectors; no registers should be changed by the INTCTRL_CL handler.
+ *  In particular, the client should not use a jal instruction to invoke
+ *  the hv_downcall_dispatch service, as that would overwrite the client's
+ *  lr register.  Note that the hv_downcall_dispatch service may overwrite
+ *  one or more of the client's system save registers.
+ *
+ *  The client must not modify the INTCTRL_CL_STATUS SPR.  The hypervisor
+ *  will set this register to cause a downcall to happen, and will clear
+ *  it when no further downcalls are pending.
+ *
+ *  When a downcall vector is entered, the INTCTRL_CL interrupt will be
+ *  masked.  When the client is done processing a downcall, and is ready
+ *  to accept another, it must unmask this interrupt; if more downcalls
+ *  are pending, this will cause the INTCTRL_CL vector to be reentered.
+ *  Currently the following interrupt vectors can be entered through a
+ *  downcall:
+ *
+ *  INT_MESSAGE_RCV_DWNCL   (hypervisor message available)
+ *  INT_DEV_INTR_DWNCL      (device interrupt)
+ *  INT_DMATLB_MISS_DWNCL   (DMA TLB miss)
+ *  INT_SNITLB_MISS_DWNCL   (SNI TLB miss)
+ *  INT_DMATLB_ACCESS_DWNCL (DMA TLB access violation)
+ */
+void hv_downcall_dispatch(void);
+
+#endif /* !__ASSEMBLER__ */
+
+/** We use actual interrupt vectors which never occur (they're only there
+ *  to allow setting MPLs for related SPRs) for our downcall vectors.
+ */
+/** Message receive downcall interrupt vector */
+#define INT_MESSAGE_RCV_DWNCL    INT_BOOT_ACCESS
+/** DMA TLB miss downcall interrupt vector */
+#define INT_DMATLB_MISS_DWNCL    INT_DMA_ASID
+/** Static nework processor instruction TLB miss interrupt vector */
+#define INT_SNITLB_MISS_DWNCL    INT_SNI_ASID
+/** DMA TLB access violation downcall interrupt vector */
+#define INT_DMATLB_ACCESS_DWNCL  INT_DMA_CPL
+/** Device interrupt downcall interrupt vector */
+#define INT_DEV_INTR_DWNCL       INT_WORLD_ACCESS
+
+#ifndef __ASSEMBLER__
+
+/** Requests the inode for a specific full pathname.
+ *
+ * Performs a lookup in the hypervisor filesystem for a given filename.
+ * Multiple calls with the same filename will always return the same inode.
+ * If there is no such filename, HV_ENOENT is returned.
+ * A bad filename pointer may result in HV_EFAULT instead.
+ *
+ * @param filename Constant pointer to name of requested file
+ * @return Inode of requested file
+ */
+int hv_fs_findfile(HV_VirtAddr filename);
+
+
+/** Data returned from an fstat request.
+ * Note that this structure should be no more than 40 bytes in size so
+ * that it can always be returned completely in registers.
+ */
+typedef struct
+{
+  int size;             /**< Size of file (or HV_Errno on error) */
+  unsigned int flags;   /**< Flags (see HV_FS_FSTAT_FLAGS) */
+} HV_FS_StatInfo;
+
+/** Bitmask flags for fstat request */
+typedef enum
+{
+  HV_FS_ISDIR    = 0x0001   /**< Is the entry a directory? */
+} HV_FS_FSTAT_FLAGS;
+
+/** Get stat information on a given file inode.
+ *
+ * Return information on the file with the given inode.
+ *
+ * IF the HV_FS_ISDIR bit is set, the "file" is a directory.  Reading
+ * it will return NUL-separated filenames (no directory part) relative
+ * to the path to the inode of the directory "file".  These can be
+ * appended to the path to the directory "file" after a forward slash
+ * to create additional filenames.  Note that it is not required
+ * that all valid paths be decomposable into valid parent directories;
+ * a filesystem may validly have just a few files, none of which have
+ * HV_FS_ISDIR set.  However, if clients may wish to enumerate the
+ * files in the filesystem, it is recommended to include all the
+ * appropriate parent directory "files" to give a consistent view.
+ *
+ * An invalid file inode will cause an HV_EBADF error to be returned.
+ *
+ * @param inode The inode number of the query
+ * @return An HV_FS_StatInfo structure
+ */
+HV_FS_StatInfo hv_fs_fstat(int inode);
+
+
+/** Read data from a specific hypervisor file.
+ * On error, may return HV_EBADF for a bad inode or HV_EFAULT for a bad buf.
+ * Reads near the end of the file will return fewer bytes than requested.
+ * Reads at or beyond the end of a file will return zero.
+ *
+ * @param inode the hypervisor file to read
+ * @param buf the buffer to read data into
+ * @param length the number of bytes of data to read
+ * @param offset the offset into the file to read the data from
+ * @return number of bytes successfully read, or an HV_Errno code
+ */
+int hv_fs_pread(int inode, HV_VirtAddr buf, int length, int offset);
+
+
+/** Read a 64-bit word from the specified physical address.
+ * The address must be 8-byte aligned.
+ * Specifying an invalid physical address will lead to client termination.
+ * @param addr The physical address to read
+ * @param access The PTE describing how to read the memory
+ * @return The 64-bit value read from the given address
+ */
+unsigned long long hv_physaddr_read64(HV_PhysAddr addr, HV_PTE access);
+
+
+/** Write a 64-bit word to the specified physical address.
+ * The address must be 8-byte aligned.
+ * Specifying an invalid physical address will lead to client termination.
+ * @param addr The physical address to write
+ * @param access The PTE that says how to write the memory
+ * @param val The 64-bit value to write to the given address
+ */
+void hv_physaddr_write64(HV_PhysAddr addr, HV_PTE access,
+                         unsigned long long val);
+
+
+/** Get the value of the command-line for the supervisor, if any.
+ * This will not include the filename of the booted supervisor, but may
+ * include configured-in boot arguments or the hv_restart() arguments.
+ * If the buffer is not long enough the hypervisor will NUL the first
+ * character of the buffer but not write any other data.
+ * @param buf The virtual address to write the command-line string to.
+ * @param length The length of buf, in characters.
+ * @return The actual length of the command line, including the trailing NUL
+ *         (may be larger than "length").
+ */
+int hv_get_command_line(HV_VirtAddr buf, int length);
+
+
+/** Set a new value for the command-line for the supervisor, which will
+ *  be returned from subsequent invocations of hv_get_command_line() on
+ *  this tile.
+ * @param buf The virtual address to read the command-line string from.
+ * @param length The length of buf, in characters; must be no more than
+ *        HV_COMMAND_LINE_LEN.
+ * @return Zero if successful, or a hypervisor error code.
+ */
+HV_Errno hv_set_command_line(HV_VirtAddr buf, int length);
+
+/** Maximum size of a command line passed to hv_set_command_line(); note
+ *  that a line returned from hv_get_command_line() could be larger than
+ *  this.*/
+#define HV_COMMAND_LINE_LEN  256
+
+/** Tell the hypervisor how to cache non-priority pages
+ * (its own as well as pages explicitly represented in page tables).
+ * Normally these will be represented as red/black pages, but
+ * when the supervisor starts to allocate "priority" pages in the PTE
+ * the hypervisor will need to start marking those pages as (e.g.) "red"
+ * and non-priority pages as either "black" (if they cache-alias
+ * with the existing priority pages) or "red/black" (if they don't).
+ * The bitmask provides information on which parts of the cache
+ * have been used for pinned pages so far on this tile; if (1 << N)
+ * appears in the bitmask, that indicates that a 4KB region of the
+ * cache starting at (N * 4KB) is in use by a "priority" page.
+ * The portion of cache used by a particular page can be computed
+ * by taking the page's PA, modulo CHIP_L2_CACHE_SIZE(), and setting
+ * all the "4KB" bits corresponding to the actual page size.
+ * @param bitmask A bitmap of priority page set values
+ */
+void hv_set_caching(unsigned long bitmask);
+
+
+/** Zero out a specified number of pages.
+ * The va and size must both be multiples of 4096.
+ * Caches are bypassed and memory is directly set to zero.
+ * This API is implemented only in the magic hypervisor and is intended
+ * to provide a performance boost to the minimal supervisor by
+ * giving it a fast way to zero memory pages when allocating them.
+ * @param va Virtual address where the page has been mapped
+ * @param size Number of bytes (must be a page size multiple)
+ */
+void hv_bzero_page(HV_VirtAddr va, unsigned int size);
+
+
+/** State object for the hypervisor messaging subsystem. */
+typedef struct
+{
+#if CHIP_VA_WIDTH() > 32
+  __hv64 opaque[2]; /**< No user-serviceable parts inside */
+#else
+  __hv32 opaque[2]; /**< No user-serviceable parts inside */
+#endif
+}
+HV_MsgState;
+
+/** Register to receive incoming messages.
+ *
+ *  This routine configures the current tile so that it can receive
+ *  incoming messages.  It must be called before the client can receive
+ *  messages with the hv_receive_message routine, and must be called on
+ *  each tile which will receive messages.
+ *
+ *  msgstate is the virtual address of a state object of type HV_MsgState.
+ *  Once the state is registered, the client must not read or write the
+ *  state object; doing so will cause undefined results.
+ *
+ *  If this routine is called with msgstate set to 0, the client's message
+ *  state will be freed and it will no longer be able to receive messages.
+ *  Note that this may cause the loss of any as-yet-undelivered messages
+ *  for the client.
+ *
+ *  If another client attempts to send a message to a client which has
+ *  not yet called hv_register_message_state, or which has freed its
+ *  message state, the message will not be delivered, as if the client
+ *  had insufficient buffering.
+ *
+ *  This routine returns HV_OK if the registration was successful, and
+ *  HV_EINVAL if the supplied state object is unsuitable.  Note that some
+ *  errors may not be detected during this routine, but might be detected
+ *  during a subsequent message delivery.
+ * @param msgstate State object.
+ **/
+HV_Errno hv_register_message_state(HV_MsgState* msgstate);
+
+/** Possible message recipient states. */
+typedef enum
+{
+  HV_TO_BE_SENT,    /**< Not sent (not attempted, or recipient not ready) */
+  HV_SENT,          /**< Successfully sent */
+  HV_BAD_RECIP      /**< Bad recipient coordinates (permanent error) */
+} HV_Recip_State;
+
+/** Message recipient. */
+typedef struct
+{
+  /** X coordinate, relative to supervisor's top-left coordinate */
+  unsigned int x:11;
+
+  /** Y coordinate, relative to supervisor's top-left coordinate */
+  unsigned int y:11;
+
+  /** Status of this recipient */
+  HV_Recip_State state:10;
+} HV_Recipient;
+
+/** Send a message to a set of recipients.
+ *
+ *  This routine sends a message to a set of recipients.
+ *
+ *  recips is an array of HV_Recipient structures.  Each specifies a tile,
+ *  and a message state; initially, it is expected that the state will
+ *  be set to HV_TO_BE_SENT.  nrecip specifies the number of recipients
+ *  in the recips array.
+ *
+ *  For each recipient whose state is HV_TO_BE_SENT, the hypervisor attempts
+ *  to send that tile the specified message.  In order to successfully
+ *  receive the message, the receiver must be a valid tile to which the
+ *  sender has access, must not be the sending tile itself, and must have
+ *  sufficient free buffer space.  (The hypervisor guarantees that each
+ *  tile which has called hv_register_message_state() will be able to
+ *  buffer one message from every other tile which can legally send to it;
+ *  more space may be provided but is not guaranteed.)  If an invalid tile
+ *  is specified, the recipient's state is set to HV_BAD_RECIP; this is a
+ *  permanent delivery error.  If the message is successfully delivered
+ *  to the recipient's buffer, the recipient's state is set to HV_SENT.
+ *  Otherwise, the recipient's state is unchanged.  Message delivery is
+ *  synchronous; all attempts to send messages are completed before this
+ *  routine returns.
+ *
+ *  If no permanent delivery errors were encountered, the routine returns
+ *  the number of messages successfully sent: that is, the number of
+ *  recipients whose states changed from HV_TO_BE_SENT to HV_SENT during
+ *  this operation.  If any permanent delivery errors were encountered,
+ *  the routine returns HV_ERECIP.  In the event of permanent delivery
+ *  errors, it may be the case that delivery was not attempted to all
+ *  recipients; if any messages were successfully delivered, however,
+ *  recipients' state values will be updated appropriately.
+ *
+ *  It is explicitly legal to specify a recipient structure whose state
+ *  is not HV_TO_BE_SENT; such a recipient is ignored.  One suggested way
+ *  of using hv_send_message to send a message to multiple tiles is to set
+ *  up a list of recipients, and then call the routine repeatedly with the
+ *  same list, each time accumulating the number of messages successfully
+ *  sent, until all messages are sent, a permanent error is encountered,
+ *  or the desired number of attempts have been made.  When used in this
+ *  way, the routine will deliver each message no more than once to each
+ *  recipient.
+ *
+ *  Note that a message being successfully delivered to the recipient's
+ *  buffer space does not guarantee that it is received by the recipient,
+ *  either immediately or at any time in the future; the recipient might
+ *  never call hv_receive_message, or could register a different state
+ *  buffer, losing the message.
+ *
+ *  Specifying the same recipient more than once in the recipient list
+ *  is an error, which will not result in an error return but which may
+ *  or may not result in more than one message being delivered to the
+ *  recipient tile.
+ *
+ *  buf and buflen specify the message to be sent.  buf is a virtual address
+ *  which must be currently mapped in the client's page table; if not, the
+ *  routine returns HV_EFAULT.  buflen must be greater than zero and less
+ *  than or equal to HV_MAX_MESSAGE_SIZE, and nrecip must be less than the
+ *  number of tiles to which the sender has access; if not, the routine
+ *  returns HV_EINVAL.
+ * @param recips List of recipients.
+ * @param nrecip Number of recipients.
+ * @param buf Address of message data.
+ * @param buflen Length of message data.
+ **/
+int hv_send_message(HV_Recipient *recips, int nrecip,
+                    HV_VirtAddr buf, int buflen);
+
+/** Maximum hypervisor message size, in bytes */
+#define HV_MAX_MESSAGE_SIZE 28
+
+
+/** Return value from hv_receive_message() */
+typedef struct
+{
+  int msglen;     /**< Message length in bytes, or an error code */
+  __hv32 source;  /**< Code identifying message sender (HV_MSG_xxx) */
+} HV_RcvMsgInfo;
+
+#define HV_MSG_TILE 0x0         /**< Message source is another tile */
+#define HV_MSG_INTR 0x1         /**< Message source is a driver interrupt */
+
+/** Receive a message.
+ *
+ * This routine retrieves a message from the client's incoming message
+ * buffer.
+ *
+ * Multiple messages sent from a particular sending tile to a particular
+ * receiving tile are received in the order that they were sent; however,
+ * no ordering is guaranteed between messages sent by different tiles.
+ *
+ * Whenever the a client's message buffer is empty, the first message
+ * subsequently received will cause the client's MESSAGE_RCV_DWNCL
+ * interrupt vector to be invoked through the interrupt downcall mechanism
+ * (see the description of the hv_downcall_dispatch() routine for details
+ * on downcalls).
+ *
+ * Another message-available downcall will not occur until a call to
+ * this routine is made when the message buffer is empty, and a message
+ * subsequently arrives.  Note that such a downcall could occur while
+ * this routine is executing.  If the calling code does not wish this
+ * to happen, it is recommended that this routine be called with the
+ * INTCTRL_1 interrupt masked, or inside an interrupt critical section.
+ *
+ * msgstate is the value previously passed to hv_register_message_state().
+ * buf is the virtual address of the buffer into which the message will
+ * be written; buflen is the length of the buffer.
+ *
+ * This routine returns an HV_RcvMsgInfo structure.  The msglen member
+ * of that structure is the length of the message received, zero if no
+ * message is available, or HV_E2BIG if the message is too large for the
+ * specified buffer.  If the message is too large, it is not consumed,
+ * and may be retrieved by a subsequent call to this routine specifying
+ * a sufficiently large buffer.  A buffer which is HV_MAX_MESSAGE_SIZE
+ * bytes long is guaranteed to be able to receive any possible message.
+ *
+ * The source member of the HV_RcvMsgInfo structure describes the sender
+ * of the message.  For messages sent by another client tile via an
+ * hv_send_message() call, this value is HV_MSG_TILE; for messages sent
+ * as a result of a device interrupt, this value is HV_MSG_INTR.
+ */
+
+HV_RcvMsgInfo hv_receive_message(HV_MsgState msgstate, HV_VirtAddr buf,
+                                 int buflen);
+
+
+/** Start remaining tiles owned by this supervisor.  Initially, only one tile
+ *  executes the client program; after it calls this service, the other tiles
+ *  are started.  This allows the initial tile to do one-time configuration
+ *  of shared data structures without having to lock them against simultaneous
+ *  access.
+ */
+void hv_start_all_tiles(void);
+
+
+/** Open a hypervisor device.
+ *
+ *  This service initializes an I/O device and its hypervisor driver software,
+ *  and makes it available for use.  The open operation is per-device per-chip;
+ *  once it has been performed, the device handle returned may be used in other
+ *  device services calls made by any tile.
+ *
+ * @param name Name of the device.  A base device name is just a text string
+ *        (say, "pcie").  If there is more than one instance of a device, the
+ *        base name is followed by a slash and a device number (say, "pcie/0").
+ *        Some devices may support further structure beneath those components;
+ *        most notably, devices which require control operations do so by
+ *        supporting reads and/or writes to a control device whose name
+ *        includes a trailing "/ctl" (say, "pcie/0/ctl").
+ * @param flags Flags (HV_DEV_xxx).
+ * @return A positive integer device handle, or a negative error code.
+ */
+int hv_dev_open(HV_VirtAddr name, __hv32 flags);
+
+
+/** Close a hypervisor device.
+ *
+ *  This service uninitializes an I/O device and its hypervisor driver
+ *  software, and makes it unavailable for use.  The close operation is
+ *  per-device per-chip; once it has been performed, the device is no longer
+ *  available.  Normally there is no need to ever call the close service.
+ *
+ * @param devhdl Device handle of the device to be closed.
+ * @return Zero if the close is successful, otherwise, a negative error code.
+ */
+int hv_dev_close(int devhdl);
+
+
+/** Read data from a hypervisor device synchronously.
+ *
+ *  This service transfers data from a hypervisor device to a memory buffer.
+ *  When the service returns, the data has been written from the memory buffer,
+ *  and the buffer will not be further modified by the driver.
+ *
+ *  No ordering is guaranteed between requests issued from different tiles.
+ *
+ *  Devices may choose to support both the synchronous and asynchronous read
+ *  operations, only one of them, or neither of them.
+ *
+ * @param devhdl Device handle of the device to be read from.
+ * @param flags Flags (HV_DEV_xxx).
+ * @param va Virtual address of the target data buffer.  This buffer must
+ *        be mapped in the currently installed page table; if not, HV_EFAULT
+ *        may be returned.
+ * @param len Number of bytes to be transferred.
+ * @param offset Driver-dependent offset.  For a random-access device, this is
+ *        often a byte offset from the beginning of the device; in other cases,
+ *        like on a control device, it may have a different meaning.
+ * @return A non-negative value if the read was at least partially successful;
+ *         otherwise, a negative error code.  The precise interpretation of
+ *         the return value is driver-dependent, but many drivers will return
+ *         the number of bytes successfully transferred.
+ */
+int hv_dev_pread(int devhdl, __hv32 flags, HV_VirtAddr va, __hv32 len,
+                 __hv64 offset);
+
+#define HV_DEV_NB_EMPTY     0x1   /**< Don't block when no bytes of data can
+                                       be transferred. */
+#define HV_DEV_NB_PARTIAL   0x2   /**< Don't block when some bytes, but not all
+                                       of the requested bytes, can be
+                                       transferred. */
+#define HV_DEV_NOCACHE      0x4   /**< The caller warrants that none of the
+                                       cache lines which might contain data
+                                       from the requested buffer are valid.
+                                       Useful with asynchronous operations
+                                       only. */
+
+#define HV_DEV_ALLFLAGS     (HV_DEV_NB_EMPTY | HV_DEV_NB_PARTIAL | \
+                             HV_DEV_NOCACHE)   /**< All HV_DEV_xxx flags */
+
+/** Write data to a hypervisor device synchronously.
+ *
+ *  This service transfers data from a memory buffer to a hypervisor device.
+ *  When the service returns, the data has been read from the memory buffer,
+ *  and the buffer may be overwritten by the client; the data may not
+ *  necessarily have been conveyed to the actual hardware I/O interface.
+ *
+ *  No ordering is guaranteed between requests issued from different tiles.
+ *
+ *  Devices may choose to support both the synchronous and asynchronous write
+ *  operations, only one of them, or neither of them.
+ *
+ * @param devhdl Device handle of the device to be written to.
+ * @param flags Flags (HV_DEV_xxx).
+ * @param va Virtual address of the source data buffer.  This buffer must
+ *        be mapped in the currently installed page table; if not, HV_EFAULT
+ *        may be returned.
+ * @param len Number of bytes to be transferred.
+ * @param offset Driver-dependent offset.  For a random-access device, this is
+ *        often a byte offset from the beginning of the device; in other cases,
+ *        like on a control device, it may have a different meaning.
+ * @return A non-negative value if the write was at least partially successful;
+ *         otherwise, a negative error code.  The precise interpretation of
+ *         the return value is driver-dependent, but many drivers will return
+ *         the number of bytes successfully transferred.
+ */
+int hv_dev_pwrite(int devhdl, __hv32 flags, HV_VirtAddr va, __hv32 len,
+                  __hv64 offset);
+
+
+/** Interrupt arguments, used in the asynchronous I/O interfaces. */
+#if CHIP_VA_WIDTH() > 32
+typedef __hv64 HV_IntArg;
+#else
+typedef __hv32 HV_IntArg;
+#endif
+
+/** Interrupt messages are delivered via the mechanism as normal messages,
+ *  but have a message source of HV_DEV_INTR.  The message is formatted
+ *  as an HV_IntrMsg structure.
+ */
+
+typedef struct
+{
+  HV_IntArg intarg;  /**< Interrupt argument, passed to the poll/preada/pwritea
+                          services */
+  HV_IntArg intdata; /**< Interrupt-specific interrupt data */
+} HV_IntrMsg;
+
+/** Request an interrupt message when a device condition is satisfied.
+ *
+ *  This service requests that an interrupt message be delivered to the
+ *  requesting tile when a device becomes readable or writable, or when any
+ *  data queued to the device via previous write operations from this tile
+ *  has been actually sent out on the hardware I/O interface.  Devices may
+ *  choose to support any, all, or none of the available conditions.
+ *
+ *  If multiple conditions are specified, only one message will be
+ *  delivered.  If the event mask delivered to that interrupt handler
+ *  indicates that some of the conditions have not yet occurred, the
+ *  client must issue another poll() call if it wishes to wait for those
+ *  conditions.
+ *
+ *  Only one poll may be outstanding per device handle per tile.  If more than
+ *  one tile is polling on the same device and condition, they will all be
+ *  notified when it happens.  Because of this, clients may not assume that
+ *  the condition signaled is necessarily still true when they request a
+ *  subsequent service; for instance, the readable data which caused the
+ *  poll call to interrupt may have been read by another tile in the interim.
+ *
+ *  The notification interrupt message could come directly, or via the
+ *  downcall (intctrl1) method, depending on what the tile is doing
+ *  when the condition is satisfied.  Note that it is possible for the
+ *  requested interrupt to be delivered after this service is called but
+ *  before it returns.
+ *
+ * @param devhdl Device handle of the device to be polled.
+ * @param events Flags denoting the events which will cause the interrupt to
+ *        be delivered (HV_DEVPOLL_xxx).
+ * @param intarg Value which will be delivered as the intarg member of the
+ *        eventual interrupt message; the intdata member will be set to a
+ *        mask of HV_DEVPOLL_xxx values indicating which conditions have been
+ *        satisifed.
+ * @return Zero if the interrupt was successfully scheduled; otherwise, a
+ *         negative error code.
+ */
+int hv_dev_poll(int devhdl, __hv32 events, HV_IntArg intarg);
+
+#define HV_DEVPOLL_READ     0x1   /**< Test device for readability */
+#define HV_DEVPOLL_WRITE    0x2   /**< Test device for writability */
+#define HV_DEVPOLL_FLUSH    0x4   /**< Test device for output drained */
+
+
+/** Cancel a request for an interrupt when a device event occurs.
+ *
+ *  This service requests that no interrupt be delivered when the events
+ *  noted in the last-issued poll() call happen.  Once this service returns,
+ *  the interrupt has been canceled; however, it is possible for the interrupt
+ *  to be delivered after this service is called but before it returns.
+ *
+ * @param devhdl Device handle of the device on which to cancel polling.
+ * @return Zero if the poll was successfully canceled; otherwise, a negative
+ *         error code.
+ */
+int hv_dev_poll_cancel(int devhdl);
+
+
+/** Scatter-gather list for preada/pwritea calls. */
+typedef struct
+#if CHIP_VA_WIDTH() <= 32
+__attribute__ ((packed, aligned(4)))
+#endif
+{
+  HV_PhysAddr pa;  /**< Client physical address of the buffer segment. */
+  HV_PTE pte;      /**< Page table entry describing the caching and location
+                        override characteristics of the buffer segment.  Some
+                        drivers ignore this element and will require that
+                        the NOCACHE flag be set on their requests. */
+  __hv32 len;      /**< Length of the buffer segment. */
+} HV_SGL;
+
+#define HV_SGL_MAXLEN 16  /**< Maximum number of entries in a scatter-gather
+                               list */
+
+/** Read data from a hypervisor device asynchronously.
+ *
+ *  This service transfers data from a hypervisor device to a memory buffer.
+ *  When the service returns, the read has been scheduled.  When the read
+ *  completes, an interrupt message will be delivered, and the buffer will
+ *  not be further modified by the driver.
+ *
+ *  The number of possible outstanding asynchronous requests is defined by
+ *  each driver, but it is recommended that it be at least two requests
+ *  per tile per device.
+ *
+ *  No ordering is guaranteed between synchronous and asynchronous requests,
+ *  even those issued on the same tile.
+ *
+ *  The completion interrupt message could come directly, or via the downcall
+ *  (intctrl1) method, depending on what the tile is doing when the read
+ *  completes.  Interrupts do not coalesce; one is delivered for each
+ *  asynchronous I/O request.  Note that it is possible for the requested
+ *  interrupt to be delivered after this service is called but before it
+ *  returns.
+ *
+ *  Devices may choose to support both the synchronous and asynchronous read
+ *  operations, only one of them, or neither of them.
+ *
+ * @param devhdl Device handle of the device to be read from.
+ * @param flags Flags (HV_DEV_xxx).
+ * @param sgl_len Number of elements in the scatter-gather list.
+ * @param sgl Scatter-gather list describing the memory to which data will be
+ *        written.
+ * @param offset Driver-dependent offset.  For a random-access device, this is
+ *        often a byte offset from the beginning of the device; in other cases,
+ *        like on a control device, it may have a different meaning.
+ * @param intarg Value which will be delivered as the intarg member of the
+ *        eventual interrupt message; the intdata member will be set to the
+ *        normal return value from the read request.
+ * @return Zero if the read was successfully scheduled; otherwise, a negative
+ *         error code.  Note that some drivers may choose to pre-validate
+ *         their arguments, and may thus detect certain device error
+ *         conditions at this time rather than when the completion notification
+ *         occurs, but this is not required.
+ */
+int hv_dev_preada(int devhdl, __hv32 flags, __hv32 sgl_len,
+                  HV_SGL sgl[/* sgl_len */], __hv64 offset, HV_IntArg intarg);
+
+
+/** Write data to a hypervisor device asynchronously.
+ *
+ *  This service transfers data from a memory buffer to a hypervisor
+ *  device.  When the service returns, the write has been scheduled.
+ *  When the write completes, an interrupt message will be delivered,
+ *  and the buffer may be overwritten by the client; the data may not
+ *  necessarily have been conveyed to the actual hardware I/O interface.
+ *
+ *  The number of possible outstanding asynchronous requests is defined by
+ *  each driver, but it is recommended that it be at least two requests
+ *  per tile per device.
+ *
+ *  No ordering is guaranteed between synchronous and asynchronous requests,
+ *  even those issued on the same tile.
+ *
+ *  The completion interrupt message could come directly, or via the downcall
+ *  (intctrl1) method, depending on what the tile is doing when the read
+ *  completes.  Interrupts do not coalesce; one is delivered for each
+ *  asynchronous I/O request.  Note that it is possible for the requested
+ *  interrupt to be delivered after this service is called but before it
+ *  returns.
+ *
+ *  Devices may choose to support both the synchronous and asynchronous write
+ *  operations, only one of them, or neither of them.
+ *
+ * @param devhdl Device handle of the device to be read from.
+ * @param flags Flags (HV_DEV_xxx).
+ * @param sgl_len Number of elements in the scatter-gather list.
+ * @param sgl Scatter-gather list describing the memory from which data will be
+ *        read.
+ * @param offset Driver-dependent offset.  For a random-access device, this is
+ *        often a byte offset from the beginning of the device; in other cases,
+ *        like on a control device, it may have a different meaning.
+ * @param intarg Value which will be delivered as the intarg member of the
+ *        eventual interrupt message; the intdata member will be set to the
+ *        normal return value from the write request.
+ * @return Zero if the write was successfully scheduled; otherwise, a negative
+ *         error code.  Note that some drivers may choose to pre-validate
+ *         their arguments, and may thus detect certain device error
+ *         conditions at this time rather than when the completion notification
+ *         occurs, but this is not required.
+ */
+int hv_dev_pwritea(int devhdl, __hv32 flags, __hv32 sgl_len,
+                   HV_SGL sgl[/* sgl_len */], __hv64 offset, HV_IntArg intarg);
+
+
+/** Define a pair of tile and ASID to identify a user process context. */
+typedef struct
+{
+  /** X coordinate, relative to supervisor's top-left coordinate */
+  unsigned int x:11;
+
+  /** Y coordinate, relative to supervisor's top-left coordinate */
+  unsigned int y:11;
+
+  /** ASID of the process on this x,y tile */
+  HV_ASID asid:10;
+} HV_Remote_ASID;
+
+/** Flush cache and/or TLB state on remote tiles.
+ *
+ * @param cache_pa Client physical address to flush from cache (ignored if
+ *        the length encoded in cache_control is zero, or if
+ *        HV_FLUSH_EVICT_L2 is set, or if cache_cpumask is NULL).
+ * @param cache_control This argument allows you to specify a length of
+ *        physical address space to flush (maximum HV_FLUSH_MAX_CACHE_LEN).
+ *        You can "or" in HV_FLUSH_EVICT_L2 to flush the whole L2 cache.
+ *        You can "or" in HV_FLUSH_EVICT_L1I to flush the whole L1I cache.
+ *        HV_FLUSH_ALL flushes all caches.
+ * @param cache_cpumask Bitmask (in row-major order, supervisor-relative) of
+ *        tile indices to perform cache flush on.  The low bit of the first
+ *        word corresponds to the tile at the upper left-hand corner of the
+ *        supervisor's rectangle.  If passed as a NULL pointer, equivalent
+ *        to an empty bitmask.  On chips which support hash-for-home caching,
+ *        if passed as -1, equivalent to a mask containing tiles which could
+ *        be doing hash-for-home caching.
+ * @param tlb_va Virtual address to flush from TLB (ignored if
+ *        tlb_length is zero or tlb_cpumask is NULL).
+ * @param tlb_length Number of bytes of data to flush from the TLB.
+ * @param tlb_pgsize Page size to use for TLB flushes.
+ *        tlb_va and tlb_length need not be aligned to this size.
+ * @param tlb_cpumask Bitmask for tlb flush, like cache_cpumask.
+ *        If passed as a NULL pointer, equivalent to an empty bitmask.
+ * @param asids Pointer to an HV_Remote_ASID array of tile/ASID pairs to flush.
+ * @param asidcount Number of HV_Remote_ASID entries in asids[].
+ * @return Zero for success, or else HV_EINVAL or HV_EFAULT for errors that
+ *        are detected while parsing the arguments.
+ */
+int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
+                    unsigned long* cache_cpumask,
+                    HV_VirtAddr tlb_va, unsigned long tlb_length,
+                    unsigned long tlb_pgsize, unsigned long* tlb_cpumask,
+                    HV_Remote_ASID* asids, int asidcount);
+
+/** Include in cache_control to ensure a flush of the entire L2. */
+#define HV_FLUSH_EVICT_L2 (1UL << 31)
+
+/** Include in cache_control to ensure a flush of the entire L1I. */
+#define HV_FLUSH_EVICT_L1I (1UL << 30)
+
+/** Maximum legal size to use for the "length" component of cache_control. */
+#define HV_FLUSH_MAX_CACHE_LEN ((1UL << 30) - 1)
+
+/** Use for cache_control to ensure a flush of all caches. */
+#define HV_FLUSH_ALL -1UL
+
+#else   /* __ASSEMBLER__ */
+
+/** Include in cache_control to ensure a flush of the entire L2. */
+#define HV_FLUSH_EVICT_L2 (1 << 31)
+
+/** Include in cache_control to ensure a flush of the entire L1I. */
+#define HV_FLUSH_EVICT_L1I (1 << 30)
+
+/** Maximum legal size to use for the "length" component of cache_control. */
+#define HV_FLUSH_MAX_CACHE_LEN ((1 << 30) - 1)
+
+/** Use for cache_control to ensure a flush of all caches. */
+#define HV_FLUSH_ALL -1
+
+#endif  /* __ASSEMBLER__ */
+
+#ifndef __ASSEMBLER__
+
+/** Return a 64-bit value corresponding to the PTE if needed */
+#define hv_pte_val(pte) ((pte).val)
+
+/** Cast a 64-bit value to an HV_PTE */
+#define hv_pte(val) ((HV_PTE) { val })
+
+#endif  /* !__ASSEMBLER__ */
+
+
+/** Bits in the size of an HV_PTE */
+#define HV_LOG2_PTE_SIZE 3
+
+/** Size of an HV_PTE */
+#define HV_PTE_SIZE (1 << HV_LOG2_PTE_SIZE)
+
+
+/* Bits in HV_PTE's low word. */
+#define HV_PTE_INDEX_PRESENT          0  /**< PTE is valid */
+#define HV_PTE_INDEX_MIGRATING        1  /**< Page is migrating */
+#define HV_PTE_INDEX_CLIENT0          2  /**< Page client state 0 */
+#define HV_PTE_INDEX_CLIENT1          3  /**< Page client state 1 */
+#define HV_PTE_INDEX_NC               4  /**< L1$/L2$ incoherent with L3$ */
+#define HV_PTE_INDEX_NO_ALLOC_L1      5  /**< Page is uncached in local L1$ */
+#define HV_PTE_INDEX_NO_ALLOC_L2      6  /**< Page is uncached in local L2$ */
+#define HV_PTE_INDEX_CACHED_PRIORITY  7  /**< Page is priority cached */
+#define HV_PTE_INDEX_PAGE             8  /**< PTE describes a page */
+#define HV_PTE_INDEX_GLOBAL           9  /**< Page is global */
+#define HV_PTE_INDEX_USER            10  /**< Page is user-accessible */
+#define HV_PTE_INDEX_ACCESSED        11  /**< Page has been accessed */
+#define HV_PTE_INDEX_DIRTY           12  /**< Page has been written */
+                                         /*   Bits 13-14 are reserved for
+                                              future use. */
+#define HV_PTE_INDEX_SUPER           15  /**< Pages ganged together for TLB */
+#define HV_PTE_INDEX_MODE            16  /**< Page mode; see HV_PTE_MODE_xxx */
+#define HV_PTE_MODE_BITS              3  /**< Number of bits in mode */
+#define HV_PTE_INDEX_CLIENT2         19  /**< Page client state 2 */
+#define HV_PTE_INDEX_LOTAR           20  /**< Page's LOTAR; must be high bits
+                                              of word */
+#define HV_PTE_LOTAR_BITS            12  /**< Number of bits in a LOTAR */
+
+/* Bits in HV_PTE's high word. */
+#define HV_PTE_INDEX_READABLE        32  /**< Page is readable */
+#define HV_PTE_INDEX_WRITABLE        33  /**< Page is writable */
+#define HV_PTE_INDEX_EXECUTABLE      34  /**< Page is executable */
+#define HV_PTE_INDEX_PTFN            35  /**< Page's PTFN; must be high bits
+                                              of word */
+#define HV_PTE_PTFN_BITS             29  /**< Number of bits in a PTFN */
+
+/*
+ * Legal values for the PTE's mode field
+ */
+/** Data is not resident in any caches; loads and stores access memory
+ *  directly.
+ */
+#define HV_PTE_MODE_UNCACHED          1
+
+/** Data is resident in the tile's local L1 and/or L2 caches; if a load
+ *  or store misses there, it goes to memory.
+ *
+ *  The copy in the local L1$/L2$ is not invalidated when the copy in
+ *  memory is changed.
+ */
+#define HV_PTE_MODE_CACHE_NO_L3       2
+
+/** Data is resident in the tile's local L1 and/or L2 caches.  If a load
+ *  or store misses there, it goes to an L3 cache in a designated tile;
+ *  if it misses there, it goes to memory.
+ *
+ *  If the NC bit is not set, the copy in the local L1$/L2$ is invalidated
+ *  when the copy in the remote L3$ is changed.  Otherwise, such
+ *  invalidation will not occur.
+ *
+ *  Chips for which CHIP_HAS_COHERENT_LOCAL_CACHE() is 0 do not support
+ *  invalidation from an L3$ to another tile's L1$/L2$.  If the NC bit is
+ *  clear on such a chip, no copy is kept in the local L1$/L2$ in this mode.
+ */
+#define HV_PTE_MODE_CACHE_TILE_L3     3
+
+/** Data is resident in the tile's local L1 and/or L2 caches.  If a load
+ *  or store misses there, it goes to an L3 cache in one of a set of
+ *  designated tiles; if it misses there, it goes to memory.  Which tile
+ *  is chosen from the set depends upon a hash function applied to the
+ *  physical address.  This mode is not supported on chips for which
+ *  CHIP_HAS_CBOX_HOME_MAP() is 0.
+ *
+ *  If the NC bit is not set, the copy in the local L1$/L2$ is invalidated
+ *  when the copy in the remote L3$ is changed.  Otherwise, such
+ *  invalidation will not occur.
+ *
+ *  Chips for which CHIP_HAS_COHERENT_LOCAL_CACHE() is 0 do not support
+ *  invalidation from an L3$ to another tile's L1$/L2$.  If the NC bit is
+ *  clear on such a chip, no copy is kept in the local L1$/L2$ in this mode.
+ */
+#define HV_PTE_MODE_CACHE_HASH_L3     4
+
+/** Data is not resident in memory; accesses are instead made to an I/O
+ *  device, whose tile coordinates are given by the PTE's LOTAR field.
+ *  This mode is only supported on chips for which CHIP_HAS_MMIO() is 1.
+ *  The EXECUTABLE bit may not be set in an MMIO PTE.
+ */
+#define HV_PTE_MODE_MMIO              5
+
+
+/* C wants 1ULL so it is typed as __hv64, but the assembler needs just numbers.
+ * The assembler can't handle shifts greater than 31, but treats them
+ * as shifts mod 32, so assembler code must be aware of which word
+ * the bit belongs in when using these macros.
+ */
+#ifdef __ASSEMBLER__
+#define __HV_PTE_ONE 1        /**< One, for assembler */
+#else
+#define __HV_PTE_ONE 1ULL     /**< One, for C */
+#endif
+
+/** Is this PTE present?
+ *
+ * If this bit is set, this PTE represents a valid translation or level-2
+ * page table pointer.  Otherwise, the page table does not contain a
+ * translation for the subject virtual pages.
+ *
+ * If this bit is not set, the other bits in the PTE are not
+ * interpreted by the hypervisor, and may contain any value.
+ */
+#define HV_PTE_PRESENT               (__HV_PTE_ONE << HV_PTE_INDEX_PRESENT)
+
+/** Does this PTE map a page?
+ *
+ * If this bit is set in a level-0 page table, the entry should be
+ * interpreted as a level-2 page table entry mapping a jumbo page.
+ *
+ * If this bit is set in a level-1 page table, the entry should be
+ * interpreted as a level-2 page table entry mapping a large page.
+ *
+ * This bit should not be modified by the client while PRESENT is set, as
+ * doing so may race with the hypervisor's update of ACCESSED and DIRTY bits.
+ *
+ * In a level-2 page table, this bit is ignored and must be zero.
+ */
+#define HV_PTE_PAGE                  (__HV_PTE_ONE << HV_PTE_INDEX_PAGE)
+
+/** Does this PTE implicitly reference multiple pages?
+ *
+ * If this bit is set in the page table (either in the level-2 page table,
+ * or in a higher level page table in conjunction with the PAGE bit)
+ * then the PTE specifies a range of contiguous pages, not a single page.
+ * The hv_set_pte_super_shift() allows you to specify the count for
+ * each level of the page table.
+ *
+ * Note: this bit is not supported on TILEPro systems.
+ */
+#define HV_PTE_SUPER                 (__HV_PTE_ONE << HV_PTE_INDEX_SUPER)
+
+/** Is this a global (non-ASID) mapping?
+ *
+ * If this bit is set, the translations established by this PTE will
+ * not be flushed from the TLB by the hv_flush_asid() service; they
+ * will be flushed by the hv_flush_page() or hv_flush_pages() services.
+ *
+ * Setting this bit for translations which are identical in all page
+ * tables (for instance, code and data belonging to a client OS) can
+ * be very beneficial, as it will reduce the number of TLB misses.
+ * Note that, while it is not an error which will be detected by the
+ * hypervisor, it is an extremely bad idea to set this bit for
+ * translations which are _not_ identical in all page tables.
+ *
+ * This bit should not be modified by the client while PRESENT is set, as
+ * doing so may race with the hypervisor's update of ACCESSED and DIRTY bits.
+ *
+ * This bit is ignored in level-1 PTEs unless the Page bit is set.
+ */
+#define HV_PTE_GLOBAL                (__HV_PTE_ONE << HV_PTE_INDEX_GLOBAL)
+
+/** Is this mapping accessible to users?
+ *
+ * If this bit is set, code running at any PL will be permitted to
+ * access the virtual addresses mapped by this PTE.  Otherwise, only
+ * code running at PL 1 or above will be allowed to do so.
+ *
+ * This bit should not be modified by the client while PRESENT is set, as
+ * doing so may race with the hypervisor's update of ACCESSED and DIRTY bits.
+ *
+ * This bit is ignored in level-1 PTEs unless the Page bit is set.
+ */
+#define HV_PTE_USER                  (__HV_PTE_ONE << HV_PTE_INDEX_USER)
+
+/** Has this mapping been accessed?
+ *
+ * This bit is set by the hypervisor when the memory described by the
+ * translation is accessed for the first time.  It is never cleared by
+ * the hypervisor, but may be cleared by the client.  After the bit
+ * has been cleared, subsequent references are not guaranteed to set
+ * it again until the translation has been flushed from the TLB.
+ *
+ * This bit is ignored in level-1 PTEs unless the Page bit is set.
+ */
+#define HV_PTE_ACCESSED              (__HV_PTE_ONE << HV_PTE_INDEX_ACCESSED)
+
+/** Is this mapping dirty?
+ *
+ * This bit is set by the hypervisor when the memory described by the
+ * translation is written for the first time.  It is never cleared by
+ * the hypervisor, but may be cleared by the client.  After the bit
+ * has been cleared, subsequent references are not guaranteed to set
+ * it again until the translation has been flushed from the TLB.
+ *
+ * This bit is ignored in level-1 PTEs unless the Page bit is set.
+ */
+#define HV_PTE_DIRTY                 (__HV_PTE_ONE << HV_PTE_INDEX_DIRTY)
+
+/** Migrating bit in PTE.
+ *
+ * This bit is guaranteed not to be inspected or modified by the
+ * hypervisor.  The name is indicative of the suggested use by the client
+ * to tag pages whose L3 cache is being migrated from one cpu to another.
+ */
+#define HV_PTE_MIGRATING             (__HV_PTE_ONE << HV_PTE_INDEX_MIGRATING)
+
+/** Client-private bit in PTE.
+ *
+ * This bit is guaranteed not to be inspected or modified by the
+ * hypervisor.
+ */
+#define HV_PTE_CLIENT0               (__HV_PTE_ONE << HV_PTE_INDEX_CLIENT0)
+
+/** Client-private bit in PTE.
+ *
+ * This bit is guaranteed not to be inspected or modified by the
+ * hypervisor.
+ */
+#define HV_PTE_CLIENT1               (__HV_PTE_ONE << HV_PTE_INDEX_CLIENT1)
+
+/** Client-private bit in PTE.
+ *
+ * This bit is guaranteed not to be inspected or modified by the
+ * hypervisor.
+ */
+#define HV_PTE_CLIENT2               (__HV_PTE_ONE << HV_PTE_INDEX_CLIENT2)
+
+/** Non-coherent (NC) bit in PTE.
+ *
+ * If this bit is set, the mapping that is set up will be non-coherent
+ * (also known as non-inclusive).  This means that changes to the L3
+ * cache will not cause a local copy to be invalidated.  It is generally
+ * recommended only for read-only mappings.
+ *
+ * In level-1 PTEs, if the Page bit is clear, this bit determines how the
+ * level-2 page table is accessed.
+ */
+#define HV_PTE_NC                    (__HV_PTE_ONE << HV_PTE_INDEX_NC)
+
+/** Is this page prevented from filling the L1$?
+ *
+ * If this bit is set, the page described by the PTE will not be cached
+ * the local cpu's L1 cache.
+ *
+ * If CHIP_HAS_NC_AND_NOALLOC_BITS() is not true in <chip.h> for this chip,
+ * it is illegal to use this attribute, and may cause client termination.
+ *
+ * In level-1 PTEs, if the Page bit is clear, this bit
+ * determines how the level-2 page table is accessed.
+ */
+#define HV_PTE_NO_ALLOC_L1           (__HV_PTE_ONE << HV_PTE_INDEX_NO_ALLOC_L1)
+
+/** Is this page prevented from filling the L2$?
+ *
+ * If this bit is set, the page described by the PTE will not be cached
+ * the local cpu's L2 cache.
+ *
+ * If CHIP_HAS_NC_AND_NOALLOC_BITS() is not true in <chip.h> for this chip,
+ * it is illegal to use this attribute, and may cause client termination.
+ *
+ * In level-1 PTEs, if the Page bit is clear, this bit determines how the
+ * level-2 page table is accessed.
+ */
+#define HV_PTE_NO_ALLOC_L2           (__HV_PTE_ONE << HV_PTE_INDEX_NO_ALLOC_L2)
+
+/** Is this a priority page?
+ *
+ * If this bit is set, the page described by the PTE will be given
+ * priority in the cache.  Normally this translates into allowing the
+ * page to use only the "red" half of the cache.  The client may wish to
+ * then use the hv_set_caching service to specify that other pages which
+ * alias this page will use only the "black" half of the cache.
+ *
+ * If the Cached Priority bit is clear, the hypervisor uses the
+ * current hv_set_caching() value to choose how to cache the page.
+ *
+ * It is illegal to set the Cached Priority bit if the Non-Cached bit
+ * is set and the Cached Remotely bit is clear, i.e. if requests to
+ * the page map directly to memory.
+ *
+ * This bit is ignored in level-1 PTEs unless the Page bit is set.
+ */
+#define HV_PTE_CACHED_PRIORITY       (__HV_PTE_ONE << \
+                                      HV_PTE_INDEX_CACHED_PRIORITY)
+
+/** Is this a readable mapping?
+ *
+ * If this bit is set, code will be permitted to read from (e.g.,
+ * issue load instructions against) the virtual addresses mapped by
+ * this PTE.
+ *
+ * It is illegal for this bit to be clear if the Writable bit is set.
+ *
+ * This bit is ignored in level-1 PTEs unless the Page bit is set.
+ */
+#define HV_PTE_READABLE              (__HV_PTE_ONE << HV_PTE_INDEX_READABLE)
+
+/** Is this a writable mapping?
+ *
+ * If this bit is set, code will be permitted to write to (e.g., issue
+ * store instructions against) the virtual addresses mapped by this
+ * PTE.
+ *
+ * This bit is ignored in level-1 PTEs unless the Page bit is set.
+ */
+#define HV_PTE_WRITABLE              (__HV_PTE_ONE << HV_PTE_INDEX_WRITABLE)
+
+/** Is this an executable mapping?
+ *
+ * If this bit is set, code will be permitted to execute from
+ * (e.g., jump to) the virtual addresses mapped by this PTE.
+ *
+ * This bit applies to any processor on the tile, if there are more
+ * than one.
+ *
+ * This bit is ignored in level-1 PTEs unless the Page bit is set.
+ */
+#define HV_PTE_EXECUTABLE            (__HV_PTE_ONE << HV_PTE_INDEX_EXECUTABLE)
+
+/** The width of a LOTAR's x or y bitfield. */
+#define HV_LOTAR_WIDTH 11
+
+/** Converts an x,y pair to a LOTAR value. */
+#define HV_XY_TO_LOTAR(x, y) ((HV_LOTAR)(((x) << HV_LOTAR_WIDTH) | (y)))
+
+/** Extracts the X component of a lotar. */
+#define HV_LOTAR_X(lotar) ((lotar) >> HV_LOTAR_WIDTH)
+
+/** Extracts the Y component of a lotar. */
+#define HV_LOTAR_Y(lotar) ((lotar) & ((1 << HV_LOTAR_WIDTH) - 1))
+
+#ifndef __ASSEMBLER__
+
+/** Define accessor functions for a PTE bit. */
+#define _HV_BIT(name, bit)                                      \
+static __inline int                                             \
+hv_pte_get_##name(HV_PTE pte)                                   \
+{                                                               \
+  return (pte.val >> HV_PTE_INDEX_##bit) & 1;                   \
+}                                                               \
+                                                                \
+static __inline HV_PTE                                          \
+hv_pte_set_##name(HV_PTE pte)                                   \
+{                                                               \
+  pte.val |= 1ULL << HV_PTE_INDEX_##bit;                        \
+  return pte;                                                   \
+}                                                               \
+                                                                \
+static __inline HV_PTE                                          \
+hv_pte_clear_##name(HV_PTE pte)                                 \
+{                                                               \
+  pte.val &= ~(1ULL << HV_PTE_INDEX_##bit);                     \
+  return pte;                                                   \
+}
+
+/* Generate accessors to get, set, and clear various PTE flags.
+ */
+_HV_BIT(present,         PRESENT)
+_HV_BIT(page,            PAGE)
+_HV_BIT(super,           SUPER)
+_HV_BIT(client0,         CLIENT0)
+_HV_BIT(client1,         CLIENT1)
+_HV_BIT(client2,         CLIENT2)
+_HV_BIT(migrating,       MIGRATING)
+_HV_BIT(nc,              NC)
+_HV_BIT(readable,        READABLE)
+_HV_BIT(writable,        WRITABLE)
+_HV_BIT(executable,      EXECUTABLE)
+_HV_BIT(accessed,        ACCESSED)
+_HV_BIT(dirty,           DIRTY)
+_HV_BIT(no_alloc_l1,     NO_ALLOC_L1)
+_HV_BIT(no_alloc_l2,     NO_ALLOC_L2)
+_HV_BIT(cached_priority, CACHED_PRIORITY)
+_HV_BIT(global,          GLOBAL)
+_HV_BIT(user,            USER)
+
+#undef _HV_BIT
+
+/** Get the page mode from the PTE.
+ *
+ * This field generally determines whether and how accesses to the page
+ * are cached; the HV_PTE_MODE_xxx symbols define the legal values for the
+ * page mode.  The NC, NO_ALLOC_L1, and NO_ALLOC_L2 bits modify this
+ * general policy.
+ */
+static __inline unsigned int
+hv_pte_get_mode(const HV_PTE pte)
+{
+  return (((__hv32) pte.val) >> HV_PTE_INDEX_MODE) &
+         ((1 << HV_PTE_MODE_BITS) - 1);
+}
+
+/** Set the page mode into a PTE.  See hv_pte_get_mode. */
+static __inline HV_PTE
+hv_pte_set_mode(HV_PTE pte, unsigned int val)
+{
+  pte.val &= ~(((1ULL << HV_PTE_MODE_BITS) - 1) << HV_PTE_INDEX_MODE);
+  pte.val |= val << HV_PTE_INDEX_MODE;
+  return pte;
+}
+
+/** Get the page frame number from the PTE.
+ *
+ * This field contains the upper bits of the CPA (client physical
+ * address) of the target page; the complete CPA is this field with
+ * HV_LOG2_PAGE_TABLE_ALIGN zero bits appended to it.
+ *
+ * For all PTEs in the lowest-level page table, and for all PTEs with
+ * the Page bit set in all page tables, the CPA must be aligned modulo
+ * the relevant page size.
+ */
+static __inline unsigned long
+hv_pte_get_ptfn(const HV_PTE pte)
+{
+  return pte.val >> HV_PTE_INDEX_PTFN;
+}
+
+/** Set the page table frame number into a PTE.  See hv_pte_get_ptfn. */
+static __inline HV_PTE
+hv_pte_set_ptfn(HV_PTE pte, unsigned long val)
+{
+  pte.val &= ~(((1ULL << HV_PTE_PTFN_BITS)-1) << HV_PTE_INDEX_PTFN);
+  pte.val |= (__hv64) val << HV_PTE_INDEX_PTFN;
+  return pte;
+}
+
+/** Get the client physical address from the PTE.  See hv_pte_set_ptfn. */
+static __inline HV_PhysAddr
+hv_pte_get_pa(const HV_PTE pte)
+{
+  return (__hv64) hv_pte_get_ptfn(pte) << HV_LOG2_PAGE_TABLE_ALIGN;
+}
+
+/** Set the client physical address into a PTE.  See hv_pte_get_ptfn. */
+static __inline HV_PTE
+hv_pte_set_pa(HV_PTE pte, HV_PhysAddr pa)
+{
+  return hv_pte_set_ptfn(pte, pa >> HV_LOG2_PAGE_TABLE_ALIGN);
+}
+
+
+/** Get the remote tile caching this page.
+ *
+ * Specifies the remote tile which is providing the L3 cache for this page.
+ *
+ * This field is ignored unless the page mode is HV_PTE_MODE_CACHE_TILE_L3.
+ *
+ * In level-1 PTEs, if the Page bit is clear, this field determines how the
+ * level-2 page table is accessed.
+ */
+static __inline unsigned int
+hv_pte_get_lotar(const HV_PTE pte)
+{
+  unsigned int lotar = ((__hv32) pte.val) >> HV_PTE_INDEX_LOTAR;
+
+  return HV_XY_TO_LOTAR( (lotar >> (HV_PTE_LOTAR_BITS / 2)),
+                         (lotar & ((1 << (HV_PTE_LOTAR_BITS / 2)) - 1)) );
+}
+
+
+/** Set the remote tile caching a page into a PTE.  See hv_pte_get_lotar. */
+static __inline HV_PTE
+hv_pte_set_lotar(HV_PTE pte, unsigned int val)
+{
+  unsigned int x = HV_LOTAR_X(val);
+  unsigned int y = HV_LOTAR_Y(val);
+
+  pte.val &= ~(((1ULL << HV_PTE_LOTAR_BITS)-1) << HV_PTE_INDEX_LOTAR);
+  pte.val |= (x << (HV_PTE_INDEX_LOTAR + HV_PTE_LOTAR_BITS / 2)) |
+             (y << HV_PTE_INDEX_LOTAR);
+  return pte;
+}
+
+#endif  /* !__ASSEMBLER__ */
+
+/** Converts a client physical address to a ptfn. */
+#define HV_CPA_TO_PTFN(p) ((p) >> HV_LOG2_PAGE_TABLE_ALIGN)
+
+/** Converts a ptfn to a client physical address. */
+#define HV_PTFN_TO_CPA(p) (((HV_PhysAddr)(p)) << HV_LOG2_PAGE_TABLE_ALIGN)
+
+#if CHIP_VA_WIDTH() > 32
+
+/*
+ * Note that we currently do not allow customizing the page size
+ * of the L0 pages, but fix them at 4GB, so we do not use the
+ * "_HV_xxx" nomenclature for the L0 macros.
+ */
+
+/** Log number of HV_PTE entries in L0 page table */
+#define HV_LOG2_L0_ENTRIES (CHIP_VA_WIDTH() - HV_LOG2_L1_SPAN)
+
+/** Number of HV_PTE entries in L0 page table */
+#define HV_L0_ENTRIES (1 << HV_LOG2_L0_ENTRIES)
+
+/** Log size of L0 page table in bytes */
+#define HV_LOG2_L0_SIZE (HV_LOG2_PTE_SIZE + HV_LOG2_L0_ENTRIES)
+
+/** Size of L0 page table in bytes */
+#define HV_L0_SIZE (1 << HV_LOG2_L0_SIZE)
+
+#ifdef __ASSEMBLER__
+
+/** Index in L0 for a specific VA */
+#define HV_L0_INDEX(va) \
+  (((va) >> HV_LOG2_L1_SPAN) & (HV_L0_ENTRIES - 1))
+
+#else
+
+/** Index in L1 for a specific VA */
+#define HV_L0_INDEX(va) \
+  (((HV_VirtAddr)(va) >> HV_LOG2_L1_SPAN) & (HV_L0_ENTRIES - 1))
+
+#endif
+
+#endif /* CHIP_VA_WIDTH() > 32 */
+
+/** Log number of HV_PTE entries in L1 page table */
+#define _HV_LOG2_L1_ENTRIES(log2_page_size_large) \
+  (HV_LOG2_L1_SPAN - log2_page_size_large)
+
+/** Number of HV_PTE entries in L1 page table */
+#define _HV_L1_ENTRIES(log2_page_size_large) \
+  (1 << _HV_LOG2_L1_ENTRIES(log2_page_size_large))
+
+/** Log size of L1 page table in bytes */
+#define _HV_LOG2_L1_SIZE(log2_page_size_large) \
+  (HV_LOG2_PTE_SIZE + _HV_LOG2_L1_ENTRIES(log2_page_size_large))
+
+/** Size of L1 page table in bytes */
+#define _HV_L1_SIZE(log2_page_size_large) \
+  (1 << _HV_LOG2_L1_SIZE(log2_page_size_large))
+
+/** Log number of HV_PTE entries in level-2 page table */
+#define _HV_LOG2_L2_ENTRIES(log2_page_size_large, log2_page_size_small) \
+  (log2_page_size_large - log2_page_size_small)
+
+/** Number of HV_PTE entries in level-2 page table */
+#define _HV_L2_ENTRIES(log2_page_size_large, log2_page_size_small) \
+  (1 << _HV_LOG2_L2_ENTRIES(log2_page_size_large, log2_page_size_small))
+
+/** Log size of level-2 page table in bytes */
+#define _HV_LOG2_L2_SIZE(log2_page_size_large, log2_page_size_small) \
+  (HV_LOG2_PTE_SIZE + \
+   _HV_LOG2_L2_ENTRIES(log2_page_size_large, log2_page_size_small))
+
+/** Size of level-2 page table in bytes */
+#define _HV_L2_SIZE(log2_page_size_large, log2_page_size_small) \
+  (1 << _HV_LOG2_L2_SIZE(log2_page_size_large, log2_page_size_small))
+
+#ifdef __ASSEMBLER__
+
+#if CHIP_VA_WIDTH() > 32
+
+/** Index in L1 for a specific VA */
+#define _HV_L1_INDEX(va, log2_page_size_large) \
+  (((va) >> log2_page_size_large) & (_HV_L1_ENTRIES(log2_page_size_large) - 1))
+
+#else /* CHIP_VA_WIDTH() > 32 */
+
+/** Index in L1 for a specific VA */
+#define _HV_L1_INDEX(va, log2_page_size_large) \
+  (((va) >> log2_page_size_large))
+
+#endif /* CHIP_VA_WIDTH() > 32 */
+
+/** Index in level-2 page table for a specific VA */
+#define _HV_L2_INDEX(va, log2_page_size_large, log2_page_size_small) \
+  (((va) >> log2_page_size_small) & \
+   (_HV_L2_ENTRIES(log2_page_size_large, log2_page_size_small) - 1))
+
+#else /* __ASSEMBLER __ */
+
+#if CHIP_VA_WIDTH() > 32
+
+/** Index in L1 for a specific VA */
+#define _HV_L1_INDEX(va, log2_page_size_large) \
+  (((HV_VirtAddr)(va) >> log2_page_size_large) & \
+   (_HV_L1_ENTRIES(log2_page_size_large) - 1))
+
+#else /* CHIP_VA_WIDTH() > 32 */
+
+/** Index in L1 for a specific VA */
+#define _HV_L1_INDEX(va, log2_page_size_large) \
+  (((HV_VirtAddr)(va) >> log2_page_size_large))
+
+#endif /* CHIP_VA_WIDTH() > 32 */
+
+/** Index in level-2 page table for a specific VA */
+#define _HV_L2_INDEX(va, log2_page_size_large, log2_page_size_small) \
+  (((HV_VirtAddr)(va) >> log2_page_size_small) & \
+   (_HV_L2_ENTRIES(log2_page_size_large, log2_page_size_small) - 1))
+
+#endif /* __ASSEMBLER __ */
+
+/** Position of the PFN field within the PTE (subset of the PTFN). */
+#define _HV_PTE_INDEX_PFN(log2_page_size) \
+  (HV_PTE_INDEX_PTFN + (log2_page_size - HV_LOG2_PAGE_TABLE_ALIGN))
+
+/** Length of the PFN field within the PTE (subset of the PTFN). */
+#define _HV_PTE_INDEX_PFN_BITS(log2_page_size) \
+  (HV_PTE_INDEX_PTFN_BITS - (log2_page_size - HV_LOG2_PAGE_TABLE_ALIGN))
+
+/** Converts a client physical address to a pfn. */
+#define _HV_CPA_TO_PFN(p, log2_page_size) ((p) >> log2_page_size)
+
+/** Converts a pfn to a client physical address. */
+#define _HV_PFN_TO_CPA(p, log2_page_size) \
+  (((HV_PhysAddr)(p)) << log2_page_size)
+
+/** Converts a ptfn to a pfn. */
+#define _HV_PTFN_TO_PFN(p, log2_page_size) \
+  ((p) >> (log2_page_size - HV_LOG2_PAGE_TABLE_ALIGN))
+
+/** Converts a pfn to a ptfn. */
+#define _HV_PFN_TO_PTFN(p, log2_page_size) \
+  ((p) << (log2_page_size - HV_LOG2_PAGE_TABLE_ALIGN))
+
+#endif /* _HV_HV_H */
diff --git a/arch/tile/include/hv/iorpc.h b/arch/tile/include/hv/iorpc.h
new file mode 100644
index 000000000..ddf160448
--- /dev/null
+++ b/arch/tile/include/hv/iorpc.h
@@ -0,0 +1,714 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+#ifndef _HV_IORPC_H_
+#define _HV_IORPC_H_
+
+/**
+ *
+ * Error codes and struct definitions for the IO RPC library.
+ *
+ * The hypervisor's IO RPC component provides a convenient way for
+ * driver authors to proxy system calls between user space, linux, and
+ * the hypervisor driver.  The core of the system is a set of Python
+ * files that take ".idl" files as input and generates the following
+ * source code:
+ *
+ * - _rpc_call() routines for use in userspace IO libraries.  These
+ * routines take an argument list specified in the .idl file, pack the
+ * arguments in to a buffer, and read or write that buffer via the
+ * Linux iorpc driver.
+ *
+ * - dispatch_read() and dispatch_write() routines that hypervisor
+ * drivers can use to implement most of their dev_pread() and
+ * dev_pwrite() methods.  These routines decode the incoming parameter
+ * blob, permission check and translate parameters where appropriate,
+ * and then invoke a callback routine for whichever RPC call has
+ * arrived.  The driver simply implements the set of callback
+ * routines.
+ *
+ * The IO RPC system also includes the Linux 'iorpc' driver, which
+ * proxies calls between the userspace library and the hypervisor
+ * driver.  The Linux driver is almost entirely device agnostic; it
+ * watches for special flags indicating cases where a memory buffer
+ * address might need to be translated, etc.  As a result, driver
+ * writers can avoid many of the problem cases related to registering
+ * hardware resources like memory pages or interrupts.  However, the
+ * drivers must be careful to obey the conventions documented below in
+ * order to work properly with the generic Linux iorpc driver.
+ *
+ * @section iorpc_domains Service Domains
+ *
+ * All iorpc-based drivers must support a notion of service domains.
+ * A service domain is basically an application context - state
+ * indicating resources that are allocated to that particular app
+ * which it may access and (perhaps) other applications may not
+ * access.  Drivers can support any number of service domains they
+ * choose.  In some cases the design is limited by a number of service
+ * domains supported by the IO hardware; in other cases the service
+ * domains are a purely software concept and the driver chooses a
+ * maximum number of domains based on how much state memory it is
+ * willing to preallocate.
+ *
+ * For example, the mPIPE driver only supports as many service domains
+ * as are supported by the mPIPE hardware.  This limitation is
+ * required because the hardware implements its own MMIO protection
+ * scheme to allow large MMIO mappings while still protecting small
+ * register ranges within the page that should only be accessed by the
+ * hypervisor.
+ *
+ * In contrast, drivers with no hardware service domain limitations
+ * (for instance the TRIO shim) can implement an arbitrary number of
+ * service domains.  In these cases, each service domain is limited to
+ * a carefully restricted set of legal MMIO addresses if necessary to
+ * keep one application from corrupting another application's state.
+ *
+ * @section iorpc_conventions System Call Conventions
+ *
+ * The driver's open routine is responsible for allocating a new
+ * service domain for each hv_dev_open() call.  By convention, the
+ * return value from open() should be the service domain number on
+ * success, or GXIO_ERR_NO_SVC_DOM if no more service domains are
+ * available.
+ *
+ * The implementations of hv_dev_pread() and hv_dev_pwrite() are
+ * responsible for validating the devhdl value passed up by the
+ * client.  Since the device handle returned by hv_dev_open() should
+ * embed the positive service domain number, drivers should make sure
+ * that DRV_HDL2BITS(devhdl) is a legal service domain.  If the client
+ * passes an illegal service domain number, the routine should return
+ * GXIO_ERR_INVAL_SVC_DOM.  Once the service domain number has been
+ * validated, the driver can copy to/from the client buffer and call
+ * the dispatch_read() or dispatch_write() methods created by the RPC
+ * generator.
+ *
+ * The hv_dev_close() implementation should reset all service domain
+ * state and put the service domain back on a free list for
+ * reallocation by a future application.  In most cases, this will
+ * require executing a hardware reset or drain flow and denying any
+ * MMIO regions that were created for the service domain.
+ *
+ * @section iorpc_data Special Data Types
+ *
+ * The .idl file syntax allows the creation of syscalls with special
+ * parameters that require permission checks or translations as part
+ * of the system call path.  Because of limitations in the code
+ * generator, APIs are generally limited to just one of these special
+ * parameters per system call, and they are sometimes required to be
+ * the first or last parameter to the call.  Special parameters
+ * include:
+ *
+ * @subsection iorpc_mem_buffer MEM_BUFFER
+ *
+ * The MEM_BUFFER() datatype allows user space to "register" memory
+ * buffers with a device.  Registering memory accomplishes two tasks:
+ * Linux keeps track of all buffers that might be modified by a
+ * hardware device, and the hardware device drivers bind registered
+ * buffers to particular hardware resources like ingress NotifRings.
+ * The MEM_BUFFER() idl syntax can take extra flags like ALIGN_64KB,
+ * ALIGN_SELF_SIZE, and FLAGS indicating that memory buffers must have
+ * certain alignment or that the user should be able to pass a "memory
+ * flags" word specifying attributes like nt_hint or IO cache pinning.
+ * The parser will accept multiple MEM_BUFFER() flags.
+ *
+ * Implementations must obey the following conventions when
+ * registering memory buffers via the iorpc flow.  These rules are a
+ * result of the Linux driver implementation, which needs to keep
+ * track of how many times a particular page has been registered with
+ * the hardware so that it can release the page when all those
+ * registrations are cleared.
+ *
+ * - Memory registrations that refer to a resource which has already
+ * been bound must return GXIO_ERR_ALREADY_INIT.  Thus, it is an
+ * error to register memory twice without resetting (i.e. closing) the
+ * resource in between.  This convention keeps the Linux driver from
+ * having to track which particular devices a page is bound to.
+ *
+ * - At present, a memory registration is only cleared when the
+ * service domain is reset.  In this case, the Linux driver simply
+ * closes the HV device file handle and then decrements the reference
+ * counts of all pages that were previously registered with the
+ * device.
+ *
+ * - In the future, we may add a mechanism for unregistering memory.
+ * One possible implementation would require that the user specify
+ * which buffer is currently registered.  The HV would then verify
+ * that that page was actually the one currently mapped and return
+ * success or failure to Linux, which would then only decrement the
+ * page reference count if the addresses were mapped.  Another scheme
+ * might allow Linux to pass a token to the HV to be returned when the
+ * resource is unmapped.
+ *
+ * @subsection iorpc_interrupt INTERRUPT
+ *
+ * The INTERRUPT .idl datatype allows the client to bind hardware
+ * interrupts to a particular combination of IPI parameters - CPU, IPI
+ * PL, and event bit number.  This data is passed via a special
+ * datatype so that the Linux driver can validate the CPU and PL and
+ * the HV generic iorpc code can translate client CPUs to real CPUs.
+ *
+ * @subsection iorpc_pollfd_setup POLLFD_SETUP
+ *
+ * The POLLFD_SETUP .idl datatype allows the client to set up hardware
+ * interrupt bindings which are received by Linux but which are made
+ * visible to user processes as state transitions on a file descriptor;
+ * this allows user processes to use Linux primitives, such as poll(), to
+ * await particular hardware events.  This data is passed via a special
+ * datatype so that the Linux driver may recognize the pollable file
+ * descriptor and translate it to a set of interrupt target information,
+ * and so that the HV generic iorpc code can translate client CPUs to real
+ * CPUs.
+ *
+ * @subsection iorpc_pollfd POLLFD
+ *
+ * The POLLFD .idl datatype allows manipulation of hardware interrupt
+ * bindings set up via the POLLFD_SETUP datatype; common operations are
+ * resetting the state of the requested interrupt events, and unbinding any
+ * bound interrupts.  This data is passed via a special datatype so that
+ * the Linux driver may recognize the pollable file descriptor and
+ * translate it to an interrupt identifier previously supplied by the
+ * hypervisor as the result of an earlier pollfd_setup operation.
+ *
+ * @subsection iorpc_blob BLOB
+ *
+ * The BLOB .idl datatype allows the client to write an arbitrary
+ * length string of bytes up to the hypervisor driver.  This can be
+ * useful for passing up large, arbitrarily structured data like
+ * classifier programs.  The iorpc stack takes care of validating the
+ * buffer VA and CPA as the data passes up to the hypervisor.  Unlike
+ * MEM_BUFFER(), the buffer is not registered - Linux does not bump
+ * page refcounts and the HV driver should not reuse the buffer once
+ * the system call is complete.
+ *
+ * @section iorpc_translation Translating User Space Calls
+ *
+ * The ::iorpc_offset structure describes the formatting of the offset
+ * that is passed to pread() or pwrite() as part of the generated RPC code.
+ * When the user calls up to Linux, the rpc code fills in all the fields of
+ * the offset, including a 16-bit opcode, a 16 bit format indicator, and 32
+ * bits of user-specified "sub-offset".  The opcode indicates which syscall
+ * is being requested.  The format indicates whether there is a "prefix
+ * struct" at the start of the memory buffer passed to pwrite(), and if so
+ * what data is in that prefix struct.  These prefix structs are used to
+ * implement special datatypes like MEM_BUFFER() and INTERRUPT - we arrange
+ * to put data that needs translation and permission checks at the start of
+ * the buffer so that the Linux driver and generic portions of the HV iorpc
+ * code can easily access the data.  The 32 bits of user-specified
+ * "sub-offset" are most useful for pread() calls where the user needs to
+ * also pass in a few bits indicating which register to read, etc.
+ *
+ * The Linux iorpc driver watches for system calls that contain prefix
+ * structs so that it can translate parameters and bump reference
+ * counts as appropriate.  It does not (currently) have any knowledge
+ * of the per-device opcodes - it doesn't care what operation you're
+ * doing to mPIPE, so long as it can do all the generic book-keeping.
+ * The hv/iorpc.h header file defines all of the generic encoding bits
+ * needed to translate iorpc calls without knowing which particular
+ * opcode is being issued.
+ *
+ * @section iorpc_globals Global iorpc Calls
+ *
+ * Implementing mmap() required adding some special iorpc syscalls
+ * that are only called by the Linux driver, never by userspace.
+ * These include get_mmio_base() and check_mmio_offset().  These
+ * routines are described in globals.idl and must be included in every
+ * iorpc driver.  By providing these routines in every driver, Linux's
+ * mmap implementation can easily get the PTE bits it needs and
+ * validate the PA offset without needing to know the per-device
+ * opcodes to perform those tasks.
+ *
+ * @section iorpc_kernel Supporting gxio APIs in the Kernel
+ *
+ * The iorpc code generator also supports generation of kernel code
+ * implementing the gxio APIs.  This capability is currently used by
+ * the mPIPE network driver, and will likely be used by the TRIO root
+ * complex and endpoint drivers and perhaps an in-kernel crypto
+ * driver.  Each driver that wants to instantiate iorpc calls in the
+ * kernel needs to generate a kernel version of the generate rpc code
+ * and (probably) copy any related gxio source files into the kernel.
+ * The mPIPE driver provides a good example of this pattern.
+ */
+
+#ifdef __KERNEL__
+#include <linux/stddef.h>
+#else
+#include <stddef.h>
+#endif
+
+#if defined(__HV__)
+#include <hv/hypervisor.h>
+#elif defined(__KERNEL__)
+#include <hv/hypervisor.h>
+#include <linux/types.h>
+#else
+#include <stdint.h>
+#endif
+
+
+/** Code indicating translation services required within the RPC path.
+ * These indicate whether there is a translatable struct at the start
+ * of the RPC buffer and what information that struct contains.
+ */
+enum iorpc_format_e
+{
+  /** No translation required, no prefix struct. */
+  IORPC_FORMAT_NONE,
+
+  /** No translation required, no prefix struct, no access to this
+   *  operation from user space. */
+  IORPC_FORMAT_NONE_NOUSER,
+
+  /** Prefix struct contains user VA and size. */
+  IORPC_FORMAT_USER_MEM,
+
+  /** Prefix struct contains CPA, size, and homing bits. */
+  IORPC_FORMAT_KERNEL_MEM,
+
+  /** Prefix struct contains interrupt. */
+  IORPC_FORMAT_KERNEL_INTERRUPT,
+
+  /** Prefix struct contains user-level interrupt. */
+  IORPC_FORMAT_USER_INTERRUPT,
+
+  /** Prefix struct contains pollfd_setup (interrupt information). */
+  IORPC_FORMAT_KERNEL_POLLFD_SETUP,
+
+  /** Prefix struct contains user-level pollfd_setup (file descriptor). */
+  IORPC_FORMAT_USER_POLLFD_SETUP,
+
+  /** Prefix struct contains pollfd (interrupt cookie). */
+  IORPC_FORMAT_KERNEL_POLLFD,
+
+  /** Prefix struct contains user-level pollfd (file descriptor). */
+  IORPC_FORMAT_USER_POLLFD,
+};
+
+
+/** Generate an opcode given format and code. */
+#define IORPC_OPCODE(FORMAT, CODE) (((FORMAT) << 16) | (CODE))
+
+/** The offset passed through the read() and write() system calls
+    combines an opcode with 32 bits of user-specified offset. */
+union iorpc_offset
+{
+#ifndef __BIG_ENDIAN__
+  uint64_t offset;              /**< All bits. */
+
+  struct
+  {
+    uint16_t code;              /**< RPC code. */
+    uint16_t format;            /**< iorpc_format_e */
+    uint32_t sub_offset;        /**< caller-specified offset. */
+  };
+
+  uint32_t opcode;              /**< Opcode combines code & format. */
+#else
+  uint64_t offset;              /**< All bits. */
+
+  struct
+  {
+    uint32_t sub_offset;        /**< caller-specified offset. */
+    uint16_t format;            /**< iorpc_format_e */
+    uint16_t code;              /**< RPC code. */
+  };
+
+  struct
+  {
+    uint32_t padding;
+    uint32_t opcode;              /**< Opcode combines code & format. */
+  };
+#endif
+};
+
+
+/** Homing and cache hinting bits that can be used by IO devices. */
+struct iorpc_mem_attr
+{
+  unsigned int lotar_x:4;       /**< lotar X bits (or Gx page_mask). */
+  unsigned int lotar_y:4;       /**< lotar Y bits (or Gx page_offset). */
+  unsigned int hfh:1;           /**< Uses hash-for-home. */
+  unsigned int nt_hint:1;       /**< Non-temporal hint. */
+  unsigned int io_pin:1;        /**< Only fill 'IO' cache ways. */
+};
+
+/** Set the nt_hint bit. */
+#define IORPC_MEM_BUFFER_FLAG_NT_HINT (1 << 0)
+
+/** Set the IO pin bit. */
+#define IORPC_MEM_BUFFER_FLAG_IO_PIN (1 << 1)
+
+
+/** A structure used to describe memory registration.  Different
+    protection levels describe memory differently, so this union
+    contains all the different possible descriptions.  As a request
+    moves up the call chain, each layer translates from one
+    description format to the next.  In particular, the Linux iorpc
+    driver translates user VAs into CPAs and homing parameters. */
+union iorpc_mem_buffer
+{
+  struct
+  {
+    uint64_t va;                /**< User virtual address. */
+    uint64_t size;              /**< Buffer size. */
+    unsigned int flags;         /**< nt_hint, IO pin. */
+  }
+  user;                         /**< Buffer as described by user apps. */
+
+  struct
+  {
+    unsigned long long cpa;     /**< Client physical address. */
+#if defined(__KERNEL__) || defined(__HV__)
+    size_t size;                /**< Buffer size. */
+    HV_PTE pte;                 /**< PTE describing memory homing. */
+#else
+    uint64_t size;
+    uint64_t pte;
+#endif
+    unsigned int flags;         /**< nt_hint, IO pin. */
+  }
+  kernel;                       /**< Buffer as described by kernel. */
+
+  struct
+  {
+    unsigned long long pa;      /**< Physical address. */
+    size_t size;                /**< Buffer size. */
+    struct iorpc_mem_attr attr;      /**< Homing and locality hint bits. */
+  }
+  hv;                           /**< Buffer parameters for HV driver. */
+};
+
+
+/** A structure used to describe interrupts.  The format differs slightly
+ *  for user and kernel interrupts.  As with the mem_buffer_t, translation
+ *  between the formats is done at each level. */
+union iorpc_interrupt
+{
+  struct
+  {
+    int cpu;   /**< CPU. */
+    int event; /**< evt_num */
+  }
+  user;        /**< Interrupt as described by user applications. */
+
+  struct
+  {
+    int x;     /**< X coord. */
+    int y;     /**< Y coord. */
+    int ipi;   /**< int_num */
+    int event; /**< evt_num */
+  }
+  kernel;      /**< Interrupt as described by the kernel. */
+
+};
+
+
+/** A structure used to describe interrupts used with poll().  The format
+ *  differs significantly for requests from user to kernel, and kernel to
+ *  hypervisor.  As with the mem_buffer_t, translation between the formats
+ *  is done at each level. */
+union iorpc_pollfd_setup
+{
+  struct
+  {
+    int fd;    /**< Pollable file descriptor. */
+  }
+  user;        /**< pollfd_setup as described by user applications. */
+
+  struct
+  {
+    int x;     /**< X coord. */
+    int y;     /**< Y coord. */
+    int ipi;   /**< int_num */
+    int event; /**< evt_num */
+  }
+  kernel;      /**< pollfd_setup as described by the kernel. */
+
+};
+
+
+/** A structure used to describe previously set up interrupts used with
+ *  poll().  The format differs significantly for requests from user to
+ *  kernel, and kernel to hypervisor.  As with the mem_buffer_t, translation
+ *  between the formats is done at each level. */
+union iorpc_pollfd
+{
+  struct
+  {
+    int fd;    /**< Pollable file descriptor. */
+  }
+  user;        /**< pollfd as described by user applications. */
+
+  struct
+  {
+    int cookie; /**< hv cookie returned by the pollfd_setup operation. */
+  }
+  kernel;      /**< pollfd as described by the kernel. */
+
+};
+
+
+/** The various iorpc devices use error codes from -1100 to -1299.
+ *
+ * This range is distinct from netio (-700 to -799), the hypervisor
+ * (-800 to -899), tilepci (-900 to -999), ilib (-1000 to -1099),
+ * gxcr (-1300 to -1399) and gxpci (-1400 to -1499).
+ */
+enum gxio_err_e {
+
+  /** Largest iorpc error number. */
+  GXIO_ERR_MAX = -1101,
+
+
+  /********************************************************/
+  /*                   Generic Error Codes                */
+  /********************************************************/
+
+  /** Bad RPC opcode - possible version incompatibility. */
+  GXIO_ERR_OPCODE = -1101,
+
+  /** Invalid parameter. */
+  GXIO_ERR_INVAL = -1102,
+
+  /** Memory buffer did not meet alignment requirements. */
+  GXIO_ERR_ALIGNMENT = -1103,
+
+  /** Memory buffers must be coherent and cacheable. */
+  GXIO_ERR_COHERENCE = -1104,
+
+  /** Resource already initialized. */
+  GXIO_ERR_ALREADY_INIT = -1105,
+
+  /** No service domains available. */
+  GXIO_ERR_NO_SVC_DOM = -1106,
+
+  /** Illegal service domain number. */
+  GXIO_ERR_INVAL_SVC_DOM = -1107,
+
+  /** Illegal MMIO address. */
+  GXIO_ERR_MMIO_ADDRESS = -1108,
+
+  /** Illegal interrupt binding. */
+  GXIO_ERR_INTERRUPT = -1109,
+
+  /** Unreasonable client memory. */
+  GXIO_ERR_CLIENT_MEMORY = -1110,
+
+  /** No more IOTLB entries. */
+  GXIO_ERR_IOTLB_ENTRY = -1111,
+
+  /** Invalid memory size. */
+  GXIO_ERR_INVAL_MEMORY_SIZE = -1112,
+
+  /** Unsupported operation. */
+  GXIO_ERR_UNSUPPORTED_OP = -1113,
+
+  /** Insufficient DMA credits. */
+  GXIO_ERR_DMA_CREDITS = -1114,
+
+  /** Operation timed out. */
+  GXIO_ERR_TIMEOUT = -1115,
+
+  /** No such device or object. */
+  GXIO_ERR_NO_DEVICE = -1116,
+
+  /** Device or resource busy. */
+  GXIO_ERR_BUSY = -1117,
+
+  /** I/O error. */
+  GXIO_ERR_IO = -1118,
+
+  /** Permissions error. */
+  GXIO_ERR_PERM = -1119,
+
+
+
+  /********************************************************/
+  /*                 Test Device Error Codes              */
+  /********************************************************/
+
+  /** Illegal register number. */
+  GXIO_TEST_ERR_REG_NUMBER = -1120,
+
+  /** Illegal buffer slot. */
+  GXIO_TEST_ERR_BUFFER_SLOT = -1121,
+
+
+  /********************************************************/
+  /*                    MPIPE Error Codes                 */
+  /********************************************************/
+
+
+  /** Invalid buffer size. */
+  GXIO_MPIPE_ERR_INVAL_BUFFER_SIZE = -1131,
+
+  /** Cannot allocate buffer stack. */
+  GXIO_MPIPE_ERR_NO_BUFFER_STACK = -1140,
+
+  /** Invalid buffer stack number. */
+  GXIO_MPIPE_ERR_BAD_BUFFER_STACK = -1141,
+
+  /** Cannot allocate NotifRing. */
+  GXIO_MPIPE_ERR_NO_NOTIF_RING = -1142,
+
+  /** Invalid NotifRing number. */
+  GXIO_MPIPE_ERR_BAD_NOTIF_RING = -1143,
+
+  /** Cannot allocate NotifGroup. */
+  GXIO_MPIPE_ERR_NO_NOTIF_GROUP = -1144,
+
+  /** Invalid NotifGroup number. */
+  GXIO_MPIPE_ERR_BAD_NOTIF_GROUP = -1145,
+
+  /** Cannot allocate bucket. */
+  GXIO_MPIPE_ERR_NO_BUCKET = -1146,
+
+  /** Invalid bucket number. */
+  GXIO_MPIPE_ERR_BAD_BUCKET = -1147,
+
+  /** Cannot allocate eDMA ring. */
+  GXIO_MPIPE_ERR_NO_EDMA_RING = -1148,
+
+  /** Invalid eDMA ring number. */
+  GXIO_MPIPE_ERR_BAD_EDMA_RING = -1149,
+
+  /** Invalid channel number. */
+  GXIO_MPIPE_ERR_BAD_CHANNEL = -1150,
+
+  /** Bad configuration. */
+  GXIO_MPIPE_ERR_BAD_CONFIG = -1151,
+
+  /** Empty iqueue. */
+  GXIO_MPIPE_ERR_IQUEUE_EMPTY = -1152,
+
+  /** Empty rules. */
+  GXIO_MPIPE_ERR_RULES_EMPTY = -1160,
+
+  /** Full rules. */
+  GXIO_MPIPE_ERR_RULES_FULL = -1161,
+
+  /** Corrupt rules. */
+  GXIO_MPIPE_ERR_RULES_CORRUPT = -1162,
+
+  /** Invalid rules. */
+  GXIO_MPIPE_ERR_RULES_INVALID = -1163,
+
+  /** Classifier is too big. */
+  GXIO_MPIPE_ERR_CLASSIFIER_TOO_BIG = -1170,
+
+  /** Classifier is too complex. */
+  GXIO_MPIPE_ERR_CLASSIFIER_TOO_COMPLEX = -1171,
+
+  /** Classifier has bad header. */
+  GXIO_MPIPE_ERR_CLASSIFIER_BAD_HEADER = -1172,
+
+  /** Classifier has bad contents. */
+  GXIO_MPIPE_ERR_CLASSIFIER_BAD_CONTENTS = -1173,
+
+  /** Classifier encountered invalid symbol. */
+  GXIO_MPIPE_ERR_CLASSIFIER_INVAL_SYMBOL = -1174,
+
+  /** Classifier encountered invalid bounds. */
+  GXIO_MPIPE_ERR_CLASSIFIER_INVAL_BOUNDS = -1175,
+
+  /** Classifier encountered invalid relocation. */
+  GXIO_MPIPE_ERR_CLASSIFIER_INVAL_RELOCATION = -1176,
+
+  /** Classifier encountered undefined symbol. */
+  GXIO_MPIPE_ERR_CLASSIFIER_UNDEF_SYMBOL = -1177,
+
+
+  /********************************************************/
+  /*                    TRIO  Error Codes                 */
+  /********************************************************/
+
+  /** Cannot allocate memory map region. */
+  GXIO_TRIO_ERR_NO_MEMORY_MAP = -1180,
+
+  /** Invalid memory map region number. */
+  GXIO_TRIO_ERR_BAD_MEMORY_MAP = -1181,
+
+  /** Cannot allocate scatter queue. */
+  GXIO_TRIO_ERR_NO_SCATTER_QUEUE = -1182,
+
+  /** Invalid scatter queue number. */
+  GXIO_TRIO_ERR_BAD_SCATTER_QUEUE = -1183,
+
+  /** Cannot allocate push DMA ring. */
+  GXIO_TRIO_ERR_NO_PUSH_DMA_RING = -1184,
+
+  /** Invalid push DMA ring index. */
+  GXIO_TRIO_ERR_BAD_PUSH_DMA_RING = -1185,
+
+  /** Cannot allocate pull DMA ring. */
+  GXIO_TRIO_ERR_NO_PULL_DMA_RING = -1186,
+
+  /** Invalid pull DMA ring index. */
+  GXIO_TRIO_ERR_BAD_PULL_DMA_RING = -1187,
+
+  /** Cannot allocate PIO region. */
+  GXIO_TRIO_ERR_NO_PIO = -1188,
+
+  /** Invalid PIO region index. */
+  GXIO_TRIO_ERR_BAD_PIO = -1189,
+
+  /** Cannot allocate ASID. */
+  GXIO_TRIO_ERR_NO_ASID = -1190,
+
+  /** Invalid ASID. */
+  GXIO_TRIO_ERR_BAD_ASID = -1191,
+
+
+  /********************************************************/
+  /*                    MICA Error Codes                  */
+  /********************************************************/
+
+  /** No such accelerator type. */
+  GXIO_MICA_ERR_BAD_ACCEL_TYPE = -1220,
+
+  /** Cannot allocate context. */
+  GXIO_MICA_ERR_NO_CONTEXT = -1221,
+
+  /** PKA command queue is full, can't add another command. */
+  GXIO_MICA_ERR_PKA_CMD_QUEUE_FULL = -1222,
+
+  /** PKA result queue is empty, can't get a result from the queue. */
+  GXIO_MICA_ERR_PKA_RESULT_QUEUE_EMPTY = -1223,
+
+  /********************************************************/
+  /*                    GPIO Error Codes                  */
+  /********************************************************/
+
+  /** Pin not available.  Either the physical pin does not exist, or
+   *  it is reserved by the hypervisor for system usage. */
+  GXIO_GPIO_ERR_PIN_UNAVAILABLE = -1240,
+
+  /** Pin busy.  The pin exists, and is available for use via GXIO, but
+   *  it has been attached by some other process or driver. */
+  GXIO_GPIO_ERR_PIN_BUSY = -1241,
+
+  /** Cannot access unattached pin.  One or more of the pins being
+   *  manipulated by this call are not attached to the requesting
+   *  context. */
+  GXIO_GPIO_ERR_PIN_UNATTACHED = -1242,
+
+  /** Invalid I/O mode for pin.  The wiring of the pin in the system
+   *  is such that the I/O mode or electrical control parameters
+   *  requested could cause damage. */
+  GXIO_GPIO_ERR_PIN_INVALID_MODE = -1243,
+
+  /** Smallest iorpc error number. */
+  GXIO_ERR_MIN = -1299
+};
+
+
+#endif /* !_HV_IORPC_H_ */
diff --git a/arch/tile/include/hv/netio_errors.h b/arch/tile/include/hv/netio_errors.h
new file mode 100644
index 000000000..e1591bff6
--- /dev/null
+++ b/arch/tile/include/hv/netio_errors.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/**
+ * Error codes returned from NetIO routines.
+ */
+
+#ifndef __NETIO_ERRORS_H__
+#define __NETIO_ERRORS_H__
+
+/**
+ * @addtogroup error
+ *
+ * @brief The error codes returned by NetIO functions.
+ *
+ * NetIO functions return 0 (defined as ::NETIO_NO_ERROR) on success, and
+ * a negative value if an error occurs.
+ *
+ * In cases where a NetIO function failed due to a error reported by
+ * system libraries, the error code will be the negation of the
+ * system errno at the time of failure.  The @ref netio_strerror()
+ * function will deliver error strings for both NetIO and system error
+ * codes.
+ *
+ * @{
+ */
+
+/** The set of all NetIO errors. */
+typedef enum
+{
+  /** Operation successfully completed. */
+  NETIO_NO_ERROR        = 0,
+
+  /** A packet was successfully retrieved from an input queue. */
+  NETIO_PKT             = 0,
+
+  /** Largest NetIO error number. */
+  NETIO_ERR_MAX         = -701,
+
+  /** The tile is not registered with the IPP. */
+  NETIO_NOT_REGISTERED  = -701,
+
+  /** No packet was available to retrieve from the input queue. */
+  NETIO_NOPKT           = -702,
+
+  /** The requested function is not implemented. */
+  NETIO_NOT_IMPLEMENTED = -703,
+
+  /** On a registration operation, the target queue already has the maximum
+   *  number of tiles registered for it, and no more may be added.  On a
+   *  packet send operation, the output queue is full and nothing more can
+   *  be queued until some of the queued packets are actually transmitted. */
+  NETIO_QUEUE_FULL      = -704,
+
+  /** The calling process or thread is not bound to exactly one CPU. */
+  NETIO_BAD_AFFINITY    = -705,
+
+  /** Cannot allocate memory on requested controllers. */
+  NETIO_CANNOT_HOME     = -706,
+
+  /** On a registration operation, the IPP specified is not configured
+   *  to support the options requested; for instance, the application
+   *  wants a specific type of tagged headers which the configured IPP
+   *  doesn't support.  Or, the supplied configuration information is
+   *  not self-consistent, or is out of range; for instance, specifying
+   *  both NETIO_RECV and NETIO_NO_RECV, or asking for more than
+   *  NETIO_MAX_SEND_BUFFERS to be preallocated.  On a VLAN or bucket
+   *  configure operation, the number of items, or the base item, was
+   *  out of range.
+   */
+  NETIO_BAD_CONFIG      = -707,
+
+  /** Too many tiles have registered to transmit packets. */
+  NETIO_TOOMANY_XMIT    = -708,
+
+  /** Packet transmission was attempted on a queue which was registered
+      with transmit disabled. */
+  NETIO_UNREG_XMIT      = -709,
+
+  /** This tile is already registered with the IPP. */
+  NETIO_ALREADY_REGISTERED = -710,
+
+  /** The Ethernet link is down. The application should try again later. */
+  NETIO_LINK_DOWN       = -711,
+
+  /** An invalid memory buffer has been specified.  This may be an unmapped
+   * virtual address, or one which does not meet alignment requirements.
+   * For netio_input_register(), this error may be returned when multiple
+   * processes specify different memory regions to be used for NetIO
+   * buffers.  That can happen if these processes specify explicit memory
+   * regions with the ::NETIO_FIXED_BUFFER_VA flag, or if tmc_cmem_init()
+   * has not been called by a common ancestor of the processes.
+   */
+  NETIO_FAULT           = -712,
+
+  /** Cannot combine user-managed shared memory and cache coherence. */
+  NETIO_BAD_CACHE_CONFIG = -713,
+
+  /** Smallest NetIO error number. */
+  NETIO_ERR_MIN         = -713,
+
+#ifndef __DOXYGEN__
+  /** Used internally to mean that no response is needed; never returned to
+   *  an application. */
+  NETIO_NO_RESPONSE     = 1
+#endif
+} netio_error_t;
+
+/** @} */
+
+#endif /* __NETIO_ERRORS_H__ */
diff --git a/arch/tile/include/hv/netio_intf.h b/arch/tile/include/hv/netio_intf.h
new file mode 100644
index 000000000..8d20972ab
--- /dev/null
+++ b/arch/tile/include/hv/netio_intf.h
@@ -0,0 +1,2975 @@
+/*
+ * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/**
+ * NetIO interface structures and macros.
+ */
+
+#ifndef __NETIO_INTF_H__
+#define __NETIO_INTF_H__
+
+#include <hv/netio_errors.h>
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+#else
+#include <stdint.h>
+#endif
+
+#if !defined(__HV__) && !defined(__BOGUX__) && !defined(__KERNEL__)
+#include <assert.h>
+#define netio_assert assert  /**< Enable assertions from macros */
+#else
+#define netio_assert(...) ((void)(0))  /**< Disable assertions from macros */
+#endif
+
+/*
+ * If none of these symbols are defined, we're building libnetio in an
+ * environment where we have pthreads, so we'll enable locking.
+ */
+#if !defined(__HV__) && !defined(__BOGUX__) && !defined(__KERNEL__) && \
+    !defined(__NEWLIB__)
+#define _NETIO_PTHREAD       /**< Include a mutex in netio_queue_t below */
+
+/*
+ * If NETIO_UNLOCKED is defined, we don't do use per-cpu locks on
+ * per-packet NetIO operations.  We still do pthread locking on things
+ * like netio_input_register, though.  This is used for building
+ * libnetio_unlocked.
+ */
+#ifndef NETIO_UNLOCKED
+
+/* Avoid PLT overhead by using our own inlined per-cpu lock. */
+#include <sched.h>
+typedef int _netio_percpu_mutex_t;
+
+static __inline int
+_netio_percpu_mutex_init(_netio_percpu_mutex_t* lock)
+{
+  *lock = 0;
+  return 0;
+}
+
+static __inline int
+_netio_percpu_mutex_lock(_netio_percpu_mutex_t* lock)
+{
+  while (__builtin_expect(__insn_tns(lock), 0))
+    sched_yield();
+  return 0;
+}
+
+static __inline int
+_netio_percpu_mutex_unlock(_netio_percpu_mutex_t* lock)
+{
+  *lock = 0;
+  return 0;
+}
+
+#else /* NETIO_UNLOCKED */
+
+/* Don't do any locking for per-packet NetIO operations. */
+typedef int _netio_percpu_mutex_t;
+#define _netio_percpu_mutex_init(L)
+#define _netio_percpu_mutex_lock(L)
+#define _netio_percpu_mutex_unlock(L)
+
+#endif /* NETIO_UNLOCKED */
+#endif /* !__HV__, !__BOGUX, !__KERNEL__, !__NEWLIB__ */
+
+/** How many tiles can register for a given queue.
+ *  @ingroup setup */
+#define NETIO_MAX_TILES_PER_QUEUE  64
+
+
+/** Largest permissible queue identifier.
+ *  @ingroup setup  */
+#define NETIO_MAX_QUEUE_ID        255
+
+
+#ifndef __DOXYGEN__
+
+/* Metadata packet checksum/ethertype flags. */
+
+/** The L4 checksum has not been calculated. */
+#define _NETIO_PKT_NO_L4_CSUM_SHIFT           0
+#define _NETIO_PKT_NO_L4_CSUM_RMASK           1
+#define _NETIO_PKT_NO_L4_CSUM_MASK \
+         (_NETIO_PKT_NO_L4_CSUM_RMASK << _NETIO_PKT_NO_L4_CSUM_SHIFT)
+
+/** The L3 checksum has not been calculated. */
+#define _NETIO_PKT_NO_L3_CSUM_SHIFT           1
+#define _NETIO_PKT_NO_L3_CSUM_RMASK           1
+#define _NETIO_PKT_NO_L3_CSUM_MASK \
+         (_NETIO_PKT_NO_L3_CSUM_RMASK << _NETIO_PKT_NO_L3_CSUM_SHIFT)
+
+/** The L3 checksum is incorrect (or perhaps has not been calculated). */
+#define _NETIO_PKT_BAD_L3_CSUM_SHIFT          2
+#define _NETIO_PKT_BAD_L3_CSUM_RMASK          1
+#define _NETIO_PKT_BAD_L3_CSUM_MASK \
+         (_NETIO_PKT_BAD_L3_CSUM_RMASK << _NETIO_PKT_BAD_L3_CSUM_SHIFT)
+
+/** The Ethernet packet type is unrecognized. */
+#define _NETIO_PKT_TYPE_UNRECOGNIZED_SHIFT    3
+#define _NETIO_PKT_TYPE_UNRECOGNIZED_RMASK    1
+#define _NETIO_PKT_TYPE_UNRECOGNIZED_MASK \
+         (_NETIO_PKT_TYPE_UNRECOGNIZED_RMASK << \
+          _NETIO_PKT_TYPE_UNRECOGNIZED_SHIFT)
+
+/* Metadata packet type flags. */
+
+/** Where the packet type bits are; this field is the index into
+ *  _netio_pkt_info. */
+#define _NETIO_PKT_TYPE_SHIFT        4
+#define _NETIO_PKT_TYPE_RMASK        0x3F
+
+/** How many VLAN tags the packet has, and, if we have two, which one we
+ *  actually grouped on.  A VLAN within a proprietary (Marvell or Broadcom)
+ *  tag is counted here. */
+#define _NETIO_PKT_VLAN_SHIFT        4
+#define _NETIO_PKT_VLAN_RMASK        0x3
+#define _NETIO_PKT_VLAN_MASK \
+         (_NETIO_PKT_VLAN_RMASK << _NETIO_PKT_VLAN_SHIFT)
+#define _NETIO_PKT_VLAN_NONE         0   /* No VLAN tag. */
+#define _NETIO_PKT_VLAN_ONE          1   /* One VLAN tag. */
+#define _NETIO_PKT_VLAN_TWO_OUTER    2   /* Two VLAN tags, outer one used. */
+#define _NETIO_PKT_VLAN_TWO_INNER    3   /* Two VLAN tags, inner one used. */
+
+/** Which proprietary tags the packet has. */
+#define _NETIO_PKT_TAG_SHIFT         6
+#define _NETIO_PKT_TAG_RMASK         0x3
+#define _NETIO_PKT_TAG_MASK \
+          (_NETIO_PKT_TAG_RMASK << _NETIO_PKT_TAG_SHIFT)
+#define _NETIO_PKT_TAG_NONE          0   /* No proprietary tags. */
+#define _NETIO_PKT_TAG_MRVL          1   /* Marvell HyperG.Stack tags. */
+#define _NETIO_PKT_TAG_MRVL_EXT      2   /* HyperG.Stack extended tags. */
+#define _NETIO_PKT_TAG_BRCM          3   /* Broadcom HiGig tags. */
+
+/** Whether a packet has an LLC + SNAP header. */
+#define _NETIO_PKT_SNAP_SHIFT        8
+#define _NETIO_PKT_SNAP_RMASK        0x1
+#define _NETIO_PKT_SNAP_MASK \
+          (_NETIO_PKT_SNAP_RMASK << _NETIO_PKT_SNAP_SHIFT)
+
+/* NOTE: Bits 9 and 10 are unused. */
+
+/** Length of any custom data before the L2 header, in words. */
+#define _NETIO_PKT_CUSTOM_LEN_SHIFT  11
+#define _NETIO_PKT_CUSTOM_LEN_RMASK  0x1F
+#define _NETIO_PKT_CUSTOM_LEN_MASK \
+          (_NETIO_PKT_CUSTOM_LEN_RMASK << _NETIO_PKT_CUSTOM_LEN_SHIFT)
+
+/** The L4 checksum is incorrect (or perhaps has not been calculated). */
+#define _NETIO_PKT_BAD_L4_CSUM_SHIFT 16
+#define _NETIO_PKT_BAD_L4_CSUM_RMASK 0x1
+#define _NETIO_PKT_BAD_L4_CSUM_MASK \
+          (_NETIO_PKT_BAD_L4_CSUM_RMASK << _NETIO_PKT_BAD_L4_CSUM_SHIFT)
+
+/** Length of the L2 header, in words. */
+#define _NETIO_PKT_L2_LEN_SHIFT  17
+#define _NETIO_PKT_L2_LEN_RMASK  0x1F
+#define _NETIO_PKT_L2_LEN_MASK \
+          (_NETIO_PKT_L2_LEN_RMASK << _NETIO_PKT_L2_LEN_SHIFT)
+
+
+/* Flags in minimal packet metadata. */
+
+/** We need an eDMA checksum on this packet. */
+#define _NETIO_PKT_NEED_EDMA_CSUM_SHIFT            0
+#define _NETIO_PKT_NEED_EDMA_CSUM_RMASK            1
+#define _NETIO_PKT_NEED_EDMA_CSUM_MASK \
+         (_NETIO_PKT_NEED_EDMA_CSUM_RMASK << _NETIO_PKT_NEED_EDMA_CSUM_SHIFT)
+
+/* Data within the packet information table. */
+
+/* Note that, for efficiency, code which uses these fields assumes that none
+ * of the shift values below are zero.  See uses below for an explanation. */
+
+/** Offset within the L2 header of the innermost ethertype (in halfwords). */
+#define _NETIO_PKT_INFO_ETYPE_SHIFT       6
+#define _NETIO_PKT_INFO_ETYPE_RMASK    0x1F
+
+/** Offset within the L2 header of the VLAN tag (in halfwords). */
+#define _NETIO_PKT_INFO_VLAN_SHIFT       11
+#define _NETIO_PKT_INFO_VLAN_RMASK     0x1F
+
+#endif
+
+
+/** The size of a memory buffer representing a small packet.
+ *  @ingroup egress */
+#define SMALL_PACKET_SIZE 256
+
+/** The size of a memory buffer representing a large packet.
+ *  @ingroup egress */
+#define LARGE_PACKET_SIZE 2048
+
+/** The size of a memory buffer representing a jumbo packet.
+ *  @ingroup egress */
+#define JUMBO_PACKET_SIZE (12 * 1024)
+
+
+/* Common ethertypes.
+ * @ingroup ingress */
+/** @{ */
+/** The ethertype of IPv4. */
+#define ETHERTYPE_IPv4 (0x0800)
+/** The ethertype of ARP. */
+#define ETHERTYPE_ARP (0x0806)
+/** The ethertype of VLANs. */
+#define ETHERTYPE_VLAN (0x8100)
+/** The ethertype of a Q-in-Q header. */
+#define ETHERTYPE_Q_IN_Q (0x9100)
+/** The ethertype of IPv6. */
+#define ETHERTYPE_IPv6 (0x86DD)
+/** The ethertype of MPLS. */
+#define ETHERTYPE_MPLS (0x8847)
+/** @} */
+
+
+/** The possible return values of NETIO_PKT_STATUS.
+ * @ingroup ingress
+ */
+typedef enum
+{
+  /** No problems were detected with this packet. */
+  NETIO_PKT_STATUS_OK,
+  /** The packet is undersized; this is expected behavior if the packet's
+    * ethertype is unrecognized, but otherwise the packet is likely corrupt. */
+  NETIO_PKT_STATUS_UNDERSIZE,
+  /** The packet is oversized and some trailing bytes have been discarded.
+      This is expected behavior for short packets, since it's impossible to
+      precisely determine the amount of padding which may have been added to
+      them to make them meet the minimum Ethernet packet size. */
+  NETIO_PKT_STATUS_OVERSIZE,
+  /** The packet was judged to be corrupt by hardware (for instance, it had
+      a bad CRC, or part of it was discarded due to lack of buffer space in
+      the I/O shim) and should be discarded. */
+  NETIO_PKT_STATUS_BAD
+} netio_pkt_status_t;
+
+
+/** Log2 of how many buckets we have. */
+#define NETIO_LOG2_NUM_BUCKETS (10)
+
+/** How many buckets we have.
+ * @ingroup ingress */
+#define NETIO_NUM_BUCKETS (1 << NETIO_LOG2_NUM_BUCKETS)
+
+
+/**
+ * @brief A group-to-bucket identifier.
+ *
+ * @ingroup setup
+ *
+ * This tells us what to do with a given group.
+ */
+typedef union {
+  /** The header broken down into bits. */
+  struct {
+    /** Whether we should balance on L4, if available */
+    unsigned int __balance_on_l4:1;
+    /** Whether we should balance on L3, if available */
+    unsigned int __balance_on_l3:1;
+    /** Whether we should balance on L2, if available */
+    unsigned int __balance_on_l2:1;
+    /** Reserved for future use */
+    unsigned int __reserved:1;
+    /** The base bucket to use to send traffic */
+    unsigned int __bucket_base:NETIO_LOG2_NUM_BUCKETS;
+    /** The mask to apply to the balancing value. This must be one less
+     * than a power of two, e.g. 0x3 or 0xFF.
+     */
+    unsigned int __bucket_mask:NETIO_LOG2_NUM_BUCKETS;
+    /** Pad to 32 bits */
+    unsigned int __padding:(32 - 4 - 2 * NETIO_LOG2_NUM_BUCKETS);
+  } bits;
+  /** To send out the IDN. */
+  unsigned int word;
+}
+netio_group_t;
+
+
+/**
+ * @brief A VLAN-to-bucket identifier.
+ *
+ * @ingroup setup
+ *
+ * This tells us what to do with a given VLAN.
+ */
+typedef netio_group_t netio_vlan_t;
+
+
+/**
+ * A bucket-to-queue mapping.
+ * @ingroup setup
+ */
+typedef unsigned char netio_bucket_t;
+
+
+/**
+ * A packet size can always fit in a netio_size_t.
+ * @ingroup setup
+ */
+typedef unsigned int netio_size_t;
+
+
+/**
+ * @brief Ethernet standard (ingress) packet metadata.
+ *
+ * @ingroup ingress
+ *
+ * This is additional data associated with each packet.
+ * This structure is opaque and accessed through the @ref ingress.
+ *
+ * Also, the buffer population operation currently assumes that standard
+ * metadata is at least as large as minimal metadata, and will need to be
+ * modified if that is no longer the case.
+ */
+typedef struct
+{
+#ifdef __DOXYGEN__
+  /** This structure is opaque. */
+  unsigned char opaque[24];
+#else
+  /** The overall ordinal of the packet */
+  unsigned int __packet_ordinal;
+  /** The ordinal of the packet within the group */
+  unsigned int __group_ordinal;
+  /** The best flow hash IPP could compute. */
+  unsigned int __flow_hash;
+  /** Flags pertaining to checksum calculation, packet type, etc. */
+  unsigned int __flags;
+  /** The first word of "user data". */
+  unsigned int __user_data_0;
+  /** The second word of "user data". */
+  unsigned int __user_data_1;
+#endif
+}
+netio_pkt_metadata_t;
+
+
+/** To ensure that the L3 header is aligned mod 4, the L2 header should be
+ * aligned mod 4 plus 2, since every supported L2 header is 4n + 2 bytes
+ * long.  The standard way to do this is to simply add 2 bytes of padding
+ * before the L2 header.
+ */
+#define NETIO_PACKET_PADDING 2
+
+
+
+/**
+ * @brief Ethernet minimal (egress) packet metadata.
+ *
+ * @ingroup egress
+ *
+ * This structure represents information about packets which have
+ * been processed by @ref netio_populate_buffer() or
+ * @ref netio_populate_prepend_buffer().  This structure is opaque
+ * and accessed through the @ref egress.
+ *
+ * @internal This structure is actually copied into the memory used by
+ * standard metadata, which is assumed to be large enough.
+ */
+typedef struct
+{
+#ifdef __DOXYGEN__
+  /** This structure is opaque. */
+  unsigned char opaque[14];
+#else
+  /** The offset of the L2 header from the start of the packet data. */
+  unsigned short l2_offset;
+  /** The offset of the L3 header from the start of the packet data. */
+  unsigned short l3_offset;
+  /** Where to write the checksum. */
+  unsigned char csum_location;
+  /** Where to start checksumming from. */
+  unsigned char csum_start;
+  /** Flags pertaining to checksum calculation etc. */
+  unsigned short flags;
+  /** The L2 length of the packet. */
+  unsigned short l2_length;
+  /** The checksum with which to seed the checksum generator. */
+  unsigned short csum_seed;
+  /** How much to checksum. */
+  unsigned short csum_length;
+#endif
+}
+netio_pkt_minimal_metadata_t;
+
+
+#ifndef __DOXYGEN__
+
+/**
+ * @brief An I/O notification header.
+ *
+ * This is the first word of data received from an I/O shim in a notification
+ * packet. It contains framing and status information.
+ */
+typedef union
+{
+  unsigned int word; /**< The whole word. */
+  /** The various fields. */
+  struct
+  {
+    unsigned int __channel:7;    /**< Resource channel. */
+    unsigned int __type:4;       /**< Type. */
+    unsigned int __ack:1;        /**< Whether an acknowledgement is needed. */
+    unsigned int __reserved:1;   /**< Reserved. */
+    unsigned int __protocol:1;   /**< A protocol-specific word is added. */
+    unsigned int __status:2;     /**< Status of the transfer. */
+    unsigned int __framing:2;    /**< Framing of the transfer. */
+    unsigned int __transfer_size:14; /**< Transfer size in bytes (total). */
+  } bits;
+}
+__netio_pkt_notif_t;
+
+
+/**
+ * Returns the base address of the packet.
+ */
+#define _NETIO_PKT_HANDLE_BASE(p) \
+  ((unsigned char*)((p).word & 0xFFFFFFC0))
+
+/**
+ * Returns the base address of the packet.
+ */
+#define _NETIO_PKT_BASE(p) \
+  _NETIO_PKT_HANDLE_BASE(p->__packet)
+
+/**
+ * @brief An I/O notification packet (second word)
+ *
+ * This is the second word of data received from an I/O shim in a notification
+ * packet.  This is the virtual address of the packet buffer, plus some flag
+ * bits.  (The virtual address of the packet is always 256-byte aligned so we
+ * have room for 8 bits' worth of flags in the low 8 bits.)
+ *
+ * @internal
+ * NOTE: The low two bits must contain "__queue", so the "packet size"
+ * (SIZE_SMALL, SIZE_LARGE, or SIZE_JUMBO) can be determined quickly.
+ *
+ * If __addr or __offset are moved, _NETIO_PKT_BASE
+ * (defined right below this) must be changed.
+ */
+typedef union
+{
+  unsigned int word; /**< The whole word. */
+  /** The various fields. */
+  struct
+  {
+    /** Which queue the packet will be returned to once it is sent back to
+        the IPP.  This is one of the SIZE_xxx values. */
+    unsigned int __queue:2;
+
+    /** The IPP handle of the sending IPP. */
+    unsigned int __ipp_handle:2;
+
+    /** Reserved for future use. */
+    unsigned int __reserved:1;
+
+    /** If 1, this packet has minimal (egress) metadata; otherwise, it
+        has standard (ingress) metadata. */
+    unsigned int __minimal:1;
+
+    /** Offset of the metadata within the packet.  This value is multiplied
+     *  by 64 and added to the base packet address to get the metadata
+     *  address.  Note that this field is aligned within the word such that
+     *  you can easily extract the metadata address with a 26-bit mask. */
+    unsigned int __offset:2;
+
+    /** The top 24 bits of the packet's virtual address. */
+    unsigned int __addr:24;
+  } bits;
+}
+__netio_pkt_handle_t;
+
+#endif /* !__DOXYGEN__ */
+
+
+/**
+ * @brief A handle for an I/O packet's storage.
+ * @ingroup ingress
+ *
+ * netio_pkt_handle_t encodes the concept of a ::netio_pkt_t with its
+ * packet metadata removed.  It is a much smaller type that exists to
+ * facilitate applications where the full ::netio_pkt_t type is too
+ * large, such as those that cache enormous numbers of packets or wish
+ * to transmit packet descriptors over the UDN.
+ *
+ * Because there is no metadata, most ::netio_pkt_t operations cannot be
+ * performed on a netio_pkt_handle_t.  It supports only
+ * netio_free_handle() (to free the buffer) and
+ * NETIO_PKT_CUSTOM_DATA_H() (to access a pointer to its contents).
+ * The application must acquire any additional metadata it wants from the
+ * original ::netio_pkt_t and record it separately.
+ *
+ * A netio_pkt_handle_t can be extracted from a ::netio_pkt_t by calling
+ * NETIO_PKT_HANDLE().  An invalid handle (analogous to NULL) can be
+ * created by assigning the value ::NETIO_PKT_HANDLE_NONE. A handle can
+ * be tested for validity with NETIO_PKT_HANDLE_IS_VALID().
+ */
+typedef struct
+{
+  unsigned int word; /**< Opaque bits. */
+} netio_pkt_handle_t;
+
+/**
+ * @brief A packet descriptor.
+ *
+ * @ingroup ingress
+ * @ingroup egress
+ *
+ * This data structure represents a packet.  The structure is manipulated
+ * through the @ref ingress and the @ref egress.
+ *
+ * While the contents of a netio_pkt_t are opaque, the structure itself is
+ * portable.  This means that it may be shared between all tiles which have
+ * done a netio_input_register() call for the interface on which the pkt_t
+ * was initially received (via netio_get_packet()) or retrieved (via
+ * netio_get_buffer()).  The contents of a netio_pkt_t can be transmitted to
+ * another tile via shared memory, or via a UDN message, or by other means.
+ * The destination tile may then use the pkt_t as if it had originally been
+ * received locally; it may read or write the packet's data, read its
+ * metadata, free the packet, send the packet, transfer the netio_pkt_t to
+ * yet another tile, and so forth.
+ *
+ * Once a netio_pkt_t has been transferred to a second tile, the first tile
+ * should not reference the original copy; in particular, if more than one
+ * tile frees or sends the same netio_pkt_t, the IPP's packet free lists will
+ * become corrupted.  Note also that each tile which reads or modifies
+ * packet data must obey the memory coherency rules outlined in @ref input.
+ */
+typedef struct
+{
+#ifdef __DOXYGEN__
+  /** This structure is opaque. */
+  unsigned char opaque[32];
+#else
+  /** For an ingress packet (one with standard metadata), this is the
+   *  notification header we got from the I/O shim.  For an egress packet
+   *  (one with minimal metadata), this word is zero if the packet has not
+   *  been populated, and nonzero if it has. */
+  __netio_pkt_notif_t __notif_header;
+
+  /** Virtual address of the packet buffer, plus state flags. */
+  __netio_pkt_handle_t __packet;
+
+  /** Metadata associated with the packet. */
+  netio_pkt_metadata_t __metadata;
+#endif
+}
+netio_pkt_t;
+
+
+#ifndef __DOXYGEN__
+
+#define __NETIO_PKT_NOTIF_HEADER(pkt) ((pkt)->__notif_header)
+#define __NETIO_PKT_IPP_HANDLE(pkt) ((pkt)->__packet.bits.__ipp_handle)
+#define __NETIO_PKT_QUEUE(pkt) ((pkt)->__packet.bits.__queue)
+#define __NETIO_PKT_NOTIF_HEADER_M(mda, pkt) ((pkt)->__notif_header)
+#define __NETIO_PKT_IPP_HANDLE_M(mda, pkt) ((pkt)->__packet.bits.__ipp_handle)
+#define __NETIO_PKT_MINIMAL(pkt) ((pkt)->__packet.bits.__minimal)
+#define __NETIO_PKT_QUEUE_M(mda, pkt) ((pkt)->__packet.bits.__queue)
+#define __NETIO_PKT_FLAGS_M(mda, pkt) ((mda)->__flags)
+
+/* Packet information table, used by the attribute access functions below. */
+extern const uint16_t _netio_pkt_info[];
+
+#endif /* __DOXYGEN__ */
+
+
+#ifndef __DOXYGEN__
+/* These macros are deprecated and will disappear in a future MDE release. */
+#define NETIO_PKT_GOOD_CHECKSUM(pkt) \
+  NETIO_PKT_L4_CSUM_CORRECT(pkt)
+#define NETIO_PKT_GOOD_CHECKSUM_M(mda, pkt) \
+  NETIO_PKT_L4_CSUM_CORRECT_M(mda, pkt)
+#endif /* __DOXYGEN__ */
+
+
+/* Packet attribute access functions. */
+
+/** Return a pointer to the metadata for a packet.
+ * @ingroup ingress
+ *
+ * Calling this function once and passing the result to other retrieval
+ * functions with a "_M" suffix usually improves performance.  This
+ * function must be called on an 'ingress' packet (i.e. one retrieved
+ * by @ref netio_get_packet(), on which @ref netio_populate_buffer() or
+ * @ref netio_populate_prepend_buffer have not been called). Use of this
+ * function on an 'egress' packet will cause an assertion failure.
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return A pointer to the packet's standard metadata.
+ */
+static __inline netio_pkt_metadata_t*
+NETIO_PKT_METADATA(netio_pkt_t* pkt)
+{
+  netio_assert(!pkt->__packet.bits.__minimal);
+  return &pkt->__metadata;
+}
+
+
+/** Return a pointer to the minimal metadata for a packet.
+ * @ingroup egress
+ *
+ * Calling this function once and passing the result to other retrieval
+ * functions with a "_MM" suffix usually improves performance.  This
+ * function must be called on an 'egress' packet (i.e. one on which
+ * @ref netio_populate_buffer() or @ref netio_populate_prepend_buffer()
+ * have been called, or one retrieved by @ref netio_get_buffer()). Use of
+ * this function on an 'ingress' packet will cause an assertion failure.
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return A pointer to the packet's standard metadata.
+ */
+static __inline netio_pkt_minimal_metadata_t*
+NETIO_PKT_MINIMAL_METADATA(netio_pkt_t* pkt)
+{
+  netio_assert(pkt->__packet.bits.__minimal);
+  return (netio_pkt_minimal_metadata_t*) &pkt->__metadata;
+}
+
+
+/** Determine whether a packet has 'minimal' metadata.
+ * @ingroup pktfuncs
+ *
+ * This function will return nonzero if the packet is an 'egress'
+ * packet (i.e. one on which @ref netio_populate_buffer() or
+ * @ref netio_populate_prepend_buffer() have been called, or one
+ * retrieved by @ref netio_get_buffer()), and zero if the packet
+ * is an 'ingress' packet (i.e. one retrieved by @ref netio_get_packet(),
+ * which has not been converted into an 'egress' packet).
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return Nonzero if the packet has minimal metadata.
+ */
+static __inline unsigned int
+NETIO_PKT_IS_MINIMAL(netio_pkt_t* pkt)
+{
+  return pkt->__packet.bits.__minimal;
+}
+
+
+/** Return a handle for a packet's storage.
+ * @ingroup pktfuncs
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return A handle for the packet's storage.
+ */
+static __inline netio_pkt_handle_t
+NETIO_PKT_HANDLE(netio_pkt_t* pkt)
+{
+  netio_pkt_handle_t h;
+  h.word = pkt->__packet.word;
+  return h;
+}
+
+
+/** A special reserved value indicating the absence of a packet handle.
+ *
+ * @ingroup pktfuncs
+ */
+#define NETIO_PKT_HANDLE_NONE ((netio_pkt_handle_t) { 0 })
+
+
+/** Test whether a packet handle is valid.
+ *
+ * Applications may wish to use the reserved value NETIO_PKT_HANDLE_NONE
+ * to indicate no packet at all.  This function tests to see if a packet
+ * handle is a real handle, not this special reserved value.
+ *
+ * @ingroup pktfuncs
+ *
+ * @param[in] handle Handle on which to operate.
+ * @return One if the packet handle is valid, else zero.
+ */
+static __inline unsigned int
+NETIO_PKT_HANDLE_IS_VALID(netio_pkt_handle_t handle)
+{
+  return handle.word != 0;
+}
+
+
+
+/** Return a pointer to the start of the packet's custom header.
+ *  A custom header may or may not be present, depending upon the IPP; its
+ *  contents and alignment are also IPP-dependent.  Currently, none of the
+ *  standard IPPs supplied by Tilera produce a custom header.  If present,
+ *  the custom header precedes the L2 header in the packet buffer.
+ * @ingroup ingress
+ *
+ * @param[in] handle Handle on which to operate.
+ * @return A pointer to start of the packet.
+ */
+static __inline unsigned char*
+NETIO_PKT_CUSTOM_DATA_H(netio_pkt_handle_t handle)
+{
+  return _NETIO_PKT_HANDLE_BASE(handle) + NETIO_PACKET_PADDING;
+}
+
+
+/** Return the length of the packet's custom header.
+ *  A custom header may or may not be present, depending upon the IPP; its
+ *  contents and alignment are also IPP-dependent.  Currently, none of the
+ *  standard IPPs supplied by Tilera produce a custom header.  If present,
+ *  the custom header precedes the L2 header in the packet buffer.
+ *
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return The length of the packet's custom header, in bytes.
+ */
+static __inline netio_size_t
+NETIO_PKT_CUSTOM_HEADER_LENGTH_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  /*
+   * Note that we effectively need to extract a quantity from the flags word
+   * which is measured in words, and then turn it into bytes by shifting
+   * it left by 2.  We do this all at once by just shifting right two less
+   * bits, and shifting the mask up two bits.
+   */
+  return ((mda->__flags >> (_NETIO_PKT_CUSTOM_LEN_SHIFT - 2)) &
+          (_NETIO_PKT_CUSTOM_LEN_RMASK << 2));
+}
+
+
+/** Return the length of the packet, starting with the custom header.
+ *  A custom header may or may not be present, depending upon the IPP; its
+ *  contents and alignment are also IPP-dependent.  Currently, none of the
+ *  standard IPPs supplied by Tilera produce a custom header.  If present,
+ *  the custom header precedes the L2 header in the packet buffer.
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return The length of the packet, in bytes.
+ */
+static __inline netio_size_t
+NETIO_PKT_CUSTOM_LENGTH_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return (__NETIO_PKT_NOTIF_HEADER(pkt).bits.__transfer_size -
+          NETIO_PACKET_PADDING);
+}
+
+
+/** Return a pointer to the start of the packet's custom header.
+ *  A custom header may or may not be present, depending upon the IPP; its
+ *  contents and alignment are also IPP-dependent.  Currently, none of the
+ *  standard IPPs supplied by Tilera produce a custom header.  If present,
+ *  the custom header precedes the L2 header in the packet buffer.
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return A pointer to start of the packet.
+ */
+static __inline unsigned char*
+NETIO_PKT_CUSTOM_DATA_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return NETIO_PKT_CUSTOM_DATA_H(NETIO_PKT_HANDLE(pkt));
+}
+
+
+/** Return the length of the packet's L2 (Ethernet plus VLAN or SNAP) header.
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return The length of the packet's L2 header, in bytes.
+ */
+static __inline netio_size_t
+NETIO_PKT_L2_HEADER_LENGTH_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  /*
+   * Note that we effectively need to extract a quantity from the flags word
+   * which is measured in words, and then turn it into bytes by shifting
+   * it left by 2.  We do this all at once by just shifting right two less
+   * bits, and shifting the mask up two bits.  We then add two bytes.
+   */
+  return ((mda->__flags >> (_NETIO_PKT_L2_LEN_SHIFT - 2)) &
+          (_NETIO_PKT_L2_LEN_RMASK << 2)) + 2;
+}
+
+
+/** Return the length of the packet, starting with the L2 (Ethernet) header.
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return The length of the packet, in bytes.
+ */
+static __inline netio_size_t
+NETIO_PKT_L2_LENGTH_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return (NETIO_PKT_CUSTOM_LENGTH_M(mda, pkt) -
+          NETIO_PKT_CUSTOM_HEADER_LENGTH_M(mda,pkt));
+}
+
+
+/** Return a pointer to the start of the packet's L2 (Ethernet) header.
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return A pointer to start of the packet.
+ */
+static __inline unsigned char*
+NETIO_PKT_L2_DATA_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return (NETIO_PKT_CUSTOM_DATA_M(mda, pkt) +
+          NETIO_PKT_CUSTOM_HEADER_LENGTH_M(mda, pkt));
+}
+
+
+/** Retrieve the length of the packet, starting with the L3 (generally,
+ *  the IP) header.
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return Length of the packet's L3 header and data, in bytes.
+ */
+static __inline netio_size_t
+NETIO_PKT_L3_LENGTH_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return (NETIO_PKT_L2_LENGTH_M(mda, pkt) -
+          NETIO_PKT_L2_HEADER_LENGTH_M(mda,pkt));
+}
+
+
+/** Return a pointer to the packet's L3 (generally, the IP) header.
+ * @ingroup ingress
+ *
+ * Note that we guarantee word alignment of the L3 header.
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return A pointer to the packet's L3 header.
+ */
+static __inline unsigned char*
+NETIO_PKT_L3_DATA_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return (NETIO_PKT_L2_DATA_M(mda, pkt) +
+          NETIO_PKT_L2_HEADER_LENGTH_M(mda, pkt));
+}
+
+
+/** Return the ordinal of the packet.
+ * @ingroup ingress
+ *
+ * Each packet is given an ordinal number when it is delivered by the IPP.
+ * In the medium term, the ordinal is unique and monotonically increasing,
+ * being incremented by 1 for each packet; the ordinal of the first packet
+ * delivered after the IPP starts is zero.  (Since the ordinal is of finite
+ * size, given enough input packets, it will eventually wrap around to zero;
+ * in the long term, therefore, ordinals are not unique.)  The ordinals
+ * handed out by different IPPs are not disjoint, so two packets from
+ * different IPPs may have identical ordinals.  Packets dropped by the
+ * IPP or by the I/O shim are not assigned ordinals.
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's per-IPP packet ordinal.
+ */
+static __inline unsigned int
+NETIO_PKT_ORDINAL_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return mda->__packet_ordinal;
+}
+
+
+/** Return the per-group ordinal of the packet.
+ * @ingroup ingress
+ *
+ * Each packet is given a per-group ordinal number when it is
+ * delivered by the IPP. By default, the group is the packet's VLAN,
+ * although IPP can be recompiled to use different values.  In
+ * the medium term, the ordinal is unique and monotonically
+ * increasing, being incremented by 1 for each packet; the ordinal of
+ * the first packet distributed to a particular group is zero.
+ * (Since the ordinal is of finite size, given enough input packets,
+ * it will eventually wrap around to zero; in the long term,
+ * therefore, ordinals are not unique.)  The ordinals handed out by
+ * different IPPs are not disjoint, so two packets from different IPPs
+ * may have identical ordinals; similarly, packets distributed to
+ * different groups may have identical ordinals.  Packets dropped by
+ * the IPP or by the I/O shim are not assigned ordinals.
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's per-IPP, per-group ordinal.
+ */
+static __inline unsigned int
+NETIO_PKT_GROUP_ORDINAL_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return mda->__group_ordinal;
+}
+
+
+/** Return the VLAN ID assigned to the packet.
+ * @ingroup ingress
+ *
+ * This value is usually contained within the packet header.
+ *
+ * This value will be zero if the packet does not have a VLAN tag, or if
+ * this value was not extracted from the packet.
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's VLAN ID.
+ */
+static __inline unsigned short
+NETIO_PKT_VLAN_ID_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  int vl = (mda->__flags >> _NETIO_PKT_VLAN_SHIFT) & _NETIO_PKT_VLAN_RMASK;
+  unsigned short* pkt_p;
+  int index;
+  unsigned short val;
+
+  if (vl == _NETIO_PKT_VLAN_NONE)
+    return 0;
+
+  pkt_p = (unsigned short*) NETIO_PKT_L2_DATA_M(mda, pkt);
+  index = (mda->__flags >> _NETIO_PKT_TYPE_SHIFT) & _NETIO_PKT_TYPE_RMASK;
+
+  val = pkt_p[(_netio_pkt_info[index] >> _NETIO_PKT_INFO_VLAN_SHIFT) &
+              _NETIO_PKT_INFO_VLAN_RMASK];
+
+#ifdef __TILECC__
+  return (__insn_bytex(val) >> 16) & 0xFFF;
+#else
+  return (__builtin_bswap32(val) >> 16) & 0xFFF;
+#endif
+}
+
+
+/** Return the ethertype of the packet.
+ * @ingroup ingress
+ *
+ * This value is usually contained within the packet header.
+ *
+ * This value is reliable if @ref NETIO_PKT_ETHERTYPE_RECOGNIZED_M()
+ * returns true, and otherwise, may not be well defined.
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's ethertype.
+ */
+static __inline unsigned short
+NETIO_PKT_ETHERTYPE_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  unsigned short* pkt_p = (unsigned short*) NETIO_PKT_L2_DATA_M(mda, pkt);
+  int index = (mda->__flags >> _NETIO_PKT_TYPE_SHIFT) & _NETIO_PKT_TYPE_RMASK;
+
+  unsigned short val =
+    pkt_p[(_netio_pkt_info[index] >> _NETIO_PKT_INFO_ETYPE_SHIFT) &
+          _NETIO_PKT_INFO_ETYPE_RMASK];
+
+  return __builtin_bswap32(val) >> 16;
+}
+
+
+/** Return the flow hash computed on the packet.
+ * @ingroup ingress
+ *
+ * For TCP and UDP packets, this hash is calculated by hashing together
+ * the "5-tuple" values, specifically the source IP address, destination
+ * IP address, protocol type, source port and destination port.
+ * The hash value is intended to be helpful for millions of distinct
+ * flows.
+ *
+ * For IPv4 or IPv6 packets which are neither TCP nor UDP, the flow hash is
+ * derived by hashing together the source and destination IP addresses.
+ *
+ * For MPLS-encapsulated packets, the flow hash is derived by hashing
+ * the first MPLS label.
+ *
+ * For all other packets the flow hash is computed from the source
+ * and destination Ethernet addresses.
+ *
+ * The hash is symmetric, meaning it produces the same value if the
+ * source and destination are swapped. The only exceptions are
+ * tunneling protocols 0x04 (IP in IP Encapsulation), 0x29 (Simple
+ * Internet Protocol), 0x2F (General Routing Encapsulation) and 0x32
+ * (Encap Security Payload), which use only the destination address
+ * since the source address is not meaningful.
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's 32-bit flow hash.
+ */
+static __inline unsigned int
+NETIO_PKT_FLOW_HASH_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return mda->__flow_hash;
+}
+
+
+/** Return the first word of "user data" for the packet.
+ *
+ * The contents of the user data words depend on the IPP.
+ *
+ * When using the standard ipp1, ipp2, or ipp4 sub-drivers, the first
+ * word of user data contains the least significant bits of the 64-bit
+ * arrival cycle count (see @c get_cycle_count_low()).
+ *
+ * See the <em>System Programmer's Guide</em> for details.
+ *
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's first word of "user data".
+ */
+static __inline unsigned int
+NETIO_PKT_USER_DATA_0_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return mda->__user_data_0;
+}
+
+
+/** Return the second word of "user data" for the packet.
+ *
+ * The contents of the user data words depend on the IPP.
+ *
+ * When using the standard ipp1, ipp2, or ipp4 sub-drivers, the second
+ * word of user data contains the most significant bits of the 64-bit
+ * arrival cycle count (see @c get_cycle_count_high()).
+ *
+ * See the <em>System Programmer's Guide</em> for details.
+ *
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's second word of "user data".
+ */
+static __inline unsigned int
+NETIO_PKT_USER_DATA_1_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return mda->__user_data_1;
+}
+
+
+/** Determine whether the L4 (TCP/UDP) checksum was calculated.
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return Nonzero if the L4 checksum was calculated.
+ */
+static __inline unsigned int
+NETIO_PKT_L4_CSUM_CALCULATED_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return !(mda->__flags & _NETIO_PKT_NO_L4_CSUM_MASK);
+}
+
+
+/** Determine whether the L4 (TCP/UDP) checksum was calculated and found to
+ *  be correct.
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return Nonzero if the checksum was calculated and is correct.
+ */
+static __inline unsigned int
+NETIO_PKT_L4_CSUM_CORRECT_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return !(mda->__flags &
+           (_NETIO_PKT_BAD_L4_CSUM_MASK | _NETIO_PKT_NO_L4_CSUM_MASK));
+}
+
+
+/** Determine whether the L3 (IP) checksum was calculated.
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return Nonzero if the L3 (IP) checksum was calculated.
+*/
+static __inline unsigned int
+NETIO_PKT_L3_CSUM_CALCULATED_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return !(mda->__flags & _NETIO_PKT_NO_L3_CSUM_MASK);
+}
+
+
+/** Determine whether the L3 (IP) checksum was calculated and found to be
+ *  correct.
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return Nonzero if the checksum was calculated and is correct.
+ */
+static __inline unsigned int
+NETIO_PKT_L3_CSUM_CORRECT_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return !(mda->__flags &
+           (_NETIO_PKT_BAD_L3_CSUM_MASK | _NETIO_PKT_NO_L3_CSUM_MASK));
+}
+
+
+/** Determine whether the ethertype was recognized and L3 packet data was
+ *  processed.
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return Nonzero if the ethertype was recognized and L3 packet data was
+ *   processed.
+ */
+static __inline unsigned int
+NETIO_PKT_ETHERTYPE_RECOGNIZED_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return !(mda->__flags & _NETIO_PKT_TYPE_UNRECOGNIZED_MASK);
+}
+
+
+/** Retrieve the status of a packet and any errors that may have occurred
+ * during ingress processing (length mismatches, CRC errors, etc.).
+ * @ingroup ingress
+ *
+ * Note that packets for which @ref NETIO_PKT_ETHERTYPE_RECOGNIZED()
+ * returns zero are always reported as underlength, as there is no a priori
+ * means to determine their length.  Normally, applications should use
+ * @ref NETIO_PKT_BAD_M() instead of explicitly checking status with this
+ * function.
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's status.
+ */
+static __inline netio_pkt_status_t
+NETIO_PKT_STATUS_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return (netio_pkt_status_t) __NETIO_PKT_NOTIF_HEADER(pkt).bits.__status;
+}
+
+
+/** Report whether a packet is bad (i.e., was shorter than expected based on
+ *  its headers, or had a bad CRC).
+ * @ingroup ingress
+ *
+ * Note that this function does not verify L3 or L4 checksums.
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return Nonzero if the packet is bad and should be discarded.
+ */
+static __inline unsigned int
+NETIO_PKT_BAD_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return ((NETIO_PKT_STATUS_M(mda, pkt) & 1) &&
+          (NETIO_PKT_ETHERTYPE_RECOGNIZED_M(mda, pkt) ||
+           NETIO_PKT_STATUS_M(mda, pkt) == NETIO_PKT_STATUS_BAD));
+}
+
+
+/** Return the length of the packet, starting with the L2 (Ethernet) header.
+ * @ingroup egress
+ *
+ * @param[in] mmd Pointer to packet's minimal metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return The length of the packet, in bytes.
+ */
+static __inline netio_size_t
+NETIO_PKT_L2_LENGTH_MM(netio_pkt_minimal_metadata_t* mmd, netio_pkt_t* pkt)
+{
+  return mmd->l2_length;
+}
+
+
+/** Return the length of the L2 (Ethernet) header.
+ * @ingroup egress
+ *
+ * @param[in] mmd Pointer to packet's minimal metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return The length of the packet's L2 header, in bytes.
+ */
+static __inline netio_size_t
+NETIO_PKT_L2_HEADER_LENGTH_MM(netio_pkt_minimal_metadata_t* mmd,
+                              netio_pkt_t* pkt)
+{
+  return mmd->l3_offset - mmd->l2_offset;
+}
+
+
+/** Return the length of the packet, starting with the L3 (IP) header.
+ * @ingroup egress
+ *
+ * @param[in] mmd Pointer to packet's minimal metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return Length of the packet's L3 header and data, in bytes.
+ */
+static __inline netio_size_t
+NETIO_PKT_L3_LENGTH_MM(netio_pkt_minimal_metadata_t* mmd, netio_pkt_t* pkt)
+{
+  return (NETIO_PKT_L2_LENGTH_MM(mmd, pkt) -
+          NETIO_PKT_L2_HEADER_LENGTH_MM(mmd, pkt));
+}
+
+
+/** Return a pointer to the packet's L3 (generally, the IP) header.
+ * @ingroup egress
+ *
+ * Note that we guarantee word alignment of the L3 header.
+ *
+ * @param[in] mmd Pointer to packet's minimal metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return A pointer to the packet's L3 header.
+ */
+static __inline unsigned char*
+NETIO_PKT_L3_DATA_MM(netio_pkt_minimal_metadata_t* mmd, netio_pkt_t* pkt)
+{
+  return _NETIO_PKT_BASE(pkt) + mmd->l3_offset;
+}
+
+
+/** Return a pointer to the packet's L2 (Ethernet) header.
+ * @ingroup egress
+ *
+ * @param[in] mmd Pointer to packet's minimal metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return A pointer to start of the packet.
+ */
+static __inline unsigned char*
+NETIO_PKT_L2_DATA_MM(netio_pkt_minimal_metadata_t* mmd, netio_pkt_t* pkt)
+{
+  return _NETIO_PKT_BASE(pkt) + mmd->l2_offset;
+}
+
+
+/** Retrieve the status of a packet and any errors that may have occurred
+ * during ingress processing (length mismatches, CRC errors, etc.).
+ * @ingroup ingress
+ *
+ * Note that packets for which @ref NETIO_PKT_ETHERTYPE_RECOGNIZED()
+ * returns zero are always reported as underlength, as there is no a priori
+ * means to determine their length.  Normally, applications should use
+ * @ref NETIO_PKT_BAD() instead of explicitly checking status with this
+ * function.
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's status.
+ */
+static __inline netio_pkt_status_t
+NETIO_PKT_STATUS(netio_pkt_t* pkt)
+{
+  netio_assert(!pkt->__packet.bits.__minimal);
+
+  return (netio_pkt_status_t) __NETIO_PKT_NOTIF_HEADER(pkt).bits.__status;
+}
+
+
+/** Report whether a packet is bad (i.e., was shorter than expected based on
+ *  its headers, or had a bad CRC).
+ * @ingroup ingress
+ *
+ * Note that this function does not verify L3 or L4 checksums.
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return Nonzero if the packet is bad and should be discarded.
+ */
+static __inline unsigned int
+NETIO_PKT_BAD(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_BAD_M(mda, pkt);
+}
+
+
+/** Return the length of the packet's custom header.
+ *  A custom header may or may not be present, depending upon the IPP; its
+ *  contents and alignment are also IPP-dependent.  Currently, none of the
+ *  standard IPPs supplied by Tilera produce a custom header.  If present,
+ *  the custom header precedes the L2 header in the packet buffer.
+ * @ingroup pktfuncs
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return The length of the packet's custom header, in bytes.
+ */
+static __inline netio_size_t
+NETIO_PKT_CUSTOM_HEADER_LENGTH(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_CUSTOM_HEADER_LENGTH_M(mda, pkt);
+}
+
+
+/** Return the length of the packet, starting with the custom header.
+ *  A custom header may or may not be present, depending upon the IPP; its
+ *  contents and alignment are also IPP-dependent.  Currently, none of the
+ *  standard IPPs supplied by Tilera produce a custom header.  If present,
+ *  the custom header precedes the L2 header in the packet buffer.
+ * @ingroup pktfuncs
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return  The length of the packet, in bytes.
+ */
+static __inline netio_size_t
+NETIO_PKT_CUSTOM_LENGTH(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_CUSTOM_LENGTH_M(mda, pkt);
+}
+
+
+/** Return a pointer to the packet's custom header.
+ *  A custom header may or may not be present, depending upon the IPP; its
+ *  contents and alignment are also IPP-dependent.  Currently, none of the
+ *  standard IPPs supplied by Tilera produce a custom header.  If present,
+ *  the custom header precedes the L2 header in the packet buffer.
+ * @ingroup pktfuncs
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return A pointer to start of the packet.
+ */
+static __inline unsigned char*
+NETIO_PKT_CUSTOM_DATA(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_CUSTOM_DATA_M(mda, pkt);
+}
+
+
+/** Return the length of the packet's L2 (Ethernet plus VLAN or SNAP) header.
+ * @ingroup pktfuncs
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return The length of the packet's L2 header, in bytes.
+ */
+static __inline netio_size_t
+NETIO_PKT_L2_HEADER_LENGTH(netio_pkt_t* pkt)
+{
+  if (NETIO_PKT_IS_MINIMAL(pkt))
+  {
+    netio_pkt_minimal_metadata_t* mmd = NETIO_PKT_MINIMAL_METADATA(pkt);
+
+    return NETIO_PKT_L2_HEADER_LENGTH_MM(mmd, pkt);
+  }
+  else
+  {
+    netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+    return NETIO_PKT_L2_HEADER_LENGTH_M(mda, pkt);
+  }
+}
+
+
+/** Return the length of the packet, starting with the L2 (Ethernet) header.
+ * @ingroup pktfuncs
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return  The length of the packet, in bytes.
+ */
+static __inline netio_size_t
+NETIO_PKT_L2_LENGTH(netio_pkt_t* pkt)
+{
+  if (NETIO_PKT_IS_MINIMAL(pkt))
+  {
+    netio_pkt_minimal_metadata_t* mmd = NETIO_PKT_MINIMAL_METADATA(pkt);
+
+    return NETIO_PKT_L2_LENGTH_MM(mmd, pkt);
+  }
+  else
+  {
+    netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+    return NETIO_PKT_L2_LENGTH_M(mda, pkt);
+  }
+}
+
+
+/** Return a pointer to the packet's L2 (Ethernet) header.
+ * @ingroup pktfuncs
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return A pointer to start of the packet.
+ */
+static __inline unsigned char*
+NETIO_PKT_L2_DATA(netio_pkt_t* pkt)
+{
+  if (NETIO_PKT_IS_MINIMAL(pkt))
+  {
+    netio_pkt_minimal_metadata_t* mmd = NETIO_PKT_MINIMAL_METADATA(pkt);
+
+    return NETIO_PKT_L2_DATA_MM(mmd, pkt);
+  }
+  else
+  {
+    netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+    return NETIO_PKT_L2_DATA_M(mda, pkt);
+  }
+}
+
+
+/** Retrieve the length of the packet, starting with the L3 (generally, the IP)
+ * header.
+ * @ingroup pktfuncs
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return Length of the packet's L3 header and data, in bytes.
+ */
+static __inline netio_size_t
+NETIO_PKT_L3_LENGTH(netio_pkt_t* pkt)
+{
+  if (NETIO_PKT_IS_MINIMAL(pkt))
+  {
+    netio_pkt_minimal_metadata_t* mmd = NETIO_PKT_MINIMAL_METADATA(pkt);
+
+    return NETIO_PKT_L3_LENGTH_MM(mmd, pkt);
+  }
+  else
+  {
+    netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+    return NETIO_PKT_L3_LENGTH_M(mda, pkt);
+  }
+}
+
+
+/** Return a pointer to the packet's L3 (generally, the IP) header.
+ * @ingroup pktfuncs
+ *
+ * Note that we guarantee word alignment of the L3 header.
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return A pointer to the packet's L3 header.
+ */
+static __inline unsigned char*
+NETIO_PKT_L3_DATA(netio_pkt_t* pkt)
+{
+  if (NETIO_PKT_IS_MINIMAL(pkt))
+  {
+    netio_pkt_minimal_metadata_t* mmd = NETIO_PKT_MINIMAL_METADATA(pkt);
+
+    return NETIO_PKT_L3_DATA_MM(mmd, pkt);
+  }
+  else
+  {
+    netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+    return NETIO_PKT_L3_DATA_M(mda, pkt);
+  }
+}
+
+
+/** Return the ordinal of the packet.
+ * @ingroup ingress
+ *
+ * Each packet is given an ordinal number when it is delivered by the IPP.
+ * In the medium term, the ordinal is unique and monotonically increasing,
+ * being incremented by 1 for each packet; the ordinal of the first packet
+ * delivered after the IPP starts is zero.  (Since the ordinal is of finite
+ * size, given enough input packets, it will eventually wrap around to zero;
+ * in the long term, therefore, ordinals are not unique.)  The ordinals
+ * handed out by different IPPs are not disjoint, so two packets from
+ * different IPPs may have identical ordinals.  Packets dropped by the
+ * IPP or by the I/O shim are not assigned ordinals.
+ *
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's per-IPP packet ordinal.
+ */
+static __inline unsigned int
+NETIO_PKT_ORDINAL(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_ORDINAL_M(mda, pkt);
+}
+
+
+/** Return the per-group ordinal of the packet.
+ * @ingroup ingress
+ *
+ * Each packet is given a per-group ordinal number when it is
+ * delivered by the IPP. By default, the group is the packet's VLAN,
+ * although IPP can be recompiled to use different values.  In
+ * the medium term, the ordinal is unique and monotonically
+ * increasing, being incremented by 1 for each packet; the ordinal of
+ * the first packet distributed to a particular group is zero.
+ * (Since the ordinal is of finite size, given enough input packets,
+ * it will eventually wrap around to zero; in the long term,
+ * therefore, ordinals are not unique.)  The ordinals handed out by
+ * different IPPs are not disjoint, so two packets from different IPPs
+ * may have identical ordinals; similarly, packets distributed to
+ * different groups may have identical ordinals.  Packets dropped by
+ * the IPP or by the I/O shim are not assigned ordinals.
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's per-IPP, per-group ordinal.
+ */
+static __inline unsigned int
+NETIO_PKT_GROUP_ORDINAL(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_GROUP_ORDINAL_M(mda, pkt);
+}
+
+
+/** Return the VLAN ID assigned to the packet.
+ * @ingroup ingress
+ *
+ * This is usually also contained within the packet header.  If the packet
+ * does not have a VLAN tag, the VLAN ID returned by this function is zero.
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's VLAN ID.
+ */
+static __inline unsigned short
+NETIO_PKT_VLAN_ID(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_VLAN_ID_M(mda, pkt);
+}
+
+
+/** Return the ethertype of the packet.
+ * @ingroup ingress
+ *
+ * This value is reliable if @ref NETIO_PKT_ETHERTYPE_RECOGNIZED()
+ * returns true, and otherwise, may not be well defined.
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's ethertype.
+ */
+static __inline unsigned short
+NETIO_PKT_ETHERTYPE(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_ETHERTYPE_M(mda, pkt);
+}
+
+
+/** Return the flow hash computed on the packet.
+ * @ingroup ingress
+ *
+ * For TCP and UDP packets, this hash is calculated by hashing together
+ * the "5-tuple" values, specifically the source IP address, destination
+ * IP address, protocol type, source port and destination port.
+ * The hash value is intended to be helpful for millions of distinct
+ * flows.
+ *
+ * For IPv4 or IPv6 packets which are neither TCP nor UDP, the flow hash is
+ * derived by hashing together the source and destination IP addresses.
+ *
+ * For MPLS-encapsulated packets, the flow hash is derived by hashing
+ * the first MPLS label.
+ *
+ * For all other packets the flow hash is computed from the source
+ * and destination Ethernet addresses.
+ *
+ * The hash is symmetric, meaning it produces the same value if the
+ * source and destination are swapped. The only exceptions are
+ * tunneling protocols 0x04 (IP in IP Encapsulation), 0x29 (Simple
+ * Internet Protocol), 0x2F (General Routing Encapsulation) and 0x32
+ * (Encap Security Payload), which use only the destination address
+ * since the source address is not meaningful.
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's 32-bit flow hash.
+ */
+static __inline unsigned int
+NETIO_PKT_FLOW_HASH(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_FLOW_HASH_M(mda, pkt);
+}
+
+
+/** Return the first word of "user data" for the packet.
+ *
+ * The contents of the user data words depend on the IPP.
+ *
+ * When using the standard ipp1, ipp2, or ipp4 sub-drivers, the first
+ * word of user data contains the least significant bits of the 64-bit
+ * arrival cycle count (see @c get_cycle_count_low()).
+ *
+ * See the <em>System Programmer's Guide</em> for details.
+ *
+ * @ingroup ingress
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's first word of "user data".
+ */
+static __inline unsigned int
+NETIO_PKT_USER_DATA_0(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_USER_DATA_0_M(mda, pkt);
+}
+
+
+/** Return the second word of "user data" for the packet.
+ *
+ * The contents of the user data words depend on the IPP.
+ *
+ * When using the standard ipp1, ipp2, or ipp4 sub-drivers, the second
+ * word of user data contains the most significant bits of the 64-bit
+ * arrival cycle count (see @c get_cycle_count_high()).
+ *
+ * See the <em>System Programmer's Guide</em> for details.
+ *
+ * @ingroup ingress
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's second word of "user data".
+ */
+static __inline unsigned int
+NETIO_PKT_USER_DATA_1(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_USER_DATA_1_M(mda, pkt);
+}
+
+
+/** Determine whether the L4 (TCP/UDP) checksum was calculated.
+ * @ingroup ingress
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return Nonzero if the L4 checksum was calculated.
+ */
+static __inline unsigned int
+NETIO_PKT_L4_CSUM_CALCULATED(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_L4_CSUM_CALCULATED_M(mda, pkt);
+}
+
+
+/** Determine whether the L4 (TCP/UDP) checksum was calculated and found to
+ *  be correct.
+ * @ingroup ingress
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return Nonzero if the checksum was calculated and is correct.
+ */
+static __inline unsigned int
+NETIO_PKT_L4_CSUM_CORRECT(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_L4_CSUM_CORRECT_M(mda, pkt);
+}
+
+
+/** Determine whether the L3 (IP) checksum was calculated.
+ * @ingroup ingress
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return Nonzero if the L3 (IP) checksum was calculated.
+*/
+static __inline unsigned int
+NETIO_PKT_L3_CSUM_CALCULATED(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_L3_CSUM_CALCULATED_M(mda, pkt);
+}
+
+
+/** Determine whether the L3 (IP) checksum was calculated and found to be
+ *  correct.
+ * @ingroup ingress
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return Nonzero if the checksum was calculated and is correct.
+ */
+static __inline unsigned int
+NETIO_PKT_L3_CSUM_CORRECT(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_L3_CSUM_CORRECT_M(mda, pkt);
+}
+
+
+/** Determine whether the Ethertype was recognized and L3 packet data was
+ *  processed.
+ * @ingroup ingress
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return Nonzero if the Ethertype was recognized and L3 packet data was
+ *   processed.
+ */
+static __inline unsigned int
+NETIO_PKT_ETHERTYPE_RECOGNIZED(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_ETHERTYPE_RECOGNIZED_M(mda, pkt);
+}
+
+
+/** Set an egress packet's L2 length, using a metadata pointer to speed the
+ * computation.
+ * @ingroup egress
+ *
+ * @param[in,out] mmd Pointer to packet's minimal metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @param[in] len Packet L2 length, in bytes.
+ */
+static __inline void
+NETIO_PKT_SET_L2_LENGTH_MM(netio_pkt_minimal_metadata_t* mmd, netio_pkt_t* pkt,
+                           int len)
+{
+  mmd->l2_length = len;
+}
+
+
+/** Set an egress packet's L2 length.
+ * @ingroup egress
+ *
+ * @param[in,out] pkt Packet on which to operate.
+ * @param[in] len Packet L2 length, in bytes.
+ */
+static __inline void
+NETIO_PKT_SET_L2_LENGTH(netio_pkt_t* pkt, int len)
+{
+  netio_pkt_minimal_metadata_t* mmd = NETIO_PKT_MINIMAL_METADATA(pkt);
+
+  NETIO_PKT_SET_L2_LENGTH_MM(mmd, pkt, len);
+}
+
+
+/** Set an egress packet's L2 header length, using a metadata pointer to
+ *  speed the computation.
+ * @ingroup egress
+ *
+ * It is not normally necessary to call this routine; only the L2 length,
+ * not the header length, is needed to transmit a packet.  It may be useful if
+ * the egress packet will later be processed by code which expects to use
+ * functions like @ref NETIO_PKT_L3_DATA() to get a pointer to the L3 payload.
+ *
+ * @param[in,out] mmd Pointer to packet's minimal metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @param[in] len Packet L2 header length, in bytes.
+ */
+static __inline void
+NETIO_PKT_SET_L2_HEADER_LENGTH_MM(netio_pkt_minimal_metadata_t* mmd,
+                                  netio_pkt_t* pkt, int len)
+{
+  mmd->l3_offset = mmd->l2_offset + len;
+}
+
+
+/** Set an egress packet's L2 header length.
+ * @ingroup egress
+ *
+ * It is not normally necessary to call this routine; only the L2 length,
+ * not the header length, is needed to transmit a packet.  It may be useful if
+ * the egress packet will later be processed by code which expects to use
+ * functions like @ref NETIO_PKT_L3_DATA() to get a pointer to the L3 payload.
+ *
+ * @param[in,out] pkt Packet on which to operate.
+ * @param[in] len Packet L2 header length, in bytes.
+ */
+static __inline void
+NETIO_PKT_SET_L2_HEADER_LENGTH(netio_pkt_t* pkt, int len)
+{
+  netio_pkt_minimal_metadata_t* mmd = NETIO_PKT_MINIMAL_METADATA(pkt);
+
+  NETIO_PKT_SET_L2_HEADER_LENGTH_MM(mmd, pkt, len);
+}
+
+
+/** Set up an egress packet for hardware checksum computation, using a
+ *  metadata pointer to speed the operation.
+ * @ingroup egress
+ *
+ *  NetIO provides the ability to automatically calculate a standard
+ *  16-bit Internet checksum on transmitted packets.  The application
+ *  may specify the point in the packet where the checksum starts, the
+ *  number of bytes to be checksummed, and the two bytes in the packet
+ *  which will be replaced with the completed checksum.  (If the range
+ *  of bytes to be checksummed includes the bytes to be replaced, the
+ *  initial values of those bytes will be included in the checksum.)
+ *
+ *  For some protocols, the packet checksum covers data which is not present
+ *  in the packet, or is at least not contiguous to the main data payload.
+ *  For instance, the TCP checksum includes a "pseudo-header" which includes
+ *  the source and destination IP addresses of the packet.  To accommodate
+ *  this, the checksum engine may be "seeded" with an initial value, which
+ *  the application would need to compute based on the specific protocol's
+ *  requirements.  Note that the seed is given in host byte order (little-
+ *  endian), not network byte order (big-endian); code written to compute a
+ *  pseudo-header checksum in network byte order will need to byte-swap it
+ *  before use as the seed.
+ *
+ *  Note that the checksum is computed as part of the transmission process,
+ *  so it will not be present in the packet upon completion of this routine.
+ *
+ * @param[in,out] mmd Pointer to packet's minimal metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @param[in] start Offset within L2 packet of the first byte to include in
+ *   the checksum.
+ * @param[in] length Number of bytes to include in the checksum.
+ *   the checksum.
+ * @param[in] location Offset within L2 packet of the first of the two bytes
+ *   to be replaced with the calculated checksum.
+ * @param[in] seed Initial value of the running checksum before any of the
+ *   packet data is added.
+ */
+static __inline void
+NETIO_PKT_DO_EGRESS_CSUM_MM(netio_pkt_minimal_metadata_t* mmd,
+                            netio_pkt_t* pkt, int start, int length,
+                            int location, uint16_t seed)
+{
+  mmd->csum_start = start;
+  mmd->csum_length = length;
+  mmd->csum_location = location;
+  mmd->csum_seed = seed;
+  mmd->flags |= _NETIO_PKT_NEED_EDMA_CSUM_MASK;
+}
+
+
+/** Set up an egress packet for hardware checksum computation.
+ * @ingroup egress
+ *
+ *  NetIO provides the ability to automatically calculate a standard
+ *  16-bit Internet checksum on transmitted packets.  The application
+ *  may specify the point in the packet where the checksum starts, the
+ *  number of bytes to be checksummed, and the two bytes in the packet
+ *  which will be replaced with the completed checksum.  (If the range
+ *  of bytes to be checksummed includes the bytes to be replaced, the
+ *  initial values of those bytes will be included in the checksum.)
+ *
+ *  For some protocols, the packet checksum covers data which is not present
+ *  in the packet, or is at least not contiguous to the main data payload.
+ *  For instance, the TCP checksum includes a "pseudo-header" which includes
+ *  the source and destination IP addresses of the packet.  To accommodate
+ *  this, the checksum engine may be "seeded" with an initial value, which
+ *  the application would need to compute based on the specific protocol's
+ *  requirements.  Note that the seed is given in host byte order (little-
+ *  endian), not network byte order (big-endian); code written to compute a
+ *  pseudo-header checksum in network byte order will need to byte-swap it
+ *  before use as the seed.
+ *
+ *  Note that the checksum is computed as part of the transmission process,
+ *  so it will not be present in the packet upon completion of this routine.
+ *
+ * @param[in,out] pkt Packet on which to operate.
+ * @param[in] start Offset within L2 packet of the first byte to include in
+ *   the checksum.
+ * @param[in] length Number of bytes to include in the checksum.
+ *   the checksum.
+ * @param[in] location Offset within L2 packet of the first of the two bytes
+ *   to be replaced with the calculated checksum.
+ * @param[in] seed Initial value of the running checksum before any of the
+ *   packet data is added.
+ */
+static __inline void
+NETIO_PKT_DO_EGRESS_CSUM(netio_pkt_t* pkt, int start, int length,
+                         int location, uint16_t seed)
+{
+  netio_pkt_minimal_metadata_t* mmd = NETIO_PKT_MINIMAL_METADATA(pkt);
+
+  NETIO_PKT_DO_EGRESS_CSUM_MM(mmd, pkt, start, length, location, seed);
+}
+
+
+/** Return the number of bytes which could be prepended to a packet, using a
+ *  metadata pointer to speed the operation.
+ *  See @ref netio_populate_prepend_buffer() to get a full description of
+ *  prepending.
+ *
+ * @param[in,out] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline int
+NETIO_PKT_PREPEND_AVAIL_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return (pkt->__packet.bits.__offset << 6) +
+         NETIO_PKT_CUSTOM_HEADER_LENGTH_M(mda, pkt);
+}
+
+
+/** Return the number of bytes which could be prepended to a packet, using a
+ *  metadata pointer to speed the operation.
+ *  See @ref netio_populate_prepend_buffer() to get a full description of
+ *  prepending.
+ * @ingroup egress
+ *
+ * @param[in,out] mmd Pointer to packet's minimal metadata.
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline int
+NETIO_PKT_PREPEND_AVAIL_MM(netio_pkt_minimal_metadata_t* mmd, netio_pkt_t* pkt)
+{
+  return (pkt->__packet.bits.__offset << 6) + mmd->l2_offset;
+}
+
+
+/** Return the number of bytes which could be prepended to a packet.
+ *  See @ref netio_populate_prepend_buffer() to get a full description of
+ *  prepending.
+ * @ingroup egress
+ *
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline int
+NETIO_PKT_PREPEND_AVAIL(netio_pkt_t* pkt)
+{
+  if (NETIO_PKT_IS_MINIMAL(pkt))
+  {
+    netio_pkt_minimal_metadata_t* mmd = NETIO_PKT_MINIMAL_METADATA(pkt);
+
+    return NETIO_PKT_PREPEND_AVAIL_MM(mmd, pkt);
+  }
+  else
+  {
+    netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+    return NETIO_PKT_PREPEND_AVAIL_M(mda, pkt);
+  }
+}
+
+
+/** Flush a packet's minimal metadata from the cache, using a metadata pointer
+ *  to speed the operation.
+ * @ingroup egress
+ *
+ * @param[in] mmd Pointer to packet's minimal metadata.
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline void
+NETIO_PKT_FLUSH_MINIMAL_METADATA_MM(netio_pkt_minimal_metadata_t* mmd,
+                                    netio_pkt_t* pkt)
+{
+}
+
+
+/** Invalidate a packet's minimal metadata from the cache, using a metadata
+ *  pointer to speed the operation.
+ * @ingroup egress
+ *
+ * @param[in] mmd Pointer to packet's minimal metadata.
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline void
+NETIO_PKT_INV_MINIMAL_METADATA_MM(netio_pkt_minimal_metadata_t* mmd,
+                                  netio_pkt_t* pkt)
+{
+}
+
+
+/** Flush and then invalidate a packet's minimal metadata from the cache,
+ *  using a metadata pointer to speed the operation.
+ * @ingroup egress
+ *
+ * @param[in] mmd Pointer to packet's minimal metadata.
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline void
+NETIO_PKT_FLUSH_INV_MINIMAL_METADATA_MM(netio_pkt_minimal_metadata_t* mmd,
+                                        netio_pkt_t* pkt)
+{
+}
+
+
+/** Flush a packet's metadata from the cache, using a metadata pointer
+ *  to speed the operation.
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's minimal metadata.
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline void
+NETIO_PKT_FLUSH_METADATA_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+}
+
+
+/** Invalidate a packet's metadata from the cache, using a metadata
+ *  pointer to speed the operation.
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's metadata.
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline void
+NETIO_PKT_INV_METADATA_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+}
+
+
+/** Flush and then invalidate a packet's metadata from the cache,
+ *  using a metadata pointer to speed the operation.
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's metadata.
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline void
+NETIO_PKT_FLUSH_INV_METADATA_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+}
+
+
+/** Flush a packet's minimal metadata from the cache.
+ * @ingroup egress
+ *
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline void
+NETIO_PKT_FLUSH_MINIMAL_METADATA(netio_pkt_t* pkt)
+{
+}
+
+
+/** Invalidate a packet's minimal metadata from the cache.
+ * @ingroup egress
+ *
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline void
+NETIO_PKT_INV_MINIMAL_METADATA(netio_pkt_t* pkt)
+{
+}
+
+
+/** Flush and then invalidate a packet's minimal metadata from the cache.
+ * @ingroup egress
+ *
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline void
+NETIO_PKT_FLUSH_INV_MINIMAL_METADATA(netio_pkt_t* pkt)
+{
+}
+
+
+/** Flush a packet's metadata from the cache.
+ * @ingroup ingress
+ *
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline void
+NETIO_PKT_FLUSH_METADATA(netio_pkt_t* pkt)
+{
+}
+
+
+/** Invalidate a packet's metadata from the cache.
+ * @ingroup ingress
+ *
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline void
+NETIO_PKT_INV_METADATA(netio_pkt_t* pkt)
+{
+}
+
+
+/** Flush and then invalidate a packet's metadata from the cache.
+ * @ingroup ingress
+ *
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline void
+NETIO_PKT_FLUSH_INV_METADATA(netio_pkt_t* pkt)
+{
+}
+
+/** Number of NUMA nodes we can distribute buffers to.
+ * @ingroup setup */
+#define NETIO_NUM_NODE_WEIGHTS  16
+
+/**
+ * @brief An object for specifying the characteristics of NetIO communication
+ * endpoint.
+ *
+ * @ingroup setup
+ *
+ * The @ref netio_input_register() function uses this structure to define
+ * how an application tile will communicate with an IPP.
+ *
+ *
+ * Future updates to NetIO may add new members to this structure,
+ * which can affect the success of the registration operation.  Thus,
+ * if dynamically initializing the structure, applications are urged to
+ * zero it out first, for example:
+ *
+ * @code
+ * netio_input_config_t config;
+ * memset(&config, 0, sizeof (config));
+ * config.flags = NETIO_RECV | NETIO_XMIT_CSUM | NETIO_TAG_NONE;
+ * config.num_receive_packets = NETIO_MAX_RECEIVE_PKTS;
+ * config.queue_id = 0;
+ *     .
+ *     .
+ *     .
+ * @endcode
+ *
+ * since that guarantees that any unused structure members, including
+ * members which did not exist when the application was first developed,
+ * will not have unexpected values.
+ *
+ * If statically initializing the structure, we strongly recommend use of
+ * C99-style named initializers, for example:
+ *
+ * @code
+ * netio_input_config_t config = {
+ *    .flags = NETIO_RECV | NETIO_XMIT_CSUM | NETIO_TAG_NONE,
+ *    .num_receive_packets = NETIO_MAX_RECEIVE_PKTS,
+ *    .queue_id = 0,
+ * },
+ * @endcode
+ *
+ * instead of the old-style structure initialization:
+ *
+ * @code
+ * // Bad example! Currently equivalent to the above, but don't do this.
+ * netio_input_config_t config = {
+ *    NETIO_RECV | NETIO_XMIT_CSUM | NETIO_TAG_NONE, NETIO_MAX_RECEIVE_PKTS, 0
+ * },
+ * @endcode
+ *
+ * since the C99 style requires no changes to the code if elements of the
+ * config structure are rearranged.  (It also makes the initialization much
+ * easier to understand.)
+ *
+ * Except for items which address a particular tile's transmit or receive
+ * characteristics, such as the ::NETIO_RECV flag, applications are advised
+ * to specify the same set of configuration data on all registrations.
+ * This prevents differing results if multiple tiles happen to do their
+ * registration operations in a different order on different invocations of
+ * the application.  This is particularly important for things like link
+ * management flags, and buffer size and homing specifications.
+ *
+ * Unless the ::NETIO_FIXED_BUFFER_VA flag is specified in flags, the NetIO
+ * buffer pool is automatically created and mapped into the application's
+ * virtual address space at an address chosen by the operating system,
+ * using the common memory (cmem) facility in the Tilera Multicore
+ * Components library.  The cmem facility allows multiple processes to gain
+ * access to shared memory which is mapped into each process at an
+ * identical virtual address.  In order for this to work, the processes
+ * must have a common ancestor, which must create the common memory using
+ * tmc_cmem_init().
+ *
+ * In programs using the iLib process creation API, or in programs which use
+ * only one process (which include programs using the pthreads library),
+ * tmc_cmem_init() is called automatically.  All other applications
+ * must call it explicitly, before any child processes which might call
+ * netio_input_register() are created.
+ */
+typedef struct
+{
+  /** Registration characteristics.
+
+      This value determines several characteristics of the registration;
+      flags for different types of behavior are ORed together to make the
+      final flag value.  Generally applications should specify exactly
+      one flag from each of the following categories:
+
+      - Whether the application will be receiving packets on this queue
+        (::NETIO_RECV or ::NETIO_NO_RECV).
+
+      - Whether the application will be transmitting packets on this queue,
+        and if so, whether it will request egress checksum calculation
+        (::NETIO_XMIT, ::NETIO_XMIT_CSUM, or ::NETIO_NO_XMIT).  It is
+        legal to call netio_get_buffer() without one of the XMIT flags,
+        as long as ::NETIO_RECV is specified; in this case, the retrieved
+        buffers must be passed to another tile for transmission.
+
+      - Whether the application expects any vendor-specific tags in
+        its packets' L2 headers (::NETIO_TAG_NONE, ::NETIO_TAG_BRCM,
+        or ::NETIO_TAG_MRVL).  This must match the configuration of the
+        target IPP.
+
+      To accommodate applications written to previous versions of the NetIO
+      interface, none of the flags above are currently required; if omitted,
+      NetIO behaves more or less as if ::NETIO_RECV | ::NETIO_XMIT_CSUM |
+      ::NETIO_TAG_NONE were used.  However, explicit specification of
+      the relevant flags allows NetIO to do a better job of resource
+      allocation, allows earlier detection of certain configuration errors,
+      and may enable advanced features or higher performance in the future,
+      so their use is strongly recommended.
+
+      Note that specifying ::NETIO_NO_RECV along with ::NETIO_NO_XMIT
+      is a special case, intended primarily for use by programs which
+      retrieve network statistics or do link management operations.
+      When these flags are both specified, the resulting queue may not
+      be used with NetIO routines other than netio_get(), netio_set(),
+      and netio_input_unregister().  See @ref link for more information
+      on link management.
+
+      Other flags are optional; their use is described below.
+  */
+  int flags;
+
+  /** Interface name.  This is a string which identifies the specific
+      Ethernet controller hardware to be used.  The format of the string
+      is a device type and a device index, separated by a slash; so,
+      the first 10 Gigabit Ethernet controller is named "xgbe/0", while
+      the second 10/100/1000 Megabit Ethernet controller is named "gbe/1".
+   */
+  const char* interface;
+
+  /** Receive packet queue size.  This specifies the maximum number
+      of ingress packets that can be received on this queue without
+      being retrieved by @ref netio_get_packet().  If the IPP's distribution
+      algorithm calls for a packet to be sent to this queue, and this
+      number of packets are already pending there, the new packet
+      will either be discarded, or sent to another tile registered
+      for the same queue_id (see @ref drops).  This value must
+      be at least ::NETIO_MIN_RECEIVE_PKTS, can always be at least
+      ::NETIO_MAX_RECEIVE_PKTS, and may be larger than that on certain
+      interfaces.
+   */
+  int num_receive_packets;
+
+  /** The queue ID being requested.  Legal values for this range from 0
+      to ::NETIO_MAX_QUEUE_ID, inclusive.  ::NETIO_MAX_QUEUE_ID is always
+      greater than or equal to the number of tiles; this allows one queue
+      for each tile, plus at least one additional queue.  Some applications
+      may wish to use the additional queue as a destination for unwanted
+      packets, since packets delivered to queues for which no tiles have
+      registered are discarded.
+   */
+  unsigned int queue_id;
+
+  /** Maximum number of small send buffers to be held in the local empty
+      buffer cache.  This specifies the size of the area which holds
+      empty small egress buffers requested from the IPP but not yet
+      retrieved via @ref netio_get_buffer().  This value must be greater
+      than zero if the application will ever use @ref netio_get_buffer()
+      to allocate empty small egress buffers; it may be no larger than
+      ::NETIO_MAX_SEND_BUFFERS.  See @ref epp for more details on empty
+      buffer caching.
+   */
+  int num_send_buffers_small_total;
+
+  /** Number of small send buffers to be preallocated at registration.
+      If this value is nonzero, the specified number of empty small egress
+      buffers will be requested from the IPP during the netio_input_register
+      operation; this may speed the execution of @ref netio_get_buffer().
+      This may be no larger than @ref num_send_buffers_small_total.  See @ref
+      epp for more details on empty buffer caching.
+   */
+  int num_send_buffers_small_prealloc;
+
+  /** Maximum number of large send buffers to be held in the local empty
+      buffer cache.  This specifies the size of the area which holds empty
+      large egress buffers requested from the IPP but not yet retrieved via
+      @ref netio_get_buffer().  This value must be greater than zero if the
+      application will ever use @ref netio_get_buffer() to allocate empty
+      large egress buffers; it may be no larger than ::NETIO_MAX_SEND_BUFFERS.
+      See @ref epp for more details on empty buffer caching.
+   */
+  int num_send_buffers_large_total;
+
+  /** Number of large send buffers to be preallocated at registration.
+      If this value is nonzero, the specified number of empty large egress
+      buffers will be requested from the IPP during the netio_input_register
+      operation; this may speed the execution of @ref netio_get_buffer().
+      This may be no larger than @ref num_send_buffers_large_total.  See @ref
+      epp for more details on empty buffer caching.
+   */
+  int num_send_buffers_large_prealloc;
+
+  /** Maximum number of jumbo send buffers to be held in the local empty
+      buffer cache.  This specifies the size of the area which holds empty
+      jumbo egress buffers requested from the IPP but not yet retrieved via
+      @ref netio_get_buffer().  This value must be greater than zero if the
+      application will ever use @ref netio_get_buffer() to allocate empty
+      jumbo egress buffers; it may be no larger than ::NETIO_MAX_SEND_BUFFERS.
+      See @ref epp for more details on empty buffer caching.
+   */
+  int num_send_buffers_jumbo_total;
+
+  /** Number of jumbo send buffers to be preallocated at registration.
+      If this value is nonzero, the specified number of empty jumbo egress
+      buffers will be requested from the IPP during the netio_input_register
+      operation; this may speed the execution of @ref netio_get_buffer().
+      This may be no larger than @ref num_send_buffers_jumbo_total.  See @ref
+      epp for more details on empty buffer caching.
+   */
+  int num_send_buffers_jumbo_prealloc;
+
+  /** Total packet buffer size.  This determines the total size, in bytes,
+      of the NetIO buffer pool.  Note that the maximum number of available
+      buffers of each size is determined during hypervisor configuration
+      (see the <em>System Programmer's Guide</em> for details); this just
+      influences how much host memory is allocated for those buffers.
+
+      The buffer pool is allocated from common memory, which will be
+      automatically initialized if needed.  If your buffer pool is larger
+      than 240 MB, you might need to explicitly call @c tmc_cmem_init(),
+      as described in the Application Libraries Reference Manual (UG227).
+
+      Packet buffers are currently allocated in chunks of 16 MB; this
+      value will be rounded up to the next larger multiple of 16 MB.
+      If this value is zero, a default of 32 MB will be used; this was
+      the value used by previous versions of NetIO.  Note that taking this
+      default also affects the placement of buffers on Linux NUMA nodes.
+      See @ref buffer_node_weights for an explanation of buffer placement.
+
+      In order to successfully allocate packet buffers, Linux must have
+      available huge pages on the relevant Linux NUMA nodes.  See the
+      <em>System Programmer's Guide</em> for information on configuring
+      huge page support in Linux.
+   */
+  uint64_t total_buffer_size;
+
+  /** Buffer placement weighting factors.
+
+      This array specifies the relative amount of buffering to place
+      on each of the available Linux NUMA nodes.  This array is
+      indexed by the NUMA node, and the values in the array are
+      proportional to the amount of buffer space to allocate on that
+      node.
+
+      If memory striping is enabled in the Hypervisor, then there is
+      only one logical NUMA node (node 0). In that case, NetIO will by
+      default ignore the suggested buffer node weights, and buffers
+      will be striped across the physical memory controllers. See
+      UG209 System Programmer's Guide for a description of the
+      hypervisor option that controls memory striping.
+
+      If memory striping is disabled, then there are up to four NUMA
+      nodes, corresponding to the four DDRAM controllers in the TILE
+      processor architecture.  See UG100 Tile Processor Architecture
+      Overview for a diagram showing the location of each of the DDRAM
+      controllers relative to the tile array.
+
+      For instance, if memory striping is disabled, the following
+      configuration strucure:
+
+      @code
+      netio_input_config_t config = {
+            .
+            .
+            .
+        .total_buffer_size = 4 * 16 * 1024 * 1024;
+        .buffer_node_weights = { 1, 0, 1, 0 },
+      },
+      @endcode
+
+      would result in 32 MB of buffers being placed on controller 0, and
+      32 MB on controller 2.  (Since buffers are allocated in units of
+      16 MB, some sets of weights will not be able to be matched exactly.)
+
+      For the weights to be effective, @ref total_buffer_size must be
+      nonzero.  If @ref total_buffer_size is zero, causing the default
+      32 MB of buffer space to be used, then any specified weights will
+      be ignored, and buffers will positioned as they were in previous
+      versions of NetIO:
+
+      - For xgbe/0 and gbe/0, 16 MB of buffers will be placed on controller 1,
+        and the other 16 MB will be placed on controller 2.
+
+      - For xgbe/1 and gbe/1, 16 MB of buffers will be placed on controller 2,
+        and the other 16 MB will be placed on controller 3.
+
+      If @ref total_buffer_size is nonzero, but all weights are zero,
+      then all buffer space will be allocated on Linux NUMA node zero.
+
+      By default, the specified buffer placement is treated as a hint;
+      if sufficient free memory is not available on the specified
+      controllers, the buffers will be allocated elsewhere.  However,
+      if the ::NETIO_STRICT_HOMING flag is specified in @ref flags, then a
+      failure to allocate buffer space exactly as requested will cause the
+      registration operation to fail with an error of ::NETIO_CANNOT_HOME.
+
+      Note that maximal network performance cannot be achieved with
+      only one memory controller.
+   */
+  uint8_t buffer_node_weights[NETIO_NUM_NODE_WEIGHTS];
+
+  /** Fixed virtual address for packet buffers.  Only valid when
+      ::NETIO_FIXED_BUFFER_VA is specified in @ref flags; see the
+      description of that flag for details.
+   */
+  void* fixed_buffer_va;
+
+  /**
+      Maximum number of outstanding send packet requests.  This value is
+      only relevant when an EPP is in use; it determines the number of
+      slots in the EPP's outgoing packet queue which this tile is allowed
+      to consume, and thus the number of packets which may be sent before
+      the sending tile must wait for an acknowledgment from the EPP.
+      Modifying this value is generally only helpful when using @ref
+      netio_send_packet_vector(), where it can help improve performance by
+      allowing a single vector send operation to process more packets.
+      Typically it is not specified, and the default, which divides the
+      outgoing packet slots evenly between all tiles on the chip, is used.
+
+      If a registration asks for more outgoing packet queue slots than are
+      available, ::NETIO_TOOMANY_XMIT will be returned.  The total number
+      of packet queue slots which are available for all tiles for each EPP
+      is subject to change, but is currently ::NETIO_TOTAL_SENDS_OUTSTANDING.
+
+
+      This value is ignored if ::NETIO_XMIT is not specified in flags.
+      If you want to specify a large value here for a specific tile, you are
+      advised to specify NETIO_NO_XMIT on other, non-transmitting tiles so
+      that they do not consume a default number of packet slots.  Any tile
+      transmitting is required to have at least ::NETIO_MIN_SENDS_OUTSTANDING
+      slots allocated to it; values less than that will be silently
+      increased by the NetIO library.
+   */
+  int num_sends_outstanding;
+}
+netio_input_config_t;
+
+
+/** Registration flags; used in the @ref netio_input_config_t structure.
+ * @addtogroup setup
+ */
+/** @{ */
+
+/** Fail a registration request if we can't put packet buffers
+    on the specified memory controllers. */
+#define NETIO_STRICT_HOMING   0x00000002
+
+/** This application expects no tags on its L2 headers. */
+#define NETIO_TAG_NONE        0x00000004
+
+/** This application expects Marvell extended tags on its L2 headers. */
+#define NETIO_TAG_MRVL        0x00000008
+
+/** This application expects Broadcom tags on its L2 headers. */
+#define NETIO_TAG_BRCM        0x00000010
+
+/** This registration may call routines which receive packets. */
+#define NETIO_RECV            0x00000020
+
+/** This registration may not call routines which receive packets. */
+#define NETIO_NO_RECV         0x00000040
+
+/** This registration may call routines which transmit packets. */
+#define NETIO_XMIT            0x00000080
+
+/** This registration may call routines which transmit packets with
+    checksum acceleration. */
+#define NETIO_XMIT_CSUM       0x00000100
+
+/** This registration may not call routines which transmit packets. */
+#define NETIO_NO_XMIT         0x00000200
+
+/** This registration wants NetIO buffers mapped at an application-specified
+    virtual address.
+
+    NetIO buffers are by default created by the TMC common memory facility,
+    which must be configured by a common ancestor of all processes sharing
+    a network interface.  When this flag is specified, NetIO buffers are
+    instead mapped at an address chosen by the application (and specified
+    in @ref netio_input_config_t::fixed_buffer_va).  This allows multiple
+    unrelated but cooperating processes to share a NetIO interface.
+    All processes sharing the same interface must specify this flag,
+    and all must specify the same fixed virtual address.
+
+    @ref netio_input_config_t::fixed_buffer_va must be a
+    multiple of 16 MB, and the packet buffers will occupy @ref
+    netio_input_config_t::total_buffer_size bytes of virtual address
+    space, beginning at that address.  If any of those virtual addresses
+    are currently occupied by other memory objects, like application or
+    shared library code or data, @ref netio_input_register() will return
+    ::NETIO_FAULT.  While it is impossible to provide a fixed_buffer_va
+    which will work for all applications, a good first guess might be to
+    use 0xb0000000 minus @ref netio_input_config_t::total_buffer_size.
+    If that fails, it might be helpful to consult the running application's
+    virtual address description file (/proc/<em>pid</em>/maps) to see
+    which regions of virtual address space are available.
+ */
+#define NETIO_FIXED_BUFFER_VA 0x00000400
+
+/** This registration call will not complete unless the network link
+    is up.  The process will wait several seconds for this to happen (the
+    precise interval is link-dependent), but if the link does not come up,
+    ::NETIO_LINK_DOWN will be returned.  This flag is the default if
+    ::NETIO_NOREQUIRE_LINK_UP is not specified.  Note that this flag by
+    itself does not request that the link be brought up; that can be done
+    with the ::NETIO_AUTO_LINK_UPDN or ::NETIO_AUTO_LINK_UP flags (the
+    latter is the default if no NETIO_AUTO_LINK_xxx flags are specified),
+    or by explicitly setting the link's desired state via netio_set().
+    If the link is not brought up by one of those methods, and this flag
+    is specified, the registration operation will return ::NETIO_LINK_DOWN.
+    This flag is ignored if it is specified along with ::NETIO_NO_XMIT and
+    ::NETIO_NO_RECV.  See @ref link for more information on link
+    management.
+ */
+#define NETIO_REQUIRE_LINK_UP    0x00000800
+
+/** This registration call will complete even if the network link is not up.
+    Whenever the link is not up, packets will not be sent or received:
+    netio_get_packet() will return ::NETIO_NOPKT once all queued packets
+    have been drained, and netio_send_packet() and similar routines will
+    return NETIO_QUEUE_FULL once the outgoing packet queue in the EPP
+    or the I/O shim is full.  See @ref link for more information on link
+    management.
+ */
+#define NETIO_NOREQUIRE_LINK_UP  0x00001000
+
+#ifndef __DOXYGEN__
+/*
+ * These are part of the implementation of the NETIO_AUTO_LINK_xxx flags,
+ * but should not be used directly by applications, and are thus not
+ * documented.
+ */
+#define _NETIO_AUTO_UP        0x00002000
+#define _NETIO_AUTO_DN        0x00004000
+#define _NETIO_AUTO_PRESENT   0x00008000
+#endif
+
+/** Set the desired state of the link to up, allowing any speeds which are
+    supported by the link hardware, as part of this registration operation.
+    Do not take down the link automatically.  This is the default if
+    no other NETIO_AUTO_LINK_xxx flags are specified.  This flag is ignored
+    if it is specified along with ::NETIO_NO_XMIT and ::NETIO_NO_RECV.
+    See @ref link for more information on link management.
+ */
+#define NETIO_AUTO_LINK_UP     (_NETIO_AUTO_PRESENT | _NETIO_AUTO_UP)
+
+/** Set the desired state of the link to up, allowing any speeds which are
+    supported by the link hardware, as part of this registration operation.
+    Set the desired state of the link to down the next time no tiles are
+    registered for packet reception or transmission.  This flag is ignored
+    if it is specified along with ::NETIO_NO_XMIT and ::NETIO_NO_RECV.
+    See @ref link for more information on link management.
+ */
+#define NETIO_AUTO_LINK_UPDN   (_NETIO_AUTO_PRESENT | _NETIO_AUTO_UP | \
+                                _NETIO_AUTO_DN)
+
+/** Set the desired state of the link to down the next time no tiles are
+    registered for packet reception or transmission.  This flag is ignored
+    if it is specified along with ::NETIO_NO_XMIT and ::NETIO_NO_RECV.
+    See @ref link for more information on link management.
+ */
+#define NETIO_AUTO_LINK_DN     (_NETIO_AUTO_PRESENT | _NETIO_AUTO_DN)
+
+/** Do not bring up the link automatically as part of this registration
+    operation.  Do not take down the link automatically.  This flag
+    is ignored if it is specified along with ::NETIO_NO_XMIT and
+    ::NETIO_NO_RECV.  See @ref link for more information on link management.
+  */
+#define NETIO_AUTO_LINK_NONE   _NETIO_AUTO_PRESENT
+
+
+/** Minimum number of receive packets. */
+#define NETIO_MIN_RECEIVE_PKTS            16
+
+/** Lower bound on the maximum number of receive packets; may be higher
+    than this on some interfaces. */
+#define NETIO_MAX_RECEIVE_PKTS           128
+
+/** Maximum number of send buffers, per packet size. */
+#define NETIO_MAX_SEND_BUFFERS            16
+
+/** Number of EPP queue slots, and thus outstanding sends, per EPP. */
+#define NETIO_TOTAL_SENDS_OUTSTANDING   2015
+
+/** Minimum number of EPP queue slots, and thus outstanding sends, per
+ *  transmitting tile. */
+#define NETIO_MIN_SENDS_OUTSTANDING       16
+
+
+/**@}*/
+
+#ifndef __DOXYGEN__
+
+/**
+ * An object for providing Ethernet packets to a process.
+ */
+struct __netio_queue_impl_t;
+
+/**
+ * An object for managing the user end of a NetIO queue.
+ */
+struct __netio_queue_user_impl_t;
+
+#endif /* !__DOXYGEN__ */
+
+
+/** A netio_queue_t describes a NetIO communications endpoint.
+ * @ingroup setup
+ */
+typedef struct
+{
+#ifdef __DOXYGEN__
+  uint8_t opaque[8];                 /**< This is an opaque structure. */
+#else
+  struct __netio_queue_impl_t* __system_part;    /**< The system part. */
+  struct __netio_queue_user_impl_t* __user_part; /**< The user part. */
+#ifdef _NETIO_PTHREAD
+  _netio_percpu_mutex_t lock;                    /**< Queue lock. */
+#endif
+#endif
+}
+netio_queue_t;
+
+
+/**
+ * @brief Packet send context.
+ *
+ * @ingroup egress
+ *
+ * Packet send context for use with netio_send_packet_prepare and _commit.
+ */
+typedef struct
+{
+#ifdef __DOXYGEN__
+  uint8_t opaque[44];   /**< This is an opaque structure. */
+#else
+  uint8_t flags;        /**< Defined below */
+  uint8_t datalen;      /**< Number of valid words pointed to by data. */
+  uint32_t request[9];  /**< Request to be sent to the EPP or shim.  Note
+                             that this is smaller than the 11-word maximum
+                             request size, since some constant values are
+                             not saved in the context. */
+  uint32_t *data;       /**< Data to be sent to the EPP or shim via IDN. */
+#endif
+}
+netio_send_pkt_context_t;
+
+
+#ifndef __DOXYGEN__
+#define SEND_PKT_CTX_USE_EPP   1  /**< We're sending to an EPP. */
+#define SEND_PKT_CTX_SEND_CSUM 2  /**< Request includes a checksum. */
+#endif
+
+/**
+ * @brief Packet vector entry.
+ *
+ * @ingroup egress
+ *
+ * This data structure is used with netio_send_packet_vector() to send multiple
+ * packets with one NetIO call.  The structure should be initialized by
+ * calling netio_pkt_vector_set(), rather than by setting the fields
+ * directly.
+ *
+ * This structure is guaranteed to be a power of two in size, no
+ * bigger than one L2 cache line, and to be aligned modulo its size.
+ */
+typedef struct
+#ifndef __DOXYGEN__
+__attribute__((aligned(8)))
+#endif
+{
+  /** Reserved for use by the user application.  When initialized with
+   *  the netio_set_pkt_vector_entry() function, this field is guaranteed
+   *  to be visible to readers only after all other fields are already
+   *  visible.  This way it can be used as a valid flag or generation
+   *  counter. */
+  uint8_t user_data;
+
+  /* Structure members below this point should not be accessed directly by
+   * applications, as they may change in the future. */
+
+  /** Low 8 bits of the packet address to send.  The high bits are
+   *  acquired from the 'handle' field. */
+  uint8_t buffer_address_low;
+
+  /** Number of bytes to transmit. */
+  uint16_t size;
+
+  /** The raw handle from a netio_pkt_t.  If this is NETIO_PKT_HANDLE_NONE,
+   *  this vector entry will be skipped and no packet will be transmitted. */
+  netio_pkt_handle_t handle;
+}
+netio_pkt_vector_entry_t;
+
+
+/**
+ * @brief Initialize fields in a packet vector entry.
+ *
+ * @ingroup egress
+ *
+ * @param[out] v Pointer to the vector entry to be initialized.
+ * @param[in] pkt Packet to be transmitted when the vector entry is passed to
+ *        netio_send_packet_vector().  Note that the packet's attributes
+ *        (e.g., its L2 offset and length) are captured at the time this
+ *        routine is called; subsequent changes in those attributes will not
+ *        be reflected in the packet which is actually transmitted.
+ *        Changes in the packet's contents, however, will be so reflected.
+ *        If this is NULL, no packet will be transmitted.
+ * @param[in] user_data User data to be set in the vector entry.
+ *        This function guarantees that the "user_data" field will become
+ *        visible to a reader only after all other fields have become visible.
+ *        This allows a structure in a ring buffer to be written and read
+ *        by a polling reader without any locks or other synchronization.
+ */
+static __inline void
+netio_pkt_vector_set(volatile netio_pkt_vector_entry_t* v, netio_pkt_t* pkt,
+                     uint8_t user_data)
+{
+  if (pkt)
+  {
+    if (NETIO_PKT_IS_MINIMAL(pkt))
+    {
+      netio_pkt_minimal_metadata_t* mmd =
+        (netio_pkt_minimal_metadata_t*) &pkt->__metadata;
+      v->buffer_address_low = (uintptr_t) NETIO_PKT_L2_DATA_MM(mmd, pkt) & 0xFF;
+      v->size = NETIO_PKT_L2_LENGTH_MM(mmd, pkt);
+    }
+    else
+    {
+      netio_pkt_metadata_t* mda = &pkt->__metadata;
+      v->buffer_address_low = (uintptr_t) NETIO_PKT_L2_DATA_M(mda, pkt) & 0xFF;
+      v->size = NETIO_PKT_L2_LENGTH_M(mda, pkt);
+    }
+    v->handle.word = pkt->__packet.word;
+  }
+  else
+  {
+    v->handle.word = 0;   /* Set handle to NETIO_PKT_HANDLE_NONE. */
+  }
+
+  __asm__("" : : : "memory");
+
+  v->user_data = user_data;
+}
+
+
+/**
+ * Flags and structures for @ref netio_get() and @ref netio_set().
+ * @ingroup config
+ */
+
+/** @{ */
+/** Parameter class; addr is a NETIO_PARAM_xxx value. */
+#define NETIO_PARAM       0
+/** Interface MAC address. This address is only valid with @ref netio_get().
+ *  The value is a 6-byte MAC address.  Depending upon the overall system
+ *  design, a MAC address may or may not be available for each interface. */
+#define NETIO_PARAM_MAC        0
+
+/** Determine whether to suspend output on the receipt of pause frames.
+ *  If the value is nonzero, the I/O shim will suspend output when a pause
+ *  frame is received.  If the value is zero, pause frames will be ignored. */
+#define NETIO_PARAM_PAUSE_IN   1
+
+/** Determine whether to send pause frames if the I/O shim packet FIFOs are
+ *  nearly full.  If the value is zero, pause frames are not sent.  If
+ *  the value is nonzero, it is the delay value which will be sent in any
+ *  pause frames which are output, in units of 512 bit times. */
+#define NETIO_PARAM_PAUSE_OUT  2
+
+/** Jumbo frame support.  The value is a 4-byte integer.  If the value is
+ *  nonzero, the MAC will accept frames of up to 10240 bytes.  If the value
+ *  is zero, the MAC will only accept frames of up to 1544 bytes. */
+#define NETIO_PARAM_JUMBO      3
+
+/** I/O shim's overflow statistics register.  The value is two 16-bit integers.
+ *  The first 16-bit value (or the low 16 bits, if the value is treated as a
+ *  32-bit number) is the count of packets which were completely dropped and
+ *  not delivered by the shim.  The second 16-bit value (or the high 16 bits,
+ *  if the value is treated as a 32-bit number) is the count of packets
+ *  which were truncated and thus only partially delivered by the shim.  This
+ *  register is automatically reset to zero after it has been read.
+ */
+#define NETIO_PARAM_OVERFLOW   4
+
+/** IPP statistics.  This address is only valid with @ref netio_get().  The
+ *  value is a netio_stat_t structure.  Unlike the I/O shim statistics, the
+ *  IPP statistics are not all reset to zero on read; see the description
+ *  of the netio_stat_t for details. */
+#define NETIO_PARAM_STAT 5
+
+/** Possible link state.  The value is a combination of "NETIO_LINK_xxx"
+ *  flags.  With @ref netio_get(), this will indicate which flags are
+ *  actually supported by the hardware.
+ *
+ *  For historical reasons, specifying this value to netio_set() will have
+ *  the same behavior as using ::NETIO_PARAM_LINK_CONFIG, but this usage is
+ *  discouraged.
+ */
+#define NETIO_PARAM_LINK_POSSIBLE_STATE 6
+
+/** Link configuration. The value is a combination of "NETIO_LINK_xxx" flags.
+ *  With @ref netio_set(), this will attempt to immediately bring up the
+ *  link using whichever of the requested flags are supported by the
+ *  hardware, or take down the link if the flags are zero; if this is
+ *  not possible, an error will be returned.  Many programs will want
+ *  to use ::NETIO_PARAM_LINK_DESIRED_STATE instead.
+ *
+ *  For historical reasons, specifying this value to netio_get() will
+ *  have the same behavior as using ::NETIO_PARAM_LINK_POSSIBLE_STATE,
+ *  but this usage is discouraged.
+ */
+#define NETIO_PARAM_LINK_CONFIG NETIO_PARAM_LINK_POSSIBLE_STATE
+
+/** Current link state. This address is only valid with @ref netio_get().
+ *  The value is zero or more of the "NETIO_LINK_xxx" flags, ORed together.
+ *  If the link is down, the value ANDed with NETIO_LINK_SPEED will be
+ *  zero; if the link is up, the value ANDed with NETIO_LINK_SPEED will
+ *  result in exactly one of the NETIO_LINK_xxx values, indicating the
+ *  current speed. */
+#define NETIO_PARAM_LINK_CURRENT_STATE 7
+
+/** Variant symbol for current state, retained for compatibility with
+ *  pre-MDE-2.1 programs. */
+#define NETIO_PARAM_LINK_STATUS NETIO_PARAM_LINK_CURRENT_STATE
+
+/** Packet Coherence protocol. This address is only valid with @ref netio_get().
+ *  The value is nonzero if the interface is configured for cache-coherent DMA.
+ */
+#define NETIO_PARAM_COHERENT 8
+
+/** Desired link state. The value is a conbination of "NETIO_LINK_xxx"
+ *  flags, which specify the desired state for the link.  With @ref
+ *  netio_set(), this will, in the background, attempt to bring up the link
+ *  using whichever of the requested flags are reasonable, or take down the
+ *  link if the flags are zero.  The actual link up or down operation may
+ *  happen after this call completes.  If the link state changes in the
+ *  future, the system will continue to try to get back to the desired link
+ *  state; for instance, if the link is brought up successfully, and then
+ *  the network cable is disconnected, the link will go down.  However, the
+ *  desired state of the link is still up, so if the cable is reconnected,
+ *  the link will be brought up again.
+ *
+ *  With @ref netio_get(), this will indicate the desired state for the
+ *  link, as set with a previous netio_set() call, or implicitly by a
+ *  netio_input_register() or netio_input_unregister() operation.  This may
+ *  not reflect the current state of the link; to get that, use
+ *  ::NETIO_PARAM_LINK_CURRENT_STATE. */
+#define NETIO_PARAM_LINK_DESIRED_STATE 9
+
+/** NetIO statistics structure.  Retrieved using the ::NETIO_PARAM_STAT
+ *  address passed to @ref netio_get(). */
+typedef struct
+{
+  /** Number of packets which have been received by the IPP and forwarded
+   *  to a tile's receive queue for processing.  This value wraps at its
+   *  maximum, and is not cleared upon read. */
+  uint32_t packets_received;
+
+  /** Number of packets which have been dropped by the IPP, because they could
+   *  not be received, or could not be forwarded to a tile.  The former happens
+   *  when the IPP does not have a free packet buffer of suitable size for an
+   *  incoming frame.  The latter happens when all potential destination tiles
+   *  for a packet, as defined by the group, bucket, and queue configuration,
+   *  have full receive queues.   This value wraps at its maximum, and is not
+   *  cleared upon read. */
+  uint32_t packets_dropped;
+
+  /*
+   * Note: the #defines after each of the following four one-byte values
+   * denote their location within the third word of the netio_stat_t.  They
+   * are intended for use only by the IPP implementation and are thus omitted
+   * from the Doxygen output.
+   */
+
+  /** Number of packets dropped because no worker was able to accept a new
+   *  packet.  This value saturates at its maximum, and is cleared upon
+   *  read. */
+  uint8_t drops_no_worker;
+#ifndef __DOXYGEN__
+#define NETIO_STAT_DROPS_NO_WORKER   0
+#endif
+
+  /** Number of packets dropped because no small buffers were available.
+   *  This value saturates at its maximum, and is cleared upon read. */
+  uint8_t drops_no_smallbuf;
+#ifndef __DOXYGEN__
+#define NETIO_STAT_DROPS_NO_SMALLBUF 1
+#endif
+
+  /** Number of packets dropped because no large buffers were available.
+   *  This value saturates at its maximum, and is cleared upon read. */
+  uint8_t drops_no_largebuf;
+#ifndef __DOXYGEN__
+#define NETIO_STAT_DROPS_NO_LARGEBUF 2
+#endif
+
+  /** Number of packets dropped because no jumbo buffers were available.
+   *  This value saturates at its maximum, and is cleared upon read. */
+  uint8_t drops_no_jumbobuf;
+#ifndef __DOXYGEN__
+#define NETIO_STAT_DROPS_NO_JUMBOBUF 3
+#endif
+}
+netio_stat_t;
+
+
+/** Link can run, should run, or is running at 10 Mbps. */
+#define NETIO_LINK_10M         0x01
+
+/** Link can run, should run, or is running at 100 Mbps. */
+#define NETIO_LINK_100M        0x02
+
+/** Link can run, should run, or is running at 1 Gbps. */
+#define NETIO_LINK_1G          0x04
+
+/** Link can run, should run, or is running at 10 Gbps. */
+#define NETIO_LINK_10G         0x08
+
+/** Link should run at the highest speed supported by the link and by
+ *  the device connected to the link.  Only usable as a value for
+ *  the link's desired state; never returned as a value for the current
+ *  or possible states. */
+#define NETIO_LINK_ANYSPEED    0x10
+
+/** All legal link speeds. */
+#define NETIO_LINK_SPEED  (NETIO_LINK_10M  | \
+                           NETIO_LINK_100M | \
+                           NETIO_LINK_1G   | \
+                           NETIO_LINK_10G  | \
+                           NETIO_LINK_ANYSPEED)
+
+
+/** MAC register class.  Addr is a register offset within the MAC.
+ *  Registers within the XGbE and GbE MACs are documented in the Tile
+ *  Processor I/O Device Guide (UG104). MAC registers start at address
+ *  0x4000, and do not include the MAC_INTERFACE registers. */
+#define NETIO_MAC             1
+
+/** MDIO register class (IEEE 802.3 clause 22 format).  Addr is the "addr"
+ *  member of a netio_mdio_addr_t structure. */
+#define NETIO_MDIO            2
+
+/** MDIO register class (IEEE 802.3 clause 45 format).  Addr is the "addr"
+ *  member of a netio_mdio_addr_t structure. */
+#define NETIO_MDIO_CLAUSE45   3
+
+/** NetIO MDIO address type.  Retrieved or provided using the ::NETIO_MDIO
+ *  address passed to @ref netio_get() or @ref netio_set(). */
+typedef union
+{
+  struct
+  {
+    unsigned int reg:16;  /**< MDIO register offset.  For clause 22 access,
+                               must be less than 32. */
+    unsigned int phy:5;   /**< Which MDIO PHY to access. */
+    unsigned int dev:5;   /**< Which MDIO device to access within that PHY.
+                               Applicable for clause 45 access only; ignored
+                               for clause 22 access. */
+  }
+  bits;                   /**< Container for bitfields. */
+  uint64_t addr;          /**< Value to pass to @ref netio_get() or
+                           *   @ref netio_set(). */
+}
+netio_mdio_addr_t;
+
+/** @} */
+
+#endif /* __NETIO_INTF_H__ */
diff --git a/arch/tile/include/hv/syscall_public.h b/arch/tile/include/hv/syscall_public.h
new file mode 100644
index 000000000..9cc0837e6
--- /dev/null
+++ b/arch/tile/include/hv/syscall_public.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/**
+ * @file syscall.h
+ * Indices for the hypervisor system calls that are intended to be called
+ * directly, rather than only through hypervisor-generated "glue" code.
+ */
+
+#ifndef _SYS_HV_INCLUDE_SYSCALL_PUBLIC_H
+#define _SYS_HV_INCLUDE_SYSCALL_PUBLIC_H
+
+/** Fast syscall flag bit location.  When this bit is set, the hypervisor
+ *  handles the syscall specially.
+ */
+#define HV_SYS_FAST_SHIFT                 14
+
+/** Fast syscall flag bit mask. */
+#define HV_SYS_FAST_MASK                  (1 << HV_SYS_FAST_SHIFT)
+
+/** Bit location for flagging fast syscalls that can be called from PL0. */
+#define HV_SYS_FAST_PLO_SHIFT             13
+
+/** Fast syscall allowing PL0 bit mask. */
+#define HV_SYS_FAST_PL0_MASK              (1 << HV_SYS_FAST_PLO_SHIFT)
+
+/** Perform an MF that waits for all victims to reach DRAM. */
+#define HV_SYS_fence_incoherent         (51 | HV_SYS_FAST_MASK \
+                                       | HV_SYS_FAST_PL0_MASK)
+
+#endif /* !_SYS_HV_INCLUDE_SYSCALL_PUBLIC_H */