2
0
mirror of https://github.com/xcat2/xNBA.git synced 2025-01-05 19:15:05 +00:00

Merge branch '3leaf'

This commit is contained in:
Michael Brown 2007-10-29 17:21:58 +00:00
commit 1620b3512c
10 changed files with 8058 additions and 1 deletions

View File

@ -152,6 +152,7 @@ SRCDIRS += drivers/scsi
SRCDIRS += drivers/ata
SRCDIRS += drivers/nvs
SRCDIRS += drivers/bitbash
SRCDIRS += drivers/infiniband
SRCDIRS += interface/pxe
SRCDIRS += tests
SRCDIRS += crypto crypto/axtls crypto/matrixssl

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,461 @@
#ifndef _ARBEL_H
#define _ARBEL_H
/** @file
*
* Mellanox Arbel Infiniband HCA driver
*
*/
#include <stdint.h>
#include <gpxe/uaccess.h>
#include "mlx_bitops.h"
#include "MT25218_PRM.h"
/*
* Hardware constants
*
*/
/* PCI BARs */
#define ARBEL_PCI_CONFIG_BAR PCI_BASE_ADDRESS_0
#define ARBEL_PCI_CONFIG_BAR_SIZE 0x100000
#define ARBEL_PCI_UAR_BAR PCI_BASE_ADDRESS_2
#define ARBEL_PCI_UAR_IDX 1
#define ARBEL_PCI_UAR_SIZE 0x1000
/* UAR context table (UCE) resource types */
#define ARBEL_UAR_RES_NONE 0x00
#define ARBEL_UAR_RES_CQ_CI 0x01
#define ARBEL_UAR_RES_CQ_ARM 0x02
#define ARBEL_UAR_RES_SQ 0x03
#define ARBEL_UAR_RES_RQ 0x04
#define ARBEL_UAR_RES_GROUP_SEP 0x07
/* Work queue entry and completion queue entry opcodes */
#define ARBEL_OPCODE_SEND 0x0a
#define ARBEL_OPCODE_RECV_ERROR 0xfe
#define ARBEL_OPCODE_SEND_ERROR 0xff
/* HCA command register opcodes */
#define ARBEL_HCR_QUERY_DEV_LIM 0x0003
#define ARBEL_HCR_QUERY_FW 0x0004
#define ARBEL_HCR_INIT_HCA 0x0007
#define ARBEL_HCR_CLOSE_HCA 0x0008
#define ARBEL_HCR_INIT_IB 0x0009
#define ARBEL_HCR_CLOSE_IB 0x000a
#define ARBEL_HCR_SW2HW_MPT 0x000d
#define ARBEL_HCR_MAP_EQ 0x0012
#define ARBEL_HCR_SW2HW_EQ 0x0013
#define ARBEL_HCR_HW2SW_EQ 0x0014
#define ARBEL_HCR_SW2HW_CQ 0x0016
#define ARBEL_HCR_HW2SW_CQ 0x0017
#define ARBEL_HCR_RST2INIT_QPEE 0x0019
#define ARBEL_HCR_INIT2RTR_QPEE 0x001a
#define ARBEL_HCR_RTR2RTS_QPEE 0x001b
#define ARBEL_HCR_2RST_QPEE 0x0021
#define ARBEL_HCR_MAD_IFC 0x0024
#define ARBEL_HCR_READ_MGM 0x0025
#define ARBEL_HCR_WRITE_MGM 0x0026
#define ARBEL_HCR_MGID_HASH 0x0027
#define ARBEL_HCR_RUN_FW 0x0ff6
#define ARBEL_HCR_DISABLE_LAM 0x0ff7
#define ARBEL_HCR_ENABLE_LAM 0x0ff8
#define ARBEL_HCR_UNMAP_ICM 0x0ff9
#define ARBEL_HCR_MAP_ICM 0x0ffa
#define ARBEL_HCR_UNMAP_ICM_AUX 0x0ffb
#define ARBEL_HCR_MAP_ICM_AUX 0x0ffc
#define ARBEL_HCR_SET_ICM_SIZE 0x0ffd
#define ARBEL_HCR_UNMAP_FA 0x0ffe
#define ARBEL_HCR_MAP_FA 0x0fff
/* Service types */
#define ARBEL_ST_UD 0x03
/* MTUs */
#define ARBEL_MTU_2048 0x04
#define ARBEL_NO_EQ 64
#define ARBEL_INVALID_LKEY 0x00000100UL
#define ARBEL_PAGE_SIZE 4096
#define ARBEL_DB_POST_SND_OFFSET 0x10
/*
* Datatypes that seem to be missing from the autogenerated documentation
*
*/
struct arbelprm_mgm_hash_st {
pseudo_bit_t reserved0[0x00020];
/* -------------- */
pseudo_bit_t hash[0x00010];
pseudo_bit_t reserved1[0x00010];
} __attribute__ (( packed ));
struct arbelprm_scalar_parameter_st {
pseudo_bit_t reserved0[0x00020];
/* -------------- */
pseudo_bit_t value[0x00020];
} __attribute__ (( packed ));
/*
* Wrapper structures for hardware datatypes
*
*/
struct MLX_DECLARE_STRUCT ( arbelprm_access_lam );
struct MLX_DECLARE_STRUCT ( arbelprm_completion_queue_context );
struct MLX_DECLARE_STRUCT ( arbelprm_completion_queue_entry );
struct MLX_DECLARE_STRUCT ( arbelprm_completion_with_error );
struct MLX_DECLARE_STRUCT ( arbelprm_cq_arm_db_record );
struct MLX_DECLARE_STRUCT ( arbelprm_cq_ci_db_record );
struct MLX_DECLARE_STRUCT ( arbelprm_eqc );
struct MLX_DECLARE_STRUCT ( arbelprm_hca_command_register );
struct MLX_DECLARE_STRUCT ( arbelprm_init_hca );
struct MLX_DECLARE_STRUCT ( arbelprm_init_ib );
struct MLX_DECLARE_STRUCT ( arbelprm_mad_ifc );
struct MLX_DECLARE_STRUCT ( arbelprm_mgm_entry );
struct MLX_DECLARE_STRUCT ( arbelprm_mgm_hash );
struct MLX_DECLARE_STRUCT ( arbelprm_mpt );
struct MLX_DECLARE_STRUCT ( arbelprm_qp_db_record );
struct MLX_DECLARE_STRUCT ( arbelprm_qp_ee_state_transitions );
struct MLX_DECLARE_STRUCT ( arbelprm_query_dev_lim );
struct MLX_DECLARE_STRUCT ( arbelprm_query_fw );
struct MLX_DECLARE_STRUCT ( arbelprm_queue_pair_ee_context_entry );
struct MLX_DECLARE_STRUCT ( arbelprm_recv_wqe_segment_next );
struct MLX_DECLARE_STRUCT ( arbelprm_scalar_parameter );
struct MLX_DECLARE_STRUCT ( arbelprm_send_doorbell );
struct MLX_DECLARE_STRUCT ( arbelprm_ud_address_vector );
struct MLX_DECLARE_STRUCT ( arbelprm_virtual_physical_mapping );
struct MLX_DECLARE_STRUCT ( arbelprm_wqe_segment_ctrl_send );
struct MLX_DECLARE_STRUCT ( arbelprm_wqe_segment_data_ptr );
struct MLX_DECLARE_STRUCT ( arbelprm_wqe_segment_next );
struct MLX_DECLARE_STRUCT ( arbelprm_wqe_segment_ud );
/*
* Composite hardware datatypes
*
*/
#define ARBEL_MAX_GATHER 1
struct arbelprm_ud_send_wqe {
struct arbelprm_wqe_segment_next next;
struct arbelprm_wqe_segment_ctrl_send ctrl;
struct arbelprm_wqe_segment_ud ud;
struct arbelprm_wqe_segment_data_ptr data[ARBEL_MAX_GATHER];
} __attribute__ (( packed ));
#define ARBEL_MAX_SCATTER 1
struct arbelprm_recv_wqe {
/* The autogenerated header is inconsistent between send and
* receive WQEs. The "ctrl" structure for receive WQEs is
* defined to include the "next" structure. Since the "ctrl"
* part of the "ctrl" structure contains only "reserved, must
* be zero" bits, we ignore its definition and provide
* something more usable.
*/
struct arbelprm_recv_wqe_segment_next next;
uint32_t ctrl[2]; /* All "reserved, must be zero" */
struct arbelprm_wqe_segment_data_ptr data[ARBEL_MAX_SCATTER];
} __attribute__ (( packed ));
union arbelprm_completion_entry {
struct arbelprm_completion_queue_entry normal;
struct arbelprm_completion_with_error error;
} __attribute__ (( packed ));
union arbelprm_doorbell_record {
struct arbelprm_cq_arm_db_record cq_arm;
struct arbelprm_cq_ci_db_record cq_ci;
struct arbelprm_qp_db_record qp;
} __attribute__ (( packed ));
union arbelprm_doorbell_register {
struct arbelprm_send_doorbell send;
uint32_t dword[2];
} __attribute__ (( packed ));
union arbelprm_mad {
struct arbelprm_mad_ifc ifc;
union ib_mad mad;
} __attribute__ (( packed ));
/*
* gPXE-specific definitions
*
*/
/** Arbel device limits */
struct arbel_dev_limits {
/** Number of reserved QPs */
unsigned int reserved_qps;
/** QP context entry size */
size_t qpc_entry_size;
/** Extended QP context entry size */
size_t eqpc_entry_size;
/** Number of reserved SRQs */
unsigned int reserved_srqs;
/** SRQ context entry size */
size_t srqc_entry_size;
/** Number of reserved EEs */
unsigned int reserved_ees;
/** EE context entry size */
size_t eec_entry_size;
/** Extended EE context entry size */
size_t eeec_entry_size;
/** Number of reserved CQs */
unsigned int reserved_cqs;
/** CQ context entry size */
size_t cqc_entry_size;
/** Number of reserved MTTs */
unsigned int reserved_mtts;
/** MTT entry size */
size_t mtt_entry_size;
/** Number of reserved MRWs */
unsigned int reserved_mrws;
/** MPT entry size */
size_t mpt_entry_size;
/** Number of reserved RDBs */
unsigned int reserved_rdbs;
/** EQ context entry size */
size_t eqc_entry_size;
/** Number of reserved UARs */
unsigned int reserved_uars;
};
/** Alignment of Arbel send work queue entries */
#define ARBEL_SEND_WQE_ALIGN 128
/** An Arbel send work queue entry */
union arbel_send_wqe {
struct arbelprm_ud_send_wqe ud;
uint8_t force_align[ARBEL_SEND_WQE_ALIGN];
} __attribute__ (( packed ));
/** An Arbel send work queue */
struct arbel_send_work_queue {
/** Doorbell record number */
unsigned int doorbell_idx;
/** Work queue entries */
union arbel_send_wqe *wqe;
/** Size of work queue */
size_t wqe_size;
};
/** Alignment of Arbel receive work queue entries */
#define ARBEL_RECV_WQE_ALIGN 64
/** An Arbel receive work queue entry */
union arbel_recv_wqe {
struct arbelprm_recv_wqe recv;
uint8_t force_align[ARBEL_RECV_WQE_ALIGN];
} __attribute__ (( packed ));
/** An Arbel receive work queue */
struct arbel_recv_work_queue {
/** Doorbell record number */
unsigned int doorbell_idx;
/** Work queue entries */
union arbel_recv_wqe *wqe;
/** Size of work queue */
size_t wqe_size;
};
/** Maximum number of allocatable queue pairs
*
* This is a policy decision, not a device limit.
*/
#define ARBEL_MAX_QPS 8
/** Base queue pair number */
#define ARBEL_QPN_BASE 0x550000
/** An Arbel queue pair */
struct arbel_queue_pair {
/** Send work queue */
struct arbel_send_work_queue send;
/** Receive work queue */
struct arbel_recv_work_queue recv;
};
/** Maximum number of allocatable completion queues
*
* This is a policy decision, not a device limit.
*/
#define ARBEL_MAX_CQS 8
/** An Arbel completion queue */
struct arbel_completion_queue {
/** Consumer counter doorbell record number */
unsigned int ci_doorbell_idx;
/** Arm queue doorbell record number */
unsigned int arm_doorbell_idx;
/** Completion queue entries */
union arbelprm_completion_entry *cqe;
/** Size of completion queue */
size_t cqe_size;
};
/** An Arbel resource bitmask */
typedef uint32_t arbel_bitmask_t;
/** Size of an Arbel resource bitmask */
#define ARBEL_BITMASK_SIZE(max_entries) \
( ( (max_entries) + ( 8 * sizeof ( arbel_bitmask_t ) ) - 1 ) / \
( 8 * sizeof ( arbel_bitmask_t ) ) )
/** An Arbel device */
struct arbel {
/** PCI configuration registers */
void *config;
/** PCI user Access Region */
void *uar;
/** Command input mailbox */
void *mailbox_in;
/** Command output mailbox */
void *mailbox_out;
/** Firmware area in external memory */
userptr_t firmware_area;
/** ICM size */
size_t icm_len;
/** ICM AUX size */
size_t icm_aux_len;
/** ICM area */
userptr_t icm;
/** Doorbell records */
union arbelprm_doorbell_record *db_rec;
/** Reserved LKey
*
* Used to get unrestricted memory access.
*/
unsigned long reserved_lkey;
/** Completion queue in-use bitmask */
arbel_bitmask_t cq_inuse[ ARBEL_BITMASK_SIZE ( ARBEL_MAX_CQS ) ];
/** Queue pair in-use bitmask */
arbel_bitmask_t qp_inuse[ ARBEL_BITMASK_SIZE ( ARBEL_MAX_QPS ) ];
/** Device limits */
struct arbel_dev_limits limits;
};
/** Global protection domain */
#define ARBEL_GLOBAL_PD 0x123456
/** Memory key prefix */
#define ARBEL_MKEY_PREFIX 0x77000000UL
/*
* HCA commands
*
*/
#define ARBEL_HCR_BASE 0x80680
#define ARBEL_HCR_REG(x) ( ARBEL_HCR_BASE + 4 * (x) )
#define ARBEL_HCR_MAX_WAIT_MS 2000
#define ARBEL_MBOX_ALIGN 4096
#define ARBEL_MBOX_SIZE 512
/* HCA command is split into
*
* bits 11:0 Opcode
* bit 12 Input uses mailbox
* bit 13 Output uses mailbox
* bits 22:14 Input parameter length (in dwords)
* bits 31:23 Output parameter length (in dwords)
*
* Encoding the information in this way allows us to cut out several
* parameters to the arbel_command() call.
*/
#define ARBEL_HCR_IN_MBOX 0x00001000UL
#define ARBEL_HCR_OUT_MBOX 0x00002000UL
#define ARBEL_HCR_OPCODE( _command ) ( (_command) & 0xfff )
#define ARBEL_HCR_IN_LEN( _command ) ( ( (_command) >> 12 ) & 0x7fc )
#define ARBEL_HCR_OUT_LEN( _command ) ( ( (_command) >> 21 ) & 0x7fc )
/** Build HCR command from component parts */
#define ARBEL_HCR_INOUT_CMD( _opcode, _in_mbox, _in_len, \
_out_mbox, _out_len ) \
( (_opcode) | \
( (_in_mbox) ? ARBEL_HCR_IN_MBOX : 0 ) | \
( ( (_in_len) / 4 ) << 14 ) | \
( (_out_mbox) ? ARBEL_HCR_OUT_MBOX : 0 ) | \
( ( (_out_len) / 4 ) << 23 ) )
#define ARBEL_HCR_IN_CMD( _opcode, _in_mbox, _in_len ) \
ARBEL_HCR_INOUT_CMD ( _opcode, _in_mbox, _in_len, 0, 0 )
#define ARBEL_HCR_OUT_CMD( _opcode, _out_mbox, _out_len ) \
ARBEL_HCR_INOUT_CMD ( _opcode, 0, 0, _out_mbox, _out_len )
#define ARBEL_HCR_VOID_CMD( _opcode ) \
ARBEL_HCR_INOUT_CMD ( _opcode, 0, 0, 0, 0 )
/*
* Doorbell record allocation
*
* The doorbell record map looks like:
*
* ARBEL_MAX_CQS * Arm completion queue doorbell
* ARBEL_MAX_QPS * Send work request doorbell
* Group separator
* ...(empty space)...
* ARBEL_MAX_QPS * Receive work request doorbell
* ARBEL_MAX_CQS * Completion queue consumer counter update doorbell
*/
#define ARBEL_MAX_DOORBELL_RECORDS 512
#define ARBEL_GROUP_SEPARATOR_DOORBELL ( ARBEL_MAX_CQS + ARBEL_MAX_QPS )
/**
* Get arm completion queue doorbell index
*
* @v cqn_offset Completion queue number offset
* @ret doorbell_idx Doorbell index
*/
static inline unsigned int
arbel_cq_arm_doorbell_idx ( unsigned int cqn_offset ) {
return cqn_offset;
}
/**
* Get send work request doorbell index
*
* @v qpn_offset Queue pair number offset
* @ret doorbell_idx Doorbell index
*/
static inline unsigned int
arbel_send_doorbell_idx ( unsigned int qpn_offset ) {
return ( ARBEL_MAX_CQS + qpn_offset );
}
/**
* Get receive work request doorbell index
*
* @v qpn_offset Queue pair number offset
* @ret doorbell_idx Doorbell index
*/
static inline unsigned int
arbel_recv_doorbell_idx ( unsigned int qpn_offset ) {
return ( ARBEL_MAX_DOORBELL_RECORDS - ARBEL_MAX_CQS - qpn_offset - 1 );
}
/**
* Get completion queue consumer counter doorbell index
*
* @v cqn_offset Completion queue number offset
* @ret doorbell_idx Doorbell index
*/
static inline unsigned int
arbel_cq_ci_doorbell_idx ( unsigned int cqn_offset ) {
return ( ARBEL_MAX_DOORBELL_RECORDS - cqn_offset - 1 );
}
#endif /* _ARBEL_H */

View File

@ -0,0 +1,209 @@
#ifndef _MLX_BITOPS_H
#define _MLX_BITOPS_H
/*
* Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/**
* @file
*
* Mellanox bit operations
*
*/
/* Datatype used to represent a bit in the Mellanox autogenerated headers */
typedef unsigned char pseudo_bit_t;
/**
* Wrapper structure for pseudo_bit_t structures
*
* This structure provides a wrapper around the autogenerated
* pseudo_bit_t structures. It has the correct size, and also
* encapsulates type information about the underlying pseudo_bit_t
* structure, which allows the MLX_FILL etc. macros to work without
* requiring explicit type information.
*/
#define MLX_DECLARE_STRUCT( _structure ) \
_structure { \
union { \
uint8_t bytes[ sizeof ( struct _structure ## _st ) / 8 ]; \
uint32_t dwords[ sizeof ( struct _structure ## _st ) / 32 ]; \
struct _structure ## _st *dummy[0]; \
} u; \
}
/** Get pseudo_bit_t structure type from wrapper structure pointer */
#define MLX_PSEUDO_STRUCT( _ptr ) \
typeof ( *((_ptr)->u.dummy[0]) )
/** Bit offset of a field within a pseudo_bit_t structure */
#define MLX_BIT_OFFSET( _structure_st, _field ) \
offsetof ( _structure_st, _field )
/** Dword offset of a field within a pseudo_bit_t structure */
#define MLX_DWORD_OFFSET( _structure_st, _field ) \
( MLX_BIT_OFFSET ( _structure_st, _field ) / 32 )
/** Dword bit offset of a field within a pseudo_bit_t structure
*
* Yes, using mod-32 would work, but would lose the check for the
* error of specifying a mismatched field name and dword index.
*/
#define MLX_DWORD_BIT_OFFSET( _structure_st, _index, _field ) \
( MLX_BIT_OFFSET ( _structure_st, _field ) - ( 32 * (_index) ) )
/** Bit width of a field within a pseudo_bit_t structure */
#define MLX_BIT_WIDTH( _structure_st, _field ) \
sizeof ( ( ( _structure_st * ) NULL )->_field )
/** Bit mask for a field within a pseudo_bit_t structure */
#define MLX_BIT_MASK( _structure_st, _field ) \
( ( ~( ( uint32_t ) 0 ) ) >> \
( 32 - MLX_BIT_WIDTH ( _structure_st, _field ) ) )
/*
* Assemble native-endian dword from named fields and values
*
*/
#define MLX_ASSEMBLE_1( _structure_st, _index, _field, _value ) \
( (_value) << MLX_DWORD_BIT_OFFSET ( _structure_st, _index, _field ) )
#define MLX_ASSEMBLE_2( _structure_st, _index, _field, _value, ... ) \
( MLX_ASSEMBLE_1 ( _structure_st, _index, _field, _value ) | \
MLX_ASSEMBLE_1 ( _structure_st, _index, __VA_ARGS__ ) )
#define MLX_ASSEMBLE_3( _structure_st, _index, _field, _value, ... ) \
( MLX_ASSEMBLE_1 ( _structure_st, _index, _field, _value ) | \
MLX_ASSEMBLE_2 ( _structure_st, _index, __VA_ARGS__ ) )
#define MLX_ASSEMBLE_4( _structure_st, _index, _field, _value, ... ) \
( MLX_ASSEMBLE_1 ( _structure_st, _index, _field, _value ) | \
MLX_ASSEMBLE_3 ( _structure_st, _index, __VA_ARGS__ ) )
#define MLX_ASSEMBLE_5( _structure_st, _index, _field, _value, ... ) \
( MLX_ASSEMBLE_1 ( _structure_st, _index, _field, _value ) | \
MLX_ASSEMBLE_4 ( _structure_st, _index, __VA_ARGS__ ) )
#define MLX_ASSEMBLE_6( _structure_st, _index, _field, _value, ... ) \
( MLX_ASSEMBLE_1 ( _structure_st, _index, _field, _value ) | \
MLX_ASSEMBLE_5 ( _structure_st, _index, __VA_ARGS__ ) )
/*
* Build native-endian (positive) dword bitmasks from named fields
*
*/
#define MLX_MASK_1( _structure_st, _index, _field ) \
( MLX_BIT_MASK ( _structure_st, _field ) << \
MLX_DWORD_BIT_OFFSET ( _structure_st, _index, _field ) )
#define MLX_MASK_2( _structure_st, _index, _field, ... ) \
( MLX_MASK_1 ( _structure_st, _index, _field ) | \
MLX_MASK_1 ( _structure_st, _index, __VA_ARGS__ ) )
#define MLX_MASK_3( _structure_st, _index, _field, ... ) \
( MLX_MASK_1 ( _structure_st, _index, _field ) | \
MLX_MASK_2 ( _structure_st, _index, __VA_ARGS__ ) )
#define MLX_MASK_4( _structure_st, _index, _field, ... ) \
( MLX_MASK_1 ( _structure_st, _index, _field ) | \
MLX_MASK_3 ( _structure_st, _index, __VA_ARGS__ ) )
#define MLX_MASK_5( _structure_st, _index, _field, ... ) \
( MLX_MASK_1 ( _structure_st, _index, _field ) | \
MLX_MASK_4 ( _structure_st, _index, __VA_ARGS__ ) )
#define MLX_MASK_6( _structure_st, _index, _field, ... ) \
( MLX_MASK_1 ( _structure_st, _index, _field ) | \
MLX_MASK_5 ( _structure_st, _index, __VA_ARGS__ ) )
/*
* Populate big-endian dwords from named fields and values
*
*/
#define MLX_FILL( _ptr, _index, _assembled ) \
do { \
uint32_t *__ptr = &(_ptr)->u.dwords[(_index)]; \
uint32_t __assembled = (_assembled); \
*__ptr = cpu_to_be32 ( __assembled ); \
} while ( 0 )
#define MLX_FILL_1( _ptr, _index, ... ) \
MLX_FILL ( _ptr, _index, MLX_ASSEMBLE_1 ( MLX_PSEUDO_STRUCT ( _ptr ),\
_index, __VA_ARGS__ ) )
#define MLX_FILL_2( _ptr, _index, ... ) \
MLX_FILL ( _ptr, _index, MLX_ASSEMBLE_2 ( MLX_PSEUDO_STRUCT ( _ptr ),\
_index, __VA_ARGS__ ) )
#define MLX_FILL_3( _ptr, _index, ... ) \
MLX_FILL ( _ptr, _index, MLX_ASSEMBLE_3 ( MLX_PSEUDO_STRUCT ( _ptr ),\
_index, __VA_ARGS__ ) )
#define MLX_FILL_4( _ptr, _index, ... ) \
MLX_FILL ( _ptr, _index, MLX_ASSEMBLE_4 ( MLX_PSEUDO_STRUCT ( _ptr ),\
_index, __VA_ARGS__ ) )
#define MLX_FILL_5( _ptr, _index, ... ) \
MLX_FILL ( _ptr, _index, MLX_ASSEMBLE_5 ( MLX_PSEUDO_STRUCT ( _ptr ),\
_index, __VA_ARGS__ ) )
#define MLX_FILL_6( _ptr, _index, ... ) \
MLX_FILL ( _ptr, _index, MLX_ASSEMBLE_6 ( MLX_PSEUDO_STRUCT ( _ptr ),\
_index, __VA_ARGS__ ) )
/*
* Modify big-endian dword using named field and value
*
*/
#define MLX_SET( _ptr, _field, _value ) \
do { \
unsigned int __index = \
MLX_DWORD_OFFSET ( MLX_PSEUDO_STRUCT ( _ptr ), _field ); \
uint32_t *__ptr = &(_ptr)->u.dwords[__index]; \
uint32_t __value = be32_to_cpu ( *__ptr ); \
__value &= ~( MLX_MASK_1 ( MLX_PSEUDO_STRUCT ( _ptr ), \
__index, _field ) ); \
__value |= MLX_ASSEMBLE_1 ( MLX_PSEUDO_STRUCT ( _ptr ), \
__index, _field, _value ); \
*__ptr = cpu_to_be32 ( __value ); \
} while ( 0 )
/*
* Extract value of named field
*
*/
#define MLX_GET( _ptr, _field ) \
( { \
unsigned int __index = \
MLX_DWORD_OFFSET ( MLX_PSEUDO_STRUCT ( _ptr ), _field ); \
uint32_t *__ptr = &(_ptr)->u.dwords[__index]; \
uint32_t __value = be32_to_cpu ( *__ptr ); \
__value >>= \
MLX_DWORD_BIT_OFFSET ( MLX_PSEUDO_STRUCT ( _ptr ), \
__index, _field ); \
__value &= \
MLX_BIT_MASK ( MLX_PSEUDO_STRUCT ( _ptr ), _field ); \
__value; \
} )
#endif /* _MLX_BITOPS_H */

930
src/drivers/net/ipoib.c Normal file
View File

@ -0,0 +1,930 @@
/*
* Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <stdint.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <byteswap.h>
#include <errno.h>
#include "timer.h"
#include <gpxe/if_arp.h>
#include <gpxe/iobuf.h>
#include <gpxe/netdevice.h>
#include <gpxe/infiniband.h>
#include <gpxe/ipoib.h>
/** @file
*
* IP over Infiniband
*/
/** IPoIB MTU */
#define IPOIB_MTU 2048
/** Number of IPoIB data send work queue entries */
#define IPOIB_DATA_NUM_SEND_WQES 2
/** Number of IPoIB data receive work queue entries */
#define IPOIB_DATA_NUM_RECV_WQES 4
/** Number of IPoIB data completion entries */
#define IPOIB_DATA_NUM_CQES 8
/** Number of IPoIB metadata send work queue entries */
#define IPOIB_META_NUM_SEND_WQES 2
/** Number of IPoIB metadata receive work queue entries */
#define IPOIB_META_NUM_RECV_WQES 2
/** Number of IPoIB metadata completion entries */
#define IPOIB_META_NUM_CQES 8
/** An IPoIB queue set */
struct ipoib_queue_set {
/** Completion queue */
struct ib_completion_queue *cq;
/** Queue pair */
struct ib_queue_pair *qp;
/** Receive work queue fill level */
unsigned int recv_fill;
/** Receive work queue maximum fill level */
unsigned int recv_max_fill;
};
/** An IPoIB device */
struct ipoib_device {
/** Network device */
struct net_device *netdev;
/** Underlying Infiniband device */
struct ib_device *ibdev;
/** Data queue set */
struct ipoib_queue_set data;
/** Data queue set */
struct ipoib_queue_set meta;
/** Broadcast GID */
struct ib_gid broadcast_gid;
/** Broadcast LID */
unsigned int broadcast_lid;
/** Joined to broadcast group */
int broadcast_joined;
/** Data queue key */
unsigned long data_qkey;
};
/**
* IPoIB path cache entry
*
* This serves a similar role to the ARP cache for Ethernet. (ARP
* *is* used on IPoIB; we have two caches to maintain.)
*/
struct ipoib_cached_path {
/** Destination GID */
struct ib_gid gid;
/** Destination LID */
unsigned int dlid;
/** Service level */
unsigned int sl;
/** Rate */
unsigned int rate;
};
/** Number of IPoIB path cache entries */
#define IPOIB_NUM_CACHED_PATHS 2
/** IPoIB path cache */
static struct ipoib_cached_path ipoib_path_cache[IPOIB_NUM_CACHED_PATHS];
/** Oldest IPoIB path cache entry index */
static unsigned int ipoib_path_cache_idx = 0;
/** TID half used to identify get path record replies */
#define IPOIB_TID_GET_PATH_REC 0x11111111UL
/** TID half used to identify multicast member record replies */
#define IPOIB_TID_MC_MEMBER_REC 0x22222222UL
/** IPoIB metadata TID */
static uint32_t ipoib_meta_tid = 0;
/** IPv4 broadcast GID */
static const struct ib_gid ipv4_broadcast_gid = {
{ { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff } }
};
/** Maximum time we will wait for the broadcast join to succeed */
#define IPOIB_JOIN_MAX_DELAY_MS 1000
/****************************************************************************
*
* IPoIB link layer
*
****************************************************************************
*/
/** Broadcast QPN used in IPoIB MAC addresses
*
* This is a guaranteed invalid real QPN
*/
#define IPOIB_BROADCAST_QPN 0xffffffffUL
/** Broadcast IPoIB address */
static struct ipoib_mac ipoib_broadcast = {
.qpn = ntohl ( IPOIB_BROADCAST_QPN ),
};
/**
* Transmit IPoIB packet
*
* @v iobuf I/O buffer
* @v netdev Network device
* @v net_protocol Network-layer protocol
* @v ll_dest Link-layer destination address
*
* Prepends the IPoIB link-layer header and transmits the packet.
*/
static int ipoib_tx ( struct io_buffer *iobuf, struct net_device *netdev,
struct net_protocol *net_protocol,
const void *ll_dest ) {
struct ipoib_hdr *ipoib_hdr =
iob_push ( iobuf, sizeof ( *ipoib_hdr ) );
/* Build IPoIB header */
memcpy ( &ipoib_hdr->pseudo.peer, ll_dest,
sizeof ( ipoib_hdr->pseudo.peer ) );
ipoib_hdr->real.proto = net_protocol->net_proto;
ipoib_hdr->real.reserved = 0;
/* Hand off to network device */
return netdev_tx ( netdev, iobuf );
}
/**
* Process received IPoIB packet
*
* @v iobuf I/O buffer
* @v netdev Network device
*
* Strips off the IPoIB link-layer header and passes up to the
* network-layer protocol.
*/
static int ipoib_rx ( struct io_buffer *iobuf, struct net_device *netdev ) {
struct ipoib_hdr *ipoib_hdr = iobuf->data;
/* Sanity check */
if ( iob_len ( iobuf ) < sizeof ( *ipoib_hdr ) ) {
DBG ( "IPoIB packet too short for link-layer header\n" );
DBG_HD ( iobuf->data, iob_len ( iobuf ) );
free_iob ( iobuf );
return -EINVAL;
}
/* Strip off IPoIB header */
iob_pull ( iobuf, sizeof ( *ipoib_hdr ) );
/* Hand off to network-layer protocol */
return net_rx ( iobuf, netdev, ipoib_hdr->real.proto,
&ipoib_hdr->pseudo.peer );
}
/**
* Transcribe IPoIB address
*
* @v ll_addr Link-layer address
* @ret string Link-layer address in human-readable format
*/
const char * ipoib_ntoa ( const void *ll_addr ) {
static char buf[45];
const struct ipoib_mac *mac = ll_addr;
snprintf ( buf, sizeof ( buf ), "%08lx:%08lx:%08lx:%08lx:%08lx",
htonl ( mac->qpn ), htonl ( mac->gid.u.dwords[0] ),
htonl ( mac->gid.u.dwords[1] ),
htonl ( mac->gid.u.dwords[2] ),
htonl ( mac->gid.u.dwords[3] ) );
return buf;
}
/** IPoIB protocol */
struct ll_protocol ipoib_protocol __ll_protocol = {
.name = "IPoIB",
.ll_proto = htons ( ARPHRD_INFINIBAND ),
.ll_addr_len = IPOIB_ALEN,
.ll_header_len = IPOIB_HLEN,
.ll_broadcast = ( uint8_t * ) &ipoib_broadcast,
.tx = ipoib_tx,
.rx = ipoib_rx,
.ntoa = ipoib_ntoa,
};
/****************************************************************************
*
* IPoIB network device
*
****************************************************************************
*/
/**
* Destroy queue set
*
* @v ipoib IPoIB device
* @v qset Queue set
*/
static void ipoib_destroy_qset ( struct ipoib_device *ipoib,
struct ipoib_queue_set *qset ) {
struct ib_device *ibdev = ipoib->ibdev;
if ( qset->qp )
ib_destroy_qp ( ibdev, qset->qp );
if ( qset->cq )
ib_destroy_cq ( ibdev, qset->cq );
memset ( qset, 0, sizeof ( *qset ) );
}
/**
* Create queue set
*
* @v ipoib IPoIB device
* @v qset Queue set
* @ret rc Return status code
*/
static int ipoib_create_qset ( struct ipoib_device *ipoib,
struct ipoib_queue_set *qset,
unsigned int num_cqes,
unsigned int num_send_wqes,
unsigned int num_recv_wqes,
unsigned long qkey ) {
struct ib_device *ibdev = ipoib->ibdev;
int rc;
/* Store queue parameters */
qset->recv_max_fill = num_recv_wqes;
/* Allocate completion queue */
qset->cq = ib_create_cq ( ibdev, num_cqes );
if ( ! qset->cq ) {
DBGC ( ipoib, "IPoIB %p could not allocate completion queue\n",
ipoib );
rc = -ENOMEM;
goto err;
}
/* Allocate queue pair */
qset->qp = ib_create_qp ( ibdev, num_send_wqes, qset->cq,
num_recv_wqes, qset->cq, qkey );
if ( ! qset->qp ) {
DBGC ( ipoib, "IPoIB %p could not allocate queue pair\n",
ipoib );
rc = -ENOMEM;
goto err;
}
qset->qp->owner_priv = ipoib->netdev;
return 0;
err:
ipoib_destroy_qset ( ipoib, qset );
return rc;
}
/**
* Find path cache entry by GID
*
* @v gid GID
* @ret entry Path cache entry, or NULL
*/
static struct ipoib_cached_path *
ipoib_find_cached_path ( struct ib_gid *gid ) {
struct ipoib_cached_path *path;
unsigned int i;
for ( i = 0 ; i < IPOIB_NUM_CACHED_PATHS ; i++ ) {
path = &ipoib_path_cache[i];
if ( memcmp ( &path->gid, gid, sizeof ( *gid ) ) == 0 )
return path;
}
DBG ( "IPoIB %08lx:%08lx:%08lx:%08lx cache miss\n",
htonl ( gid->u.dwords[0] ), htonl ( gid->u.dwords[1] ),
htonl ( gid->u.dwords[2] ), htonl ( gid->u.dwords[3] ) );
return NULL;
}
/**
* Transmit path record request
*
* @v ipoib IPoIB device
* @v gid Destination GID
* @ret rc Return status code
*/
static int ipoib_get_path_record ( struct ipoib_device *ipoib,
struct ib_gid *gid ) {
struct ib_device *ibdev = ipoib->ibdev;
struct io_buffer *iobuf;
struct ib_mad_path_record *path_record;
struct ib_address_vector av;
int rc;
/* Allocate I/O buffer */
iobuf = alloc_iob ( sizeof ( *path_record ) );
if ( ! iobuf )
return -ENOMEM;
iob_put ( iobuf, sizeof ( *path_record ) );
path_record = iobuf->data;
memset ( path_record, 0, sizeof ( *path_record ) );
/* Construct path record request */
path_record->mad_hdr.base_version = IB_MGMT_BASE_VERSION;
path_record->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
path_record->mad_hdr.class_version = 2;
path_record->mad_hdr.method = IB_MGMT_METHOD_GET;
path_record->mad_hdr.attr_id = htons ( IB_SA_ATTR_PATH_REC );
path_record->mad_hdr.tid[0] = IPOIB_TID_GET_PATH_REC;
path_record->mad_hdr.tid[1] = ipoib_meta_tid++;
path_record->sa_hdr.comp_mask[1] =
htonl ( IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID );
memcpy ( &path_record->dgid, gid, sizeof ( path_record->dgid ) );
memcpy ( &path_record->sgid, &ibdev->port_gid,
sizeof ( path_record->sgid ) );
/* Construct address vector */
memset ( &av, 0, sizeof ( av ) );
av.dlid = ibdev->sm_lid;
av.dest_qp = IB_SA_QPN;
av.qkey = IB_GLOBAL_QKEY;
/* Post send request */
if ( ( rc = ib_post_send ( ibdev, ipoib->meta.qp, &av,
iobuf ) ) != 0 ) {
DBGC ( ipoib, "IPoIB %p could not send get path record: %s\n",
ipoib, strerror ( rc ) );
free_iob ( iobuf );
return rc;
}
return 0;
}
/**
* Transmit multicast group membership request
*
* @v ipoib IPoIB device
* @v gid Multicast GID
* @v join Join (rather than leave) group
* @ret rc Return status code
*/
static int ipoib_mc_member_record ( struct ipoib_device *ipoib,
struct ib_gid *gid, int join ) {
struct ib_device *ibdev = ipoib->ibdev;
struct io_buffer *iobuf;
struct ib_mad_mc_member_record *mc_member_record;
struct ib_address_vector av;
int rc;
/* Allocate I/O buffer */
iobuf = alloc_iob ( sizeof ( *mc_member_record ) );
if ( ! iobuf )
return -ENOMEM;
iob_put ( iobuf, sizeof ( *mc_member_record ) );
mc_member_record = iobuf->data;
memset ( mc_member_record, 0, sizeof ( *mc_member_record ) );
/* Construct path record request */
mc_member_record->mad_hdr.base_version = IB_MGMT_BASE_VERSION;
mc_member_record->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
mc_member_record->mad_hdr.class_version = 2;
mc_member_record->mad_hdr.method =
( join ? IB_MGMT_METHOD_SET : IB_MGMT_METHOD_DELETE );
mc_member_record->mad_hdr.attr_id = htons ( IB_SA_ATTR_MC_MEMBER_REC );
mc_member_record->mad_hdr.tid[0] = IPOIB_TID_MC_MEMBER_REC;
mc_member_record->mad_hdr.tid[1] = ipoib_meta_tid++;
mc_member_record->sa_hdr.comp_mask[1] =
htonl ( IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID |
IB_SA_MCMEMBER_REC_JOIN_STATE );
mc_member_record->scope__join_state = 1;
memcpy ( &mc_member_record->mgid, gid,
sizeof ( mc_member_record->mgid ) );
memcpy ( &mc_member_record->port_gid, &ibdev->port_gid,
sizeof ( mc_member_record->port_gid ) );
/* Construct address vector */
memset ( &av, 0, sizeof ( av ) );
av.dlid = ibdev->sm_lid;
av.dest_qp = IB_SA_QPN;
av.qkey = IB_GLOBAL_QKEY;
/* Post send request */
if ( ( rc = ib_post_send ( ibdev, ipoib->meta.qp, &av,
iobuf ) ) != 0 ) {
DBGC ( ipoib, "IPoIB %p could not send get path record: %s\n",
ipoib, strerror ( rc ) );
free_iob ( iobuf );
return rc;
}
return 0;
}
/**
* Transmit packet via IPoIB network device
*
* @v netdev Network device
* @v iobuf I/O buffer
* @ret rc Return status code
*/
static int ipoib_transmit ( struct net_device *netdev,
struct io_buffer *iobuf ) {
struct ipoib_device *ipoib = netdev->priv;
struct ib_device *ibdev = ipoib->ibdev;
struct ipoib_pseudo_hdr *ipoib_pshdr = iobuf->data;
struct ib_address_vector av;
struct ib_gid *gid;
struct ipoib_cached_path *path;
int rc;
/* Sanity check */
if ( iob_len ( iobuf ) < sizeof ( *ipoib_pshdr ) ) {
DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib );
return -EINVAL;
}
iob_pull ( iobuf, ( sizeof ( *ipoib_pshdr ) ) );
/* Construct address vector */
memset ( &av, 0, sizeof ( av ) );
av.qkey = IB_GLOBAL_QKEY;
av.gid_present = 1;
if ( ipoib_pshdr->peer.qpn == htonl ( IPOIB_BROADCAST_QPN ) ) {
/* Broadcast address */
av.dest_qp = IB_BROADCAST_QPN;
av.dlid = ipoib->broadcast_lid;
gid = &ipoib->broadcast_gid;
} else {
/* Unicast - look in path cache */
path = ipoib_find_cached_path ( &ipoib_pshdr->peer.gid );
if ( ! path ) {
/* No path entry - get path record */
rc = ipoib_get_path_record ( ipoib,
&ipoib_pshdr->peer.gid );
netdev_tx_complete ( netdev, iobuf );
return rc;
}
av.dest_qp = ntohl ( ipoib_pshdr->peer.qpn );
av.dlid = path->dlid;
av.rate = path->rate;
av.sl = path->sl;
gid = &ipoib_pshdr->peer.gid;
}
memcpy ( &av.gid, gid, sizeof ( av.gid ) );
return ib_post_send ( ibdev, ipoib->data.qp, &av, iobuf );
}
/**
* Handle IPoIB data send completion
*
* @v ibdev Infiniband device
* @v qp Queue pair
* @v completion Completion
* @v iobuf I/O buffer
*/
static void ipoib_data_complete_send ( struct ib_device *ibdev __unused,
struct ib_queue_pair *qp,
struct ib_completion *completion,
struct io_buffer *iobuf ) {
struct net_device *netdev = qp->owner_priv;
netdev_tx_complete_err ( netdev, iobuf,
( completion->syndrome ? -EIO : 0 ) );
}
/**
* Handle IPoIB data receive completion
*
* @v ibdev Infiniband device
* @v qp Queue pair
* @v completion Completion
* @v iobuf I/O buffer
*/
static void ipoib_data_complete_recv ( struct ib_device *ibdev __unused,
struct ib_queue_pair *qp,
struct ib_completion *completion,
struct io_buffer *iobuf ) {
struct net_device *netdev = qp->owner_priv;
struct ipoib_device *ipoib = netdev->priv;
struct ipoib_pseudo_hdr *ipoib_pshdr;
if ( completion->syndrome ) {
netdev_rx_err ( netdev, iobuf, -EIO );
goto done;
}
iob_put ( iobuf, completion->len );
if ( iob_len ( iobuf ) < sizeof ( struct ib_global_route_header ) ) {
DBGC ( ipoib, "IPoIB %p received data packet too short to "
"contain GRH\n", ipoib );
DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
netdev_rx_err ( netdev, iobuf, -EIO );
goto done;
}
iob_pull ( iobuf, sizeof ( struct ib_global_route_header ) );
if ( iob_len ( iobuf ) < sizeof ( struct ipoib_real_hdr ) ) {
DBGC ( ipoib, "IPoIB %p received data packet too short to "
"contain IPoIB header\n", ipoib );
DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
netdev_rx_err ( netdev, iobuf, -EIO );
goto done;
}
ipoib_pshdr = iob_push ( iobuf, sizeof ( *ipoib_pshdr ) );
/* FIXME: fill in a MAC address for the sake of AoE! */
netdev_rx ( netdev, iobuf );
done:
ipoib->data.recv_fill--;
}
/**
* Handle IPoIB metadata send completion
*
* @v ibdev Infiniband device
* @v qp Queue pair
* @v completion Completion
* @v iobuf I/O buffer
*/
static void ipoib_meta_complete_send ( struct ib_device *ibdev __unused,
struct ib_queue_pair *qp,
struct ib_completion *completion,
struct io_buffer *iobuf ) {
struct net_device *netdev = qp->owner_priv;
struct ipoib_device *ipoib = netdev->priv;
if ( completion->syndrome ) {
DBGC ( ipoib, "IPoIB %p metadata TX completion error %x\n",
ipoib, completion->syndrome );
}
free_iob ( iobuf );
}
/**
* Handle received IPoIB path record
*
* @v ipoib IPoIB device
* @v path_record Path record
*/
static void ipoib_recv_path_record ( struct ipoib_device *ipoib __unused,
struct ib_mad_path_record *path_record ) {
struct ipoib_cached_path *path;
/* Update path cache entry */
path = &ipoib_path_cache[ipoib_path_cache_idx];
memcpy ( &path->gid, &path_record->dgid, sizeof ( path->gid ) );
path->dlid = ntohs ( path_record->dlid );
path->sl = ( path_record->reserved__sl & 0x0f );
path->rate = ( path_record->rate_selector__rate & 0x3f );
DBG ( "IPoIB %08lx:%08lx:%08lx:%08lx dlid %x sl %x rate %x\n",
htonl ( path->gid.u.dwords[0] ), htonl ( path->gid.u.dwords[1] ),
htonl ( path->gid.u.dwords[2] ), htonl ( path->gid.u.dwords[3] ),
path->dlid, path->sl, path->rate );
/* Update path cache index */
ipoib_path_cache_idx++;
if ( ipoib_path_cache_idx == IPOIB_NUM_CACHED_PATHS )
ipoib_path_cache_idx = 0;
}
/**
* Handle received IPoIB multicast membership record
*
* @v ipoib IPoIB device
* @v mc_member_record Multicast membership record
*/
static void ipoib_recv_mc_member_record ( struct ipoib_device *ipoib,
struct ib_mad_mc_member_record *mc_member_record ) {
/* Record parameters */
ipoib->broadcast_joined =
( mc_member_record->scope__join_state & 0x0f );
ipoib->data_qkey = ntohl ( mc_member_record->qkey );
ipoib->broadcast_lid = ntohs ( mc_member_record->mlid );
DBGC ( ipoib, "IPoIB %p %s broadcast group: qkey %lx mlid %x\n",
ipoib, ( ipoib->broadcast_joined ? "joined" : "left" ),
ipoib->data_qkey, ipoib->broadcast_lid );
}
/**
* Handle IPoIB metadata receive completion
*
* @v ibdev Infiniband device
* @v qp Queue pair
* @v completion Completion
* @v iobuf I/O buffer
*/
static void ipoib_meta_complete_recv ( struct ib_device *ibdev __unused,
struct ib_queue_pair *qp,
struct ib_completion *completion,
struct io_buffer *iobuf ) {
struct net_device *netdev = qp->owner_priv;
struct ipoib_device *ipoib = netdev->priv;
union ib_mad *mad;
if ( completion->syndrome ) {
DBGC ( ipoib, "IPoIB %p metadata RX completion error %x\n",
ipoib, completion->syndrome );
goto done;
}
iob_put ( iobuf, completion->len );
if ( iob_len ( iobuf ) < sizeof ( struct ib_global_route_header ) ) {
DBGC ( ipoib, "IPoIB %p received metadata packet too short "
"to contain GRH\n", ipoib );
DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
goto done;
}
iob_pull ( iobuf, sizeof ( struct ib_global_route_header ) );
if ( iob_len ( iobuf ) < sizeof ( *mad ) ) {
DBGC ( ipoib, "IPoIB %p received metadata packet too short "
"to contain reply\n", ipoib );
DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
goto done;
}
mad = iobuf->data;
if ( mad->mad_hdr.status != 0 ) {
DBGC ( ipoib, "IPoIB %p metadata RX err status %04x\n",
ipoib, ntohs ( mad->mad_hdr.status ) );
goto done;
}
switch ( mad->mad_hdr.tid[0] ) {
case IPOIB_TID_GET_PATH_REC:
ipoib_recv_path_record ( ipoib, &mad->path_record );
break;
case IPOIB_TID_MC_MEMBER_REC:
ipoib_recv_mc_member_record ( ipoib, &mad->mc_member_record );
break;
default:
DBGC ( ipoib, "IPoIB %p unwanted response:\n",
ipoib );
DBGC_HD ( ipoib, mad, sizeof ( *mad ) );
break;
}
done:
ipoib->meta.recv_fill--;
free_iob ( iobuf );
}
/**
* Refill IPoIB receive ring
*
* @v ipoib IPoIB device
*/
static void ipoib_refill_recv ( struct ipoib_device *ipoib,
struct ipoib_queue_set *qset ) {
struct ib_device *ibdev = ipoib->ibdev;
struct io_buffer *iobuf;
int rc;
while ( qset->recv_fill < qset->recv_max_fill ) {
iobuf = alloc_iob ( IPOIB_MTU );
if ( ! iobuf )
break;
if ( ( rc = ib_post_recv ( ibdev, qset->qp, iobuf ) ) != 0 ) {
free_iob ( iobuf );
break;
}
qset->recv_fill++;
}
}
/**
* Poll IPoIB network device
*
* @v netdev Network device
*/
static void ipoib_poll ( struct net_device *netdev ) {
struct ipoib_device *ipoib = netdev->priv;
struct ib_device *ibdev = ipoib->ibdev;
ib_poll_cq ( ibdev, ipoib->meta.cq, ipoib_meta_complete_send,
ipoib_meta_complete_recv );
ib_poll_cq ( ibdev, ipoib->data.cq, ipoib_data_complete_send,
ipoib_data_complete_recv );
ipoib_refill_recv ( ipoib, &ipoib->meta );
ipoib_refill_recv ( ipoib, &ipoib->data );
}
/**
* Enable/disable interrupts on IPoIB network device
*
* @v netdev Network device
* @v enable Interrupts should be enabled
*/
static void ipoib_irq ( struct net_device *netdev __unused,
int enable __unused ) {
/* No implementation */
}
/**
* Open IPoIB network device
*
* @v netdev Network device
* @ret rc Return status code
*/
static int ipoib_open ( struct net_device *netdev ) {
struct ipoib_device *ipoib = netdev->priv;
struct ib_device *ibdev = ipoib->ibdev;
int rc;
/* Attach to broadcast multicast GID */
if ( ( rc = ib_mcast_attach ( ibdev, ipoib->data.qp,
&ipoib->broadcast_gid ) ) != 0 ) {
DBG ( "Could not attach to broadcast GID: %s\n",
strerror ( rc ) );
return rc;
}
/* Fill receive rings */
ipoib_refill_recv ( ipoib, &ipoib->meta );
ipoib_refill_recv ( ipoib, &ipoib->data );
return 0;
}
/**
* Close IPoIB network device
*
* @v netdev Network device
*/
static void ipoib_close ( struct net_device *netdev ) {
struct ipoib_device *ipoib = netdev->priv;
struct ib_device *ibdev = ipoib->ibdev;
/* Detach from broadcast multicast GID */
ib_mcast_detach ( ibdev, ipoib->data.qp, &ipoib->broadcast_gid );
/* FIXME: should probably flush the receive ring */
}
/** IPoIB network device operations */
static struct net_device_operations ipoib_operations = {
.open = ipoib_open,
.close = ipoib_close,
.transmit = ipoib_transmit,
.poll = ipoib_poll,
.irq = ipoib_irq,
};
/**
* Join IPoIB broadcast group
*
* @v ipoib IPoIB device
* @ret rc Return status code
*/
static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) {
struct ib_device *ibdev = ipoib->ibdev;
unsigned int delay_ms;
int rc;
/* Make sure we have some receive descriptors */
ipoib_refill_recv ( ipoib, &ipoib->meta );
/* Send join request */
if ( ( rc = ipoib_mc_member_record ( ipoib, &ipoib->broadcast_gid,
1 ) ) != 0 ) {
DBGC ( ipoib, "IPoIB %p could not send broadcast join: %s\n",
ipoib, strerror ( rc ) );
return rc;
}
/* Wait for join to complete. Ideally we wouldn't delay for
* this long, but we need the queue key before we can set up
* the data queue pair, which we need before we can know the
* MAC address.
*/
for ( delay_ms = IPOIB_JOIN_MAX_DELAY_MS ; delay_ms ; delay_ms-- ) {
mdelay ( 1 );
ib_poll_cq ( ibdev, ipoib->meta.cq, ipoib_meta_complete_send,
ipoib_meta_complete_recv );
ipoib_refill_recv ( ipoib, &ipoib->meta );
if ( ipoib->broadcast_joined )
return 0;
}
DBGC ( ipoib, "IPoIB %p timed out waiting for broadcast join\n",
ipoib );
return -ETIMEDOUT;
}
/**
* Probe IPoIB device
*
* @v ibdev Infiniband device
* @ret rc Return status code
*/
int ipoib_probe ( struct ib_device *ibdev ) {
struct net_device *netdev;
struct ipoib_device *ipoib;
struct ipoib_mac *mac;
int rc;
/* Allocate network device */
netdev = alloc_ipoibdev ( sizeof ( *ipoib ) );
if ( ! netdev )
return -ENOMEM;
netdev_init ( netdev, &ipoib_operations );
ipoib = netdev->priv;
ib_set_ownerdata ( ibdev, netdev );
netdev->dev = ibdev->dev;
memset ( ipoib, 0, sizeof ( *ipoib ) );
ipoib->netdev = netdev;
ipoib->ibdev = ibdev;
/* Calculate broadcast GID */
memcpy ( &ipoib->broadcast_gid, &ipv4_broadcast_gid,
sizeof ( ipoib->broadcast_gid ) );
ipoib->broadcast_gid.u.words[2] = htons ( ibdev->pkey );
/* Allocate metadata queue set */
if ( ( rc = ipoib_create_qset ( ipoib, &ipoib->meta,
IPOIB_META_NUM_CQES,
IPOIB_META_NUM_SEND_WQES,
IPOIB_META_NUM_RECV_WQES,
IB_GLOBAL_QKEY ) ) != 0 ) {
DBGC ( ipoib, "IPoIB %p could not allocate metadata QP: %s\n",
ipoib, strerror ( rc ) );
goto err_create_meta_qset;
}
/* Join broadcast group */
if ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) {
DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n",
ipoib, strerror ( rc ) );
goto err_join_broadcast_group;
}
/* Allocate data queue set */
if ( ( rc = ipoib_create_qset ( ipoib, &ipoib->data,
IPOIB_DATA_NUM_CQES,
IPOIB_DATA_NUM_SEND_WQES,
IPOIB_DATA_NUM_RECV_WQES,
ipoib->data_qkey ) ) != 0 ) {
DBGC ( ipoib, "IPoIB %p could not allocate data QP: %s\n",
ipoib, strerror ( rc ) );
goto err_create_data_qset;
}
/* Construct MAC address */
mac = ( ( struct ipoib_mac * ) netdev->ll_addr );
mac->qpn = htonl ( ipoib->data.qp->qpn );
memcpy ( &mac->gid, &ibdev->port_gid, sizeof ( mac->gid ) );
/* Register network device */
if ( ( rc = register_netdev ( netdev ) ) != 0 )
goto err_register_netdev;
return 0;
err_register_netdev:
ipoib_destroy_qset ( ipoib, &ipoib->data );
err_join_broadcast_group:
err_create_data_qset:
ipoib_destroy_qset ( ipoib, &ipoib->meta );
err_create_meta_qset:
netdev_nullify ( netdev );
netdev_put ( netdev );
return rc;
}
/**
* Remove IPoIB device
*
* @v ibdev Infiniband device
*/
void ipoib_remove ( struct ib_device *ibdev ) {
struct net_device *netdev = ib_get_ownerdata ( ibdev );
struct ipoib_device *ipoib = netdev->priv;
unregister_netdev ( netdev );
ipoib_destroy_qset ( ipoib, &ipoib->data );
ipoib_destroy_qset ( ipoib, &ipoib->meta );
netdev_nullify ( netdev );
netdev_put ( netdev );
}

View File

@ -0,0 +1,578 @@
#ifndef _GPXE_INFINIBAND_H
#define _GPXE_INFINIBAND_H
/** @file
*
* Infiniband protocol
*
*/
#include <stdint.h>
#include <gpxe/device.h>
/** Subnet administrator QPN */
#define IB_SA_QPN 1
/** Broadcast QPN */
#define IB_BROADCAST_QPN 0xffffffUL
/** Subnet administrator queue key */
#define IB_GLOBAL_QKEY 0x80010000UL
/** An Infiniband Global Identifier */
struct ib_gid {
union {
uint8_t bytes[16];
uint16_t words[8];
uint32_t dwords[4];
} u;
};
/** An Infiniband Global Route Header */
struct ib_global_route_header {
/** IP version, traffic class, and flow label
*
* 4 bits : Version of the GRH
* 8 bits : Traffic class
* 20 bits : Flow label
*/
uint32_t ipver_tclass_flowlabel;
/** Payload length */
uint16_t paylen;
/** Next header */
uint8_t nxthdr;
/** Hop limit */
uint8_t hoplmt;
/** Source GID */
struct ib_gid sgid;
/** Destiniation GID */
struct ib_gid dgid;
} __attribute__ (( packed ));
struct ib_device;
struct ib_queue_pair;
struct ib_completion_queue;
/** An Infiniband Work Queue */
struct ib_work_queue {
/** Containing queue pair */
struct ib_queue_pair *qp;
/** "Is a send queue" flag */
int is_send;
/** Associated completion queue */
struct ib_completion_queue *cq;
/** List of work queues on this completion queue */
struct list_head list;
/** Number of work queue entries */
unsigned int num_wqes;
/** Next work queue entry index
*
* This is the index of the next entry to be filled (i.e. the
* first empty entry). This value is not bounded by num_wqes;
* users must logical-AND with (num_wqes-1) to generate an
* array index.
*/
unsigned long next_idx;
/** I/O buffers assigned to work queue */
struct io_buffer **iobufs;
/** Device private data */
void *dev_priv;
};
/** An Infiniband Queue Pair */
struct ib_queue_pair {
/** Queue Pair Number */
unsigned long qpn;
/** Queue key */
unsigned long qkey;
/** Send queue */
struct ib_work_queue send;
/** Receive queue */
struct ib_work_queue recv;
/** Device private data */
void *dev_priv;
/** Queue owner private data */
void *owner_priv;
};
/** An Infiniband Completion Queue */
struct ib_completion_queue {
/** Completion queue number */
unsigned long cqn;
/** Number of completion queue entries */
unsigned int num_cqes;
/** Next completion queue entry index
*
* This is the index of the next entry to be filled (i.e. the
* first empty entry). This value is not bounded by num_wqes;
* users must logical-AND with (num_wqes-1) to generate an
* array index.
*/
unsigned long next_idx;
/** List of work queues completing to this queue */
struct list_head work_queues;
/** Device private data */
void *dev_priv;
};
/** An Infiniband completion */
struct ib_completion {
/** Syndrome
*
* If non-zero, then the completion is in error.
*/
unsigned int syndrome;
/** Length */
size_t len;
};
/** An Infiniband completion handler
*
* @v ibdev Infiniband device
* @v qp Queue pair
* @v completion Completion
* @v iobuf I/O buffer
*/
typedef void ( * ib_completer_t ) ( struct ib_device *ibdev,
struct ib_queue_pair *qp,
struct ib_completion *completion,
struct io_buffer *iobuf );
/** An Infiniband Address Vector */
struct ib_address_vector {
/** Destination Queue Pair */
unsigned int dest_qp;
/** Queue key */
unsigned long qkey;
/** Destination Local ID */
unsigned int dlid;
/** Rate */
unsigned int rate;
/** Service level */
unsigned int sl;
/** GID is present */
unsigned int gid_present;
/** GID */
struct ib_gid gid;
};
/**
* Infiniband device operations
*
* These represent a subset of the Infiniband Verbs.
*/
struct ib_device_operations {
/** Create completion queue
*
* @v ibdev Infiniband device
* @v cq Completion queue
* @ret rc Return status code
*/
int ( * create_cq ) ( struct ib_device *ibdev,
struct ib_completion_queue *cq );
/** Destroy completion queue
*
* @v ibdev Infiniband device
* @v cq Completion queue
*/
void ( * destroy_cq ) ( struct ib_device *ibdev,
struct ib_completion_queue *cq );
/** Create queue pair
*
* @v ibdev Infiniband device
* @v qp Queue pair
* @ret rc Return status code
*/
int ( * create_qp ) ( struct ib_device *ibdev,
struct ib_queue_pair *qp );
/** Destroy queue pair
*
* @v ibdev Infiniband device
* @v qp Queue pair
*/
void ( * destroy_qp ) ( struct ib_device *ibdev,
struct ib_queue_pair *qp );
/** Post send work queue entry
*
* @v ibdev Infiniband device
* @v qp Queue pair
* @v av Address vector
* @v iobuf I/O buffer
* @ret rc Return status code
*
* If this method returns success, the I/O buffer remains
* owned by the queue pair. If this method returns failure,
* the I/O buffer is immediately released; the failure is
* interpreted as "failure to enqueue buffer".
*/
int ( * post_send ) ( struct ib_device *ibdev,
struct ib_queue_pair *qp,
struct ib_address_vector *av,
struct io_buffer *iobuf );
/** Post receive work queue entry
*
* @v ibdev Infiniband device
* @v qp Queue pair
* @v iobuf I/O buffer
* @ret rc Return status code
*
* If this method returns success, the I/O buffer remains
* owned by the queue pair. If this method returns failure,
* the I/O buffer is immediately released; the failure is
* interpreted as "failure to enqueue buffer".
*/
int ( * post_recv ) ( struct ib_device *ibdev,
struct ib_queue_pair *qp,
struct io_buffer *iobuf );
/** Poll completion queue
*
* @v ibdev Infiniband device
* @v cq Completion queue
* @v complete_send Send completion handler
* @v complete_recv Receive completion handler
*
* The completion handler takes ownership of the I/O buffer.
*/
void ( * poll_cq ) ( struct ib_device *ibdev,
struct ib_completion_queue *cq,
ib_completer_t complete_send,
ib_completer_t complete_recv );
/** Attach to multicast group
*
* @v ibdev Infiniband device
* @v qp Queue pair
* @v gid Multicast GID
* @ret rc Return status code
*/
int ( * mcast_attach ) ( struct ib_device *ibdev,
struct ib_queue_pair *qp,
struct ib_gid *gid );
/** Detach from multicast group
*
* @v ibdev Infiniband device
* @v qp Queue pair
* @v gid Multicast GID
*/
void ( * mcast_detach ) ( struct ib_device *ibdev,
struct ib_queue_pair *qp,
struct ib_gid *gid );
};
/** An Infiniband device */
struct ib_device {
/** Port GID */
struct ib_gid port_gid;
/** Subnet manager LID */
unsigned long sm_lid;
/** Partition key */
unsigned int pkey;
/** Underlying device */
struct device *dev;
/** Infiniband operations */
struct ib_device_operations *op;
/** Device private data */
void *dev_priv;
/** Owner private data */
void *owner_priv;
};
extern struct ib_completion_queue * ib_create_cq ( struct ib_device *ibdev,
unsigned int num_cqes );
extern void ib_destroy_cq ( struct ib_device *ibdev,
struct ib_completion_queue *cq );
extern struct ib_queue_pair *
ib_create_qp ( struct ib_device *ibdev, unsigned int num_send_wqes,
struct ib_completion_queue *send_cq, unsigned int num_recv_wqes,
struct ib_completion_queue *recv_cq, unsigned long qkey );
extern void ib_destroy_qp ( struct ib_device *ibdev,
struct ib_queue_pair *qp );
extern struct ib_work_queue * ib_find_wq ( struct ib_completion_queue *cq,
unsigned long qpn, int is_send );
extern struct ib_device * alloc_ibdev ( size_t priv_size );
extern void free_ibdev ( struct ib_device *ibdev );
/**
* Post send work queue entry
*
* @v ibdev Infiniband device
* @v qp Queue pair
* @v av Address vector
* @v iobuf I/O buffer
* @ret rc Return status code
*/
static inline __attribute__ (( always_inline )) int
ib_post_send ( struct ib_device *ibdev, struct ib_queue_pair *qp,
struct ib_address_vector *av, struct io_buffer *iobuf ) {
return ibdev->op->post_send ( ibdev, qp, av, iobuf );
}
/**
* Post receive work queue entry
*
* @v ibdev Infiniband device
* @v qp Queue pair
* @v iobuf I/O buffer
* @ret rc Return status code
*/
static inline __attribute__ (( always_inline )) int
ib_post_recv ( struct ib_device *ibdev, struct ib_queue_pair *qp,
struct io_buffer *iobuf ) {
return ibdev->op->post_recv ( ibdev, qp, iobuf );
}
/**
* Poll completion queue
*
* @v ibdev Infiniband device
* @v cq Completion queue
* @v complete_send Send completion handler
* @v complete_recv Receive completion handler
*/
static inline __attribute__ (( always_inline )) void
ib_poll_cq ( struct ib_device *ibdev, struct ib_completion_queue *cq,
ib_completer_t complete_send, ib_completer_t complete_recv ) {
ibdev->op->poll_cq ( ibdev, cq, complete_send, complete_recv );
}
/**
* Attach to multicast group
*
* @v ibdev Infiniband device
* @v qp Queue pair
* @v gid Multicast GID
* @ret rc Return status code
*/
static inline __attribute__ (( always_inline )) int
ib_mcast_attach ( struct ib_device *ibdev, struct ib_queue_pair *qp,
struct ib_gid *gid ) {
return ibdev->op->mcast_attach ( ibdev, qp, gid );
}
/**
* Detach from multicast group
*
* @v ibdev Infiniband device
* @v qp Queue pair
* @v gid Multicast GID
*/
static inline __attribute__ (( always_inline )) void
ib_mcast_detach ( struct ib_device *ibdev, struct ib_queue_pair *qp,
struct ib_gid *gid ) {
ibdev->op->mcast_detach ( ibdev, qp, gid );
}
/**
* Set Infiniband owner-private data
*
* @v pci Infiniband device
* @v priv Private data
*/
static inline void ib_set_ownerdata ( struct ib_device *ibdev,
void *owner_priv ) {
ibdev->owner_priv = owner_priv;
}
/**
* Get Infiniband owner-private data
*
* @v pci Infiniband device
* @ret priv Private data
*/
static inline void * ib_get_ownerdata ( struct ib_device *ibdev ) {
return ibdev->owner_priv;
}
/*****************************************************************************
*
* Management datagrams
*
* Portions Copyright (c) 2004 Mellanox Technologies Ltd. All rights
* reserved.
*
*/
/* Management base version */
#define IB_MGMT_BASE_VERSION 1
/* Management classes */
#define IB_MGMT_CLASS_SUBN_LID_ROUTED 0x01
#define IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE 0x81
#define IB_MGMT_CLASS_SUBN_ADM 0x03
#define IB_MGMT_CLASS_PERF_MGMT 0x04
#define IB_MGMT_CLASS_BM 0x05
#define IB_MGMT_CLASS_DEVICE_MGMT 0x06
#define IB_MGMT_CLASS_CM 0x07
#define IB_MGMT_CLASS_SNMP 0x08
#define IB_MGMT_CLASS_VENDOR_RANGE2_START 0x30
#define IB_MGMT_CLASS_VENDOR_RANGE2_END 0x4F
/* Management methods */
#define IB_MGMT_METHOD_GET 0x01
#define IB_MGMT_METHOD_SET 0x02
#define IB_MGMT_METHOD_GET_RESP 0x81
#define IB_MGMT_METHOD_SEND 0x03
#define IB_MGMT_METHOD_TRAP 0x05
#define IB_MGMT_METHOD_REPORT 0x06
#define IB_MGMT_METHOD_REPORT_RESP 0x86
#define IB_MGMT_METHOD_TRAP_REPRESS 0x07
#define IB_MGMT_METHOD_DELETE 0x15
#define IB_MGMT_METHOD_RESP 0x80
/* Subnet management attributes */
#define IB_SMP_ATTR_NOTICE 0x0002
#define IB_SMP_ATTR_NODE_DESC 0x0010
#define IB_SMP_ATTR_NODE_INFO 0x0011
#define IB_SMP_ATTR_SWITCH_INFO 0x0012
#define IB_SMP_ATTR_GUID_INFO 0x0014
#define IB_SMP_ATTR_PORT_INFO 0x0015
#define IB_SMP_ATTR_PKEY_TABLE 0x0016
#define IB_SMP_ATTR_SL_TO_VL_TABLE 0x0017
#define IB_SMP_ATTR_VL_ARB_TABLE 0x0018
#define IB_SMP_ATTR_LINEAR_FORWARD_TABLE 0x0019
#define IB_SMP_ATTR_RANDOM_FORWARD_TABLE 0x001A
#define IB_SMP_ATTR_MCAST_FORWARD_TABLE 0x001B
#define IB_SMP_ATTR_SM_INFO 0x0020
#define IB_SMP_ATTR_VENDOR_DIAG 0x0030
#define IB_SMP_ATTR_LED_INFO 0x0031
#define IB_SMP_ATTR_VENDOR_MASK 0xFF00
#define IB_SA_ATTR_MC_MEMBER_REC 0x38
#define IB_SA_ATTR_PATH_REC 0x35
#define IB_SA_MCMEMBER_REC_MGID (1<<0)
#define IB_SA_MCMEMBER_REC_PORT_GID (1<<1)
#define IB_SA_MCMEMBER_REC_QKEY (1<<2)
#define IB_SA_MCMEMBER_REC_MLID (1<<3)
#define IB_SA_MCMEMBER_REC_MTU_SELECTOR (1<<4)
#define IB_SA_MCMEMBER_REC_MTU (1<<5)
#define IB_SA_MCMEMBER_REC_TRAFFIC_CLASS (1<<6)
#define IB_SA_MCMEMBER_REC_PKEY (1<<7)
#define IB_SA_MCMEMBER_REC_RATE_SELECTOR (1<<8)
#define IB_SA_MCMEMBER_REC_RATE (1<<9)
#define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR (1<<10)
#define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME (1<<11)
#define IB_SA_MCMEMBER_REC_SL (1<<12)
#define IB_SA_MCMEMBER_REC_FLOW_LABEL (1<<13)
#define IB_SA_MCMEMBER_REC_HOP_LIMIT (1<<14)
#define IB_SA_MCMEMBER_REC_SCOPE (1<<15)
#define IB_SA_MCMEMBER_REC_JOIN_STATE (1<<16)
#define IB_SA_MCMEMBER_REC_PROXY_JOIN (1<<17)
#define IB_SA_PATH_REC_DGID (1<<2)
#define IB_SA_PATH_REC_SGID (1<<3)
struct ib_mad_hdr {
uint8_t base_version;
uint8_t mgmt_class;
uint8_t class_version;
uint8_t method;
uint16_t status;
uint16_t class_specific;
uint32_t tid[2];
uint16_t attr_id;
uint16_t resv;
uint32_t attr_mod;
} __attribute__ (( packed ));
struct ib_sa_hdr {
uint32_t sm_key[2];
uint16_t reserved;
uint16_t attrib_offset;
uint32_t comp_mask[2];
} __attribute__ (( packed ));
struct ib_rmpp_hdr {
uint32_t raw[3];
} __attribute__ (( packed ));
struct ib_mad_data {
struct ib_mad_hdr mad_hdr;
uint8_t data[232];
} __attribute__ (( packed ));
struct ib_mad_guid_info {
struct ib_mad_hdr mad_hdr;
uint32_t mkey[2];
uint32_t reserved[8];
uint8_t gid_local[8];
} __attribute__ (( packed ));
struct ib_mad_port_info {
struct ib_mad_hdr mad_hdr;
uint32_t mkey[2];
uint32_t reserved[8];
uint32_t mkey2[2];
uint8_t gid_prefix[8];
uint16_t lid;
uint16_t mastersm_lid;
uint32_t cap_mask;
uint16_t diag_code;
uint16_t mkey_lease_period;
uint8_t local_port_num;
uint8_t link_width_enabled;
uint8_t link_width_supported;
uint8_t link_width_active;
uint8_t port_state__link_speed_supported;
uint8_t link_down_def_state__port_phys_state;
uint8_t lmc__r1__mkey_prot_bits;
uint8_t link_speed_enabled__link_speed_active;
} __attribute__ (( packed ));
struct ib_mad_pkey_table {
struct ib_mad_hdr mad_hdr;
uint32_t mkey[2];
uint32_t reserved[8];
uint16_t pkey[16][2];
} __attribute__ (( packed ));
struct ib_mad_path_record {
struct ib_mad_hdr mad_hdr;
struct ib_rmpp_hdr rmpp_hdr;
struct ib_sa_hdr sa_hdr;
uint32_t reserved0[2];
struct ib_gid dgid;
struct ib_gid sgid;
uint16_t dlid;
uint16_t slid;
uint32_t hop_limit__flow_label__raw_traffic;
uint32_t pkey__numb_path__reversible__tclass;
uint8_t reserved1;
uint8_t reserved__sl;
uint8_t mtu_selector__mtu;
uint8_t rate_selector__rate;
uint32_t preference__packet_lifetime__packet_lifetime_selector;
uint32_t reserved2[35];
} __attribute__ (( packed ));
struct ib_mad_mc_member_record {
struct ib_mad_hdr mad_hdr;
struct ib_rmpp_hdr rmpp_hdr;
struct ib_sa_hdr sa_hdr;
struct ib_gid mgid;
struct ib_gid port_gid;
uint32_t qkey;
uint16_t mlid;
uint8_t mtu_selector__mtu;
uint8_t tclass;
uint16_t pkey;
uint8_t rate_selector__rate;
uint8_t packet_lifetime_selector__packet_lifetime;
uint32_t sl__flow_label__hop_limit;
uint8_t scope__join_state;
uint8_t proxy_join__reserved;
uint16_t reserved0;
uint32_t reserved1[37];
} __attribute__ (( packed ));
union ib_mad {
struct ib_mad_hdr mad_hdr;
struct ib_mad_data data;
struct ib_mad_guid_info guid_info;
struct ib_mad_port_info port_info;
struct ib_mad_pkey_table pkey_table;
struct ib_mad_path_record path_record;
struct ib_mad_mc_member_record mc_member_record;
} __attribute__ (( packed ));
#endif /* _GPXE_INFINIBAND_H */

78
src/include/gpxe/ipoib.h Normal file
View File

@ -0,0 +1,78 @@
#ifndef _GPXE_IPOIB_H
#define _GPXE_IPOIB_H
/** @file
*
* IP over Infiniband
*/
#include <gpxe/infiniband.h>
/** IPoIB MAC address length */
#define IPOIB_ALEN 20
/** An IPoIB MAC address */
struct ipoib_mac {
/** Queue pair number
*
* MSB must be zero; QPNs are only 24-bit.
*/
uint32_t qpn;
/** Port GID */
struct ib_gid gid;
} __attribute__ (( packed ));
/** IPoIB link-layer header length */
#define IPOIB_HLEN 24
/**
* IPoIB link-layer header pseudo portion
*
* This part doesn't actually exist on the wire, but it provides a
* convenient way to fit into the typical network device model.
*/
struct ipoib_pseudo_hdr {
/** Peer address */
struct ipoib_mac peer;
} __attribute__ (( packed ));
/** IPoIB link-layer header real portion */
struct ipoib_real_hdr {
/** Network-layer protocol */
uint16_t proto;
/** Reserved, must be zero */
uint16_t reserved;
} __attribute__ (( packed ));
/** An IPoIB link-layer header */
struct ipoib_hdr {
/** Pseudo portion */
struct ipoib_pseudo_hdr pseudo;
/** Real portion */
struct ipoib_real_hdr real;
} __attribute__ (( packed ));
extern struct ll_protocol ipoib_protocol;
extern const char * ipoib_ntoa ( const void *ll_addr );
/**
* Allocate IPoIB device
*
* @v priv_size Size of driver private data
* @ret netdev Network device, or NULL
*/
static inline struct net_device * alloc_ipoibdev ( size_t priv_size ) {
struct net_device *netdev;
netdev = alloc_netdev ( priv_size );
if ( netdev ) {
netdev->ll_protocol = &ipoib_protocol;
}
return netdev;
}
extern int ipoib_probe ( struct ib_device *ibdev );
extern void ipoib_remove ( struct ib_device *ibdev );
#endif /* _GPXE_IPOIB_H */

View File

@ -275,7 +275,8 @@ struct tcp_options {
* actually use 65536, we use a window size of (65536-4) to ensure
* that payloads remain dword-aligned.
*/
#define TCP_MAX_WINDOW_SIZE ( 65536 - 4 )
//#define TCP_MAX_WINDOW_SIZE ( 65536 - 4 )
#define TCP_MAX_WINDOW_SIZE 4096
/**
* Path MTU

210
src/net/infiniband.c Normal file
View File

@ -0,0 +1,210 @@
/*
* Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <byteswap.h>
#include <errno.h>
#include <assert.h>
#include <gpxe/list.h>
#include <gpxe/if_arp.h>
#include <gpxe/netdevice.h>
#include <gpxe/iobuf.h>
#include <gpxe/infiniband.h>
/** @file
*
* Infiniband protocol
*
*/
/**
* Create completion queue
*
* @v ibdev Infiniband device
* @v num_cqes Number of completion queue entries
* @ret cq New completion queue
*/
struct ib_completion_queue * ib_create_cq ( struct ib_device *ibdev,
unsigned int num_cqes ) {
struct ib_completion_queue *cq;
int rc;
DBGC ( ibdev, "IBDEV %p creating completion queue\n", ibdev );
/* Allocate and initialise data structure */
cq = zalloc ( sizeof ( *cq ) );
if ( ! cq )
return NULL;
cq->num_cqes = num_cqes;
INIT_LIST_HEAD ( &cq->work_queues );
/* Perform device-specific initialisation and get CQN */
if ( ( rc = ibdev->op->create_cq ( ibdev, cq ) ) != 0 ) {
DBGC ( ibdev, "IBDEV %p could not initialise completion "
"queue: %s\n", ibdev, strerror ( rc ) );
free ( cq );
return NULL;
}
DBGC ( ibdev, "IBDEV %p created %d-entry completion queue %p (%p) "
"with CQN %#lx\n", ibdev, num_cqes, cq, cq->dev_priv, cq->cqn );
return cq;
}
/**
* Destroy completion queue
*
* @v ibdev Infiniband device
* @v cq Completion queue
*/
void ib_destroy_cq ( struct ib_device *ibdev,
struct ib_completion_queue *cq ) {
DBGC ( ibdev, "IBDEV %p destroying completion queue %#lx\n",
ibdev, cq->cqn );
assert ( list_empty ( &cq->work_queues ) );
ibdev->op->destroy_cq ( ibdev, cq );
free ( cq );
}
/**
* Create queue pair
*
* @v ibdev Infiniband device
* @v num_send_wqes Number of send work queue entries
* @v send_cq Send completion queue
* @v num_recv_wqes Number of receive work queue entries
* @v recv_cq Receive completion queue
* @v qkey Queue key
* @ret qp Queue pair
*/
struct ib_queue_pair * ib_create_qp ( struct ib_device *ibdev,
unsigned int num_send_wqes,
struct ib_completion_queue *send_cq,
unsigned int num_recv_wqes,
struct ib_completion_queue *recv_cq,
unsigned long qkey ) {
struct ib_queue_pair *qp;
size_t total_size;
int rc;
DBGC ( ibdev, "IBDEV %p creating queue pair\n", ibdev );
/* Allocate and initialise data structure */
total_size = ( sizeof ( *qp ) +
( num_send_wqes * sizeof ( qp->send.iobufs[0] ) ) +
( num_recv_wqes * sizeof ( qp->recv.iobufs[0] ) ) );
qp = zalloc ( total_size );
if ( ! qp )
return NULL;
qp->qkey = qkey;
qp->send.qp = qp;
qp->send.is_send = 1;
qp->send.cq = send_cq;
list_add ( &qp->send.list, &send_cq->work_queues );
qp->send.num_wqes = num_send_wqes;
qp->send.iobufs = ( ( ( void * ) qp ) + sizeof ( *qp ) );
qp->recv.qp = qp;
qp->recv.cq = recv_cq;
list_add ( &qp->recv.list, &recv_cq->work_queues );
qp->recv.num_wqes = num_recv_wqes;
qp->recv.iobufs = ( ( ( void * ) qp ) + sizeof ( *qp ) +
( num_send_wqes * sizeof ( qp->send.iobufs[0] ) ));
/* Perform device-specific initialisation and get QPN */
if ( ( rc = ibdev->op->create_qp ( ibdev, qp ) ) != 0 ) {
DBGC ( ibdev, "IBDEV %p could not initialise queue pair: "
"%s\n", ibdev, strerror ( rc ) );
free ( qp );
return NULL;
}
DBGC ( ibdev, "IBDEV %p created queue pair %p (%p) with QPN %#lx\n",
ibdev, qp, qp->dev_priv, qp->qpn );
DBGC ( ibdev, "IBDEV %p QPN %#lx has %d send entries at [%p,%p)\n",
ibdev, qp->qpn, num_send_wqes, qp->send.iobufs,
qp->recv.iobufs );
DBGC ( ibdev, "IBDEV %p QPN %#lx has %d receive entries at [%p,%p)\n",
ibdev, qp->qpn, num_send_wqes, qp->recv.iobufs,
( ( ( void * ) qp ) + total_size ) );
return qp;
}
/**
* Destroy queue pair
*
* @v ibdev Infiniband device
* @v qp Queue pair
*/
void ib_destroy_qp ( struct ib_device *ibdev,
struct ib_queue_pair *qp ) {
DBGC ( ibdev, "IBDEV %p destroying queue pair %#lx\n",
ibdev, qp->qpn );
ibdev->op->destroy_qp ( ibdev, qp );
list_del ( &qp->send.list );
list_del ( &qp->recv.list );
free ( qp );
}
/**
* Find work queue belonging to completion queue
*
* @v cq Completion queue
* @v qpn Queue pair number
* @v is_send Find send work queue (rather than receive)
* @ret wq Work queue, or NULL if not found
*/
struct ib_work_queue * ib_find_wq ( struct ib_completion_queue *cq,
unsigned long qpn, int is_send ) {
struct ib_work_queue *wq;
list_for_each_entry ( wq, &cq->work_queues, list ) {
if ( ( wq->qp->qpn == qpn ) && ( wq->is_send == is_send ) )
return wq;
}
return NULL;
}
/**
* Allocate Infiniband device
*
* @v priv_size Size of private data area
* @ret ibdev Infiniband device, or NULL
*/
struct ib_device * alloc_ibdev ( size_t priv_size ) {
struct ib_device *ibdev;
size_t total_len;
total_len = ( sizeof ( *ibdev ) + priv_size );
ibdev = zalloc ( total_len );
if ( ibdev ) {
ibdev->dev_priv = ( ( ( void * ) ibdev ) + sizeof ( *ibdev ) );
}
return ibdev;
}
/**
* Free Infiniband device
*
* @v ibdev Infiniband device
*/
void free_ibdev ( struct ib_device *ibdev ) {
free ( ibdev );
}