first commit

This commit is contained in:
xue_mc 2025-04-16 18:41:44 +08:00
commit c7aa5a519b
14 changed files with 51026 additions and 0 deletions

5071
pure_cms_rdma/bf_drivers.log Normal file

File diff suppressed because it is too large Load Diff

44823
pure_cms_rdma/bf_drivers.log.0 Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,4 @@
# 启用 bf-sde 环境
. ../../bf-sde-9.2.0/set_sde.bash
# 编译 Translator 代码
./../../bf-sde-9.2.0/p4_build.sh pure_cms_rdma.p4

View File

@ -0,0 +1 @@
2024-07-19 01:15:40.048036 DigProc: Bootstrap complete

View File

@ -0,0 +1,760 @@
#include <core.p4>
#include <tna.p4>
#define ETHERTYPE_IPV4 0x0800
#define ETHERTYPE_ROCE 0x8915
#define IPv4_PROTO_TCP 0x06
#define IPv4_PROTO_UDP 0x11
#define CMS_RDMA_PAYLOAD_SIZE 8
#ifndef MAX_SUPPORTED_QPS
#define MAX_SUPPORTED_QPS 256 // Maximum number of supported QPs. Specifies table and register sizes
// #define MAX_SUPPORTED_QPS 65536 // Used when benchmarking tons of QPs
#endif
typedef bit<32> ipv4_address_t;
typedef bit<32> iCRC_t;
typedef bit<32> remote_key_t;
typedef bit<24> queue_pair_t;
typedef bit<24> psn_t; // RoCEv2 中的数据包序列号 (Packet sequence number)
typedef bit<16> qp_reg_index_t; // 用于为每个 QP 存放其 PSN. 该字段是充当那个寄存器的索引号
typedef bit<32> slot_nums_t;
typedef bit<32> memory_slot_t; // 内存插槽(空隙). 由 Key-Write 和 Append 原语所共享, 出于某些原因, 它们都被限制在最大 32 bits
typedef bit<64> memory_address_t; // 物理内存地址(共 2^64)
// 定义不同的数据包类型 (Normal 和 Mirror), 用于桥接报头中
typedef bit<8> pkt_type_t;
const pkt_type_t PKT_TYPE_NORMAL = 1;
const pkt_type_t PKT_TYPE_MIRROR = 2;
// 定义不同的镜像数据包类型 (I2E 和 E2E)
typedef bit<3> mirror_type_t;
const mirror_type_t MIRROR_TYPE_I2E = 1;
const mirror_type_t MIRROR_TYPE_E2E = 2;
// 14 Bytes
header ethernet_h{
bit<48> dstAddr;
bit<48> srcAddr;
bit<16> etherType;
}
// 20 Bytes
header ipv4_h{
bit<4> version;
bit<4> ihl;
bit<6> dscp;
bit<2> ecn;
bit<16> totalLen;
bit<16> identification;
bit<3> flags;
bit<13> fragOffset;
bit<8> ttl;
bit<8> protocol;
bit<16> hdrChecksum;
ipv4_address_t srcAddr;
ipv4_address_t dstAddr;
}
// 20 Bytes
header tcp_h {
bit<16> srcPort;
bit<16> dstPort;
bit<32> seq_no;
bit<32> ack_no;
bit<4> data_offset;
bit<4> res;
bit<8> flags;
bit<16> window;
bit<16> checksum;
bit<16> urgent_ptr;
}
// 8 Bytes
header udp_h{
bit<16> srcPort;
bit<16> dstPort;
bit<16> totalLen;
bit<16> checksum;
}
// Global Route Header (GRH) (40 Bytes)
header infiniband_grh_h{
bit<4> version;
bit<8> class;
bit<20> flow_lab;
bit<16> pay_len;
bit<8> next_hdr;
bit<8> hop_lim;
bit<128> src_gid;
bit<128> dst_gid;
}
// Base Transport Header (BTH) (12 Bytes)
header infiniband_bth_h{
bit<8> opcode;
bit<1> solicitedEvent;
bit<1> migReq;
bit<2> padCount;
bit<4> transportHeaderVersion;
bit<16> partitionKey;
bit<1> fRes;
bit<1> bRes;
bit<6> reserved1;
bit<24> destinationQP;
bit<1> ackRequest;
bit<7> reserved2;
psn_t packetSequenceNumber;
}
// Atomic Extended Transport Header (ATOMIC_ETH) (28 bytes)
header infiniband_atomiceth_h{
memory_address_t virtualAddress;
bit<32> rKey;
bit<64> data;
bit<64> compare;
}
// iCRC 字段 (4 Bytes)
header infiniband_icrc_h{
bit<32> iCRC;
}
header mirror_h{
pkt_type_t pkt_type;
}
header mirror_bridged_metadata_h{
pkt_type_t pkt_type;
}
struct headers{
mirror_bridged_metadata_h bridged_md;
/* Normal Header */
ethernet_h ethernet;
ipv4_h ipv4;
udp_h udp;
tcp_h tcp;
/* RoCEv2 Header */
infiniband_grh_h grh;
infiniband_bth_h bth;
infiniband_atomiceth_h atomic_eth;
infiniband_icrc_h icrc;
}
struct ingress_metadata_t{
pkt_type_t pkt_type;
MirrorId_t mirror_session;
}
struct egress_metadata_t{
/* Store Flowkey */
ipv4_address_t srcIP;
ipv4_address_t dstIP;
bit<16> srcPort;
bit<16> dstPort;
bit<8> proto;
/* RDMA Metadata */
psn_t rdma_psn;
remote_key_t remote_key;
queue_pair_t queue_pair;
/* Used to locate where to store in the rdma memory */
memory_address_t memory_address_start;
memory_address_t memory_address_offset;
/* Slot is used as an intermediary for calculating rdma memory address */
memory_slot_t colletcor_dst_slot;
memory_slot_t rank_num_slots;
memory_slot_t rank_slot_offset;
memory_slot_t rank_start_slot;
qp_reg_index_t qp_reg_index;
bit<8> multicast_pkt_num;
}
/* 入口解析器部分校对完毕, 没有问题 */
parser TofinoIngressParser(packet_in pkt,
/* User */
inout ingress_metadata_t ig_md,
/* Intrinsic */
out ingress_intrinsic_metadata_t ig_intr_md)
{
state start{
pkt.extract(ig_intr_md);
transition select(ig_intr_md.resubmit_flag){
1 : parse_resubmit;
0 : parse_port_metadata;
}
}
state parse_resubmit{
transition reject;
}
state parse_port_metadata{
pkt.advance(64); // Tofino 1
transition accept;
}
}
parser SwitchIngressParser(packet_in pkt,
/* User */
out headers hdr,
out ingress_metadata_t ig_md,
/* Intrinsic */
out ingress_intrinsic_metadata_t ig_intr_md)
{
TofinoIngressParser() tofino_parser;
state start{
tofino_parser.apply(pkt, ig_md, ig_intr_md);
transition parse_ethernet;
}
state parse_ethernet{
pkt.extract(hdr.ethernet);
transition select(hdr.ethernet.etherType){
ETHERTYPE_IPV4: parse_ipv4;
ETHERTYPE_ROCE: parse_grh;
default: accept;
}
}
state parse_ipv4{
pkt.extract(hdr.ipv4);
transition select(hdr.ipv4.protocol){
IPv4_PROTO_UDP: parse_udp;
IPv4_PROTO_TCP: parse_tcp;
default: accept;
}
}
state parse_tcp{
pkt.extract(hdr.tcp);
transition accept;
}
state parse_udp{
pkt.extract(hdr.udp);
transition accept;
}
state parse_grh{
pkt.extract(hdr.grh);
transition accept;
}
}
/* 入口控制块部分校对完毕, 没有问题 */
control SwitchIngress(inout headers hdr,
/* User */
inout ingress_metadata_t ig_md,
/* Intrinsic */
in ingress_intrinsic_metadata_t ig_intr_md,
in ingress_intrinsic_metadata_from_parser_t ig_intr_prsr_md,
inout ingress_intrinsic_metadata_for_deparser_t ig_intr_dprsr_md,
inout ingress_intrinsic_metadata_for_tm_t ig_intr_tm_md)
{
/* 根据目的以太网地址来设置多播组 ID, 从而对数据包执行多播操作 */
action prep_multiwrite(bit<16> mcast_grp)
{
ig_intr_tm_md.mcast_grp_a = mcast_grp;
}
table tbl_prep_multicast
{
key = {
hdr.ethernet.dstAddr: exact;
}
actions = {
prep_multiwrite;
@defaultonly NoAction;
}
default_action = NoAction;
size = 1024;
}
/* 根据目的以太网地址决定数据包执行转发到对应的目的端口, 或者是丢弃该数据包 */
action forward(PortId_t port)
{
ig_intr_tm_md.ucast_egress_port = port;
}
action to_cpu()
{
ig_intr_tm_md.ucast_egress_port = 66;
}
action drop()
{
ig_intr_dprsr_md.drop_ctl = 1;
}
table tbl_forward
{
key = {
hdr.ethernet.dstAddr: exact;
}
actions = {
forward;
to_cpu;
drop;
}
default_action = to_cpu;
size = 1024;
}
apply
{
tbl_forward.apply();
tbl_prep_multicast.apply();
// 为 Egress Control 准备桥接元数据
hdr.bridged_md.setValid();
hdr.bridged_md.pkt_type = PKT_TYPE_NORMAL;
}
}
/* 入口逆解析器部分校对完毕, 没有问题 */
control SwitchIngressDeparser(packet_out pkt,
inout headers hdr,
in ingress_metadata_t ig_md,
in ingress_intrinsic_metadata_for_deparser_t ig_intr_dprsr_md)
{
Mirror() mirror;
apply{
// 如果时 Ingress-to-Egress 镜像操作
if (ig_intr_dprsr_md.mirror_type == MIRROR_TYPE_I2E){
// Emit Mirror并附加上 mirror_h 报头
mirror.emit<mirror_h>(ig_md.mirror_session, {ig_md.pkt_type});
}
pkt.emit(hdr);
}
}
/* 出口解析器部分校对完毕, 没有问题 */
parser SwitchEgressParser(packet_in pkt,
/* User */
out headers hdr,
out egress_metadata_t eg_md,
/* Intrinsic */
out egress_intrinsic_metadata_t eg_intr_md)
{
state start{
pkt.extract(eg_intr_md);
transition parse_metadata;
}
state parse_metadata{
mirror_h mirror_md = pkt.lookahead<mirror_h>();
// 根据镜像元数据中的 pkt_type 字段决定下一步要执行的解析状态
transition select(mirror_md.pkt_type){
PKT_TYPE_MIRROR: parse_mirror_md;
PKT_TYPE_NORMAL: parse_bridged_md;
default: accept;
}
}
// 提取桥接元数据
state parse_bridged_md{
pkt.extract(hdr.bridged_md);
transition parse_ethernet;
}
// 如果是镜像数据包, 在本方案中表示是遥测报告数据包, 提取其镜像元数据
state parse_mirror_md{
mirror_h mirror_md;
pkt.extract(mirror_md);
transition parse_ethernet;
}
state parse_ethernet{
pkt.extract(hdr.ethernet);
transition select(hdr.ethernet.etherType){
ETHERTYPE_IPV4: parse_ipv4;
ETHERTYPE_ROCE: parse_grh;
default: accept;
}
}
state parse_ipv4{
pkt.extract(hdr.ipv4);
transition select(hdr.ipv4.protocol){
IPv4_PROTO_UDP: parse_udp;
IPv4_PROTO_TCP: parse_tcp;
default: accept;
}
}
state parse_udp{
pkt.extract(hdr.udp);
transition accept;
}
state parse_tcp{
pkt.extract(hdr.tcp);
transition accept;
}
state parse_grh{
pkt.extract(hdr.grh);
transition accept;
}
}
/* 准备 KeyWrite 控制块部分校对完毕, 没有问题 */
control ControlPrepareMemoryAddress(inout headers hdr,
inout egress_metadata_t eg_md,
in egress_intrinsic_metadata_t eg_intr_md)
{
Hash<slot_nums_t>(HashAlgorithm_t.CRC32) hash_slot;
// 用于区分多播中产生的多个数据包 (因为每来一个数据包都要递增寄存器中的值, 并且还是循环)
Register<bit<8>, bit<1>>(MAX_SUPPORTED_QPS) reg_multicast_iterator;
RegisterAction<bit<8>, bit<1>, bit<8>>(reg_multicast_iterator) get_pkt_number = {
void apply(inout bit<8> stored, out bit<8> output)
{
// 首先将内部存储的 stored 输出给 output
output = stored;
// 如果当前存储的值大于 hash_nums - 1, 则需要将 stored 置 0
if(stored >= 3){
stored = 0;
}
// 否则, 对 stored 进行递增
else{
stored = stored + 1;
}
}
};
// 根据当前数据包的多播 ID 号, 来获得其在 CMS 中存储的起始插槽位置
action get_start_slot(memory_slot_t start_slot)
{
// 获取起始插槽位置 (CMS 中每行的开头)
eg_md.rank_start_slot = start_slot;
}
table tbl_get_start_slot{
key = {
eg_md.multicast_pkt_num: exact;
}
actions = {
get_start_slot;
NoAction;
}
size = 8;
default_action = NoAction();
}
// 根据目的 IPv4 地址, 获取 Collector 的 RDMA 元数据信息
action set_server_info(remote_key_t remote_key, queue_pair_t queue_pair, memory_address_t memory_address_start, memory_slot_t rank_num_slots, qp_reg_index_t qp_reg_index)
{
eg_md.remote_key = remote_key;
eg_md.queue_pair = queue_pair;
eg_md.memory_address_start = memory_address_start;
eg_md.rank_num_slots = rank_num_slots;
eg_md.qp_reg_index = qp_reg_index;
}
table tbl_getRDMAMetadata
{
key = {
hdr.ethernet.dstAddr: exact;
}
actions = {
set_server_info;
}
// 单个 Translator 不可能负责比这更多的工作
size = MAX_SUPPORTED_QPS;
}
// 通过哈希函数计算出插槽的偏移量 (计算结果为 bit<32> 类型)
action cal_slot_offset()
{
eg_md.rank_slot_offset = hash_slot.get({eg_md.srcIP,
eg_md.dstIP,
eg_md.srcPort,
eg_md.dstPort,
eg_md.proto,
eg_md.multicast_pkt_num});
}
table tbl_cal_slot_offset
{
key = {}
actions = {
cal_slot_offset;
}
size = 1;
default_action = cal_slot_offset();
}
// 将这个偏移量与实际可用的插槽数量进行绑定 (通过与 mask 进行按位与运算)
action bound_memory_slot(memory_slot_t mask)
{
eg_md.rank_slot_offset = eg_md.rank_slot_offset & mask;
}
table tbl_bound_memory_slot
{
key = {
eg_md.rank_num_slots: exact;
}
actions = {
bound_memory_slot;
}
const entries = {
2: bound_memory_slot(0x00000001);
4: bound_memory_slot(0x00000003);
8: bound_memory_slot(0x00000007);
16: bound_memory_slot(0x0000000f);
32: bound_memory_slot(0x0000001f);
64: bound_memory_slot(0x0000003f);
128: bound_memory_slot(0x0000007f);
256: bound_memory_slot(0x000000ff);
512: bound_memory_slot(0x000001ff);
1024: bound_memory_slot(0x000003ff);
2048: bound_memory_slot(0x000007ff);
4096: bound_memory_slot(0x00000fff);
8192: bound_memory_slot(0x00001fff);
16384: bound_memory_slot(0x00003fff);
32768: bound_memory_slot(0x00007fff);
65536: bound_memory_slot(0x0000ffff);
131072: bound_memory_slot(0x0001ffff);
262144: bound_memory_slot(0x0003ffff);
524288: bound_memory_slot(0x0007ffff);
1048576: bound_memory_slot(0x000fffff);
2097152: bound_memory_slot(0x001fffff);
4194304: bound_memory_slot(0x003fffff);
8388608: bound_memory_slot(0x007fffff);
16777216: bound_memory_slot(0x00ffffff);
33554432: bound_memory_slot(0x01ffffff);
67108864: bound_memory_slot(0x03ffffff);
134217728: bound_memory_slot(0x07ffffff);
268435456: bound_memory_slot(0x0fffffff);
536870912: bound_memory_slot(0x1fffffff);
1073741824: bound_memory_slot(0x3fffffff);
2147483648: bound_memory_slot(0x7fffffff);
//4294967296: bound_memory_slot(0xffffffff); //does not fit in 32-bit
}
size=64;
}
apply
{
// 获取当前数据包的多播 ID 号
eg_md.multicast_pkt_num = get_pkt_number.execute(0);
// 获取 RDMA 元数据信息
tbl_getRDMAMetadata.apply();
@stage(1)
{
// 计算起始插槽位置和插槽偏移量, 然后联合起来计算出目标插槽位置
tbl_get_start_slot.apply();
tbl_cal_slot_offset.apply();
tbl_bound_memory_slot.apply();
eg_md.colletcor_dst_slot = eg_md.rank_start_slot + eg_md.rank_slot_offset;
// 将内存插槽转换为在物理内存地址中的偏移量
// 即需要乘以有效载荷的字节数 (此处为 8即向左位移 3)
eg_md.memory_address_offset = (memory_address_t)(eg_md.colletcor_dst_slot);
eg_md.memory_address_offset = eg_md.memory_address_offset * CMS_RDMA_PAYLOAD_SIZE;
}
}
}
/* 生成 RDMA 数据包控制块部分校对完毕, 没有问题 */
control ControlConvertToRDMA(inout headers hdr,
inout egress_metadata_t eg_md)
{
// 分配 32 位的寄存器数组来保存 24 位的 PSN
Register<bit<32>, qp_reg_index_t>(MAX_SUPPORTED_QPS) reg_rdma_sequence_number;
RegisterAction<psn_t, qp_reg_index_t, psn_t>(reg_rdma_sequence_number) get_psn = {
void apply(inout psn_t stored_psn, out psn_t output)
{
// 首先输出尚未递增的 PSN
output = stored_psn;
// 然后对 PSN 进行递增并覆盖原有的值
stored_psn = stored_psn + 1;
}
};
RegisterAction<psn_t, qp_reg_index_t, psn_t>(reg_rdma_sequence_number) set_psn = {
void apply(inout psn_t stored_psn, out psn_t output)
{
// 将 PSN 重新同步为 ACK 获取的值
stored_psn = eg_md.rdma_psn;
output = stored_psn;
}
};
action setEthernet()
{
hdr.ethernet.setValid();
hdr.ethernet.srcAddr = 0x08c0eb24686b; // Generator
hdr.ethernet.dstAddr = 0x08c0eb247b8b; // Collector
hdr.ethernet.etherType = ETHERTYPE_ROCE;
}
action setInfiniband_GRH()
{
hdr.grh.setValid();
hdr.grh.version = 6;
hdr.grh.class = 2;
hdr.grh.flow_lab = 0;
hdr.grh.pay_len = 44;
hdr.grh.next_hdr = 27;
hdr.grh.hop_lim = 1;
hdr.grh.src_gid = 0xfe800000000000000ac0ebfffe24686b;
hdr.grh.dst_gid = 0xfe800000000000000ac0ebfffe247b8b;
}
action setInfiniband_BTH()
{
hdr.bth.setValid();
hdr.bth.opcode = 0b00010100; // Default is RDMA Fetch&Add
hdr.bth.solicitedEvent = 0;
hdr.bth.migReq = 1;
hdr.bth.padCount = 0;
hdr.bth.transportHeaderVersion = 0;
hdr.bth.partitionKey = 0xffff;
hdr.bth.fRes = 0;
hdr.bth.bRes = 0;
hdr.bth.reserved1 = 0;
hdr.bth.destinationQP = eg_md.queue_pair; // 指定目的地队列对 (QP) 标识符
hdr.bth.ackRequest = 0;
hdr.bth.reserved2 = 0;
}
/* Fetch & Add RDMA operation */
action setInfiniband_AETH()
{
hdr.atomic_eth.setValid();
hdr.atomic_eth.virtualAddress = eg_md.memory_address_start + eg_md.memory_address_offset;
hdr.atomic_eth.rKey = eg_md.remote_key;
// Execute the increment operation
hdr.atomic_eth.data = 1;
}
apply{
setEthernet();
@stage(2)
{
// 如果为 TCP 或 UDP 数据包, 则转换为 RDMA 数据包
if(hdr.tcp.isValid() || hdr.udp.isValid()){
// GRH Header
setInfiniband_GRH();
// BTH Header
setInfiniband_BTH();
// 读取并更新该 RDMA 连接的 PSN
hdr.bth.packetSequenceNumber = get_psn.execute(eg_md.qp_reg_index);
// AETH Header
setInfiniband_AETH();
// iCRC Header
hdr.icrc.setValid();
}
}
// 使原始数据包的相关报头失效
hdr.ipv4.setInvalid();
if (hdr.tcp.isValid()){
hdr.tcp.setInvalid();
}
else if (hdr.udp.isValid()){
hdr.udp.setInvalid();
}
}
}
/* 出口控制块部分校对完毕, 没有问题 */
control SwitchEgress(inout headers hdr,
inout egress_metadata_t eg_md,
in egress_intrinsic_metadata_t eg_intr_md,
in egress_intrinsic_metadata_from_parser_t eg_intr_from_prsr,
inout egress_intrinsic_metadata_for_deparser_t eg_intr_md_for_dprsr,
inout egress_intrinsic_metadata_for_output_port_t eg_intr_md_for_oport)
{
ControlPrepareMemoryAddress() PrepareMemoryAddress;
ControlConvertToRDMA() ConvertToRDMA;
apply{
if(hdr.ipv4.srcAddr != 0xc0a80403 && hdr.ipv4.dstAddr != 0xc0a80403){
eg_md.srcIP = hdr.ipv4.srcAddr;
eg_md.dstIP = hdr.ipv4.dstAddr;
eg_md.proto = hdr.ipv4.protocol;
if(hdr.tcp.isValid()){
eg_md.srcPort = hdr.tcp.srcPort;
eg_md.dstPort = hdr.tcp.dstPort;
}
else if(hdr.udp.isValid()){
eg_md.srcPort = hdr.udp.srcPort;
eg_md.dstPort = hdr.udp.dstPort;
}
PrepareMemoryAddress.apply(hdr, eg_md, eg_intr_md);
ConvertToRDMA.apply(hdr, eg_md);
}
}
}
/* 出口逆解析器部分校对完毕, 没有问题 */
control SwitchEgressDeparser(packet_out pkt, inout headers hdr, in egress_metadata_t eg_md, in egress_intrinsic_metadata_for_deparser_t eg_dprsr_md)
{
Checksum() ipv4_checksum;
apply{
// Update IPv4 checksum
hdr.ipv4.hdrChecksum = ipv4_checksum.update(
{hdr.ipv4.version,
hdr.ipv4.ihl,
hdr.ipv4.dscp,
hdr.ipv4.ecn,
hdr.ipv4.totalLen,
hdr.ipv4.identification,
hdr.ipv4.flags,
hdr.ipv4.fragOffset,
hdr.ipv4.ttl,
hdr.ipv4.protocol,
hdr.ipv4.srcAddr,
hdr.ipv4.dstAddr});
pkt.emit(hdr.ethernet);
pkt.emit(hdr.ipv4);
pkt.emit(hdr.udp);
pkt.emit(hdr.tcp);
pkt.emit(hdr.grh);
pkt.emit(hdr.bth);
pkt.emit(hdr.atomic_eth);
pkt.emit(hdr.icrc);
}
}
Pipeline(SwitchIngressParser(),
SwitchIngress(),
SwitchIngressDeparser(),
SwitchEgressParser(),
SwitchEgress(),
SwitchEgressDeparser()
) pipe;
Switch(pipe) main;

View File

@ -0,0 +1 @@
94813711802864

View File

@ -0,0 +1 @@
32768

View File

@ -0,0 +1 @@
0

View File

@ -0,0 +1 @@
366

View File

@ -0,0 +1 @@
299151

View File

@ -0,0 +1 @@
./../../bf-sde-9.2.0/run_bfshell.sh -b table_rules.py -i

View File

@ -0,0 +1,2 @@
. ../../bf-sde-9.2.0/set_sde.bash
./../../bf-sde-9.2.0/run_switchd.sh -p pure_cms_rdma

View File

@ -0,0 +1,282 @@
#!/usr/bin/env python3
import datetime
import ipaddress
import hashlib
import struct
import os
from scapy.all import *
p4 = bfrt.pure_cms_rdma.pipe
mirror = bfrt.mirror
pre = bfrt.pre
logfile = "/root/wly_experiment/pure_cms_rdma/log_results/sketch.log"
rdma_dir = "/root/wly_experiment/pure_cms_rdma/rdma_metadata"
# 用于判断从 Collector 发来的 RDMA 元数据是否成功存储到对应的文件中
store_flag = False
# add_with_XXX() 函数的关键字必须为小写, 否则无法正常运行 (识别不出来)
# 根据测试平台的连接情况添加静态转发规则
forwardingRules = [
("6c:ec:5a:62:a8:00", 66), # Tofino CPU 1
("08:c0:eb:24:7b:8b", 148), # Collector
("08:c0:eb:24:68:6b", 180) # Generator
]
# 将收集器的以太网地址映射到 Tonfino 对应的端口 (确保所有这些端口都存在 mcRules)
collectorEthertoPorts = [
("08:c0:eb:24:7b:8b", 148),
]
# CMS 中每个插槽的大小, 默认为 8 字节 (64 bits)
bucket_size_B = 8
# 多播规则, 用于将出口端口 (egress port) 和哈希函数的数量 (duplicate_num) 映射到多播组 ID
mcRules = [
{
"mgid":1,
"egressPort":148,
"duplicate_num":1
},
{
"mgid":2,
"egressPort":148,
"duplicate_num":2
},
{
"mgid":3,
"egressPort":148,
"duplicate_num":3
}
]
def log(text):
""" 打印日志 """
global logfile, datetime
line = "%s \t DigProc: %s" %(str(datetime.datetime.now()), str(text))
print(line)
# 覆盖式写入
f = open(logfile, "w+")
f.write(line + "\n")
f.close()
# 获取 RDMA 连接的元数据信息函数已验证无误
def getRDMAMetadata():
''' 获取 RDMA 连接的元数据信息 '''
global log, os, rdma_dir
log("Reading collector RDMA metadata from disk...")
try:
# 起始的数据包序列号
f = open("%s/tmp_psn" % rdma_dir, "r")
start_psn = int(f.read())
f.close()
# 队列对
f = open("%s/tmp_qpnum" % rdma_dir, "r")
queue_pair = int(f.read())
f.close()
# 起始内存地址
f = open("%s/tmp_memaddr" % rdma_dir, "r")
memory_start = int(f.read())
f.close()
# 能够用于存放数据的长度
f = open("%s/tmp_memlen" % rdma_dir, "r")
memory_length = int(f.read())
f.close()
# 远程键 (用于获取访问远端主机内存的权限)
f = open("%s/tmp_rkey" % rdma_dir, "r")
remote_key = int(f.read())
f.close()
except:
log(" !!! !!! Failed to read RDMA metadata !!! !!! ")
log("Collector RDMA metadata has extracted from disk!!! ")
return queue_pair, start_psn, memory_start, memory_length, remote_key
# 存储 RDMA 连接的元数据信息函数已验证无误
def storeRDMAMetadata(packet):
""" 用于对接收到的数据包进行解析, 然后将解析出的 RDMA 元数据存储到磁盘的对应文件中 """
global log, store_flag, struct, rdma_dir
# 我们使用 UDP 来携带 RDMA 连接所包含的信息
log("Receive and store RDMA connection metadata to %s" % rdma_dir)
udp_payload = packet["UDP"].load
psn, queue_pair, memory_start, memory_length, remote_key = struct.unpack("!IIQII", udp_payload)
# 将解析出的 RDMA 元数据信息写入到对应的文件中
f = open("%s/tmp_psn" % rdma_dir, "w")
f.write(str(psn))
f.close()
f = open("%s/tmp_qpnum" % rdma_dir, "w")
f.write(str(queue_pair))
f.close()
f = open("%s/tmp_memaddr" % rdma_dir, "w")
f.write(str(memory_start))
f.close()
f = open("%s/tmp_memlen" % rdma_dir, "w")
f.write(str(memory_length))
f.close()
f = open("%s/tmp_rkey" % rdma_dir, "w")
f.write(str(remote_key))
f.close()
store_flag = True
# (入口阶段) 下发转发表项函数已验证无误
def insertForwardingRules():
''' 下发转发表项 (DstIP -> Egress Port) '''
global p4, log, ipaddress, forwardingRules
log("Inserting Forwarding rules...")
# 根据目的以太网地址 (dstAddr) 来转发到对应的出口端口号 (egress port)
for dstAddr, egrPort in forwardingRules:
log("DstAddr: %s -> EgressPort: %i" % (dstAddr, egrPort))
p4.SwitchIngress.tbl_forward.add_with_forward(dstaddr=dstAddr, port=egrPort)
# (入口阶段) 下发 Key-Write 表项函数已验证无误
def insertKeyWriteRules(duplicate_num):
''' 下发 Key-Write 对应的表项 (这块是下发到 Ingress Control)
(CollectorIP <-> EgressPort, EgressPort <-> duplicate_num, mgid) '''
global p4, log, collectorEthertoPorts, mcRules
log("Inserting KeyWrite rules...")
# 获取每个 Collector 的以太网地址和对应的 egressPort
for dstAddr, egrPort in collectorEthertoPorts:
log("%s, %i, %i" % (dstAddr, egrPort, duplicate_num))
# 从 mcRules 列表中查找到正确的多播组 ID (同时匹配 duplicate_num 和 egressPort)
rule = [ r for r in mcRules if r["duplicate_num"]==duplicate_num and r["egressPort"]==egrPort]
multicastGroupID = rule[0]["mgid"]
log("Adding multiwrite rule %s, N = %i - %i" % (dstAddr, duplicate_num, multicastGroupID))
p4.SwitchIngress.tbl_prep_multicast.add_with_prep_multiwrite(dstaddr=dstAddr, mcast_grp=multicastGroupID)
# (出口阶段) 下发 Prep-MemoryAddress 表项函数已验证无误
def insertPrepMemoryAddressRules(duplicate_num):
''' 对 KeyWrite 对应的 RDMA 连接 (端口号为 1337) 所需要的表项进行配置 '''
global p4, log, ipaddress, collectorEthertoPorts, getRDMAMetadata, bucket_size_B
log("Inserting PrepKeyWrite rules...")
# 用于存储数据包序列号的寄存器索引 (每个 Collector 对应一个寄存器索引)
psn_reg_index = 0
# 获取 RDMA 连接的元数据信息
queue_pair, start_psn, memory_start, memory_length, remote_key = getRDMAMetadata()
for dstAddr, _ in collectorEthertoPorts:
log("Inserting memory slot rules for collector ip %s" % dstAddr)
# 计算 Collector 中共分配了多少个插槽,即 memory_length / (Bucket size in bytes)
collector_num_storage_slots = int(memory_length / bucket_size_B)
# 计算 CMS 的每行分配了多少个插槽 (取整), 其中 duplicate_num 仅为副本的数量
cms_rank_slots = int(collector_num_storage_slots / (duplicate_num + 1))
for i in range(duplicate_num+1):
log("multicast_pkt_num is: %d" % i)
log("start_slot is: %d" % (i * cms_rank_slots))
p4.SwitchEgress.PrepareMemoryAddress.tbl_get_start_slot.add_with_get_start_slot(multicast_pkt_num=i, start_slot=i*cms_rank_slots)
# 填充存放数据包序列号的寄存器
p4.SwitchEgress.ConvertToRDMA.reg_rdma_sequence_number.mod(f1=start_psn, register_index=psn_reg_index)
log("Inserting KeyWrite RDMA Metadata lookup rule for collector ip %s" % dstAddr)
# 生成关于 Collector 的 RDMA 元数据信息的表项, 并将其填充到对应的表中
p4.SwitchEgress.PrepareMemoryAddress.tbl_getRDMAMetadata.add_with_set_server_info(dstaddr=dstAddr, remote_key=remote_key, queue_pair=queue_pair, memory_address_start=memory_start, rank_num_slots=cms_rank_slots, qp_reg_index=psn_reg_index)
# 递增寄存器索引 (如果有多个 Collector 的话就有用)
psn_reg_index += 1
# (数据包复制引擎, PRE) 配置多播函数的处理逻辑已验证无误
def ConfigMulticast(duplicate_num):
global p4, pre, log, mcRules
log("Configuring mirroring sessions...")
lastNodeID = 0
for mcastGroup in mcRules:
# 如果 duplicate_num 不等于当前 mcRule 的 duplicate_num, 则继续循环寻找
if mcastGroup["duplicate_num"] != duplicate_num:
continue
# 多播组 ID
mgid = mcastGroup["mgid"]
# 该多播组的出口端口号
egressPort = mcastGroup["egressPort"]
# 哈希函数的个数
duplicate_num = mcastGroup["duplicate_num"]
log("Setting up multicast %i, egress port: %i, duplicate_num: %i" % (mgid, egressPort, duplicate_num))
# 每个多播节点 (RID) 都是唯一的, 并且其指向的出口端口均为 egressPort
nodeIDs = []
log("Adding multicast nodes...")
for _ in range(duplicate_num):
lastNodeID += 1
log("Creating node %i" % lastNodeID)
pre.node.add(dev_port=[egressPort], multicast_node_id=lastNodeID)
nodeIDs.append(lastNodeID)
log("Creating the multicast group")
# exclusion id 都是失效的
pre.mgid.add(mgid=mgid, multicast_node_id=nodeIDs, multicast_node_l1_xid=[0]*duplicate_num, multicast_node_l1_xid_valid=[False]*duplicate_num)
# 入口表项下发函数已验证无误
def SetIngressTableRules(duplicate_num):
""" 用于对入口控制阶段的表进行下发表项操作 """
global p4, log, insertForwardingRules, insertKeyWriteRules
log("--------------- Ingress Pipeline ---------------")
insertForwardingRules()
insertKeyWriteRules(duplicate_num)
# 出口表项下发函数已验证无误
def SetEgressTableRules(duplicate_num):
""" 用于对出口控制阶段的表进行下发表项操作 """
global p4, log, insertPrepMemoryAddressRules
log("--------------- Egress Pipeline ---------------")
insertPrepMemoryAddressRules(duplicate_num=duplicate_num)
log("Starting configure Tofino Switch...")
# 配置 PRE 中的多播组转发规则
ConfigMulticast(duplicate_num=3)
# 配置入口表项
SetIngressTableRules(duplicate_num=3)
# 接着等待 Generator 那边发送过来的 RDMA 元数据信息然后配置出口表项
filter_expr = "udp and (src port 1111) and (dst port 5555)"
sniff(filter=filter_expr, iface="enp2s0f0", prn=storeRDMAMetadata, count=1)
if store_flag:
SetEgressTableRules(duplicate_num=3)
else:
log("*** Cannot receive and process RDMA metadata correctly! ***")
log("Bootstrap complete")

View File

@ -0,0 +1,77 @@
[global]
strict init = false
buffer min = 1024
buffer max = 2MB
default format = "%d(%F %X).%us %-6V (%c:%F:%U:%L) - %m%n"
file perms = 666
fsync period = 1K
[levels]
[formats]
null = "%n"
print = "[%-10.3d(%F)]%n"
file_format = "%d(%F %X).%us %-5V %c %m%n"
console_format = "%d(%F %X).%us %c %5V - %m%n"
[rules]
BF_SYS.ERROR >stdout;console_format
BF_SYS.DEBUG "bf_drivers.log", 5M * 5 ;file_format
BF_LLD.ERROR >stdout;console_format
BF_LLD.DEBUG "bf_drivers.log", 5M * 5 ;file_format
BF_PIPE.ERROR >stdout;console_format
BF_PIPE.ERROR "bf_drivers.log", 5M * 5 ;file_format
BF_TM.ERROR >stdout;console_format
BF_TM.DEBUG "bf_drivers.log", 5M * 5 ;file_format
BF_MC.ERROR >stdout;console_format
BF_MC.DEBUG "bf_drivers.log", 5M * 5 ;file_format
BF_PKT.ERROR >stdout;console_format
BF_PKT.ERROR "bf_drivers.log", 5M * 5 ;file_format
BF_DVM.ERROR >stdout;console_format
BF_DVM.DEBUG "bf_drivers.log", 5M * 5 ;file_format
BF_PORT.ERROR >stdout;console_format
BF_PORT.DEBUG "bf_drivers.log", 5M * 5 ;file_format
BF_AVAGO.ERROR >stdout;console_format
BF_AVAGO.DEBUG "bf_drivers.log", 5M * 5 ;file_format
BF_DRU.ERROR >stdout;console_format
BF_DRU.DEBUG "bf_drivers.log", 5M * 5 ;file_format
BF_API.ERROR >stdout;console_format
BF_API.DEBUG "bf_drivers.log", 5M * 5 ;file_format
BF_SAI.ERROR >stdout;console_format
BF_SAI.DEBUG "bf_drivers.log", 5M * 5 ;file_format
BF_PI.ERROR >stdout;console_format
BF_PI.DEBUG "bf_drivers.log", 5M * 5 ;file_format
BF_PLTFM.ERROR >stdout;console_format
BF_PLTFM.DEBUG "bf_drivers.log", 5M * 5 ;file_format
BF_PAL.ERROR >stdout;console_format
BF_PAL.DEBUG "bf_drivers.log", 5M * 5 ;file_format
BF_PM.ERROR >stdout;console_format
BF_PM.DEBUG "bf_drivers.log", 5M * 5 ;file_format
BF_KNET.ERROR >stdout;console_format
BF_KNET.DEBUG "bf_drivers.log", 5M * 5 ;file_format
BF_BFRT.ERROR >stdout;console_format
BF_BFRT.ERROR "bf_drivers.log", 5M * 5 ;file_format
BF_P4RT.ERROR >stdout;console_format
BF_P4RT.DEBUG "bf_drivers.log", 5M * 5 ;file_format
*.ERROR >syslog , LOG_USER