SwDMA/pure_cms_rdma/pure_cms_rdma.p4
2025-04-16 18:41:44 +08:00

761 lines
18 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <core.p4>
#include <tna.p4>
#define ETHERTYPE_IPV4 0x0800
#define ETHERTYPE_ROCE 0x8915
#define IPv4_PROTO_TCP 0x06
#define IPv4_PROTO_UDP 0x11
#define CMS_RDMA_PAYLOAD_SIZE 8
#ifndef MAX_SUPPORTED_QPS
#define MAX_SUPPORTED_QPS 256 // Maximum number of supported QPs. Specifies table and register sizes
// #define MAX_SUPPORTED_QPS 65536 // Used when benchmarking tons of QPs
#endif
typedef bit<32> ipv4_address_t;
typedef bit<32> iCRC_t;
typedef bit<32> remote_key_t;
typedef bit<24> queue_pair_t;
typedef bit<24> psn_t; // RoCEv2 中的数据包序列号 (Packet sequence number)
typedef bit<16> qp_reg_index_t; // 用于为每个 QP 存放其 PSN. 该字段是充当那个寄存器的索引号
typedef bit<32> slot_nums_t;
typedef bit<32> memory_slot_t; // 内存插槽(空隙). 由 Key-Write 和 Append 原语所共享, 出于某些原因, 它们都被限制在最大 32 bits
typedef bit<64> memory_address_t; // 物理内存地址(共 2^64)
// 定义不同的数据包类型 (Normal 和 Mirror), 用于桥接报头中
typedef bit<8> pkt_type_t;
const pkt_type_t PKT_TYPE_NORMAL = 1;
const pkt_type_t PKT_TYPE_MIRROR = 2;
// 定义不同的镜像数据包类型 (I2E 和 E2E)
typedef bit<3> mirror_type_t;
const mirror_type_t MIRROR_TYPE_I2E = 1;
const mirror_type_t MIRROR_TYPE_E2E = 2;
// 14 Bytes
header ethernet_h{
bit<48> dstAddr;
bit<48> srcAddr;
bit<16> etherType;
}
// 20 Bytes
header ipv4_h{
bit<4> version;
bit<4> ihl;
bit<6> dscp;
bit<2> ecn;
bit<16> totalLen;
bit<16> identification;
bit<3> flags;
bit<13> fragOffset;
bit<8> ttl;
bit<8> protocol;
bit<16> hdrChecksum;
ipv4_address_t srcAddr;
ipv4_address_t dstAddr;
}
// 20 Bytes
header tcp_h {
bit<16> srcPort;
bit<16> dstPort;
bit<32> seq_no;
bit<32> ack_no;
bit<4> data_offset;
bit<4> res;
bit<8> flags;
bit<16> window;
bit<16> checksum;
bit<16> urgent_ptr;
}
// 8 Bytes
header udp_h{
bit<16> srcPort;
bit<16> dstPort;
bit<16> totalLen;
bit<16> checksum;
}
// Global Route Header (GRH) (40 Bytes)
header infiniband_grh_h{
bit<4> version;
bit<8> class;
bit<20> flow_lab;
bit<16> pay_len;
bit<8> next_hdr;
bit<8> hop_lim;
bit<128> src_gid;
bit<128> dst_gid;
}
// Base Transport Header (BTH) (12 Bytes)
header infiniband_bth_h{
bit<8> opcode;
bit<1> solicitedEvent;
bit<1> migReq;
bit<2> padCount;
bit<4> transportHeaderVersion;
bit<16> partitionKey;
bit<1> fRes;
bit<1> bRes;
bit<6> reserved1;
bit<24> destinationQP;
bit<1> ackRequest;
bit<7> reserved2;
psn_t packetSequenceNumber;
}
// Atomic Extended Transport Header (ATOMIC_ETH) (28 bytes)
header infiniband_atomiceth_h{
memory_address_t virtualAddress;
bit<32> rKey;
bit<64> data;
bit<64> compare;
}
// iCRC 字段 (4 Bytes)
header infiniband_icrc_h{
bit<32> iCRC;
}
header mirror_h{
pkt_type_t pkt_type;
}
header mirror_bridged_metadata_h{
pkt_type_t pkt_type;
}
struct headers{
mirror_bridged_metadata_h bridged_md;
/* Normal Header */
ethernet_h ethernet;
ipv4_h ipv4;
udp_h udp;
tcp_h tcp;
/* RoCEv2 Header */
infiniband_grh_h grh;
infiniband_bth_h bth;
infiniband_atomiceth_h atomic_eth;
infiniband_icrc_h icrc;
}
struct ingress_metadata_t{
pkt_type_t pkt_type;
MirrorId_t mirror_session;
}
struct egress_metadata_t{
/* Store Flowkey */
ipv4_address_t srcIP;
ipv4_address_t dstIP;
bit<16> srcPort;
bit<16> dstPort;
bit<8> proto;
/* RDMA Metadata */
psn_t rdma_psn;
remote_key_t remote_key;
queue_pair_t queue_pair;
/* Used to locate where to store in the rdma memory */
memory_address_t memory_address_start;
memory_address_t memory_address_offset;
/* Slot is used as an intermediary for calculating rdma memory address */
memory_slot_t colletcor_dst_slot;
memory_slot_t rank_num_slots;
memory_slot_t rank_slot_offset;
memory_slot_t rank_start_slot;
qp_reg_index_t qp_reg_index;
bit<8> multicast_pkt_num;
}
/* 入口解析器部分校对完毕, 没有问题 */
parser TofinoIngressParser(packet_in pkt,
/* User */
inout ingress_metadata_t ig_md,
/* Intrinsic */
out ingress_intrinsic_metadata_t ig_intr_md)
{
state start{
pkt.extract(ig_intr_md);
transition select(ig_intr_md.resubmit_flag){
1 : parse_resubmit;
0 : parse_port_metadata;
}
}
state parse_resubmit{
transition reject;
}
state parse_port_metadata{
pkt.advance(64); // Tofino 1
transition accept;
}
}
parser SwitchIngressParser(packet_in pkt,
/* User */
out headers hdr,
out ingress_metadata_t ig_md,
/* Intrinsic */
out ingress_intrinsic_metadata_t ig_intr_md)
{
TofinoIngressParser() tofino_parser;
state start{
tofino_parser.apply(pkt, ig_md, ig_intr_md);
transition parse_ethernet;
}
state parse_ethernet{
pkt.extract(hdr.ethernet);
transition select(hdr.ethernet.etherType){
ETHERTYPE_IPV4: parse_ipv4;
ETHERTYPE_ROCE: parse_grh;
default: accept;
}
}
state parse_ipv4{
pkt.extract(hdr.ipv4);
transition select(hdr.ipv4.protocol){
IPv4_PROTO_UDP: parse_udp;
IPv4_PROTO_TCP: parse_tcp;
default: accept;
}
}
state parse_tcp{
pkt.extract(hdr.tcp);
transition accept;
}
state parse_udp{
pkt.extract(hdr.udp);
transition accept;
}
state parse_grh{
pkt.extract(hdr.grh);
transition accept;
}
}
/* 入口控制块部分校对完毕, 没有问题 */
control SwitchIngress(inout headers hdr,
/* User */
inout ingress_metadata_t ig_md,
/* Intrinsic */
in ingress_intrinsic_metadata_t ig_intr_md,
in ingress_intrinsic_metadata_from_parser_t ig_intr_prsr_md,
inout ingress_intrinsic_metadata_for_deparser_t ig_intr_dprsr_md,
inout ingress_intrinsic_metadata_for_tm_t ig_intr_tm_md)
{
/* 根据目的以太网地址来设置多播组 ID, 从而对数据包执行多播操作 */
action prep_multiwrite(bit<16> mcast_grp)
{
ig_intr_tm_md.mcast_grp_a = mcast_grp;
}
table tbl_prep_multicast
{
key = {
hdr.ethernet.dstAddr: exact;
}
actions = {
prep_multiwrite;
@defaultonly NoAction;
}
default_action = NoAction;
size = 1024;
}
/* 根据目的以太网地址决定数据包执行转发到对应的目的端口, 或者是丢弃该数据包 */
action forward(PortId_t port)
{
ig_intr_tm_md.ucast_egress_port = port;
}
action to_cpu()
{
ig_intr_tm_md.ucast_egress_port = 66;
}
action drop()
{
ig_intr_dprsr_md.drop_ctl = 1;
}
table tbl_forward
{
key = {
hdr.ethernet.dstAddr: exact;
}
actions = {
forward;
to_cpu;
drop;
}
default_action = to_cpu;
size = 1024;
}
apply
{
tbl_forward.apply();
tbl_prep_multicast.apply();
// 为 Egress Control 准备桥接元数据
hdr.bridged_md.setValid();
hdr.bridged_md.pkt_type = PKT_TYPE_NORMAL;
}
}
/* 入口逆解析器部分校对完毕, 没有问题 */
control SwitchIngressDeparser(packet_out pkt,
inout headers hdr,
in ingress_metadata_t ig_md,
in ingress_intrinsic_metadata_for_deparser_t ig_intr_dprsr_md)
{
Mirror() mirror;
apply{
// 如果时 Ingress-to-Egress 镜像操作
if (ig_intr_dprsr_md.mirror_type == MIRROR_TYPE_I2E){
// Emit Mirror并附加上 mirror_h 报头
mirror.emit<mirror_h>(ig_md.mirror_session, {ig_md.pkt_type});
}
pkt.emit(hdr);
}
}
/* 出口解析器部分校对完毕, 没有问题 */
parser SwitchEgressParser(packet_in pkt,
/* User */
out headers hdr,
out egress_metadata_t eg_md,
/* Intrinsic */
out egress_intrinsic_metadata_t eg_intr_md)
{
state start{
pkt.extract(eg_intr_md);
transition parse_metadata;
}
state parse_metadata{
mirror_h mirror_md = pkt.lookahead<mirror_h>();
// 根据镜像元数据中的 pkt_type 字段决定下一步要执行的解析状态
transition select(mirror_md.pkt_type){
PKT_TYPE_MIRROR: parse_mirror_md;
PKT_TYPE_NORMAL: parse_bridged_md;
default: accept;
}
}
// 提取桥接元数据
state parse_bridged_md{
pkt.extract(hdr.bridged_md);
transition parse_ethernet;
}
// 如果是镜像数据包, 在本方案中表示是遥测报告数据包, 提取其镜像元数据
state parse_mirror_md{
mirror_h mirror_md;
pkt.extract(mirror_md);
transition parse_ethernet;
}
state parse_ethernet{
pkt.extract(hdr.ethernet);
transition select(hdr.ethernet.etherType){
ETHERTYPE_IPV4: parse_ipv4;
ETHERTYPE_ROCE: parse_grh;
default: accept;
}
}
state parse_ipv4{
pkt.extract(hdr.ipv4);
transition select(hdr.ipv4.protocol){
IPv4_PROTO_UDP: parse_udp;
IPv4_PROTO_TCP: parse_tcp;
default: accept;
}
}
state parse_udp{
pkt.extract(hdr.udp);
transition accept;
}
state parse_tcp{
pkt.extract(hdr.tcp);
transition accept;
}
state parse_grh{
pkt.extract(hdr.grh);
transition accept;
}
}
/* 准备 KeyWrite 控制块部分校对完毕, 没有问题 */
control ControlPrepareMemoryAddress(inout headers hdr,
inout egress_metadata_t eg_md,
in egress_intrinsic_metadata_t eg_intr_md)
{
Hash<slot_nums_t>(HashAlgorithm_t.CRC32) hash_slot;
// 用于区分多播中产生的多个数据包 (因为每来一个数据包都要递增寄存器中的值, 并且还是循环)
Register<bit<8>, bit<1>>(MAX_SUPPORTED_QPS) reg_multicast_iterator;
RegisterAction<bit<8>, bit<1>, bit<8>>(reg_multicast_iterator) get_pkt_number = {
void apply(inout bit<8> stored, out bit<8> output)
{
// 首先将内部存储的 stored 输出给 output
output = stored;
// 如果当前存储的值大于 hash_nums - 1, 则需要将 stored 置 0
if(stored >= 3){
stored = 0;
}
// 否则, 对 stored 进行递增
else{
stored = stored + 1;
}
}
};
// 根据当前数据包的多播 ID 号, 来获得其在 CMS 中存储的起始插槽位置
action get_start_slot(memory_slot_t start_slot)
{
// 获取起始插槽位置 (CMS 中每行的开头)
eg_md.rank_start_slot = start_slot;
}
table tbl_get_start_slot{
key = {
eg_md.multicast_pkt_num: exact;
}
actions = {
get_start_slot;
NoAction;
}
size = 8;
default_action = NoAction();
}
// 根据目的 IPv4 地址, 获取 Collector 的 RDMA 元数据信息
action set_server_info(remote_key_t remote_key, queue_pair_t queue_pair, memory_address_t memory_address_start, memory_slot_t rank_num_slots, qp_reg_index_t qp_reg_index)
{
eg_md.remote_key = remote_key;
eg_md.queue_pair = queue_pair;
eg_md.memory_address_start = memory_address_start;
eg_md.rank_num_slots = rank_num_slots;
eg_md.qp_reg_index = qp_reg_index;
}
table tbl_getRDMAMetadata
{
key = {
hdr.ethernet.dstAddr: exact;
}
actions = {
set_server_info;
}
// 单个 Translator 不可能负责比这更多的工作
size = MAX_SUPPORTED_QPS;
}
// 通过哈希函数计算出插槽的偏移量 (计算结果为 bit<32> 类型)
action cal_slot_offset()
{
eg_md.rank_slot_offset = hash_slot.get({eg_md.srcIP,
eg_md.dstIP,
eg_md.srcPort,
eg_md.dstPort,
eg_md.proto,
eg_md.multicast_pkt_num});
}
table tbl_cal_slot_offset
{
key = {}
actions = {
cal_slot_offset;
}
size = 1;
default_action = cal_slot_offset();
}
// 将这个偏移量与实际可用的插槽数量进行绑定 (通过与 mask 进行按位与运算)
action bound_memory_slot(memory_slot_t mask)
{
eg_md.rank_slot_offset = eg_md.rank_slot_offset & mask;
}
table tbl_bound_memory_slot
{
key = {
eg_md.rank_num_slots: exact;
}
actions = {
bound_memory_slot;
}
const entries = {
2: bound_memory_slot(0x00000001);
4: bound_memory_slot(0x00000003);
8: bound_memory_slot(0x00000007);
16: bound_memory_slot(0x0000000f);
32: bound_memory_slot(0x0000001f);
64: bound_memory_slot(0x0000003f);
128: bound_memory_slot(0x0000007f);
256: bound_memory_slot(0x000000ff);
512: bound_memory_slot(0x000001ff);
1024: bound_memory_slot(0x000003ff);
2048: bound_memory_slot(0x000007ff);
4096: bound_memory_slot(0x00000fff);
8192: bound_memory_slot(0x00001fff);
16384: bound_memory_slot(0x00003fff);
32768: bound_memory_slot(0x00007fff);
65536: bound_memory_slot(0x0000ffff);
131072: bound_memory_slot(0x0001ffff);
262144: bound_memory_slot(0x0003ffff);
524288: bound_memory_slot(0x0007ffff);
1048576: bound_memory_slot(0x000fffff);
2097152: bound_memory_slot(0x001fffff);
4194304: bound_memory_slot(0x003fffff);
8388608: bound_memory_slot(0x007fffff);
16777216: bound_memory_slot(0x00ffffff);
33554432: bound_memory_slot(0x01ffffff);
67108864: bound_memory_slot(0x03ffffff);
134217728: bound_memory_slot(0x07ffffff);
268435456: bound_memory_slot(0x0fffffff);
536870912: bound_memory_slot(0x1fffffff);
1073741824: bound_memory_slot(0x3fffffff);
2147483648: bound_memory_slot(0x7fffffff);
//4294967296: bound_memory_slot(0xffffffff); //does not fit in 32-bit
}
size=64;
}
apply
{
// 获取当前数据包的多播 ID 号
eg_md.multicast_pkt_num = get_pkt_number.execute(0);
// 获取 RDMA 元数据信息
tbl_getRDMAMetadata.apply();
@stage(1)
{
// 计算起始插槽位置和插槽偏移量, 然后联合起来计算出目标插槽位置
tbl_get_start_slot.apply();
tbl_cal_slot_offset.apply();
tbl_bound_memory_slot.apply();
eg_md.colletcor_dst_slot = eg_md.rank_start_slot + eg_md.rank_slot_offset;
// 将内存插槽转换为在物理内存地址中的偏移量
// 即需要乘以有效载荷的字节数 (此处为 8即向左位移 3)
eg_md.memory_address_offset = (memory_address_t)(eg_md.colletcor_dst_slot);
eg_md.memory_address_offset = eg_md.memory_address_offset * CMS_RDMA_PAYLOAD_SIZE;
}
}
}
/* 生成 RDMA 数据包控制块部分校对完毕, 没有问题 */
control ControlConvertToRDMA(inout headers hdr,
inout egress_metadata_t eg_md)
{
// 分配 32 位的寄存器数组来保存 24 位的 PSN
Register<bit<32>, qp_reg_index_t>(MAX_SUPPORTED_QPS) reg_rdma_sequence_number;
RegisterAction<psn_t, qp_reg_index_t, psn_t>(reg_rdma_sequence_number) get_psn = {
void apply(inout psn_t stored_psn, out psn_t output)
{
// 首先输出尚未递增的 PSN
output = stored_psn;
// 然后对 PSN 进行递增并覆盖原有的值
stored_psn = stored_psn + 1;
}
};
RegisterAction<psn_t, qp_reg_index_t, psn_t>(reg_rdma_sequence_number) set_psn = {
void apply(inout psn_t stored_psn, out psn_t output)
{
// 将 PSN 重新同步为 ACK 获取的值
stored_psn = eg_md.rdma_psn;
output = stored_psn;
}
};
action setEthernet()
{
hdr.ethernet.setValid();
hdr.ethernet.srcAddr = 0x08c0eb24686b; // Generator
hdr.ethernet.dstAddr = 0x08c0eb247b8b; // Collector
hdr.ethernet.etherType = ETHERTYPE_ROCE;
}
action setInfiniband_GRH()
{
hdr.grh.setValid();
hdr.grh.version = 6;
hdr.grh.class = 2;
hdr.grh.flow_lab = 0;
hdr.grh.pay_len = 44;
hdr.grh.next_hdr = 27;
hdr.grh.hop_lim = 1;
hdr.grh.src_gid = 0xfe800000000000000ac0ebfffe24686b;
hdr.grh.dst_gid = 0xfe800000000000000ac0ebfffe247b8b;
}
action setInfiniband_BTH()
{
hdr.bth.setValid();
hdr.bth.opcode = 0b00010100; // Default is RDMA Fetch&Add
hdr.bth.solicitedEvent = 0;
hdr.bth.migReq = 1;
hdr.bth.padCount = 0;
hdr.bth.transportHeaderVersion = 0;
hdr.bth.partitionKey = 0xffff;
hdr.bth.fRes = 0;
hdr.bth.bRes = 0;
hdr.bth.reserved1 = 0;
hdr.bth.destinationQP = eg_md.queue_pair; // 指定目的地队列对 (QP) 标识符
hdr.bth.ackRequest = 0;
hdr.bth.reserved2 = 0;
}
/* Fetch & Add RDMA operation */
action setInfiniband_AETH()
{
hdr.atomic_eth.setValid();
hdr.atomic_eth.virtualAddress = eg_md.memory_address_start + eg_md.memory_address_offset;
hdr.atomic_eth.rKey = eg_md.remote_key;
// Execute the increment operation
hdr.atomic_eth.data = 1;
}
apply{
setEthernet();
@stage(2)
{
// 如果为 TCP 或 UDP 数据包, 则转换为 RDMA 数据包
if(hdr.tcp.isValid() || hdr.udp.isValid()){
// GRH Header
setInfiniband_GRH();
// BTH Header
setInfiniband_BTH();
// 读取并更新该 RDMA 连接的 PSN
hdr.bth.packetSequenceNumber = get_psn.execute(eg_md.qp_reg_index);
// AETH Header
setInfiniband_AETH();
// iCRC Header
hdr.icrc.setValid();
}
}
// 使原始数据包的相关报头失效
hdr.ipv4.setInvalid();
if (hdr.tcp.isValid()){
hdr.tcp.setInvalid();
}
else if (hdr.udp.isValid()){
hdr.udp.setInvalid();
}
}
}
/* 出口控制块部分校对完毕, 没有问题 */
control SwitchEgress(inout headers hdr,
inout egress_metadata_t eg_md,
in egress_intrinsic_metadata_t eg_intr_md,
in egress_intrinsic_metadata_from_parser_t eg_intr_from_prsr,
inout egress_intrinsic_metadata_for_deparser_t eg_intr_md_for_dprsr,
inout egress_intrinsic_metadata_for_output_port_t eg_intr_md_for_oport)
{
ControlPrepareMemoryAddress() PrepareMemoryAddress;
ControlConvertToRDMA() ConvertToRDMA;
apply{
if(hdr.ipv4.srcAddr != 0xc0a80403 && hdr.ipv4.dstAddr != 0xc0a80403){
eg_md.srcIP = hdr.ipv4.srcAddr;
eg_md.dstIP = hdr.ipv4.dstAddr;
eg_md.proto = hdr.ipv4.protocol;
if(hdr.tcp.isValid()){
eg_md.srcPort = hdr.tcp.srcPort;
eg_md.dstPort = hdr.tcp.dstPort;
}
else if(hdr.udp.isValid()){
eg_md.srcPort = hdr.udp.srcPort;
eg_md.dstPort = hdr.udp.dstPort;
}
PrepareMemoryAddress.apply(hdr, eg_md, eg_intr_md);
ConvertToRDMA.apply(hdr, eg_md);
}
}
}
/* 出口逆解析器部分校对完毕, 没有问题 */
control SwitchEgressDeparser(packet_out pkt, inout headers hdr, in egress_metadata_t eg_md, in egress_intrinsic_metadata_for_deparser_t eg_dprsr_md)
{
Checksum() ipv4_checksum;
apply{
// Update IPv4 checksum
hdr.ipv4.hdrChecksum = ipv4_checksum.update(
{hdr.ipv4.version,
hdr.ipv4.ihl,
hdr.ipv4.dscp,
hdr.ipv4.ecn,
hdr.ipv4.totalLen,
hdr.ipv4.identification,
hdr.ipv4.flags,
hdr.ipv4.fragOffset,
hdr.ipv4.ttl,
hdr.ipv4.protocol,
hdr.ipv4.srcAddr,
hdr.ipv4.dstAddr});
pkt.emit(hdr.ethernet);
pkt.emit(hdr.ipv4);
pkt.emit(hdr.udp);
pkt.emit(hdr.tcp);
pkt.emit(hdr.grh);
pkt.emit(hdr.bth);
pkt.emit(hdr.atomic_eth);
pkt.emit(hdr.icrc);
}
}
Pipeline(SwitchIngressParser(),
SwitchIngress(),
SwitchIngressDeparser(),
SwitchEgressParser(),
SwitchEgress(),
SwitchEgressDeparser()
) pipe;
Switch(pipe) main;