First implementation of my MOST project
Check this first. When a static web page and a dynamic web page are put in different web servers, the performance is better. We want to implement the similar function (nginx load balancer) in P4.
[Topology]
H1 (10.0.1.1) --------(10.0.1.254) (Nginx Load Balancer) (10.0.0.254) ----------(10.0.0.1) Web1
(10.0.2.254)----------(10.0.2.1) Web2
[Difficulties]
In Nginx, H1 will create a TCP connection will Nginx Load Balancer first. Then Nginx will create another TCP connection with backend web server. So it is easy for nginx to parse the client request (HTTP GET request) to know whether it is a static web page request or a dynamic web page request. Then Nginx Load Balancer can forward the request to web1 or web 2 according to the parsed result. However, in P4, we can not know the type of user’s request in advance. Therefore, when h1 establishes connection with P4 Load Balancer, P4 Load Balancer will create multiple virtual connections with backend servers. When the P4 Load Balancer gets the user’s request, the P4 LB will forward the request to the corresponding web server.
[basic16.p4]
#include <core.p4> #include <v1model.p4> const bit<32> TYPE_HTTP_REQ_GET = 0x47455420; const bit<24> TYPE_A = 0x204854; // space + H + T const bit<24> TYPE_JPG = 0x6a7067; register<bit<32>>(2)seqNo; // seqNo(1): 1->2, seqNo(0): 3->1 register<bit<32>>(1)flow; // 1: 1->3, 0: 1->2 register<bit<32>>(1)seqNo2; // remember the initial seqNo 2->1 register<bit<32>>(1)seqNo3; // remember the initial seqNo 3->1 register<bit<32>>(1)seqNo4; // remember the FIN/ACK SeqNo 1->3
struct meta_t { bit<1> do_forward; bit<32> ipv4_sa; bit<32> ipv4_da; bit<16> tcp_sp; bit<16> tcp_dp; bit<32> nhop_ipv4; bit<32> if_ipv4_addr; bit<48> if_mac_addr; bit<1> is_ext_if; bit<16> tcpLength; bit<8> if_index; }
header ethernet_t { bit<48> dstAddr; bit<48> srcAddr; bit<16> etherType; }
header ipv4_t { bit<4> version; bit<4> ihl; bit<8> diffserv; bit<16> totalLen; bit<16> identification; bit<3> flags; bit<13> fragOffset; bit<8> ttl; bit<8> protocol; bit<16> hdrChecksum; bit<32> srcAddr; bit<32> dstAddr; }
header tcp_t { bit<16> srcPort; bit<16> dstPort; bit<32> seqNo; bit<32> ackNo; bit<4> dataOffset; bit<4> res; bit<8> flags; bit<16> window; bit<16> checksum; bit<16> urgentPtr; }
header tcp_options_t { varbit<320> options; }
struct metadata { meta_t meta; bit<8> len; bit<128> tmp; bit<1> flag; bit<13> index; bit<32> value; bit<32> seqNo; bit<32> ackNo; bit<32> flag2; }
struct headers { ethernet_t ethernet; ipv4_t ipv4; tcp_t tcp; tcp_options_t tcp_options; }
error { noAppLayerData, TcpDataOffsetTooSmall }
parser ParserImpl(packet_in packet, out headers hdr, inout metadata meta, inout standard_metadata_t standard_metadata) {
state parse_ethernet { packet.extract(hdr.ethernet); transition select(hdr.ethernet.etherType) { 16w0x800: parse_ipv4; default: accept; } }
state parse_ipv4 { packet.extract(hdr.ipv4); meta.meta.ipv4_sa = hdr.ipv4.srcAddr; meta.meta.ipv4_da = hdr.ipv4.dstAddr; meta.meta.tcpLength = hdr.ipv4.totalLen - 16w20; transition select(hdr.ipv4.protocol) { 8w6: parse_tcp; default: accept; } }
state parse_tcp { packet.extract(hdr.tcp); meta.meta.tcp_sp = hdr.tcp.srcPort; meta.meta.tcp_dp = hdr.tcp.dstPort; verify(hdr.tcp.dataOffset >=5, error.TcpDataOffsetTooSmall); transition select(hdr.tcp.dataOffset){ 5: accept; default: parse_tcp_options; } }
state parse_tcp_options { bit<10> len = ((bit<10>)(hdr.tcp.dataOffset - 5) * 4 * 8); packet.extract(hdr.tcp_options, (bit<32>)len); transition parse_app; }
state parse_app { transition select(hdr.tcp.dstPort) { 80: parse_http; default: accept; } }
state parse_http { verify(hdr.ipv4.totalLen > (bit<16>)(hdr.ipv4.ihl+hdr.tcp.dataOffset)*4, error.noAppLayerData); transition select(packet.lookahead<bit<32>>()) { TYPE_HTTP_REQ_GET: parse_a; default: accept; } }
state parse_a { meta.tmp=packet.lookahead<bit<128>>(); meta.tmp=(meta.tmp<<32); transition parse_b; }
state parse_b { bit<24> choice=meta.tmp[127:104]; meta.tmp=(meta.tmp<<8); transition select(choice) { TYPE_A: parse_c; TYPE_JPG: parse_d; default: parse_b; } }
state parse_c { meta.flag=0; transition accept; }
state parse_d { meta.flag=1; transition accept; }
@name(".start") state start { meta.flag=0; meta.meta.if_index = (bit<8>)standard_metadata.ingress_port; standard_metadata.mcast_grp=0; transition parse_ethernet; } }
control egress(inout headers hdr, inout metadata meta, inout standard_metadata_t standard_metadata) {
action myforward(bit<32> dst_ip, bit<48> dmac) { hdr.ethernet.dstAddr = dmac; hdr.ipv4.dstAddr = dst_ip; }
apply {
//for syn pkt(3-way): 1->2 if(standard_metadata.egress_rid==0 && hdr.ipv4.srcAddr==0x0a000101 && (hdr.tcp.flags&2)!=0){ myforward(0x0a000202, 0x000000000202); } //for syn pkt(3-way): 1->3 if(standard_metadata.egress_rid==1 && hdr.ipv4.srcAddr==0x0a000101 && (hdr.tcp.flags&2)!=0){ myforward(0x0a000303, 0x000000000303); } //for ack pkt(3-way): 1->2 if(standard_metadata.egress_rid==0 && hdr.ipv4.srcAddr==0x0a000101 && (hdr.tcp.flags&16)!=0 && (hdr.tcp.flags&8)==0 && (hdr.tcp.flags&2)==0){ myforward(0x0a000202, 0x000000000202); } //for ack pkt(3-way): 1->3 if(standard_metadata.egress_rid==1 && hdr.ipv4.srcAddr==0x0a000101 && (hdr.tcp.flags&16)!=0 && (hdr.tcp.flags&8)==0 && (hdr.tcp.flags&2)==0){ seqNo.read(meta.seqNo, (bit<32>)0); hdr.tcp.ackNo = (meta.seqNo + 1); myforward(0x0a000303, 0x000000000303); }
if(meta.flag==1) { myforward(0x0a000303, 0x000000000303); } } }
control ingress(inout headers hdr, inout metadata meta, inout standard_metadata_t standard_metadata) {
action do_copy(){ seqNo.write((bit<32>)1, hdr.tcp.seqNo); //1->2 standard_metadata.mcast_grp=1; }
action do_drop(){ seqNo.write((bit<32>)0, hdr.tcp.seqNo); //3->1 hdr.ipv4.dstAddr = 0x01010101; // set the wrong address, the pkt will be drop definitely mark_to_drop(standard_metadata); }
action do_copy2(){ standard_metadata.mcast_grp=1; }
action _drop() { hdr.ipv4.dstAddr = 0x01010101; // set the wrong address, the pkt will be drop definitely mark_to_drop(standard_metadata); }
action _forward(bit<32> dst_ip, bit<48> dmac, bit<9> port) { standard_metadata.egress_spec = port; hdr.ethernet.dstAddr = dmac; hdr.ipv4.dstAddr = dst_ip; hdr.ipv4.ttl = hdr.ipv4.ttl + 8w255; }
action sendback(bit<32> dst_ip, bit<32> src_ip, bit<48> dmac, bit<9> port) { standard_metadata.egress_spec = port; hdr.ethernet.dstAddr = dmac; hdr.ipv4.dstAddr = dst_ip; hdr.ipv4.srcAddr = src_ip; hdr.ipv4.ttl = hdr.ipv4.ttl + 8w255; }
table forward { actions = { sendback; _forward; _drop; } key = { hdr.ipv4.dstAddr: exact; meta.flag: exact; } size = 1024; default_action = _drop(); }
apply {
//1 -> 3 (get jpg) if(meta.flag==1){ flow.write((bit<32>)0, 1); }
//close connection, FIN/ACK if((hdr.tcp.flags&1)!=0 && (hdr.tcp.flags&16)!=0 && meta.meta.if_index==1){ seqNo4.write((bit<32>)0, (bit<32>)hdr.tcp.seqNo); }
//three_way_handshake, Clone SYNC if((hdr.tcp.flags&2)!=0 && (hdr.tcp.flags&16)==0 && meta.meta.if_index==1 && meta.flag==0){ seqNo2.write((bit<32>)0, (bit<32>)0); seqNo3.write((bit<32>)0, (bit<32>)0); flow.write((bit<32>)0, 0); do_copy(); }
//three_way_handshake, virtual TCP connection: Drop pkt with SYNC+ACK if(hdr.ipv4.srcAddr==0x0a000303 && (hdr.tcp.flags&2)!=0 && (hdr.tcp.flags&16)!=0){ seqNo3.write((bit<32>)0, (bit<32>)hdr.tcp.seqNo); do_drop(); }
if(hdr.ipv4.srcAddr==0x0a000202 && (hdr.tcp.flags&2)!=0 && (hdr.tcp.flags&16)!=0){ seqNo2.write((bit<32>)0, (bit<32>)hdr.tcp.seqNo); }
//three_way_handshake, Clone only ACK without PUSH bit set seqNo.read(meta.seqNo, (bit<32>)1); if((hdr.tcp.flags&16)!=0 && (hdr.tcp.flags&8)==0 && (hdr.tcp.seqNo==(meta.seqNo+1)) && meta.meta.if_index==1 && meta.flag==0){ do_copy2(); }
// 3->1 (return jpg) flow.read(meta.flag2, (bit<32>)0); //xxxxx if jpeg file is too big, client will send tcp ack to server (need to handle it) \ //1->3 if(hdr.ipv4.srcAddr==0x0a000101 && meta.flag2==1){ meta.flag=1; bit<32> tmp1; bit<32> tmp2; seqNo2.read(tmp1, (bit<32>)0); seqNo3.read(tmp2, (bit<32>)0); hdr.tcp.ackNo = hdr.tcp.ackNo - tmp1 + tmp2; }
// for jpg pkt transfer, modify tcp.seqNo in pkts from 3->1 if(hdr.ipv4.srcAddr==0x0a000303 && meta.flag2==1){ bit<32> tmp1; bit<32> tmp2; seqNo2.read(tmp1, (bit<32>)0); seqNo3.read(tmp2, (bit<32>)0); hdr.tcp.seqNo = hdr.tcp.seqNo - tmp2 + tmp1; }
if((hdr.tcp.flags&16)!=0 && (hdr.tcp.flags&2)==0 && meta.meta.if_index==3){ bit<32> tmp1; seqNo4.read(tmp1, (bit<32>)0); if(tmp1==hdr.tcp.ackNo) flow.write((bit<32>)0, 0); }
forward.apply(); } }
control DeparserImpl(packet_out packet, in headers hdr) { apply { packet.emit(hdr.ethernet); packet.emit(hdr.ipv4); packet.emit(hdr.tcp); packet.emit(hdr.tcp_options); } }
control verifyChecksum(inout headers hdr, inout metadata meta) { apply { } }
control computeChecksum(inout headers hdr, inout metadata meta) { apply { update_checksum(true, { hdr.ipv4.version, hdr.ipv4.ihl, hdr.ipv4.diffserv, hdr.ipv4.totalLen, hdr.ipv4.identification, hdr.ipv4.flags, hdr.ipv4.fragOffset, hdr.ipv4.ttl, hdr.ipv4.protocol, hdr.ipv4.srcAddr, hdr.ipv4.dstAddr }, hdr.ipv4.hdrChecksum, HashAlgorithm.csum16); update_checksum_with_payload(hdr.ipv4.isValid()&&hdr.tcp.isValid()&&!hdr.tcp_options.isValid(), { hdr.ipv4.srcAddr, hdr.ipv4.dstAddr, 8w0, hdr.ipv4.protocol, meta.meta.tcpLength, hdr.tcp.srcPort, hdr.tcp.dstPort, hdr.tcp.seqNo, hdr.tcp.ackNo, hdr.tcp.dataOffset, hdr.tcp.res, hdr.tcp.flags, hdr.tcp.window, hdr.tcp.urgentPtr }, hdr.tcp.checksum, HashAlgorithm.csum16); update_checksum_with_payload(hdr.ipv4.isValid()&&hdr.tcp.isValid()&&hdr.tcp_options.isValid(), { hdr.ipv4.srcAddr, hdr.ipv4.dstAddr, 8w0, hdr.ipv4.protocol, meta.meta.tcpLength, hdr.tcp.srcPort, hdr.tcp.dstPort, hdr.tcp.seqNo, hdr.tcp.ackNo, hdr.tcp.dataOffset, hdr.tcp.res, hdr.tcp.flags, hdr.tcp.window, hdr.tcp.urgentPtr, hdr.tcp_options.options }, hdr.tcp.checksum, HashAlgorithm.csum16); } }
V1Switch(ParserImpl(), verifyChecksum(), ingress(), egress(), computeChecksum(), DeparserImpl()) main;
|
[test_topo.py]
import os from mininet.net import Containernet from mininet.topo import Topo from mininet.log import setLogLevel, info from mininet.cli import CLI from mininet.node import RemoteController from mininet.node import Docker from p4_mininet import P4Switch, P4Host
import argparse from time import sleep
parser = argparse.ArgumentParser(description='Mininet demo') parser.add_argument('--behavioral-exe', help='Path to behavioral executable', type=str, action="store", required=False, default='simple_switch' ) parser.add_argument('--thrift-port', help='Thrift server port for table updates', type=int, action="store", default=9090) parser.add_argument('--num-hosts', help='Number of hosts to connect to switch', type=int, action="store", default=2) parser.add_argument('--mode', choices=['l2', 'l3'], type=str, default='l3') parser.add_argument('--json', help='Path to JSON config file', type=str, action="store", required=True) parser.add_argument('--pcap-dump', help='Dump packets on interfaces to pcap files', type=str, action="store", required=False, default=False)
args = parser.parse_args()
def main(): net = Containernet(host = P4Host, controller = None) switch1 = net.addSwitch('s1', sw_path = args.behavioral_exe, json_path = "basic.json", thrift_port = args.thrift_port, cls = P4Switch, pcap_dump = args.pcap_dump)
host1 = net.addHost('h1', mac = '00:00:00:00:01:01', ip="10.0.1.1/24") #host2 = net.addHost('h2', mac = '00:00:00:00:02:02', ip="10.0.2.2/24") #host3 = net.addHost('h3', mac = '00:00:00:00:03:03', ip="10.0.3.3/24") host2 = net.addDocker('h2', mac = '00:00:00:00:02:02', ip= '10.0.2.2/24', dimage="apache-php-mysql:v4") host3 = net.addDocker('h3', mac = '00:00:00:00:03:03', ip= '10.0.3.3/24', dimage="apache-php-mysql:v4")
net.addLink(host1, switch1, port1 = 0, port2 = 1) net.addLink(host2, switch1, port1 = 0, port2 = 2) net.addLink(host3, switch1, port1 = 0, port2 = 3)
net.start() h1,h2,h3=net.get('h1','h2','h3') h1.cmd("arp -s 10.0.1.254 00:00:00:01:01:01") h1.cmd("ip route add default via 10.0.1.254") #h1.cmd("ethtool -K eth0 tx off rx off") h2.cmd("arp -s 10.0.2.254 00:00:00:02:02:02") h2.cmd("ip route del default") h2.cmd("ip route add default via 10.0.2.254") #h2.cmd("ethtool -K eth0 tx off rx off") #h2.cmd("python -m SimpleHTTPServer 80 &") h2.cmd("/etc/init.d/php7.2-fpm start") h2.cmd("mysqld_safe --skip-grant-tables &") h2.cmd("/etc/init.d/apache2 start") h3.cmd("arp -s 10.0.3.254 00:00:00:03:03:03") h3.cmd("ip route del default") h3.cmd("ip route add default via 10.0.3.254") #h3.cmd("ethtool -K eth0 tx off rx off") #h3.cmd("python -m SimpleHTTPServer 80 &") h3.cmd("/etc/init.d/php7.2-fpm start") h3.cmd("mysqld_safe --skip-grant-tables &") h3.cmd("/etc/init.d/apache2 start")
sleep(1)
print('\033[0;32m'), print "Gotcha!" print('\033[0m')
CLI(net) try: net.stop() except: print('\033[0;31m'), print('Stop error! Trying sudo mn -c') print('\033[0m') os.system('sudo mn -c') print('\033[0;32m'), print ('Stop successfully!') print('\033[0m')
if __name__ == '__main__': setLogLevel('info') main() |
[cmd.txt]
table_add forward sendback 10.0.1.1 0 => 10.0.1.1 10.0.0.1 00:00:00:00:01:01 1 table_add forward sendback 10.0.1.1 1 => 10.0.1.1 10.0.0.1 00:00:00:00:01:01 1 table_add forward _forward 10.0.0.1 0 => 10.0.2.2 00:00:00:00:02:02 2 table_add forward _forward 10.0.0.1 1 => 10.0.3.3 00:00:00:00:03:03 3 mc_mgrp_create 1 mc_node_create 0 2 mc_node_create 1 3 mc_node_associate 1 0 mc_node_associate 1 1 |
[cmd_add.py]
import os
os.system('sudo /home/vagrant/behavioral-model/targets/simple_switch/simple_switch_CLI --thrift-port=9090 < cmd.txt') |
[start_test_topo.py]
import os
os.system("sudo python test_topo.py --behavioral-exe /home/vagrant/behavioral-model/targets/simple_switch/simple_switch --json basic.json") |
[p4_mininet.py]
# Copyright 2013-present Barefoot Networks, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. #
from mininet.net import Mininet from mininet.node import Switch, Host from mininet.log import setLogLevel, info, error, debug from mininet.moduledeps import pathCheck from sys import exit import os import tempfile import socket
class P4Host(Host): def config(self, **params): r = super(Host, self).config(**params)
self.defaultIntf().rename("eth0")
for off in ["rx", "tx", "sg"]: cmd = "/sbin/ethtool --offload eth0 %s off" % off self.cmd(cmd)
# disable IPv6 self.cmd("sysctl -w net.ipv6.conf.all.disable_ipv6=1") self.cmd("sysctl -w net.ipv6.conf.default.disable_ipv6=1") self.cmd("sysctl -w net.ipv6.conf.lo.disable_ipv6=1")
return r
def describe(self): print "**********" print self.name print "default interface: %s\t%s\t%s" %( self.defaultIntf().name, self.defaultIntf().IP(), self.defaultIntf().MAC() ) print "**********"
class P4Switch(Switch): """P4 virtual switch""" device_id = 0
def __init__(self, name, sw_path = None, json_path = None, thrift_port = None, pcap_dump = False, log_console = True, verbose = False, device_id = None, enable_debugger = False, **kwargs): Switch.__init__(self, name, **kwargs) assert(sw_path) assert(json_path) # make sure that the provided sw_path is valid pathCheck(sw_path) # make sure that the provided JSON file exists if not os.path.isfile(json_path): error("Invalid JSON file.\n") exit(1) self.sw_path = sw_path self.json_path = json_path self.verbose = verbose logfile = "/tmp/p4s.{}.log".format(self.name) self.output = open(logfile, 'w') self.thrift_port = thrift_port self.pcap_dump = pcap_dump self.enable_debugger = enable_debugger self.log_console = log_console if device_id is not None: self.device_id = device_id P4Switch.device_id = max(P4Switch.device_id, device_id) else: self.device_id = P4Switch.device_id P4Switch.device_id += 1 self.nanomsg = "ipc:///tmp/bm-{}-log.ipc".format(self.device_id)
@classmethod def setup(cls): pass
def check_switch_started(self, pid): """While the process is running (pid exists), we check if the Thrift server has been started. If the Thrift server is ready, we assume that the switch was started successfully. This is only reliable if the Thrift server is started at the end of the init process""" while True: if not os.path.exists(os.path.join("/proc", str(pid))): return False sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.settimeout(0.5) result = sock.connect_ex(("localhost", self.thrift_port)) if result == 0: return True
def start(self, controllers): "Start up a new P4 switch" info("Starting P4 switch {}.\n".format(self.name)) args = [self.sw_path] for port, intf in self.intfs.items(): if not intf.IP(): args.extend(['-i', str(port) + "@" + intf.name])
#wuwzhs edit in 2017/11/10 #args.extend(['-i 3@veth1'])
if self.pcap_dump: args.append("--pcap") # args.append("--useFiles") if self.thrift_port: args.extend(['--thrift-port', str(self.thrift_port)]) if self.nanomsg: args.extend(['--nanolog', self.nanomsg]) args.extend(['--device-id', str(self.device_id)]) P4Switch.device_id += 1 args.append(self.json_path) if self.enable_debugger: args.append("--debugger") if self.log_console: args.append("--log-console") logfile = "/tmp/p4s.{}.log".format(self.name) info(' '.join(args) + "\n")
pid = None with tempfile.NamedTemporaryFile() as f: # self.cmd(' '.join(args) + ' > /dev/null 2>&1 &') self.cmd(' '.join(args) + ' >' + logfile + ' 2>&1 & echo $! >> ' + f.name) pid = int(f.read()) debug("P4 switch {} PID is {}.\n".format(self.name, pid)) if not self.check_switch_started(pid): error("P4 switch {} did not start correctly.\n".format(self.name)) exit(1) info("P4 switch {} has been started.\n".format(self.name))
def stop(self): "Terminate P4 switch." self.output.flush() self.cmd('kill %' + self.sw_path) self.cmd('wait') self.deleteIntfs()
def attach(self, intf): "Connect a data port" assert(0)
def detach(self, intf): "Disconnect a data port" assert(0) |
[execution]
Open terminal for h1, s1, s1, s1 (we want to use wireshark to monitor the traffic between h1 and s1, s1 and h2, and s1 and h3)
H1 send a hi.php request. Note:s1-eth1 is for h1-s1, s1-eth2 is for s1-h2(web1), and s1-eth3 is for s1-h3 (web2)
We can also see that there is another connection between s1 and h3.
Get a.jpg (P4 LB forward the request to web2)
We can also see that there is another TCP connection between P4 LB and web1.
[Future work]
In this version, it does not support multiple connections at the same time. Another request needs to wait until previous request is done. So we need to extend our work to support multiple connections at the same time. And port the code to NetFGPA Sume.
Dr. Chih-Heng
Ke (smallko@gmail.com)
Department of
Computer Science and Information Engineering,
National Quemoy University, Kinmen, Taiwan.