Merge pull request #699 from Xilinx/feature/fmpadding_dynamic_integration

RTL FMPadding and dynamic padding support

Merge pull request #699 from Xilinx/feature/fmpadding_dynamic_integration
RTL FMPadding and dynamic padding support
e8fd28c6 · auphelia · GitHub · 56fecea7 · 5d65dd46 · e8fd28c6
Unverified Commit e8fd28c6 authored 2 years ago by auphelia Committed by GitHub 2 years ago
--- a/finn-rtllib/fmpadding/hdl/axi2we.sv
+++ b/finn-rtllib/fmpadding/hdl/axi2we.sv
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	AXI-Light adapter for trivial write enable interface.
+ * @author	Thomas B. Preußer <tpreusse@amd.com>
+ *****************************************************************************/
+
+module axi2we #(
+	int unsigned  ADDR_BITS
+)(
+	//- Global Control ------------------
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
+
+	//- AXI Lite ------------------------
+	// Writing
+	input	                 s_axilite_AWVALID,
+	output	                 s_axilite_AWREADY,
+	input	[ADDR_BITS-1:0]  s_axilite_AWADDR,
+
+	input	        s_axilite_WVALID,
+	output	        s_axilite_WREADY,
+	input	[31:0]  s_axilite_WDATA,
+	input	[ 3:0]  s_axilite_WSTRB,
+
+	output	       s_axilite_BVALID,
+	input	       s_axilite_BREADY,
+	output	[1:0]  s_axilite_BRESP,
+
+	// Reading tied to all-ones
+	input	       s_axilite_ARVALID,
+	output	       s_axilite_ARREADY,
+	input	[ADDR_BITS-1:0]  s_axilite_ARADDR,
+
+	output	        s_axilite_RVALID,
+	input	        s_axilite_RREADY,
+	output	[31:0]  s_axilite_RDATA,
+	output	[ 1:0]  s_axilite_RRESP,
+
+	// Write Enable Interface
+	output	logic                  we,
+	output	logic [ADDR_BITS-1:0]  wa,
+	output	logic [         31:0]  wd
+);
+
+	uwire  clk = ap_clk;
+	uwire  rst = !ap_rst_n;
+
+
+	logic  WABusy = 0;
+	logic  WDBusy = 0;
+	logic [ADDR_BITS-1:0]  Addr = 'x;
+	logic [         31:0]  Data = 'x;
+
+	assign	we = WABusy && WDBusy && s_axilite_BREADY;
+	assign	wa = Addr;
+	assign	wd = Data;
+
+	uwire  clr_wr = rst || we;
+	always_ff @(posedge clk) begin
+		if(clr_wr) begin
+			WABusy <= 0;
+			Addr <= 'x;
+			WDBusy <= 0;
+			Data <= 'x;
+		end
+		else begin
+			if(!WABusy) begin
+				WABusy <= s_axilite_AWVALID;
+				Addr   <= s_axilite_AWADDR;
+			end
+			if(!WDBusy) begin
+				WDBusy <= s_axilite_WVALID;
+				Data   <= s_axilite_WDATA;
+			end
+		end
+	end
+	assign	s_axilite_AWREADY = !WABusy;
+	assign	s_axilite_WREADY  = !WDBusy;
+	assign	s_axilite_BVALID  = WABusy && WDBusy;
+	assign	s_axilite_BRESP   = '0; // OK
+
+	// Answer all reads with '1
+	logic  RValid =  0;
+	uwire  clr_rd = rst || (RValid && s_axilite_RREADY);
+	always_ff @(posedge clk) begin
+		if(clr_rd)        RValid <=  0;
+		else if(!RValid)  RValid <= s_axilite_ARVALID;
+	end
+	assign	s_axilite_ARREADY = !RValid;
+	assign	s_axilite_RVALID  = RValid;
+	assign	s_axilite_RDATA   = '1;
+	assign	s_axilite_RRESP   = '0; // OK
+
+endmodule : axi2we
--- a/finn-rtllib/fmpadding/hdl/fmpadding.sv
+++ b/finn-rtllib/fmpadding/hdl/fmpadding.sv
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Feature map padding.
+ * @author	Thomas B. Preußer <tpreusse@amd.com>
+ *****************************************************************************/
+
+module fmpadding #(
+	int unsigned  XCOUNTER_BITS,
+	int unsigned  YCOUNTER_BITS,
+	int unsigned  NUM_CHANNELS,
+	int unsigned  SIMD,
+	int unsigned  ELEM_BITS,
+	int unsigned  INIT_XON,
+	int unsigned  INIT_XOFF,
+	int unsigned  INIT_XEND,
+	int unsigned  INIT_YON,
+	int unsigned  INIT_YOFF,
+	int unsigned  INIT_YEND,
+
+	localparam int unsigned  STREAM_BITS = 8*(1 + (SIMD*ELEM_BITS-1)/8)
+)(
+	//- Global Control ------------------
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
+
+	// Parameter Configuration ----------
+	input	logic         we,
+	input	logic [ 4:0]  wa,
+	input	logic [31:0]  wd,
+
+	//- AXI Stream - Input --------------
+	output	logic  s_axis_tready,
+	input	logic  s_axis_tvalid,
+	input	logic [STREAM_BITS-1:0]  s_axis_tdata,
+
+	//- AXI Stream - Output -------------
+	input	logic  m_axis_tready,
+	output	logic  m_axis_tvalid,
+	output	logic [STREAM_BITS-1:0]  m_axis_tdata
+);
+
+	uwire  clk = ap_clk;
+	uwire  rst = !ap_rst_n;
+
+	//-----------------------------------------------------------------------
+	// Parameter Sanity Checking
+	initial begin
+		automatic bit  fail = 0;
+
+		if(XCOUNTER_BITS < $clog2(1+INIT_XEND)) begin
+			$error("XCounter size too small to accommodate end count.");
+			fail = 1;
+		end
+		if(XCOUNTER_BITS < $clog2(1+INIT_XON)) begin
+			$error("XCounter size too small to accommodate ON count.");
+			fail = 1;
+		end
+		if(XCOUNTER_BITS < $clog2(1+INIT_XOFF)) begin
+			$error("XCounter size too small to accommodate OFF count.");
+			fail = 1;
+		end
+		if(YCOUNTER_BITS < $clog2(1+INIT_YEND)) begin
+			$error("YCounter size too small to accommodate end count.");
+			fail = 1;
+		end
+		if(YCOUNTER_BITS < $clog2(1+INIT_YON)) begin
+			$error("YCounter size too small to accommodate ON count.");
+			fail = 1;
+		end
+		if(YCOUNTER_BITS < $clog2(1+INIT_YOFF)) begin
+			$error("YCounter size too small to accommodate OFF count.");
+			fail = 1;
+		end
+
+		if((INIT_XEND < INIT_XON) || (INIT_XOFF <= INIT_XON)) begin
+			$warning("Initial empty X output range.");
+		end
+		if((INIT_YEND < INIT_YON) || (INIT_YOFF <= INIT_YON)) begin
+			$warning("Initial empty Y output range.");
+		end
+
+		if(fail)  $finish();
+	end
+
+	//-----------------------------------------------------------------------
+	// Dynamically configurable state
+	typedef logic [XCOUNTER_BITS-1:0]  xcount_t;
+	xcount_t  XEnd = INIT_XEND;
+	xcount_t  XOn  = INIT_XON;
+	xcount_t  XOff = INIT_XOFF;
+
+	typedef logic [YCOUNTER_BITS-1:0]  ycount_t;
+	ycount_t  YEnd = INIT_YEND;
+	ycount_t  YOn  = INIT_YON;
+	ycount_t  YOff = INIT_YOFF;
+
+	always_ff @(posedge clk) begin
+		if(we) begin
+			unique case(wa)
+			0*4:  XOn  <= wd;
+			1*4:  XOff <= wd;
+			2*4:  XEnd <= wd;
+			3*4:  YOn  <= wd;
+			4*4:  YOff <= wd;
+			5*4:  YEnd <= wd;
+
+			default:  assert(0) else begin
+				$error("Illegal write address.");
+				$stop;
+			end
+			endcase
+		end
+	end
+
+	//-----------------------------------------------------------------------
+	// Cascaded enables for the nested counters: SCount, XCount, YCount
+	uwire  sen;
+	uwire  xen;
+	uwire  yen;
+
+	//- S-Counter: SIMD fold ------------
+	initial begin
+		if((NUM_CHANNELS < 1) || (NUM_CHANNELS % SIMD != 0)) begin
+			$error("Channel count must be SIMD multiple.");
+			$finish;
+		end
+	end
+	// Count SF-2, SF-3, ..., 1, 0, -1
+	localparam int unsigned  SF = NUM_CHANNELS/SIMD;
+	typedef logic [$clog2(SF-1):0]  scount_t;
+	scount_t  SCount = SF-2;
+
+	assign	xen = sen && SCount[$left(SCount)];
+	uwire  sclr = rst || xen;
+	always_ff @(posedge clk) begin
+		if(sclr)      SCount <= SF-2;
+		else if(sen)  SCount <= SCount - 1;
+	end
+
+	//- X-Counter: image width ----------
+	xcount_t  XCount = 0;
+
+	assign	yen = xen && (XCount == XEnd);
+	uwire  xclr = rst || yen;
+	always_ff @(posedge clk) begin
+		if(xclr)      XCount <= 0;
+		else if(xen)  XCount <= XCount + 1;
+	end
+	uwire  xfwd = (XOn <= XCount) && (XCount < XOff);
+
+	//- Y-Counter: image height ---------
+	ycount_t  YCount = 0;
+
+	uwire  yclr = rst || (yen && (YCount == YEnd));
+	always_ff @(posedge clk) begin
+		if(yclr)      YCount <= 0;
+		else if(yen)  YCount <= YCount + 1;
+	end
+	uwire  yfwd = (YOn <= YCount) && (YCount < YOff);
+
+	//-----------------------------------------------------------------------
+	// Input forwarding and edge padding
+	typedef struct {
+		logic  vld;
+		logic [STREAM_BITS-1:0]  dat;
+	} buf_t;
+	buf_t  A = '{ vld: 0, dat: 'x };
+	buf_t  B = '{ vld: 0, dat: 'x };
+
+	uwire  fwd = xfwd && yfwd;
+	assign	sen = (m_axis_tready || !B.vld) && (s_axis_tvalid || A.vld || !fwd);
+	assign	s_axis_tready = !A.vld;
+	assign	m_axis_tvalid =  B.vld;
+	assign	m_axis_tdata  =  B.dat;
+
+	always_ff @(posedge clk) begin
+		if(rst) begin
+			B <= '{ vld: 0, dat: 'x };
+		end
+		else if(m_axis_tready || !B.vld) begin
+			B.vld <= s_axis_tvalid || A.vld || !fwd;
+			B.dat <= !fwd? '0 : A.vld? A.dat : s_axis_tdata;
+		end
+	end
+
+	always_ff @(posedge clk) begin
+		if(rst) begin
+			A <= '{ vld: 0, dat: 'x };
+		end
+		else begin
+			A.vld <= (A.vld || s_axis_tvalid) && ((B.vld && !m_axis_tready) || !fwd);
+			if(!A.vld)  A.dat <= s_axis_tdata;
+		end
+	end
+
+endmodule : fmpadding
--- a/finn-rtllib/fmpadding/hdl/fmpadding_axi.sv
+++ b/finn-rtllib/fmpadding/hdl/fmpadding_axi.sv
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Feature map padding.
+ * @author	Thomas B. Preußer <tpreusse@amd.com>
+ *****************************************************************************/
+
+module fmpadding_axi #(
+	int unsigned  XCOUNTER_BITS,
+	int unsigned  YCOUNTER_BITS,
+	int unsigned  NUM_CHANNELS,
+	int unsigned  SIMD,
+	int unsigned  ELEM_BITS,
+	int unsigned  INIT_XON,
+	int unsigned  INIT_XOFF,
+	int unsigned  INIT_XEND,
+	int unsigned  INIT_YON,
+	int unsigned  INIT_YOFF,
+	int unsigned  INIT_YEND,
+
+	localparam int unsigned  STREAM_BITS = 8*(1 + (SIMD*ELEM_BITS-1)/8)
+)(
+	//- Global Control ------------------
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
+
+	//- AXI Lite ------------------------
+	// Writing
+	input	       s_axilite_AWVALID,
+	output	       s_axilite_AWREADY,
+	input	[4:0]  s_axilite_AWADDR,
+
+	input	        s_axilite_WVALID,
+	output	        s_axilite_WREADY,
+	input	[31:0]  s_axilite_WDATA,
+	input	[ 3:0]  s_axilite_WSTRB,
+
+	output	       s_axilite_BVALID,
+	input	       s_axilite_BREADY,
+	output	[1:0]  s_axilite_BRESP,
+
+	// Reading
+	input	       s_axilite_ARVALID,
+	output	       s_axilite_ARREADY,
+	input	[4:0]  s_axilite_ARADDR,
+
+	output	        s_axilite_RVALID,
+	input	        s_axilite_RREADY,
+	output	[31:0]  s_axilite_RDATA,
+	output	[ 1:0]  s_axilite_RRESP,
+
+	//- AXI Stream - Input --------------
+	output	logic  s_axis_tready,
+	input	logic  s_axis_tvalid,
+	input	logic [STREAM_BITS-1:0]  s_axis_tdata,
+
+	//- AXI Stream - Output -------------
+	input	logic  m_axis_tready,
+	output	logic  m_axis_tvalid,
+	output	logic [STREAM_BITS-1:0]  m_axis_tdata
+);
+
+	// AXI-Lite Adapter
+	uwire         we;
+	uwire [ 4:0]  wa;
+	uwire [31:0]  wd;
+	axi2we #(.ADDR_BITS(5)) axilight_adapter (
+		.ap_clk, .ap_rst_n,
+
+		.s_axilite_AWVALID, .s_axilite_AWREADY, .s_axilite_AWADDR,
+		.s_axilite_WVALID, .s_axilite_WREADY, .s_axilite_WDATA, .s_axilite_WSTRB,
+		.s_axilite_BVALID, .s_axilite_BREADY, .s_axilite_BRESP,
+
+		.s_axilite_ARVALID, .s_axilite_ARREADY, .s_axilite_ARADDR,
+		.s_axilite_RVALID, .s_axilite_RREADY, .s_axilite_RDATA, .s_axilite_RRESP,
+
+		.we, .wa, .wd
+	);
+
+	// Actual Padding
+	fmpadding #(
+		.XCOUNTER_BITS(XCOUNTER_BITS), .YCOUNTER_BITS(YCOUNTER_BITS),
+		.NUM_CHANNELS(NUM_CHANNELS), .SIMD(SIMD),
+		.INIT_XON(INIT_XON), .INIT_XOFF(INIT_XOFF), .INIT_XEND(INIT_XEND),
+		.INIT_YON(INIT_YON), .INIT_YOFF(INIT_YOFF), .INIT_YEND(INIT_YEND),
+		.ELEM_BITS(ELEM_BITS)
+	) padding (
+		.ap_clk, .ap_rst_n,
+
+		.we, .wa, .wd,
+
+		.s_axis_tready, .s_axis_tvalid, .s_axis_tdata,
+		.m_axis_tready, .m_axis_tvalid, .m_axis_tdata
+	);
+
+endmodule : fmpadding_axi
--- a/finn-rtllib/fmpadding/hdl/fmpadding_axi_tb.sv
+++ b/finn-rtllib/fmpadding/hdl/fmpadding_axi_tb.sv
+
+module fmpadding_axi_tb #(
+	int unsigned  XCOUNTER_BITS = 8,
+	int unsigned  YCOUNTER_BITS = 8,
+	int unsigned  NUM_CHANNELS  = 4,
+	int unsigned  SIMD          = 2,
+	int unsigned  ELEM_BITS     = 4
+)();
+	localparam int unsigned  STREAM_BITS = 8*(1 + (SIMD*ELEM_BITS-1)/8);
+
+	//- Global Control ------------------
+	logic  clk = 0;
+	always #5ns clk = !clk;
+	logic  rst;
+
+	// AXI-Light for Parameter Configuration
+	logic	       s_axilite_AWVALID;
+	uwire	       s_axilite_AWREADY;
+	logic	[2:0]  s_axilite_AWADDR;
+
+	logic	        s_axilite_WVALID;
+	uwire	        s_axilite_WREADY;
+	logic	[31:0]  s_axilite_WDATA;
+
+	//- AXI Stream - Input --------------
+	uwire  s_axis_tready;
+	logic  s_axis_tvalid;
+	logic [STREAM_BITS-1:0]  s_axis_tdata;
+
+	//- AXI Stream - Output -------------
+	logic  m_axis_tready;
+	uwire  m_axis_tvalid;
+	uwire [STREAM_BITS-1:0]  m_axis_tdata;
+
+
+	// DUT
+	fmpadding_axi #(
+		.XCOUNTER_BITS(XCOUNTER_BITS),
+		.YCOUNTER_BITS(YCOUNTER_BITS),
+		.NUM_CHANNELS(NUM_CHANNELS),
+		.SIMD(SIMD),
+		.INIT_XON(0), .INIT_XOFF(0), .INIT_XEND(0),
+		.INIT_YON(0), .INIT_YOFF(0), .INIT_YEND(0),
+		.ELEM_BITS(ELEM_BITS)
+	) dut (
+		.ap_clk(clk), .ap_rst_n(!rst),
+
+		.s_axilite_AWVALID, .s_axilite_AWREADY, .s_axilite_AWADDR,
+		.s_axilite_WVALID, .s_axilite_WREADY, .s_axilite_WDATA, .s_axilite_WSTRB('1),
+		.s_axilite_BVALID(), .s_axilite_BREADY('1),	.s_axilite_BRESP(),
+		.s_axilite_ARVALID('0), .s_axilite_ARREADY(), .s_axilite_ARADDR('x),
+		.s_axilite_RVALID(), .s_axilite_RREADY('0), .s_axilite_RDATA(), .s_axilite_RRESP(),
+
+		.s_axis_tready, .s_axis_tvalid, .s_axis_tdata,
+		.m_axis_tready, .m_axis_tvalid, .m_axis_tdata
+	);
+
+	// Stimuli
+	localparam int unsigned  IMAGES = 2;
+	localparam int unsigned  XSIZE = 10;
+	localparam int unsigned  YSIZE =  7;
+	localparam int unsigned  PAD_LEFT   = 2;
+	localparam int unsigned  PAD_RIGHT  = 3;
+	localparam int unsigned  PAD_TOP    = 1;
+	localparam int unsigned  PAD_BOTTOM = 2;
+
+	task axi_write(input logic [2:0]  wa, input logic [31:0]  wd);
+		s_axilite_AWVALID <= 1;
+		s_axilite_AWADDR <= wa;
+		@(posedge clk iff s_axilite_AWREADY);
+		s_axilite_AWVALID <= 0;
+		s_axilite_AWADDR <= 'x;
+
+		s_axilite_WVALID <= 1;
+		s_axilite_WDATA <= wd;
+		@(posedge clk iff s_axilite_WREADY);
+		s_axilite_WVALID <= 0;
+		s_axilite_WDATA <= 'x;
+	endtask : axi_write
+
+
+	initial begin
+		s_axilite_AWVALID = 0;
+		s_axilite_AWADDR = 'x;
+		s_axilite_WVALID = 0;
+		s_axilite_WDATA = 'x;
+
+		s_axis_tvalid =  0;
+		s_axis_tdata  = 'x;
+
+		// Configure Parameters
+		rst = 0;
+		@(posedge clk);
+		/* XOn  */	axi_write(0, PAD_LEFT);
+		/* XOff */	axi_write(1, XSIZE - PAD_RIGHT);
+		/* XEnd */	axi_write(2, XSIZE - 1);
+		/* YOn  */	axi_write(4, PAD_TOP);
+		/* YOff */	axi_write(5, YSIZE - PAD_BOTTOM);
+		/* YEnd */	axi_write(6, YSIZE - 1);
+		@(posedge clk);
+		rst <= 1;
+		@(posedge clk);
+		rst <= 0;
+		@(posedge clk);
+
+		// Feed data input
+		s_axis_tvalid <= 1;
+		for(int unsigned  i = 0; i < IMAGES * (XSIZE-PAD_LEFT-PAD_RIGHT) * (YSIZE-PAD_TOP-PAD_BOTTOM) * (NUM_CHANNELS/SIMD); i++) begin
+			s_axis_tdata  <= i;
+			@(posedge clk iff s_axis_tready);
+			if($urandom()%5 == 0) begin
+				s_axis_tvalid <=  0;
+				s_axis_tdata  <= 'x;
+				@(posedge clk);
+				s_axis_tvalid <=  1;
+			end
+		end
+		s_axis_tvalid <=  0;
+		s_axis_tdata  <= 'x;
+	end
+
+	// Output Throttler
+	initial begin
+		m_axis_tready =  0;
+		@(posedge clk iff !rst);
+		m_axis_tready <= 1;
+		forever @(posedge clk iff m_axis_tvalid) begin
+			m_axis_tready <= 0;
+			repeat(4-$clog2(1+$urandom()%15)) @(posedge clk);
+			m_axis_tready <= 1;
+		end
+	end
+
+	// Output logger
+	initial begin
+		@(negedge rst);
+		repeat(IMAGES) begin
+			for(int unsigned  y = 0; y < YSIZE; y++) begin
+				for(int unsigned  x = 0; x < XSIZE; x++) begin
+					automatic string  delim = " ";
+					for(int unsigned  s = 0; s < NUM_CHANNELS/SIMD; s++) begin
+						@(posedge clk iff m_axis_tvalid && m_axis_tready);
+						$write("%s%02X", delim, m_axis_tdata);
+						delim = ":";
+					end
+				end
+				$display();
+			end
+			$display("----");
+		end
+		$finish;
+	end
+
+endmodule : fmpadding_axi_tb
--- a/finn-rtllib/fmpadding/hdl/fmpadding_template.v
+++ b/finn-rtllib/fmpadding/hdl/fmpadding_template.v
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+module $TOP_MODULE_NAME$(
+//- Global Control ------------------
+(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *)
+input	ap_clk,
+(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *)
+input	ap_rst_n,
+
+//- AXI Lite ------------------------
+// Writing
+input	       s_axilite_AWVALID,
+output	       s_axilite_AWREADY,
+input	[4:0]  s_axilite_AWADDR,
+
+input	        s_axilite_WVALID,
+output	        s_axilite_WREADY,
+input	[31:0]  s_axilite_WDATA,
+input	[ 3:0]  s_axilite_WSTRB,
+
+output	       s_axilite_BVALID,
+input	       s_axilite_BREADY,
+output	[1:0]  s_axilite_BRESP,
+
+// Reading
+input	       s_axilite_ARVALID,
+output	       s_axilite_ARREADY,
+input	[4:0]  s_axilite_ARADDR,
+
+output	        s_axilite_RVALID,
+input	        s_axilite_RREADY,
+output	[31:0]  s_axilite_RDATA,
+output	[ 1:0]  s_axilite_RRESP,
+
+//- AXI Stream - Input --------------
+output	in0_V_TREADY,
+input	in0_V_TVALID,
+input	[$STREAM_BITS$-1:0]  in0_V_TDATA,
+
+//- AXI Stream - Output -------------
+input	out_V_TREADY,
+output	out_V_TVALID,
+output	[$STREAM_BITS$-1:0]  out_V_TDATA
+);
+
+
+fmpadding_axi #(
+.XCOUNTER_BITS($XCOUNTER_BITS$),
+.YCOUNTER_BITS($YCOUNTER_BITS$),
+.NUM_CHANNELS($NUM_CHANNELS$),
+.SIMD($SIMD$),
+.ELEM_BITS($ELEM_BITS$),
+.INIT_XON($INIT_XON$),
+.INIT_XOFF($INIT_XOFF$),
+.INIT_XEND($INIT_XEND$),
+.INIT_YON($INIT_YON$),
+.INIT_YOFF($INIT_YOFF$),
+.INIT_YEND($INIT_YEND$)
+)
+$TOP_MODULE_NAME$_impl
+(
+ .ap_clk(ap_clk),
+ .ap_rst_n(ap_rst_n),
+ .s_axilite_AWVALID(s_axilite_AWVALID),
+ .s_axilite_AWREADY(s_axilite_AWREADY),
+ .s_axilite_AWADDR(s_axilite_AWADDR),
+ .s_axilite_WVALID(s_axilite_WVALID),
+ .s_axilite_WREADY(s_axilite_WREADY),
+ .s_axilite_WDATA(s_axilite_WDATA),
+ .s_axilite_WSTRB(s_axilite_WSTRB),
+ .s_axilite_BVALID(s_axilite_BVALID),
+ .s_axilite_BREADY(s_axilite_BREADY),
+ .s_axilite_BRESP(s_axilite_BRESP),
+ .s_axilite_ARVALID(s_axilite_ARVALID),
+ .s_axilite_ARREADY(s_axilite_ARREADY),
+ .s_axilite_ARADDR(s_axilite_ARADDR),
+ .s_axilite_RVALID(s_axilite_RVALID),
+ .s_axilite_RREADY(s_axilite_RREADY),
+ .s_axilite_RDATA(s_axilite_RDATA),
+ .s_axilite_RRESP(s_axilite_RRESP),
+ .s_axis_tready(in0_V_TREADY),
+ .s_axis_tvalid(in0_V_TVALID),
+ .s_axis_tdata(in0_V_TDATA),
+ .m_axis_tready(out_V_TREADY),
+ .m_axis_tvalid(out_V_TVALID),
+ .m_axis_tdata(out_V_TDATA)
+);
+
+endmodule
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -43,6 +43,7 @@ from finn.custom_op.fpgadataflow.downsampler import DownSampler
 from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch
 from finn.custom_op.fpgadataflow.eltwise import StreamingEltwise
 from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch
+from finn.custom_op.fpgadataflow.fmpadding_rtl import FMPadding_rtl
 from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch
 from finn.custom_op.fpgadataflow.iodma import IODMA
 from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch
@@ -91,3 +92,4 @@ custom_op["Lookup"] = Lookup
 custom_op["StreamingConcat"] = StreamingConcat
 custom_op["CheckSum"] = CheckSum
 custom_op["StreamingEltwise"] = StreamingEltwise
+custom_op["FMPadding_rtl"] = FMPadding_rtl
--- a/src/finn/custom_op/fpgadataflow/fmpadding_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/fmpadding_rtl.py
+# Copyright (C) 2022, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import math
+import numpy as np
+import os
+import shutil
+import warnings
+from qonnx.core.datatype import DataType
+from qonnx.util.basic import roundup_to_integer_multiple
+
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+try:
+    from pyverilator import PyVerilator
+except ModuleNotFoundError:
+    PyVerilator = None
+
+
+class FMPadding_rtl(HLSCustomOp):
+    """CustomOp wrapper for the finn-rtllib fmpadding_axi component
+    Supports adjusting the padding amount and spatial feature sizes at
+    runtime."""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            # spatial size of input images
+            "ImgDim": ("ints", True, []),  # [H, W] = [Y, X]
+            # total padding (per dimension) to apply
+            "Padding": (
+                "ints",
+                True,
+                [1, 1, 1, 1],
+            ),  # [H_begin, W_begin, H_end, W_end] = [Y_begin, X_begin, Y_end, X_end]
+            # number of channels in input image
+            "NumChannels": ("i", True, 0),
+            # SIMD Input parallelism
+            "SIMD": ("i", False, 1),
+            # FINN input datatype
+            "inputDataType": ("s", True, ""),
+            # shape describing input vecs per execution
+            "numInputVectors": ("i", False, 1),
+            # Enable reprogrammable implementation to change FM dimensions,
+            # stride, or dilation during runtime
+            "dynamic_mode": ("i", False, 0, {0, 1}),
+            # attribute to save top module name - not user configurable
+            "gen_top_module": ("s", False, ""),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_padded_odim(self):
+        "Return the padded spatial size of the output."
+        idim_h, idim_w = self.get_nodeattr("ImgDim")
+        pad = self.get_nodeattr("Padding")
+        pad_h = pad[0] + pad[2]
+        pad_w = pad[1] + pad[3]
+        odim_h = idim_h + pad_h
+        odim_w = idim_w + pad_w
+        return [odim_h, odim_w]
+
+    def get_exp_cycles(self):
+        odim_h, odim_w = self.get_padded_odim()
+        channels = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        batch_size = self.get_nodeattr("numInputVectors")
+        exp_cycles = (channels / simd) * batch_size * odim_h * odim_w
+        return int(exp_cycles)
+
+    def get_normal_input_shape(self, ind=0):
+        idim_h, idim_w = self.get_nodeattr("ImgDim")
+        num_ch = self.get_nodeattr("NumChannels")
+        ishape = (1, idim_h, idim_w, num_ch)
+        return ishape
+
+    def get_normal_output_shape(self, ind=0):
+        odim_h, odim_w = self.get_padded_odim()
+        num_ch = self.get_nodeattr("NumChannels")
+
+        oshape = (1, odim_h, odim_w, num_ch)
+        return oshape
+
+    def get_folded_input_shape(self, ind=0):
+        normal_ishape = list(self.get_normal_input_shape())
+        ifm_ch = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        assert ifm_ch % simd == 0, "SIMD must divide input channels"
+        fold = int(normal_ishape[-1] / simd)
+        folded_ishape = normal_ishape[:-1] + [fold, simd]
+        return tuple(folded_ishape)
+
+    def get_folded_output_shape(self, ind=0):
+        normal_oshape = list(self.get_normal_output_shape())
+        ifm_ch = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        assert ifm_ch % simd == 0, "SIMD must divide input channels"
+        fold = int(normal_oshape[-1] / simd)
+        folded_oshape = normal_oshape[:-1] + [fold, simd]
+        return tuple(folded_oshape)
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpected input shape for FMPadding_rtl."
+        return super().make_const_shape_op(oshape)
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("inputDataType", idt.name)
+        model.set_tensor_datatype(node.output[0], idt)
+
+    def verify_node(self):
+        pass
+
+    def get_input_datatype(self, ind=0):
+        """Returns FINN DataType of input."""
+        ret = DataType[self.get_nodeattr("inputDataType")]
+        # the hlslib op always pads with zeros, so ensure that the DataType
+        # is able to represent zeros
+        assert ret.allowed(0), "FMPadding_rtl DataType must support zero"
+        return ret
+
+    def get_output_datatype(self, ind=0):
+        """Returns FINN DataType of output. (Same as input datatype)"""
+        return self.get_input_datatype()
+
+    def get_instream_width(self, ind=0):
+        ibits = self.get_input_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        return ibits * simd
+
+    def get_outstream_width(self, ind=0):
+        obits = self.get_output_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        return obits * simd
+
+    def get_number_output_values(self):
+        folded_oshape = self.get_folded_output_shape()
+        return np.prod(folded_oshape[:-1])
+
+    def get_verilog_top_module_intf_names(self):
+        # Overload default HLSCustomOp implementation to add axilite control IF
+        intf_names = super().get_verilog_top_module_intf_names()
+        if self.get_nodeattr("dynamic_mode"):
+            intf_names["axilite"] = ["s_axilite"]
+        return intf_names
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        exp_ishape = self.get_normal_input_shape()
+        exp_oshape = self.get_normal_output_shape()
+        folded_ishape = self.get_folded_input_shape()
+
+        if mode == "cppsim":
+            raise Exception(
+                "cppsim not possible for FMPadding_rtl, please set exec_mode to rtlsim"
+            )
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert (
+            inp.shape == exp_ishape
+        ), """Input shape doesn't
+        match expected shape (1, ImgDim_h, ImgDim_w, NumChannels)."""
+        export_idt = self.get_input_datatype()
+
+        reshaped_input = inp.reshape(folded_ishape)
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+        sim = self.get_rtlsim()
+        nbits = self.get_instream_width()
+        rtlsim_inp = npy_to_rtlsim_input(
+            "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+        )
+        super().reset_rtlsim(sim)
+        super().toggle_clk(sim)
+        rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+        odt = export_idt
+        target_bits = odt.bitwidth()
+        packed_bits = self.get_outstream_width()
+        out_npy_path = "{}/output.npy".format(code_gen_dir)
+        out_shape = self.get_folded_output_shape()
+        rtlsim_output_to_npy(
+            rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+        )
+        # load and reshape output
+        output = np.load(out_npy_path)
+        output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+        context[node.output[0]] = output
+
+        assert (
+            context[node.output[0]].shape == exp_oshape
+        ), """Output shape doesn't match expected shape
+            (1, OutputDim_H, OutputDim_W, NumChannels)."""
+
+    def get_template_values(self, ifm_dims, pads, chans, simd, idt):
+        dimY, dimX = ifm_dims
+        padT, padL, padB, padR = pads
+        y_counter_bits = int(math.ceil(math.log2(padT + dimY + padB + 1)))
+        x_counter_bits = int(math.ceil(math.log2(padL + dimX + padR + 1)))
+        topname = self.get_verilog_top_module_name()
+        stream_bits = idt.bitwidth() * simd
+        stream_bits = int(roundup_to_integer_multiple(stream_bits, 8))
+        code_gen_dict = {
+            "XCOUNTER_BITS": int(x_counter_bits),
+            "YCOUNTER_BITS": int(y_counter_bits),
+            "NUM_CHANNELS": int(chans),
+            "SIMD": int(simd),
+            "ELEM_BITS": idt.bitwidth(),
+            "TOP_MODULE_NAME": topname,
+            "INIT_XON": int(padL),
+            "INIT_XOFF": int(padL + dimX),
+            "INIT_XEND": int(padL + dimX + padR - 1),
+            "INIT_YON": int(padT),
+            "INIT_YOFF": int(padT + dimY),
+            "INIT_YEND": int(padT + dimY + padB - 1),
+            "STREAM_BITS": int(stream_bits),
+        }
+        return code_gen_dict
+
+    def get_dynamic_config(self, ifm_dims=None, pads=None):
+        """Returns a configuration dict to re-configure FM dimension and
+        padding amounts during runtime."""
+
+        if ifm_dims is None:
+            ifm_dims = self.get_nodeattr("ImgDim")
+        if pads is None:
+            pads = self.get_nodeattr("Padding")
+        chans = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        idt = self.get_input_datatype()
+        code_gen_dict = self.get_template_values(ifm_dims, pads, chans, simd, idt)
+        config = {
+            "XON": (0 * 4, (code_gen_dict["INIT_XON"])),
+            "XOFF": (1 * 4, (code_gen_dict["INIT_XOFF"])),
+            "XEND": (2 * 4, (code_gen_dict["INIT_XEND"])),
+            "YON": (3 * 4, (code_gen_dict["INIT_YON"])),
+            "YOFF": (4 * 4, (code_gen_dict["INIT_YOFF"])),
+            "YEND": (5 * 4, (code_gen_dict["INIT_YEND"])),
+        }
+        return config
+
+    def generate_hdl(self):
+        rtlsrc = os.environ["FINN_ROOT"] + "/finn-rtllib/fmpadding/hdl"
+        template_path = rtlsrc + "/fmpadding_template.v"
+        dims = self.get_nodeattr("ImgDim")
+        pads = self.get_nodeattr("Padding")
+        chans = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        idt = self.get_input_datatype()
+        code_gen_dict = self.get_template_values(dims, pads, chans, simd, idt)
+        # save top module name so we can refer to it after this node has been renamed
+        # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject)
+        self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name())
+
+        # apply code generation to templates
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        with open(template_path, "r") as f:
+            template = f.read()
+        for key_name in code_gen_dict:
+            key = "$%s$" % key_name
+            template = template.replace(key, str(code_gen_dict[key_name]))
+
+        with open(
+            os.path.join(code_gen_dir, self.get_verilog_top_module_name() + ".v"),
+            "w",
+        ) as f:
+            f.write(template)
+
+        sv_files = ["fmpadding_axi.sv", "fmpadding.sv", "axi2we.sv"]
+        for sv_file in sv_files:
+            shutil.copy(rtlsrc + "/" + sv_file, code_gen_dir)
+        # set ipgen_path and ip_path so that HLS-Synth transformation
+        # and stich_ip transformation do not complain
+        self.set_nodeattr("ipgen_path", code_gen_dir)
+        self.set_nodeattr("ip_path", code_gen_dir)
+
+    def prepare_rtlsim(self):
+        """Creates a Verilator emulation library for the RTL code generated
+        for this node, sets the rtlsim_so attribute to its path and returns
+        a PyVerilator wrapper around it."""
+        # Modified to use generated (System-)Verilog instead of HLS output products
+
+        if PyVerilator is None:
+            raise ImportError("Installation of PyVerilator is required.")
+
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        verilog_paths = [code_gen_dir]
+        verilog_files = [
+            "fmpadding_axi.sv",
+            "fmpadding.sv",
+            "axi2we.sv",
+            self.get_nodeattr("gen_top_module") + ".v",
+        ]
+
+        # build the Verilator emu library
+        sim = PyVerilator.build(
+            verilog_files,
+            build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
+            verilog_path=verilog_paths,
+            trace_depth=get_rtlsim_trace_depth(),
+            top_module_name=self.get_verilog_top_module_name(),
+        )
+        # save generated lib filename in attribute
+        self.set_nodeattr("rtlsim_so", sim.lib._name)
+        return sim
+
+    def code_generation_ipi(self):
+        """Constructs and returns the TCL for node instantiation in Vivado IPI."""
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+
+        sourcefiles = [
+            "fmpadding_axi.sv",
+            "fmpadding.sv",
+            "axi2we.sv",
+            self.get_nodeattr("gen_top_module") + ".v",
+        ]
+
+        sourcefiles = [os.path.join(code_gen_dir, f) for f in sourcefiles]
+
+        cmd = []
+        for f in sourcefiles:
+            cmd += ["add_files -norecurse %s" % (f)]
+        cmd += [
+            "create_bd_cell -type module -reference %s %s"
+            % (self.get_nodeattr("gen_top_module"), self.onnx_node.name)
+        ]
+        return cmd
+
+    def code_generation_ipgen(self, model, fpgapart, clk):
+        """Normally: Generates C++ code and tcl script for IP generation.
+        Here: Generates (System-)Verilog code for IP generation."""
+        self.generate_hdl()
+
+    def ipgen_singlenode_code(self):
+        """Normally: Builds the bash script for IP generation."""
+        pass
+
+    def code_generation_cppsim(self, model):
+        """Normally: Generates C++ code for simulation (cppsim)."""
+        pass
+
+    def compile_singlenode_code(self):
+        pass
+
+    def global_includes(self):
+        pass
+
+    def defines(self, var):
+        pass
+
+    def read_npy_data(self):
+        pass
+
+    def strm_decl(self):
+        pass
+
+    def docompute(self):
+        pass
+
+    def dataoutstrm(self):
+        pass
+
+    def save_as_npy(self):
+        pass
+
+    def blackboxfunction(self):
+        pass
+
+    def pragmas(self):
+        pass
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -117,8 +117,12 @@ class InferConvInpGen(Transformation):
                    ConvInpGen_idim_h = odim_padding_h
                    ConvInpGen_idim_w = odim_padding_w

+                    padding_optype = (
+                        "FMPadding_rtl" if self.use_rtl_variant else "FMPadding_Batch"
+                    )
+
                    padding_node = helper.make_node(
-                        "FMPadding_Batch",
+                        padding_optype,
                        [i2c_input],
                        [padding_out],
                        domain="finn.custom_op.fpgadataflow",

--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py
@@ -41,7 +41,10 @@ from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from qonnx.transformation.lower_convs_to_matmul import (
+    LowerConvsToMatMul,
+    _auto_pad_to_explicit_padding,
+)
 from qonnx.util.basic import gen_finn_dt_tensor, get_by_name

 import finn.core.onnx_exec as oxe
@@ -54,25 +57,48 @@ from finn.transformation.fpgadataflow.create_dataflow_partition import (
 )
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.util.basic import pyverilate_get_liveness_threshold_cycles


-def create_conv_model(idim, ifm, k, stride, ofm, idt, wdt):
+def create_conv_model(
+    idim_h, idim_w, ifm, k, stride, ofm, idt, wdt, pad_mode, depthwise
+):
    np.random.seed(0)
-    ishp = (1, ifm, idim, idim)
-    int_dim = compute_conv_output_dim(idim, k, stride)
-    odim = compute_conv_output_dim(int_dim, k, stride)
-    oshp = (1, ofm, odim, odim)
-    wshp = (ofm, ifm, k, k)
-    wshp_1 = (ofm, ofm, k, k)
+    group = ifm if depthwise else 1
+    group_str = str(group)
+    ishp = (1, ifm, idim_h, idim_w)
+    pad_0 = _auto_pad_to_explicit_padding(
+        pad_mode, idim_h, idim_w, k, k, stride, stride, 2
+    )
+    int_dim_h = compute_conv_output_dim(
+        idim_h, k, stride, total_pad=pad_0[0] + pad_0[2]
+    )
+    int_dim_w = compute_conv_output_dim(
+        idim_w, k, stride, total_pad=pad_0[1] + pad_0[3]
+    )
+
+    pad_1 = _auto_pad_to_explicit_padding(
+        pad_mode, int_dim_h, int_dim_w, k, k, stride, stride, 2
+    )
+    odim_h = compute_conv_output_dim(
+        int_dim_h, k, stride, total_pad=pad_1[0] + pad_1[2]
+    )
+    odim_w = compute_conv_output_dim(
+        int_dim_w, k, stride, total_pad=pad_1[1] + pad_1[3]
+    )
+    oshp = (1, ifm, odim_h, odim_w) if depthwise else (1, ofm, odim_h, odim_w)
+    wshp = (ifm, 1, k, k) if depthwise else (ofm, ifm, k, k)
+    wshp_1 = (ifm, 1, k, k) if depthwise else (ofm, ofm, k, k)
    ishp_str = str(list(ishp))
    oshp_str = str(list(oshp))
    wshp_str = str(list(wshp))
    wshp_1_str = str(list(wshp_1))
    kshp_str = str([k, k])
-    pad_str = str([0, 0, 0, 0])
+    pad_0_str = str(list(pad_0))
+    pad_1_str = str(list(pad_1))
    stride_str = str([stride, stride])
    dil_str = str([1, 1])

@@ -88,11 +114,11 @@ def create_conv_model(idim, ifm, k, stride, ofm, idt, wdt):
    >
    {{
        conv0 = Conv<
-                dilations={dil_str},group=1,kernel_shape={kshp_str},pads={pad_str},
+                dilations={dil_str},group={group_str},kernel_shape={kshp_str},pads={pad_0_str},
                strides={stride_str}
            >(in0, param_c0_weight)
        out0 = Conv<
-                dilations={dil_str},group=1,kernel_shape={kshp_str},pads={pad_str},
+                dilations={dil_str},group={group_str},kernel_shape={kshp_str},pads={pad_1_str},
                strides={stride_str}
            >(conv0, param_c1_weight)
    }}
@@ -109,17 +135,19 @@ def create_conv_model(idim, ifm, k, stride, ofm, idt, wdt):
    return model


-def update_conv_model_dims(model, idim_new):
+def update_conv_model_dims(model, idim_new_h, idim_new_w):
    cnode = model.get_nodes_by_op_type("Conv")[0]
    k, _ = get_by_name(cnode.attribute, "kernel_shape").ints
    stride, _ = get_by_name(cnode.attribute, "strides").ints
    ishp = model.get_tensor_shape("in0")
    n, ci, _, _ = ishp
    n, co, _, _ = model.get_tensor_shape("out0")
-    int_dim = compute_conv_output_dim(idim_new, k, stride)
-    odim = compute_conv_output_dim(int_dim, k, stride)
-    model.set_tensor_shape("in0", (n, ci, idim_new, idim_new))
-    model.set_tensor_shape("out0", (n, co, odim, odim))
+    int_dim_h = compute_conv_output_dim(idim_new_h, k, stride)
+    int_dim_w = compute_conv_output_dim(idim_new_w, k, stride)
+    odim_h = compute_conv_output_dim(int_dim_h, k, stride)
+    odim_w = compute_conv_output_dim(int_dim_w, k, stride)
+    model.set_tensor_shape("in0", (n, ci, idim_new_h, idim_new_w))
+    model.set_tensor_shape("out0", (n, co, odim_h, odim_w))
    # remove all existing shapes
    del model.graph.value_info[:]
    model = model.transform(InferShapes())
@@ -142,43 +170,87 @@ def config_hook(configs):
        return None

    def write_swg_config(sim):
+        reset_rtlsim(sim)
        for axi_name, config in configs:
-            # 1. Write config registers to the SWG, dict defines (addr, value) tuples
+            # Write config registers to the SWG/FMPadding dict
+            # defines (addr, value) tuples
            for config_entry in config.values():
                axilite_write(sim, config_entry[0], config_entry[1], basename=axi_name)
-            # 2. Set cfg_valid flag (>= 1 cycle)
-            axilite_write(sim, 0, 1, basename=axi_name)
-        # 3. Reset component (>= 1 cycle)
        reset_rtlsim(sim)

    return write_swg_config


+cfg0 = {
+    "idims": [(32, 32), (8, 8)],
+    "ifm": 64,
+    "k": 3,
+    "stride": 1,
+    "ofm": 64,
+    "depthwise": True,
+    "pad_mode": "SAME_UPPER",
+}
+cfg1 = {
+    "idims": [(32, 16), (16, 8)],
+    "ifm": 4,
+    "k": 4,
+    "stride": 1,
+    "ofm": 8,
+    "depthwise": False,
+    "pad_mode": "SAME_UPPER",
+}
+cfg2 = {
+    "idims": [(64, 128), (2, 4)],
+    "ifm": 64,
+    "k": 3,
+    "stride": 1,
+    "ofm": 64,
+    "depthwise": True,
+    "pad_mode": "SAME_UPPER",
+}
+
+
+@pytest.mark.parametrize("cfg", [cfg0, cfg1, cfg2])
 @pytest.mark.slow
 @pytest.mark.vivado
 @pytest.mark.fpgadataflow
-def test_fpgadataflow_conv_dynamic():
-    idims = [32, 16]
-    ifm = 4
-    k = 4
-    stride = 1
-    ofm = 8
-    idt = DataType["UINT8"]
+def test_fpgadataflow_conv_dynamic(cfg):
+    pad_mode = cfg["pad_mode"]
+    depthwise = cfg["depthwise"]
+    idims = cfg["idims"]
+    ifm = cfg["ifm"]
+    k = cfg["k"]
+    stride = cfg["stride"]
+    ofm = cfg["ofm"]
+    idt = DataType["UINT4"]
    wdt = DataType["INT2"]
    exp_cfgs = []
    largest_model = None
    for idim in idims:
-        ishp = (1, ifm, idim, idim)
+        idim_h, idim_w = idim
+        ishp = (1, ifm, idim_h, idim_w)
        np.random.seed(0)
        inp = gen_finn_dt_tensor(idt, ishp)
-        model = create_conv_model(idim, ifm, k, stride, ofm, idt, wdt)
-        _, _, int_dim, _ = model.get_tensor_shape("conv0")
-        _, _, odim, _ = model.get_tensor_shape("out0")
+        model = create_conv_model(
+            idim_h, idim_w, ifm, k, stride, ofm, idt, wdt, pad_mode, depthwise
+        )
+        _, _, int_dim_h, int_dim_w = model.get_tensor_shape("conv0")
+        _, _, odim_h, odim_w = model.get_tensor_shape("out0")
+        pad0 = get_by_name(model.graph.node[0].attribute, "pads").ints
+        pad1 = get_by_name(model.graph.node[1].attribute, "pads").ints
        if idim == max(idims):
            # use largest model for hardware conversion
            largest_model = copy.deepcopy(model)
        golden = execute_onnx(model, {"in0": inp})["out0"]
-        exp_cfg = (idim, int_dim, odim, inp, golden)
+        exp_cfg = (
+            (idim_h, idim_w),
+            (int_dim_h, int_dim_w),
+            (odim_h, odim_w),
+            pad0,
+            pad1,
+            inp,
+            golden,
+        )
        exp_cfgs.append(exp_cfg)

    # convert to hardware and prepare simulation
@@ -187,17 +259,34 @@ def test_fpgadataflow_conv_dynamic():
    model = model.transform(
        to_hls.InferQuantizedMatrixVectorActivation(mem_mode="decoupled")
    )
+    model = model.transform(to_hls.InferVectorVectorActivation())
    model = model.transform(absorb.AbsorbConsecutiveTransposes())
    parent_model = model.transform(CreateDataflowPartition())
    sdp_inst = getCustomOp(
        parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
    )
    model = ModelWrapper(sdp_inst.get_nodeattr("model"))
-    for swg_node in model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl"):
-        getCustomOp(swg_node).set_nodeattr("SIMD", 1)
+    assert len(model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")) == 2
+    if pad_mode == "VALID":
+        assert len(model.get_nodes_by_op_type("FMPadding_rtl")) == 0
+    else:
+        assert len(model.get_nodes_by_op_type("FMPadding_rtl")) == 2
+    dyn_nodes = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")
+    dyn_nodes += model.get_nodes_by_op_type("FMPadding_rtl")
+    for swg_node in dyn_nodes:
+        getCustomOp(swg_node).set_nodeattr("SIMD", 4)
        getCustomOp(swg_node).set_nodeattr("dynamic_mode", 1)
        getCustomOp(swg_node).set_nodeattr("inFIFODepths", [16])
        getCustomOp(swg_node).set_nodeattr("outFIFODepths", [16])
+    comp_nodes = model.get_nodes_by_op_type("MatrixVectorActivation")
+    comp_nodes += model.get_nodes_by_op_type("VectorVectorActivation")
+    for comp_node in comp_nodes:
+        if depthwise:
+            getCustomOp(comp_node).set_nodeattr("PE", 4)
+        else:
+            getCustomOp(comp_node).set_nodeattr("SIMD", 4)
+            getCustomOp(comp_node).set_nodeattr("PE", 4)
+    model = model.transform(InsertDWC())
    model = model.transform(InsertFIFO())
    model = model.transform(GiveUniqueNodeNames())
    model = model.transform(GiveReadableTensorNames())
@@ -208,31 +297,64 @@ def test_fpgadataflow_conv_dynamic():

    # loop through experiment configurations
    for exp_cfg in exp_cfgs:
-        idim, int_dim, odim, inp, golden = exp_cfg
+        (
+            (idim_h, idim_w),
+            (int_dim_h, int_dim_w),
+            (odim_h, odim_w),
+            pad0,
+            pad1,
+            inp,
+            golden,
+        ) = exp_cfg
+        conv0_idim_h = idim_h + pad0[0] + pad0[2]
+        conv0_idim_w = idim_w + pad0[1] + pad0[3]
+        conv1_idim_h = int_dim_h + pad1[0] + pad1[2]
+        conv1_idim_w = int_dim_w + pad1[1] + pad1[3]
        # get config for the new dimensions
        swg_nodes = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")
        swg0 = getCustomOp(swg_nodes[0])
-        update_tensor_dim(model, swg0.onnx_node.input[0], (idim, idim))
-        update_tensor_dim(model, swg0.onnx_node.output[0], (int_dim, int_dim))
-        config0 = swg0.get_dynamic_config((idim, idim))
+        update_tensor_dim(model, swg0.onnx_node.input[0], (conv0_idim_h, conv0_idim_w))
+        update_tensor_dim(model, swg0.onnx_node.output[0], (int_dim_h, int_dim_w))
+        swg_config0 = swg0.get_dynamic_config((conv0_idim_h, conv0_idim_w))
        swg1 = getCustomOp(swg_nodes[1])
-        update_tensor_dim(model, swg1.onnx_node.input[0], (int_dim, int_dim))
-        update_tensor_dim(model, swg1.onnx_node.output[0], (odim, odim))
-        config1 = swg1.get_dynamic_config((int_dim, int_dim))
-        configs = [("s_axilite_0_", config0), ("s_axilite_1_", config1)]
+        update_tensor_dim(model, swg1.onnx_node.input[0], (conv1_idim_h, conv1_idim_w))
+        update_tensor_dim(model, swg1.onnx_node.output[0], (odim_h, odim_w))
+        swg_config1 = swg1.get_dynamic_config((conv1_idim_h, conv1_idim_w))
+        if pad_mode != "VALID":
+            pad_nodes = model.get_nodes_by_op_type("FMPadding_rtl")
+            padder0 = getCustomOp(pad_nodes[0])
+            update_tensor_dim(model, padder0.onnx_node.input[0], (idim_h, idim_w))
+            update_tensor_dim(
+                model, padder0.onnx_node.output[0], (conv0_idim_h, conv0_idim_w)
+            )
+            pad_config0 = padder0.get_dynamic_config((idim_h, idim_w), pad0)
+            padder1 = getCustomOp(pad_nodes[1])
+            update_tensor_dim(model, padder1.onnx_node.input[0], (int_dim_h, int_dim_w))
+            update_tensor_dim(
+                model, padder1.onnx_node.output[0], (conv1_idim_h, conv1_idim_w)
+            )
+            pad_config1 = padder1.get_dynamic_config((int_dim_h, int_dim_w), pad1)
+            configs = [
+                ("s_axilite_0_", pad_config0),
+                ("s_axilite_1_", swg_config0),
+                ("s_axilite_2_", pad_config1),
+                ("s_axilite_3_", swg_config1),
+            ]
+        else:
+            configs = [("s_axilite_0_", swg_config0), ("s_axilite_1_", swg_config1)]
        # adjust folded shapes for I/O FIFOs
        # (since rtlsim_exec uses folded shape info to fold global i/o tensors)
        first_node = getCustomOp(model.graph.node[0])
        first_node_shp = list(first_node.get_folded_input_shape())
-        first_node_shp[1] = idim
-        first_node_shp[2] = idim
+        first_node_shp[1] = idim_h
+        first_node_shp[2] = idim_w
        first_node.set_nodeattr("folded_shape", first_node_shp)
-        update_tensor_dim(model, first_node.onnx_node.input[0], (idim, idim))
+        update_tensor_dim(model, first_node.onnx_node.input[0], (idim_h, idim_w))
        last_node = getCustomOp(model.graph.node[-1])
        last_node_shp = list(last_node.get_folded_output_shape())
-        last_node_shp[1] = odim
-        last_node_shp[2] = odim
-        update_tensor_dim(model, last_node.onnx_node.output[0], (odim, odim))
+        last_node_shp[1] = odim_h
+        last_node_shp[2] = odim_w
+        update_tensor_dim(model, last_node.onnx_node.output[0], (odim_h, odim_w))
        last_node.set_nodeattr("folded_shape", last_node_shp)
        ctx = {"global_in": inp.transpose(0, 2, 3, 1)}
        liveness_prev = pyverilate_get_liveness_threshold_cycles()

--- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
@@ -53,7 +53,7 @@ test_fpga_part = pynq_part_map[test_pynq_board]
 target_clk_ns = 10


-def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt):
+def make_single_fmpadding_modelwrapper(optype, idim, padding, num_ch, simd, idt):
    pad_h = padding[0] + padding[2]
    pad_w = padding[1] + padding[3]
    idim_h, idim_w = idim
@@ -70,7 +70,7 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt):
    )

    FMPadding = helper.make_node(
-        "FMPadding_Batch",
+        optype,
        ["inp"],
        ["outp"],
        domain="finn.custom_op.fpgadataflow",
@@ -110,10 +110,14 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt):
 @pytest.mark.parametrize("idt", [DataType["INT2"], DataType["INT4"]])
 # execution mode
 @pytest.mark.parametrize("mode", ["cppsim", "rtlsim"])
+# implementation style
+@pytest.mark.parametrize("impl_style", ["rtl", "hls"])
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode):
+def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode, impl_style):
+    if impl_style == "rtl" and mode == "cppsim":
+        pytest.skip("rtl implstyle has no cppsim, skipping")
    if num_ch % simd != 0:
        pytest.skip(" num_ch % simd != 0, skipping")

@@ -127,7 +131,9 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode):
    odim_h = idim_h + pad_h
    odim_w = idim_w + pad_w

-    model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, simd, idt)
+    optype = {"hls": "FMPadding_Batch", "rtl": "FMPadding_rtl"}[impl_style]
+
+    model = make_single_fmpadding_modelwrapper(optype, idim, pad, num_ch, simd, idt)
    model = model.transform(InferShapes())
    model = model.transform(SetExecMode(mode))
    model = model.transform(GiveUniqueNodeNames())
@@ -138,6 +144,7 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode):
        model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
        model = model.transform(HLSSynthIP())
        model = model.transform(PrepareRTLSim())
+
    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
    expected_oshape = (1, odim_h, odim_w, num_ch)
    assert y_produced.shape == expected_oshape
@@ -149,7 +156,7 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode):
    assert (y_produced == y_expected).all()

    if mode == "rtlsim":
-        node = model.get_nodes_by_op_type("FMPadding_Batch")[0]
+        node = model.get_nodes_by_op_type(optype)[0]
        inst = getCustomOp(node)
        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
        exp_cycles_dict = model.analysis(exp_cycles_per_layer)