diff --git a/fetch-repos.sh b/fetch-repos.sh
index 16960c71e31671b042dcfb4c31208aaaf8e29906..7078b284a9bbfdebc6bfe5bd8f7d577bdfcacabc 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -27,7 +27,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-QONNX_COMMIT="f14d7dc92a6baeffa2bef811e902abb121a6f696"
+QONNX_COMMIT="ce321742d98f23909a890ed680a9c99640d7aaab"
 FINN_EXP_COMMIT="9cbd2787b5160e2b44e0e8164a0df1457dbd5366"
 BREVITAS_COMMIT="a5b71d6de1389d3e7db898fef72e014842670f03"
 PYVERILATOR_COMMIT="766e457465f5c0dd315490d7b9cc5d74f9a76f4f"
diff --git a/finn-rtllib/fmpadding/hdl/axi2we.sv b/finn-rtllib/fmpadding/hdl/axi2we.sv
new file mode 100644
index 0000000000000000000000000000000000000000..842ba3632c4224d58f87c66e1affc4c028b60ef3
--- /dev/null
+++ b/finn-rtllib/fmpadding/hdl/axi2we.sv
@@ -0,0 +1,122 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	AXI-Light adapter for trivial write enable interface.
+ * @author	Thomas B. PreuÃŸer <tpreusse@amd.com>
+ *****************************************************************************/
+
+module axi2we #(
+	int unsigned  ADDR_BITS
+)(
+	//- Global Control ------------------
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
+
+	//- AXI Lite ------------------------
+	// Writing
+	input	                 s_axilite_AWVALID,
+	output	                 s_axilite_AWREADY,
+	input	[ADDR_BITS-1:0]  s_axilite_AWADDR,
+
+	input	        s_axilite_WVALID,
+	output	        s_axilite_WREADY,
+	input	[31:0]  s_axilite_WDATA,
+	input	[ 3:0]  s_axilite_WSTRB,
+
+	output	       s_axilite_BVALID,
+	input	       s_axilite_BREADY,
+	output	[1:0]  s_axilite_BRESP,
+
+	// Reading tied to all-ones
+	input	       s_axilite_ARVALID,
+	output	       s_axilite_ARREADY,
+	input	[ADDR_BITS-1:0]  s_axilite_ARADDR,
+
+	output	        s_axilite_RVALID,
+	input	        s_axilite_RREADY,
+	output	[31:0]  s_axilite_RDATA,
+	output	[ 1:0]  s_axilite_RRESP,
+
+	// Write Enable Interface
+	output	logic                  we,
+	output	logic [ADDR_BITS-1:0]  wa,
+	output	logic [         31:0]  wd
+);
+
+	uwire  clk = ap_clk;
+	uwire  rst = !ap_rst_n;
+
+
+	logic  WABusy = 0;
+	logic  WDBusy = 0;
+	logic [ADDR_BITS-1:0]  Addr = 'x;
+	logic [         31:0]  Data = 'x;
+
+	assign	we = WABusy && WDBusy && s_axilite_BREADY;
+	assign	wa = Addr;
+	assign	wd = Data;
+
+	uwire  clr_wr = rst || we;
+	always_ff @(posedge clk) begin
+		if(clr_wr) begin
+			WABusy <= 0;
+			Addr <= 'x;
+			WDBusy <= 0;
+			Data <= 'x;
+		end
+		else begin
+			if(!WABusy) begin
+				WABusy <= s_axilite_AWVALID;
+				Addr   <= s_axilite_AWADDR;
+			end
+			if(!WDBusy) begin
+				WDBusy <= s_axilite_WVALID;
+				Data   <= s_axilite_WDATA;
+			end
+		end
+	end
+	assign	s_axilite_AWREADY = !WABusy;
+	assign	s_axilite_WREADY  = !WDBusy;
+	assign	s_axilite_BVALID  = WABusy && WDBusy;
+	assign	s_axilite_BRESP   = '0; // OK
+
+	// Answer all reads with '1
+	logic  RValid =  0;
+	uwire  clr_rd = rst || (RValid && s_axilite_RREADY);
+	always_ff @(posedge clk) begin
+		if(clr_rd)        RValid <=  0;
+		else if(!RValid)  RValid <= s_axilite_ARVALID;
+	end
+	assign	s_axilite_ARREADY = !RValid;
+	assign	s_axilite_RVALID  = RValid;
+	assign	s_axilite_RDATA   = '1;
+	assign	s_axilite_RRESP   = '0; // OK
+
+endmodule : axi2we
diff --git a/finn-rtllib/fmpadding/hdl/fmpadding.sv b/finn-rtllib/fmpadding/hdl/fmpadding.sv
new file mode 100644
index 0000000000000000000000000000000000000000..904c7c381f7b2499fc354ebf798e86edab262866
--- /dev/null
+++ b/finn-rtllib/fmpadding/hdl/fmpadding.sv
@@ -0,0 +1,224 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Feature map padding.
+ * @author	Thomas B. PreuÃŸer <tpreusse@amd.com>
+ *****************************************************************************/
+
+module fmpadding #(
+	int unsigned  XCOUNTER_BITS,
+	int unsigned  YCOUNTER_BITS,
+	int unsigned  NUM_CHANNELS,
+	int unsigned  SIMD,
+	int unsigned  ELEM_BITS,
+	int unsigned  INIT_XON,
+	int unsigned  INIT_XOFF,
+	int unsigned  INIT_XEND,
+	int unsigned  INIT_YON,
+	int unsigned  INIT_YOFF,
+	int unsigned  INIT_YEND,
+
+	localparam int unsigned  STREAM_BITS = 8*(1 + (SIMD*ELEM_BITS-1)/8)
+)(
+	//- Global Control ------------------
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
+
+	// Parameter Configuration ----------
+	input	logic         we,
+	input	logic [ 4:0]  wa,
+	input	logic [31:0]  wd,
+
+	//- AXI Stream - Input --------------
+	output	logic  s_axis_tready,
+	input	logic  s_axis_tvalid,
+	input	logic [STREAM_BITS-1:0]  s_axis_tdata,
+
+	//- AXI Stream - Output -------------
+	input	logic  m_axis_tready,
+	output	logic  m_axis_tvalid,
+	output	logic [STREAM_BITS-1:0]  m_axis_tdata
+);
+
+	uwire  clk = ap_clk;
+	uwire  rst = !ap_rst_n;
+
+	//-----------------------------------------------------------------------
+	// Parameter Sanity Checking
+	initial begin
+		automatic bit  fail = 0;
+
+		if(XCOUNTER_BITS < $clog2(1+INIT_XEND)) begin
+			$error("XCounter size too small to accommodate end count.");
+			fail = 1;
+		end
+		if(XCOUNTER_BITS < $clog2(1+INIT_XON)) begin
+			$error("XCounter size too small to accommodate ON count.");
+			fail = 1;
+		end
+		if(XCOUNTER_BITS < $clog2(1+INIT_XOFF)) begin
+			$error("XCounter size too small to accommodate OFF count.");
+			fail = 1;
+		end
+		if(YCOUNTER_BITS < $clog2(1+INIT_YEND)) begin
+			$error("YCounter size too small to accommodate end count.");
+			fail = 1;
+		end
+		if(YCOUNTER_BITS < $clog2(1+INIT_YON)) begin
+			$error("YCounter size too small to accommodate ON count.");
+			fail = 1;
+		end
+		if(YCOUNTER_BITS < $clog2(1+INIT_YOFF)) begin
+			$error("YCounter size too small to accommodate OFF count.");
+			fail = 1;
+		end
+
+		if((INIT_XEND < INIT_XON) || (INIT_XOFF <= INIT_XON)) begin
+			$warning("Initial empty X output range.");
+		end
+		if((INIT_YEND < INIT_YON) || (INIT_YOFF <= INIT_YON)) begin
+			$warning("Initial empty Y output range.");
+		end
+
+		if(fail)  $finish();
+	end
+
+	//-----------------------------------------------------------------------
+	// Dynamically configurable state
+	typedef logic [XCOUNTER_BITS-1:0]  xcount_t;
+	xcount_t  XEnd = INIT_XEND;
+	xcount_t  XOn  = INIT_XON;
+	xcount_t  XOff = INIT_XOFF;
+
+	typedef logic [YCOUNTER_BITS-1:0]  ycount_t;
+	ycount_t  YEnd = INIT_YEND;
+	ycount_t  YOn  = INIT_YON;
+	ycount_t  YOff = INIT_YOFF;
+
+	always_ff @(posedge clk) begin
+		if(we) begin
+			unique case(wa)
+			0*4:  XOn  <= wd;
+			1*4:  XOff <= wd;
+			2*4:  XEnd <= wd;
+			3*4:  YOn  <= wd;
+			4*4:  YOff <= wd;
+			5*4:  YEnd <= wd;
+
+			default:  assert(0) else begin
+				$error("Illegal write address.");
+				$stop;
+			end
+			endcase
+		end
+	end
+
+	//-----------------------------------------------------------------------
+	// Cascaded enables for the nested counters: SCount, XCount, YCount
+	uwire  sen;
+	uwire  xen;
+	uwire  yen;
+
+	//- S-Counter: SIMD fold ------------
+	initial begin
+		if((NUM_CHANNELS < 1) || (NUM_CHANNELS % SIMD != 0)) begin
+			$error("Channel count must be SIMD multiple.");
+			$finish;
+		end
+	end
+	// Count SF-2, SF-3, ..., 1, 0, -1
+	localparam int unsigned  SF = NUM_CHANNELS/SIMD;
+	typedef logic [$clog2(SF-1):0]  scount_t;
+	scount_t  SCount = SF-2;
+
+	assign	xen = sen && SCount[$left(SCount)];
+	uwire  sclr = rst || xen;
+	always_ff @(posedge clk) begin
+		if(sclr)      SCount <= SF-2;
+		else if(sen)  SCount <= SCount - 1;
+	end
+
+	//- X-Counter: image width ----------
+	xcount_t  XCount = 0;
+
+	assign	yen = xen && (XCount == XEnd);
+	uwire  xclr = rst || yen;
+	always_ff @(posedge clk) begin
+		if(xclr)      XCount <= 0;
+		else if(xen)  XCount <= XCount + 1;
+	end
+	uwire  xfwd = (XOn <= XCount) && (XCount < XOff);
+
+	//- Y-Counter: image height ---------
+	ycount_t  YCount = 0;
+
+	uwire  yclr = rst || (yen && (YCount == YEnd));
+	always_ff @(posedge clk) begin
+		if(yclr)      YCount <= 0;
+		else if(yen)  YCount <= YCount + 1;
+	end
+	uwire  yfwd = (YOn <= YCount) && (YCount < YOff);
+
+	//-----------------------------------------------------------------------
+	// Input forwarding and edge padding
+	typedef struct {
+		logic  vld;
+		logic [STREAM_BITS-1:0]  dat;
+	} buf_t;
+	buf_t  A = '{ vld: 0, dat: 'x };
+	buf_t  B = '{ vld: 0, dat: 'x };
+
+	uwire  fwd = xfwd && yfwd;
+	assign	sen = (m_axis_tready || !B.vld) && (s_axis_tvalid || A.vld || !fwd);
+	assign	s_axis_tready = !A.vld;
+	assign	m_axis_tvalid =  B.vld;
+	assign	m_axis_tdata  =  B.dat;
+
+	always_ff @(posedge clk) begin
+		if(rst) begin
+			B <= '{ vld: 0, dat: 'x };
+		end
+		else if(m_axis_tready || !B.vld) begin
+			B.vld <= s_axis_tvalid || A.vld || !fwd;
+			B.dat <= !fwd? '0 : A.vld? A.dat : s_axis_tdata;
+		end
+	end
+
+	always_ff @(posedge clk) begin
+		if(rst) begin
+			A <= '{ vld: 0, dat: 'x };
+		end
+		else begin
+			A.vld <= (A.vld || s_axis_tvalid) && ((B.vld && !m_axis_tready) || !fwd);
+			if(!A.vld)  A.dat <= s_axis_tdata;
+		end
+	end
+
+endmodule : fmpadding
diff --git a/finn-rtllib/fmpadding/hdl/fmpadding_axi.sv b/finn-rtllib/fmpadding/hdl/fmpadding_axi.sv
new file mode 100644
index 0000000000000000000000000000000000000000..5948341d000a1dd82ff363b36557f897d3a064c7
--- /dev/null
+++ b/finn-rtllib/fmpadding/hdl/fmpadding_axi.sv
@@ -0,0 +1,123 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Feature map padding.
+ * @author	Thomas B. PreuÃŸer <tpreusse@amd.com>
+ *****************************************************************************/
+
+module fmpadding_axi #(
+	int unsigned  XCOUNTER_BITS,
+	int unsigned  YCOUNTER_BITS,
+	int unsigned  NUM_CHANNELS,
+	int unsigned  SIMD,
+	int unsigned  ELEM_BITS,
+	int unsigned  INIT_XON,
+	int unsigned  INIT_XOFF,
+	int unsigned  INIT_XEND,
+	int unsigned  INIT_YON,
+	int unsigned  INIT_YOFF,
+	int unsigned  INIT_YEND,
+
+	localparam int unsigned  STREAM_BITS = 8*(1 + (SIMD*ELEM_BITS-1)/8)
+)(
+	//- Global Control ------------------
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
+
+	//- AXI Lite ------------------------
+	// Writing
+	input	       s_axilite_AWVALID,
+	output	       s_axilite_AWREADY,
+	input	[4:0]  s_axilite_AWADDR,
+
+	input	        s_axilite_WVALID,
+	output	        s_axilite_WREADY,
+	input	[31:0]  s_axilite_WDATA,
+	input	[ 3:0]  s_axilite_WSTRB,
+
+	output	       s_axilite_BVALID,
+	input	       s_axilite_BREADY,
+	output	[1:0]  s_axilite_BRESP,
+
+	// Reading
+	input	       s_axilite_ARVALID,
+	output	       s_axilite_ARREADY,
+	input	[4:0]  s_axilite_ARADDR,
+
+	output	        s_axilite_RVALID,
+	input	        s_axilite_RREADY,
+	output	[31:0]  s_axilite_RDATA,
+	output	[ 1:0]  s_axilite_RRESP,
+
+	//- AXI Stream - Input --------------
+	output	logic  s_axis_tready,
+	input	logic  s_axis_tvalid,
+	input	logic [STREAM_BITS-1:0]  s_axis_tdata,
+
+	//- AXI Stream - Output -------------
+	input	logic  m_axis_tready,
+	output	logic  m_axis_tvalid,
+	output	logic [STREAM_BITS-1:0]  m_axis_tdata
+);
+
+	// AXI-Lite Adapter
+	uwire         we;
+	uwire [ 4:0]  wa;
+	uwire [31:0]  wd;
+	axi2we #(.ADDR_BITS(5)) axilight_adapter (
+		.ap_clk, .ap_rst_n,
+
+		.s_axilite_AWVALID, .s_axilite_AWREADY, .s_axilite_AWADDR,
+		.s_axilite_WVALID, .s_axilite_WREADY, .s_axilite_WDATA, .s_axilite_WSTRB,
+		.s_axilite_BVALID, .s_axilite_BREADY, .s_axilite_BRESP,
+
+		.s_axilite_ARVALID, .s_axilite_ARREADY, .s_axilite_ARADDR,
+		.s_axilite_RVALID, .s_axilite_RREADY, .s_axilite_RDATA, .s_axilite_RRESP,
+
+		.we, .wa, .wd
+	);
+
+	// Actual Padding
+	fmpadding #(
+		.XCOUNTER_BITS(XCOUNTER_BITS), .YCOUNTER_BITS(YCOUNTER_BITS),
+		.NUM_CHANNELS(NUM_CHANNELS), .SIMD(SIMD),
+		.INIT_XON(INIT_XON), .INIT_XOFF(INIT_XOFF), .INIT_XEND(INIT_XEND),
+		.INIT_YON(INIT_YON), .INIT_YOFF(INIT_YOFF), .INIT_YEND(INIT_YEND),
+		.ELEM_BITS(ELEM_BITS)
+	) padding (
+		.ap_clk, .ap_rst_n,
+
+		.we, .wa, .wd,
+
+		.s_axis_tready, .s_axis_tvalid, .s_axis_tdata,
+		.m_axis_tready, .m_axis_tvalid, .m_axis_tdata
+	);
+
+endmodule : fmpadding_axi
diff --git a/finn-rtllib/fmpadding/hdl/fmpadding_axi_tb.sv b/finn-rtllib/fmpadding/hdl/fmpadding_axi_tb.sv
new file mode 100644
index 0000000000000000000000000000000000000000..741689b3a7af7ad4d07f2af569f71135c1d35c7b
--- /dev/null
+++ b/finn-rtllib/fmpadding/hdl/fmpadding_axi_tb.sv
@@ -0,0 +1,154 @@
+
+module fmpadding_axi_tb #(
+	int unsigned  XCOUNTER_BITS = 8,
+	int unsigned  YCOUNTER_BITS = 8,
+	int unsigned  NUM_CHANNELS  = 4,
+	int unsigned  SIMD          = 2,
+	int unsigned  ELEM_BITS     = 4
+)();
+	localparam int unsigned  STREAM_BITS = 8*(1 + (SIMD*ELEM_BITS-1)/8);
+
+	//- Global Control ------------------
+	logic  clk = 0;
+	always #5ns clk = !clk;
+	logic  rst;
+
+	// AXI-Light for Parameter Configuration
+	logic	       s_axilite_AWVALID;
+	uwire	       s_axilite_AWREADY;
+	logic	[2:0]  s_axilite_AWADDR;
+
+	logic	        s_axilite_WVALID;
+	uwire	        s_axilite_WREADY;
+	logic	[31:0]  s_axilite_WDATA;
+
+	//- AXI Stream - Input --------------
+	uwire  s_axis_tready;
+	logic  s_axis_tvalid;
+	logic [STREAM_BITS-1:0]  s_axis_tdata;
+
+	//- AXI Stream - Output -------------
+	logic  m_axis_tready;
+	uwire  m_axis_tvalid;
+	uwire [STREAM_BITS-1:0]  m_axis_tdata;
+
+
+	// DUT
+	fmpadding_axi #(
+		.XCOUNTER_BITS(XCOUNTER_BITS),
+		.YCOUNTER_BITS(YCOUNTER_BITS),
+		.NUM_CHANNELS(NUM_CHANNELS),
+		.SIMD(SIMD),
+		.INIT_XON(0), .INIT_XOFF(0), .INIT_XEND(0),
+		.INIT_YON(0), .INIT_YOFF(0), .INIT_YEND(0),
+		.ELEM_BITS(ELEM_BITS)
+	) dut (
+		.ap_clk(clk), .ap_rst_n(!rst),
+
+		.s_axilite_AWVALID, .s_axilite_AWREADY, .s_axilite_AWADDR,
+		.s_axilite_WVALID, .s_axilite_WREADY, .s_axilite_WDATA, .s_axilite_WSTRB('1),
+		.s_axilite_BVALID(), .s_axilite_BREADY('1),	.s_axilite_BRESP(),
+		.s_axilite_ARVALID('0), .s_axilite_ARREADY(), .s_axilite_ARADDR('x),
+		.s_axilite_RVALID(), .s_axilite_RREADY('0), .s_axilite_RDATA(), .s_axilite_RRESP(),
+
+		.s_axis_tready, .s_axis_tvalid, .s_axis_tdata,
+		.m_axis_tready, .m_axis_tvalid, .m_axis_tdata
+	);
+
+	// Stimuli
+	localparam int unsigned  IMAGES = 2;
+	localparam int unsigned  XSIZE = 10;
+	localparam int unsigned  YSIZE =  7;
+	localparam int unsigned  PAD_LEFT   = 2;
+	localparam int unsigned  PAD_RIGHT  = 3;
+	localparam int unsigned  PAD_TOP    = 1;
+	localparam int unsigned  PAD_BOTTOM = 2;
+
+	task axi_write(input logic [2:0]  wa, input logic [31:0]  wd);
+		s_axilite_AWVALID <= 1;
+		s_axilite_AWADDR <= wa;
+		@(posedge clk iff s_axilite_AWREADY);
+		s_axilite_AWVALID <= 0;
+		s_axilite_AWADDR <= 'x;
+
+		s_axilite_WVALID <= 1;
+		s_axilite_WDATA <= wd;
+		@(posedge clk iff s_axilite_WREADY);
+		s_axilite_WVALID <= 0;
+		s_axilite_WDATA <= 'x;
+	endtask : axi_write
+
+
+	initial begin
+		s_axilite_AWVALID = 0;
+		s_axilite_AWADDR = 'x;
+		s_axilite_WVALID = 0;
+		s_axilite_WDATA = 'x;
+
+		s_axis_tvalid =  0;
+		s_axis_tdata  = 'x;
+
+		// Configure Parameters
+		rst = 0;
+		@(posedge clk);
+		/* XOn  */	axi_write(0, PAD_LEFT);
+		/* XOff */	axi_write(1, XSIZE - PAD_RIGHT);
+		/* XEnd */	axi_write(2, XSIZE - 1);
+		/* YOn  */	axi_write(4, PAD_TOP);
+		/* YOff */	axi_write(5, YSIZE - PAD_BOTTOM);
+		/* YEnd */	axi_write(6, YSIZE - 1);
+		@(posedge clk);
+		rst <= 1;
+		@(posedge clk);
+		rst <= 0;
+		@(posedge clk);
+
+		// Feed data input
+		s_axis_tvalid <= 1;
+		for(int unsigned  i = 0; i < IMAGES * (XSIZE-PAD_LEFT-PAD_RIGHT) * (YSIZE-PAD_TOP-PAD_BOTTOM) * (NUM_CHANNELS/SIMD); i++) begin
+			s_axis_tdata  <= i;
+			@(posedge clk iff s_axis_tready);
+			if($urandom()%5 == 0) begin
+				s_axis_tvalid <=  0;
+				s_axis_tdata  <= 'x;
+				@(posedge clk);
+				s_axis_tvalid <=  1;
+			end
+		end
+		s_axis_tvalid <=  0;
+		s_axis_tdata  <= 'x;
+	end
+
+	// Output Throttler
+	initial begin
+		m_axis_tready =  0;
+		@(posedge clk iff !rst);
+		m_axis_tready <= 1;
+		forever @(posedge clk iff m_axis_tvalid) begin
+			m_axis_tready <= 0;
+			repeat(4-$clog2(1+$urandom()%15)) @(posedge clk);
+			m_axis_tready <= 1;
+		end
+	end
+
+	// Output logger
+	initial begin
+		@(negedge rst);
+		repeat(IMAGES) begin
+			for(int unsigned  y = 0; y < YSIZE; y++) begin
+				for(int unsigned  x = 0; x < XSIZE; x++) begin
+					automatic string  delim = " ";
+					for(int unsigned  s = 0; s < NUM_CHANNELS/SIMD; s++) begin
+						@(posedge clk iff m_axis_tvalid && m_axis_tready);
+						$write("%s%02X", delim, m_axis_tdata);
+						delim = ":";
+					end
+				end
+				$display();
+			end
+			$display("----");
+		end
+		$finish;
+	end
+
+endmodule : fmpadding_axi_tb
diff --git a/finn-rtllib/fmpadding/hdl/fmpadding_template.v b/finn-rtllib/fmpadding/hdl/fmpadding_template.v
new file mode 100644
index 0000000000000000000000000000000000000000..0b0f40f86a44ac1d905c89bed5328d6d1ea48876
--- /dev/null
+++ b/finn-rtllib/fmpadding/hdl/fmpadding_template.v
@@ -0,0 +1,118 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+module $TOP_MODULE_NAME$(
+//- Global Control ------------------
+(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *)
+input	ap_clk,
+(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *)
+input	ap_rst_n,
+
+//- AXI Lite ------------------------
+// Writing
+input	       s_axilite_AWVALID,
+output	       s_axilite_AWREADY,
+input	[4:0]  s_axilite_AWADDR,
+
+input	        s_axilite_WVALID,
+output	        s_axilite_WREADY,
+input	[31:0]  s_axilite_WDATA,
+input	[ 3:0]  s_axilite_WSTRB,
+
+output	       s_axilite_BVALID,
+input	       s_axilite_BREADY,
+output	[1:0]  s_axilite_BRESP,
+
+// Reading
+input	       s_axilite_ARVALID,
+output	       s_axilite_ARREADY,
+input	[4:0]  s_axilite_ARADDR,
+
+output	        s_axilite_RVALID,
+input	        s_axilite_RREADY,
+output	[31:0]  s_axilite_RDATA,
+output	[ 1:0]  s_axilite_RRESP,
+
+//- AXI Stream - Input --------------
+output	in0_V_TREADY,
+input	in0_V_TVALID,
+input	[$STREAM_BITS$-1:0]  in0_V_TDATA,
+
+//- AXI Stream - Output -------------
+input	out_V_TREADY,
+output	out_V_TVALID,
+output	[$STREAM_BITS$-1:0]  out_V_TDATA
+);
+
+
+fmpadding_axi #(
+.XCOUNTER_BITS($XCOUNTER_BITS$),
+.YCOUNTER_BITS($YCOUNTER_BITS$),
+.NUM_CHANNELS($NUM_CHANNELS$),
+.SIMD($SIMD$),
+.ELEM_BITS($ELEM_BITS$),
+.INIT_XON($INIT_XON$),
+.INIT_XOFF($INIT_XOFF$),
+.INIT_XEND($INIT_XEND$),
+.INIT_YON($INIT_YON$),
+.INIT_YOFF($INIT_YOFF$),
+.INIT_YEND($INIT_YEND$)
+)
+$TOP_MODULE_NAME$_impl
+(
+ .ap_clk(ap_clk),
+ .ap_rst_n(ap_rst_n),
+ .s_axilite_AWVALID(s_axilite_AWVALID),
+ .s_axilite_AWREADY(s_axilite_AWREADY),
+ .s_axilite_AWADDR(s_axilite_AWADDR),
+ .s_axilite_WVALID(s_axilite_WVALID),
+ .s_axilite_WREADY(s_axilite_WREADY),
+ .s_axilite_WDATA(s_axilite_WDATA),
+ .s_axilite_WSTRB(s_axilite_WSTRB),
+ .s_axilite_BVALID(s_axilite_BVALID),
+ .s_axilite_BREADY(s_axilite_BREADY),
+ .s_axilite_BRESP(s_axilite_BRESP),
+ .s_axilite_ARVALID(s_axilite_ARVALID),
+ .s_axilite_ARREADY(s_axilite_ARREADY),
+ .s_axilite_ARADDR(s_axilite_ARADDR),
+ .s_axilite_RVALID(s_axilite_RVALID),
+ .s_axilite_RREADY(s_axilite_RREADY),
+ .s_axilite_RDATA(s_axilite_RDATA),
+ .s_axilite_RRESP(s_axilite_RRESP),
+ .s_axis_tready(in0_V_TREADY),
+ .s_axis_tvalid(in0_V_TVALID),
+ .s_axis_tdata(in0_V_TDATA),
+ .m_axis_tready(out_V_TREADY),
+ .m_axis_tvalid(out_V_TVALID),
+ .m_axis_tdata(out_V_TDATA)
+);
+
+endmodule
diff --git a/finn-rtllib/swg/swg_template_axilite.v b/finn-rtllib/swg/swg_template_axilite.v
new file mode 100644
index 0000000000000000000000000000000000000000..9479c7f80d7d82b27141dbe5abcce442049237bd
--- /dev/null
+++ b/finn-rtllib/swg/swg_template_axilite.v
@@ -0,0 +1,567 @@
+
+`timescale 1 ns / 1 ps
+
+module $TOP_MODULE_NAME$_axilite #
+(
+    // Users to add parameters here
+
+    // User parameters ends
+    // Do not modify the parameters beyond this line
+
+    // Width of S_AXI data bus
+    parameter integer C_S_AXI_DATA_WIDTH	= 32,
+    // Width of S_AXI address bus
+    parameter integer C_S_AXI_ADDR_WIDTH	= 6
+)
+(
+    // Users to add ports here
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg0,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg1,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg2,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg3,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg4,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg5,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg6,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg7,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg8,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg9,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg10,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg11,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg12,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg13,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg14,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg15,
+
+    // User ports ends
+    // Do not modify the ports beyond this line
+
+    // Global Clock Signal
+    input wire  S_AXI_ACLK,
+    // Global Reset Signal. This Signal is Active LOW
+    input wire  S_AXI_ARESETN,
+    // Write address (issued by master, acceped by Slave)
+    input wire [C_S_AXI_ADDR_WIDTH-1 : 0] S_AXI_AWADDR,
+    // Write channel Protection type. This signal indicates the
+        // privilege and security level of the transaction, and whether
+        // the transaction is a data access or an instruction access.
+    input wire [2 : 0] S_AXI_AWPROT,
+    // Write address valid. This signal indicates that the master signaling
+        // valid write address and control information.
+    input wire  S_AXI_AWVALID,
+    // Write address ready. This signal indicates that the slave is ready
+        // to accept an address and associated control signals.
+    output wire  S_AXI_AWREADY,
+    // Write data (issued by master, acceped by Slave)
+    input wire [C_S_AXI_DATA_WIDTH-1 : 0] S_AXI_WDATA,
+    // Write strobes. This signal indicates which byte lanes hold
+        // valid data. There is one write strobe bit for each eight
+        // bits of the write data bus.
+    input wire [(C_S_AXI_DATA_WIDTH/8)-1 : 0] S_AXI_WSTRB,
+    // Write valid. This signal indicates that valid write
+        // data and strobes are available.
+    input wire  S_AXI_WVALID,
+    // Write ready. This signal indicates that the slave
+        // can accept the write data.
+    output wire  S_AXI_WREADY,
+    // Write response. This signal indicates the status
+        // of the write transaction.
+    output wire [1 : 0] S_AXI_BRESP,
+    // Write response valid. This signal indicates that the channel
+        // is signaling a valid write response.
+    output wire  S_AXI_BVALID,
+    // Response ready. This signal indicates that the master
+        // can accept a write response.
+    input wire  S_AXI_BREADY,
+    // Read address (issued by master, acceped by Slave)
+    input wire [C_S_AXI_ADDR_WIDTH-1 : 0] S_AXI_ARADDR,
+    // Protection type. This signal indicates the privilege
+        // and security level of the transaction, and whether the
+        // transaction is a data access or an instruction access.
+    input wire [2 : 0] S_AXI_ARPROT,
+    // Read address valid. This signal indicates that the channel
+        // is signaling valid read address and control information.
+    input wire  S_AXI_ARVALID,
+    // Read address ready. This signal indicates that the slave is
+        // ready to accept an address and associated control signals.
+    output wire  S_AXI_ARREADY,
+    // Read data (issued by slave)
+    output wire [C_S_AXI_DATA_WIDTH-1 : 0] S_AXI_RDATA,
+    // Read response. This signal indicates the status of the
+        // read transfer.
+    output wire [1 : 0] S_AXI_RRESP,
+    // Read valid. This signal indicates that the channel is
+        // signaling the required read data.
+    output wire  S_AXI_RVALID,
+    // Read ready. This signal indicates that the master can
+        // accept the read data and response information.
+    input wire  S_AXI_RREADY
+);
+
+// AXI4LITE signals
+reg [C_S_AXI_ADDR_WIDTH-1 : 0] 	axi_awaddr;
+reg  	axi_awready;
+reg  	axi_wready;
+reg [1 : 0] 	axi_bresp;
+reg  	axi_bvalid;
+reg [C_S_AXI_ADDR_WIDTH-1 : 0] 	axi_araddr;
+reg  	axi_arready;
+reg [C_S_AXI_DATA_WIDTH-1 : 0] 	axi_rdata;
+reg [1 : 0] 	axi_rresp;
+reg  	axi_rvalid;
+
+// Example-specific design signals
+// local parameter for addressing 32 bit / 64 bit C_S_AXI_DATA_WIDTH
+// ADDR_LSB is used for addressing 32/64 bit registers/memories
+// ADDR_LSB = 2 for 32 bits (n downto 2)
+// ADDR_LSB = 3 for 64 bits (n downto 3)
+localparam integer ADDR_LSB = (C_S_AXI_DATA_WIDTH/32) + 1;
+localparam integer OPT_MEM_ADDR_BITS = 3;
+//----------------------------------------------
+//-- Signals for user logic register space example
+//------------------------------------------------
+//-- Number of Slave Registers 16
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg0;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg1;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg2;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg3;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg4;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg5;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg6;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg7;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg8;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg9;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg10;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg11;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg12;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg13;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg14;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg15;
+wire	 slv_reg_rden;
+wire	 slv_reg_wren;
+reg [C_S_AXI_DATA_WIDTH-1:0]	 reg_data_out;
+integer	 byte_index;
+reg	 aw_en;
+
+// I/O Connections assignments
+
+assign S_AXI_AWREADY	= axi_awready;
+assign S_AXI_WREADY	= axi_wready;
+assign S_AXI_BRESP	= axi_bresp;
+assign S_AXI_BVALID	= axi_bvalid;
+assign S_AXI_ARREADY	= axi_arready;
+assign S_AXI_RDATA	= axi_rdata;
+assign S_AXI_RRESP	= axi_rresp;
+assign S_AXI_RVALID	= axi_rvalid;
+// Implement axi_awready generation
+// axi_awready is asserted for one S_AXI_ACLK clock cycle when both
+// S_AXI_AWVALID and S_AXI_WVALID are asserted. axi_awready is
+// de-asserted when reset is low.
+
+always @( posedge S_AXI_ACLK )
+begin
+    if ( S_AXI_ARESETN == 1'b0 )
+    begin
+        axi_awready <= 1'b0;
+        aw_en <= 1'b1;
+    end
+    else
+    begin
+        if (~axi_awready && S_AXI_AWVALID && S_AXI_WVALID && aw_en)
+        begin
+            // slave is ready to accept write address when
+            // there is a valid write address and write data
+            // on the write address and data bus. This design
+            // expects no outstanding transactions.
+            axi_awready <= 1'b1;
+            aw_en <= 1'b0;
+        end
+        else if (S_AXI_BREADY && axi_bvalid)
+            begin
+                aw_en <= 1'b1;
+                axi_awready <= 1'b0;
+            end
+        else
+        begin
+            axi_awready <= 1'b0;
+        end
+    end
+end
+
+// Implement axi_awaddr latching
+// This process is used to latch the address when both
+// S_AXI_AWVALID and S_AXI_WVALID are valid.
+
+always @( posedge S_AXI_ACLK )
+begin
+    if ( S_AXI_ARESETN == 1'b0 )
+    begin
+        axi_awaddr <= 0;
+    end
+    else
+    begin
+        if (~axi_awready && S_AXI_AWVALID && S_AXI_WVALID && aw_en)
+        begin
+            // Write Address latching
+            axi_awaddr <= S_AXI_AWADDR;
+        end
+    end
+end
+
+// Implement axi_wready generation
+// axi_wready is asserted for one S_AXI_ACLK clock cycle when both
+// S_AXI_AWVALID and S_AXI_WVALID are asserted. axi_wready is
+// de-asserted when reset is low.
+
+always @( posedge S_AXI_ACLK )
+begin
+    if ( S_AXI_ARESETN == 1'b0 )
+    begin
+        axi_wready <= 1'b0;
+    end
+    else
+    begin
+        if (~axi_wready && S_AXI_WVALID && S_AXI_AWVALID && aw_en )
+        begin
+            // slave is ready to accept write data when
+            // there is a valid write address and write data
+            // on the write address and data bus. This design
+            // expects no outstanding transactions.
+            axi_wready <= 1'b1;
+        end
+        else
+        begin
+            axi_wready <= 1'b0;
+        end
+    end
+end
+
+// Implement memory mapped register select and write logic generation
+// The write data is accepted and written to memory mapped registers when
+// axi_awready, S_AXI_WVALID, axi_wready and S_AXI_WVALID are asserted. Write strobes are used to
+// select byte enables of slave registers while writing.
+// These registers are cleared when reset (active low) is applied.
+// Slave register write enable is asserted when valid address and data are available
+// and the slave is ready to accept the write address and write data.
+assign slv_reg_wren = axi_wready && S_AXI_WVALID && axi_awready && S_AXI_AWVALID;
+
+always @( posedge S_AXI_ACLK )
+begin
+    if ( S_AXI_ARESETN == 1'b0 )
+    begin
+        slv_reg0 <= 0;
+        slv_reg1 <= 0;
+        slv_reg2 <= 0;
+        slv_reg3 <= 0;
+        slv_reg4 <= 0;
+        slv_reg5 <= 0;
+        slv_reg6 <= 0;
+        slv_reg7 <= 0;
+        slv_reg8 <= 0;
+        slv_reg9 <= 0;
+        slv_reg10 <= 0;
+        slv_reg11 <= 0;
+        slv_reg12 <= 0;
+        slv_reg13 <= 0;
+        slv_reg14 <= 0;
+        slv_reg15 <= 0;
+    end
+    else begin
+    if (slv_reg_wren)
+        begin
+        case ( axi_awaddr[ADDR_LSB+OPT_MEM_ADDR_BITS:ADDR_LSB] )
+            4'h0:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 0
+                slv_reg0[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'h1:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 1
+                slv_reg1[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'h2:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 2
+                slv_reg2[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'h3:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 3
+                slv_reg3[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'h4:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 4
+                slv_reg4[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'h5:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 5
+                slv_reg5[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'h6:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 6
+                slv_reg6[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'h7:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 7
+                slv_reg7[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'h8:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 8
+                slv_reg8[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'h9:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 9
+                slv_reg9[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'hA:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 10
+                slv_reg10[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'hB:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 11
+                slv_reg11[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'hC:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 12
+                slv_reg12[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'hD:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 13
+                slv_reg13[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'hE:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 14
+                slv_reg14[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'hF:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 15
+                slv_reg15[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            default : begin
+                        slv_reg0 <= slv_reg0;
+                        slv_reg1 <= slv_reg1;
+                        slv_reg2 <= slv_reg2;
+                        slv_reg3 <= slv_reg3;
+                        slv_reg4 <= slv_reg4;
+                        slv_reg5 <= slv_reg5;
+                        slv_reg6 <= slv_reg6;
+                        slv_reg7 <= slv_reg7;
+                        slv_reg8 <= slv_reg8;
+                        slv_reg9 <= slv_reg9;
+                        slv_reg10 <= slv_reg10;
+                        slv_reg11 <= slv_reg11;
+                        slv_reg12 <= slv_reg12;
+                        slv_reg13 <= slv_reg13;
+                        slv_reg14 <= slv_reg14;
+                        slv_reg15 <= slv_reg15;
+                    end
+        endcase
+        end
+    end
+end
+
+// Implement write response logic generation
+// The write response and response valid signals are asserted by the slave
+// when axi_wready, S_AXI_WVALID, axi_wready and S_AXI_WVALID are asserted.
+// This marks the acceptance of address and indicates the status of
+// write transaction.
+
+always @( posedge S_AXI_ACLK )
+begin
+    if ( S_AXI_ARESETN == 1'b0 )
+    begin
+        axi_bvalid  <= 0;
+        axi_bresp   <= 2'b0;
+    end
+    else
+    begin
+        if (axi_awready && S_AXI_AWVALID && ~axi_bvalid && axi_wready && S_AXI_WVALID)
+        begin
+            // indicates a valid write response is available
+            axi_bvalid <= 1'b1;
+            axi_bresp  <= 2'b0; // 'OKAY' response
+        end                   // work error responses in future
+        else
+        begin
+            if (S_AXI_BREADY && axi_bvalid)
+            //check if bready is asserted while bvalid is high)
+            //(there is a possibility that bready is always asserted high)
+            begin
+                axi_bvalid <= 1'b0;
+            end
+        end
+    end
+end
+
+// Implement axi_arready generation
+// axi_arready is asserted for one S_AXI_ACLK clock cycle when
+// S_AXI_ARVALID is asserted. axi_awready is
+// de-asserted when reset (active low) is asserted.
+// The read address is also latched when S_AXI_ARVALID is
+// asserted. axi_araddr is reset to zero on reset assertion.
+
+always @( posedge S_AXI_ACLK )
+begin
+    if ( S_AXI_ARESETN == 1'b0 )
+    begin
+        axi_arready <= 1'b0;
+        axi_araddr  <= 32'b0;
+    end
+    else
+    begin
+        if (~axi_arready && S_AXI_ARVALID)
+        begin
+            // indicates that the slave has acceped the valid read address
+            axi_arready <= 1'b1;
+            // Read address latching
+            axi_araddr  <= S_AXI_ARADDR;
+        end
+        else
+        begin
+            axi_arready <= 1'b0;
+        end
+    end
+end
+
+// Implement axi_arvalid generation
+// axi_rvalid is asserted for one S_AXI_ACLK clock cycle when both
+// S_AXI_ARVALID and axi_arready are asserted. The slave registers
+// data are available on the axi_rdata bus at this instance. The
+// assertion of axi_rvalid marks the validity of read data on the
+// bus and axi_rresp indicates the status of read transaction.axi_rvalid
+// is deasserted on reset (active low). axi_rresp and axi_rdata are
+// cleared to zero on reset (active low).
+always @( posedge S_AXI_ACLK )
+begin
+    if ( S_AXI_ARESETN == 1'b0 )
+    begin
+        axi_rvalid <= 0;
+        axi_rresp  <= 0;
+    end
+    else
+    begin
+        if (axi_arready && S_AXI_ARVALID && ~axi_rvalid)
+        begin
+            // Valid read data is available at the read data bus
+            axi_rvalid <= 1'b1;
+            axi_rresp  <= 2'b0; // 'OKAY' response
+        end
+        else if (axi_rvalid && S_AXI_RREADY)
+        begin
+            // Read data is accepted by the master
+            axi_rvalid <= 1'b0;
+        end
+    end
+end
+
+// Implement memory mapped register select and read logic generation
+// Slave register read enable is asserted when valid address is available
+// and the slave is ready to accept the read address.
+assign slv_reg_rden = axi_arready & S_AXI_ARVALID & ~axi_rvalid;
+always @(*)
+begin
+        // Address decoding for reading registers
+        case ( axi_araddr[ADDR_LSB+OPT_MEM_ADDR_BITS:ADDR_LSB] )
+        4'h0   : reg_data_out <= slv_reg0;
+        4'h1   : reg_data_out <= slv_reg1;
+        4'h2   : reg_data_out <= slv_reg2;
+        4'h3   : reg_data_out <= slv_reg3;
+        4'h4   : reg_data_out <= slv_reg4;
+        4'h5   : reg_data_out <= slv_reg5;
+        4'h6   : reg_data_out <= slv_reg6;
+        4'h7   : reg_data_out <= slv_reg7;
+        4'h8   : reg_data_out <= slv_reg8;
+        4'h9   : reg_data_out <= slv_reg9;
+        4'hA   : reg_data_out <= slv_reg10;
+        4'hB   : reg_data_out <= slv_reg11;
+        4'hC   : reg_data_out <= slv_reg12;
+        4'hD   : reg_data_out <= slv_reg13;
+        4'hE   : reg_data_out <= slv_reg14;
+        4'hF   : reg_data_out <= slv_reg15;
+        default : reg_data_out <= 0;
+        endcase
+end
+
+// Output register or memory read data
+always @( posedge S_AXI_ACLK )
+begin
+    if ( S_AXI_ARESETN == 1'b0 )
+    begin
+        axi_rdata  <= 0;
+    end
+    else
+    begin
+        // When there is a valid read address (S_AXI_ARVALID) with
+        // acceptance of read address by the slave (axi_arready),
+        // output the read dada
+        if (slv_reg_rden)
+        begin
+            axi_rdata <= reg_data_out;     // register read data
+        end
+    end
+end
+
+// Add user logic here
+assign	cfg_reg0 = slv_reg0;
+assign	cfg_reg1 = slv_reg1;
+assign	cfg_reg2 = slv_reg2;
+assign	cfg_reg3 = slv_reg3;
+assign	cfg_reg4 = slv_reg4;
+assign	cfg_reg5 = slv_reg5;
+assign	cfg_reg6 = slv_reg6;
+assign	cfg_reg7 = slv_reg7;
+assign	cfg_reg8 = slv_reg8;
+assign	cfg_reg9 = slv_reg9;
+assign	cfg_reg10 = slv_reg10;
+assign	cfg_reg11 = slv_reg11;
+assign	cfg_reg12 = slv_reg12;
+assign	cfg_reg13 = slv_reg13;
+assign	cfg_reg14 = slv_reg14;
+assign	cfg_reg15 = slv_reg15;
+// User logic ends
+
+endmodule
diff --git a/finn-rtllib/swg/swg_template_default.sv b/finn-rtllib/swg/swg_template_default.sv
index 97517438a0c261e4488b74a677a352f9dc51743b..06e65e911100dd7d3d8879b014a6d59713eb9bbd 100644
--- a/finn-rtllib/swg/swg_template_default.sv
+++ b/finn-rtllib/swg/swg_template_default.sv
@@ -36,7 +36,6 @@ module $TOP_MODULE_NAME$_controller #(
     int unsigned  LOOP_SIMD_ITERATIONS = $LOOP_SIMD_ITERATIONS$,
 
     int unsigned  INCR_BITWIDTH = $INCR_BITWIDTH$,
-    bit [INCR_BITWIDTH-1:0]  ADDR_INCREMENT_MAP[6] = $ADDR_INCREMENT_MAP$,
 
     bit IS_DEPTHWISE = $IS_DEPTHWISE$
 )(
@@ -60,26 +59,31 @@ module $TOP_MODULE_NAME$_controller #(
     state_e  State = $INNERMOST_STATE$;
     state_e  state_next;
 
-    logic signed [$clog2(LOOP_H_ITERATIONS   +2)+1-1:0]  Counter_loop_h    = LOOP_H_ITERATIONS-1;
-    logic signed [$clog2(LOOP_W_ITERATIONS   +2)+1-1:0]  Counter_loop_w    = LOOP_W_ITERATIONS-1;
-    logic signed [$clog2(LOOP_KH_ITERATIONS  +2)+1-1:0]  Counter_loop_kh   = LOOP_KH_ITERATIONS-1;
-    logic signed [$clog2(LOOP_KW_ITERATIONS  +2)+1-1:0]  Counter_loop_kw   = LOOP_KW_ITERATIONS-1;
-    logic signed [$clog2(LOOP_SIMD_ITERATIONS+2)+1-1:0]  Counter_loop_simd = LOOP_SIMD_ITERATIONS-1;
-
-    assign  addr_incr = ADDR_INCREMENT_MAP[State];
+    logic signed [$clog2(LOOP_H_ITERATIONS   +2)+1-1:0]  Counter_loop_h    = LOOP_H_ITERATIONS;
+    logic signed [$clog2(LOOP_W_ITERATIONS   +2)+1-1:0]  Counter_loop_w    = LOOP_W_ITERATIONS;
+    logic signed [$clog2(LOOP_KH_ITERATIONS  +2)+1-1:0]  Counter_loop_kh   = LOOP_KH_ITERATIONS;
+    logic signed [$clog2(LOOP_KW_ITERATIONS  +2)+1-1:0]  Counter_loop_kw   = LOOP_KW_ITERATIONS;
+    logic signed [$clog2(LOOP_SIMD_ITERATIONS+2)+1-1:0]  Counter_loop_simd = LOOP_SIMD_ITERATIONS;
+
+    // combinational logic for addr_incr generation
+    always_comb begin : blkHead
+        unique case (State)
+            0 : addr_incr = 0;
+            1 : addr_incr = $HEAD_INCR_SIMD$;
+            2 : addr_incr = $HEAD_INCR_KW$;
+            3 : addr_incr = $HEAD_INCR_KH$;
+            4 : addr_incr = $HEAD_INCR_W$;
+            5 : addr_incr = $HEAD_INCR_H$;
+        endcase
+    end
 
     // combinational logic for tail_incr generation
     uwire  tail_incr_inner_condition = IS_DEPTHWISE? (Counter_loop_kh >= 0) : 0;
-    always_comb begin : blkTail
-        if (tail_incr_inner_condition)
-            tail_incr = 1;
-        else if (Counter_loop_w >= 0)
-            tail_incr = $TAIL_INCR_W$;
-        else if (Counter_loop_h >= 0)
-            tail_incr = $TAIL_INCR_H$;
-        else
-            tail_incr = $TAIL_INCR_LAST$;
-    end
+    assign tail_incr =
+        tail_incr_inner_condition? 1 :
+        Counter_loop_w >= 0?       $TAIL_INCR_W$ :
+        Counter_loop_h >= 0?       $TAIL_INCR_H$ :
+        /* else */                 $TAIL_INCR_LAST$;
 
     // combinational next state logic
     always_comb begin : blkState
@@ -101,29 +105,29 @@ module $TOP_MODULE_NAME$_controller #(
     always_ff @ (posedge clk) begin
         if(!rst_n) begin
             State <= $INNERMOST_STATE$;
-            Counter_loop_h    <= LOOP_H_ITERATIONS-1;
-            Counter_loop_w    <= LOOP_W_ITERATIONS-1;
-            Counter_loop_kh   <= LOOP_KH_ITERATIONS-1;
-            Counter_loop_kw   <= LOOP_KW_ITERATIONS-1;
-            Counter_loop_simd <= LOOP_SIMD_ITERATIONS-1;
+            Counter_loop_h    <= LOOP_H_ITERATIONS;
+            Counter_loop_w    <= LOOP_W_ITERATIONS;
+            Counter_loop_kh   <= LOOP_KH_ITERATIONS;
+            Counter_loop_kw   <= LOOP_KW_ITERATIONS;
+            Counter_loop_simd <= LOOP_SIMD_ITERATIONS;
         end
         else if(advance) begin
             State <= state_next;
             if (State == $INNERMOST_STATE$) begin
                 if(Counter_loop_simd >= 0)  Counter_loop_simd <= Counter_loop_simd-1;
                 else begin
-                    Counter_loop_simd <= LOOP_SIMD_ITERATIONS-1;
+                    Counter_loop_simd <= LOOP_SIMD_ITERATIONS;
                     if(Counter_loop_kw >= 0)  Counter_loop_kw <= Counter_loop_kw-1;
                     else begin
-                        Counter_loop_kw <= LOOP_KW_ITERATIONS-1;
+                        Counter_loop_kw <= LOOP_KW_ITERATIONS;
                         if(Counter_loop_kh >= 0)  Counter_loop_kh <= Counter_loop_kh-1;
                         else begin
-                            Counter_loop_kh <= LOOP_KH_ITERATIONS-1;
+                            Counter_loop_kh <= LOOP_KH_ITERATIONS;
                             if(Counter_loop_w >= 0)  Counter_loop_w <= Counter_loop_w-1;
                             else begin
-                                Counter_loop_w <= LOOP_W_ITERATIONS-1;
+                                Counter_loop_w <= LOOP_W_ITERATIONS;
                                 if(Counter_loop_h >= 0)  Counter_loop_h <= Counter_loop_h-1;
-                                else  Counter_loop_h <= LOOP_H_ITERATIONS-1;
+                                else  Counter_loop_h <= LOOP_H_ITERATIONS;
                             end
                         end
                     end
@@ -139,7 +143,6 @@ module $TOP_MODULE_NAME$_cyclic_buffer_addressable #(
     int unsigned  DEPTH
 )(
     input   logic  clk,
-    input   logic  rst_n,
 
     input   logic  write_enable,
     input   logic [$clog2(DEPTH)-1:0] write_addr,
@@ -182,7 +185,7 @@ module $TOP_MODULE_NAME$_impl #(
     input   logic  out_V_V_TREADY,
     output  logic [BIT_WIDTH * SIMD * MMV_OUT-1:0]  out_V_V_TDATA
 );
-    // derived Constants
+    // derived constants
     localparam int unsigned  BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN;
     localparam int unsigned  BUF_OUT_ELEM_WIDTH = BIT_WIDTH * SIMD;
     localparam int unsigned  BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT;
@@ -199,7 +202,6 @@ module $TOP_MODULE_NAME$_impl #(
         .DEPTH(BUF_ELEM_TOTAL)
     ) window_buffer_inst (
         .clk(ap_clk),
-        .rst_n(ap_rst_n),
 
         .write_enable(window_buffer_write_enable),
         .write_addr(window_buffer_write_addr),
@@ -234,6 +236,15 @@ module $TOP_MODULE_NAME$_impl #(
     logic        [$clog2(BUF_ELEM_TOTAL)-1:0]      Window_buffer_write_addr_reg = 0;
 
     // Control signals/registers
+    logic  Write_cmd    = 0;
+    logic  Writing_done = 0;
+    uwire  write_ok      = Write_cmd &&  out_V_V_TREADY;
+    uwire  write_blocked = Write_cmd && !out_V_V_TREADY;
+
+    logic  Fetching_done = 0;
+    uwire  fetch_cmd = !($signed(Current_elem) > Newest_buffered_elem) && !write_blocked && !Fetching_done;
+
+    uwire  reading_done = Newest_buffered_elem == LAST_READ_ELEM;
     uwire  read_cmd =
         !reading_done && ( // if there is still an input element left to read
             Fetching_done || ( // if fetching is done (e.g. for skipped rows at FM end due to stride)
@@ -242,15 +253,6 @@ module $TOP_MODULE_NAME$_impl #(
             ) // (over-)write to buffer if oldest buffered element will no longer be needed
         );
     uwire  read_ok      = read_cmd && in0_V_V_TVALID;
-    uwire  reading_done = Newest_buffered_elem == LAST_READ_ELEM;
-
-    uwire  fetch_cmd = !($signed(Current_elem) > Newest_buffered_elem) && !write_blocked && !Fetching_done;
-    logic  Fetching_done = 0;
-
-    logic  Write_cmd    = 0;
-    logic  Writing_done = 0;
-    uwire  write_ok      = Write_cmd &&  out_V_V_TREADY;
-    uwire  write_blocked = Write_cmd && !out_V_V_TREADY;;
 
     //assign buffer control
     assign  window_buffer_write_addr = Window_buffer_write_addr_reg;
diff --git a/finn-rtllib/swg/swg_template_default_dynamic.sv b/finn-rtllib/swg/swg_template_default_dynamic.sv
new file mode 100644
index 0000000000000000000000000000000000000000..eb53978b580a4753bbea6c8478f35912deb812b4
--- /dev/null
+++ b/finn-rtllib/swg/swg_template_default_dynamic.sv
@@ -0,0 +1,416 @@
+module $TOP_MODULE_NAME$_controller #(
+    int unsigned  CNTR_BITWIDTH,
+    int unsigned  INCR_BITWIDTH,
+
+    bit IS_DEPTHWISE = $IS_DEPTHWISE$
+)(
+    input   logic  clk,
+    input   logic  rst_n,
+
+    input   logic  advance,
+    output  logic [INCR_BITWIDTH-1:0]  addr_incr,
+    output  logic [INCR_BITWIDTH-1:0]  tail_incr,
+
+    input logic                     cfg_valid,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_simd,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_kw,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_kh,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_w,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_h,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_simd,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_kw,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_kh,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_w,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_h,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_tail_w,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_tail_h,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_tail_last
+);
+
+    // (dynamic) configuration registers
+    logic [CNTR_BITWIDTH-1:0] Cfg_cntr_simd      = $LOOP_SIMD_ITERATIONS$;
+    logic [CNTR_BITWIDTH-1:0] Cfg_cntr_kw        = $LOOP_KW_ITERATIONS$;
+    logic [CNTR_BITWIDTH-1:0] Cfg_cntr_kh        = $LOOP_KH_ITERATIONS$;
+    logic [CNTR_BITWIDTH-1:0] Cfg_cntr_w         = $LOOP_W_ITERATIONS$;
+    logic [CNTR_BITWIDTH-1:0] Cfg_cntr_h         = $LOOP_H_ITERATIONS$;
+    logic [INCR_BITWIDTH-1:0] Cfg_incr_head_simd = $HEAD_INCR_SIMD$;
+    logic [INCR_BITWIDTH-1:0] Cfg_incr_head_kw   = $HEAD_INCR_KW$;
+    logic [INCR_BITWIDTH-1:0] Cfg_incr_head_kh   = $HEAD_INCR_KH$;
+    logic [INCR_BITWIDTH-1:0] Cfg_incr_head_w    = $HEAD_INCR_W$;
+    logic [INCR_BITWIDTH-1:0] Cfg_incr_head_h    = $HEAD_INCR_H$;
+    logic [INCR_BITWIDTH-1:0] Cfg_incr_tail_w    = $TAIL_INCR_W$;
+    logic [INCR_BITWIDTH-1:0] Cfg_incr_tail_h    = $TAIL_INCR_H$;
+    logic [INCR_BITWIDTH-1:0] Cfg_incr_tail_last = $TAIL_INCR_LAST$;
+
+    // configuration reset/set logic
+    always_ff @ (posedge clk) begin
+        if(cfg_valid) begin
+            Cfg_cntr_simd      <= cfg_cntr_simd;
+            Cfg_cntr_kw        <= cfg_cntr_kw;
+            Cfg_cntr_kh        <= cfg_cntr_kh;
+            Cfg_cntr_w         <= cfg_cntr_w;
+            Cfg_cntr_h         <= cfg_cntr_h;
+            Cfg_incr_head_simd <= cfg_incr_head_simd;
+            Cfg_incr_head_kw   <= cfg_incr_head_kw;
+            Cfg_incr_head_kh   <= cfg_incr_head_kh;
+            Cfg_incr_head_w    <= cfg_incr_head_w;
+            Cfg_incr_head_h    <= cfg_incr_head_h;
+            Cfg_incr_tail_w    <= cfg_incr_tail_w;
+            Cfg_incr_tail_h    <= cfg_incr_tail_h;
+            Cfg_incr_tail_last <= cfg_incr_tail_last;
+        end
+    end
+
+    // state and counters
+    typedef enum logic [2:0] {
+        STATE_START,
+        STATE_LOOP_SIMD,
+        STATE_LOOP_KW,
+        STATE_LOOP_KH,
+        STATE_LOOP_W,
+        STATE_LOOP_H
+    }  state_e;
+    state_e  State = $INNERMOST_STATE$;
+    state_e  state_next;
+
+    logic signed [$clog2($LOOP_H_ITERATIONS$   +2)+1-1:0]  Counter_loop_h    = $LOOP_H_ITERATIONS$;
+    logic signed [$clog2($LOOP_W_ITERATIONS$   +2)+1-1:0]  Counter_loop_w    = $LOOP_W_ITERATIONS$;
+    logic signed [$clog2($LOOP_KH_ITERATIONS$  +2)+1-1:0]  Counter_loop_kh   = $LOOP_KH_ITERATIONS$;
+    logic signed [$clog2($LOOP_KW_ITERATIONS$  +2)+1-1:0]  Counter_loop_kw   = $LOOP_KW_ITERATIONS$;
+    logic signed [$clog2($LOOP_SIMD_ITERATIONS$+2)+1-1:0]  Counter_loop_simd = $LOOP_SIMD_ITERATIONS$;
+
+    // combinational logic for addr_incr generation
+    always_comb begin : blkHead
+        unique case (State)
+            0 : addr_incr = 0;
+            1 : addr_incr = Cfg_incr_head_simd;
+            2 : addr_incr = Cfg_incr_head_kw;
+            3 : addr_incr = Cfg_incr_head_kh;
+            4 : addr_incr = Cfg_incr_head_w;
+            5 : addr_incr = Cfg_incr_head_h;
+        endcase
+    end
+
+    // combinational logic for tail_incr generation
+    uwire  tail_incr_inner_condition = IS_DEPTHWISE? (Counter_loop_kh >= 0) : 0;
+    assign tail_incr =
+        tail_incr_inner_condition? 1 :
+        Counter_loop_w >= 0?       Cfg_incr_tail_w :
+        Counter_loop_h >= 0?       Cfg_incr_tail_h :
+        /* else */                 Cfg_incr_tail_last;
+
+    // combinational next state logic
+    always_comb begin : blkState
+        state_next = State;
+        if(State != $INNERMOST_STATE$)  state_next = $INNERMOST_STATE$;
+        else begin
+            if(Counter_loop_simd < 0) begin
+                state_next =
+                    (Counter_loop_kw >= 0)? STATE_LOOP_KW :
+                    (Counter_loop_kh >= 0)? STATE_LOOP_KH :
+                    (Counter_loop_w  >= 0)? STATE_LOOP_W :
+                    (Counter_loop_h  >= 0)? STATE_LOOP_H :
+                    /* else */              STATE_START;
+            end
+        end
+    end : blkState
+
+    // sequential logic
+    always_ff @ (posedge clk) begin
+        if(!rst_n) begin
+            State <= $INNERMOST_STATE$;
+            Counter_loop_h    <= Cfg_cntr_h;
+            Counter_loop_w    <= Cfg_cntr_w;
+            Counter_loop_kh   <= Cfg_cntr_kh;
+            Counter_loop_kw   <= Cfg_cntr_kw;
+            Counter_loop_simd <= Cfg_cntr_simd;
+        end
+        else if(advance) begin
+            State <= state_next;
+            if (State == $INNERMOST_STATE$) begin
+                if(Counter_loop_simd >= 0)  Counter_loop_simd <= Counter_loop_simd-1;
+                else begin
+                    Counter_loop_simd <= Cfg_cntr_simd;
+                    if(Counter_loop_kw >= 0)  Counter_loop_kw <= Counter_loop_kw-1;
+                    else begin
+                        Counter_loop_kw <= Cfg_cntr_kw;
+                        if(Counter_loop_kh >= 0)  Counter_loop_kh <= Counter_loop_kh-1;
+                        else begin
+                            Counter_loop_kh <= Cfg_cntr_kh;
+                            if(Counter_loop_w >= 0)  Counter_loop_w <= Counter_loop_w-1;
+                            else begin
+                                Counter_loop_w <= Cfg_cntr_w;
+                                if(Counter_loop_h >= 0)  Counter_loop_h <= Counter_loop_h-1;
+                                else  Counter_loop_h <= Cfg_cntr_h;
+                            end
+                        end
+                    end
+                end
+            end
+        end
+    end
+
+endmodule :  $TOP_MODULE_NAME$_controller
+
+module $TOP_MODULE_NAME$_cyclic_buffer_addressable #(
+    int unsigned  WIDTH,
+    int unsigned  DEPTH
+)(
+    input   logic  clk,
+
+    input   logic  write_enable,
+    input   logic [$clog2(DEPTH)-1:0] write_addr,
+    input   logic [WIDTH-1:0]  data_in,
+
+    input   logic  read_enable,
+    input   logic [$clog2(DEPTH)-1:0]  read_addr, // absolute (!) read address of cyclic buffer
+    output  logic [WIDTH-1:0]  data_out
+);
+
+    $RAM_STYLE$ logic [WIDTH-1:0] Ram[DEPTH];
+    logic [WIDTH-1:0]  Out = 'x;
+    always_ff @(posedge clk) begin
+        if (read_enable)  Out <= Ram[read_addr];
+        if (write_enable) Ram[write_addr] <= data_in;
+    end
+    assign  data_out = Out;
+
+endmodule : $TOP_MODULE_NAME$_cyclic_buffer_addressable
+
+module $TOP_MODULE_NAME$_impl #(
+    int  BIT_WIDTH,
+    int  SIMD,
+    int  MMV_IN,
+    int  MMV_OUT,
+    int unsigned  CNTR_BITWIDTH,
+    int unsigned  INCR_BITWIDTH,
+
+    int  LAST_READ_ELEM = $LAST_READ_ELEM$,
+    int  LAST_WRITE_ELEM = $LAST_WRITE_ELEM$,
+    int  BUF_ELEM_TOTAL = $BUF_ELEM_TOTAL$,
+    int  ELEM_PER_WINDOW = $ELEM_PER_WINDOW$
+)(
+    input   logic  ap_clk,
+    input   logic  ap_rst_n,
+
+    input   logic  in0_V_V_TVALID,
+    output  logic  in0_V_V_TREADY,
+    input   logic [BIT_WIDTH * SIMD * MMV_IN-1:0]  in0_V_V_TDATA,
+
+    output  logic  out_V_V_TVALID,
+    input   logic  out_V_V_TREADY,
+    output  logic [BIT_WIDTH * SIMD * MMV_OUT-1:0]  out_V_V_TDATA,
+
+    input logic                     cfg_valid,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_simd,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_kw,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_kh,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_w,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_h,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_simd,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_kw,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_kh,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_w,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_h,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_tail_w,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_tail_h,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_tail_last,
+    input logic [31:0]              cfg_last_read,
+    input logic [31:0]              cfg_last_write
+);
+    // derived constants
+    localparam int unsigned  BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN;
+    localparam int unsigned  BUF_OUT_ELEM_WIDTH = BIT_WIDTH * SIMD;
+    localparam int unsigned  BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT;
+
+    // (dynamic) configuration registers
+    logic [31:0] Cfg_last_read  = LAST_READ_ELEM;
+    logic [31:0] Cfg_last_write = LAST_WRITE_ELEM;
+
+    // configuration reset/set logic
+    always_ff @ (posedge ap_clk) begin
+        if(cfg_valid) begin
+            Cfg_last_read  <= cfg_last_read;
+            Cfg_last_write <= cfg_last_write;
+        end
+    end
+
+   // main buffer instantiation
+    uwire [BUF_IN_WIDTH -1:0]  window_buffer_in;
+    uwire [BUF_OUT_WIDTH-1:0]  window_buffer_out;
+    uwire  window_buffer_write_enable;
+    uwire  window_buffer_read_enable;
+    uwire [$clog2(BUF_ELEM_TOTAL)-1:0]  window_buffer_write_addr;
+    uwire [$clog2(BUF_ELEM_TOTAL)-1:0]  window_buffer_read_addr;
+    $TOP_MODULE_NAME$_cyclic_buffer_addressable #(
+        .WIDTH(BUF_IN_WIDTH),
+        .DEPTH(BUF_ELEM_TOTAL)
+    ) window_buffer_inst (
+        .clk(ap_clk),
+
+        .write_enable(window_buffer_write_enable),
+        .write_addr(window_buffer_write_addr),
+        .data_in(window_buffer_in),
+
+        .read_enable(window_buffer_read_enable),
+        .read_addr(window_buffer_read_addr),
+        .data_out(window_buffer_out)
+    );
+
+    //controller instantiation
+    uwire  advance_controller;
+    uwire signed [INCR_BITWIDTH-1:0]  addr_incr;
+    uwire        [INCR_BITWIDTH-1:0]  tail_incr;
+    $TOP_MODULE_NAME$_controller #(
+        .CNTR_BITWIDTH(CNTR_BITWIDTH),
+        .INCR_BITWIDTH(INCR_BITWIDTH)
+    ) controller_inst (
+        .clk(ap_clk),
+        .rst_n(ap_rst_n),
+        .advance(advance_controller),
+        .addr_incr(addr_incr),
+        .tail_incr(tail_incr),
+
+        .cfg_valid(cfg_valid),
+        .cfg_cntr_simd(cfg_cntr_simd),
+        .cfg_cntr_kw(cfg_cntr_kw),
+        .cfg_cntr_kh(cfg_cntr_kh),
+        .cfg_cntr_w(cfg_cntr_w),
+        .cfg_cntr_h(cfg_cntr_h),
+        .cfg_incr_head_simd(cfg_incr_head_simd),
+        .cfg_incr_head_kw(cfg_incr_head_kw),
+        .cfg_incr_head_kh(cfg_incr_head_kh),
+        .cfg_incr_head_w(cfg_incr_head_w),
+        .cfg_incr_head_h(cfg_incr_head_h),
+        .cfg_incr_tail_w(cfg_incr_tail_w),
+        .cfg_incr_tail_h(cfg_incr_tail_h),
+        .cfg_incr_tail_last(cfg_incr_tail_last)
+    );
+
+    // Counters/address registers
+    // Add a sign bit even to (most) unsigned counters and Window_buffer_read_addr_reg,
+    // so we can use automatic sign extension and simplify calculations w/ signed increment.
+    // Alternatively, we could manually sign-extend and shave off a bit here or there.
+    logic signed [$clog2(LAST_READ_ELEM+1)+1-1:0]  Newest_buffered_elem = -1;
+    logic        [$clog2(LAST_READ_ELEM+1)+1-1:0]  Current_elem = 0;
+    logic        [$clog2(LAST_READ_ELEM+1)+1-1:0]  First_elem_next_window = 0;
+    logic        [$clog2(ELEM_PER_WINDOW)   -1:0]  Position_in_window = 0;
+    logic        [$clog2(BUF_ELEM_TOTAL)+1  -1:0]  Window_buffer_read_addr_reg = 0;
+    logic        [$clog2(BUF_ELEM_TOTAL)-1:0]      Window_buffer_write_addr_reg = 0;
+
+    // Control signals/registers
+    logic  Write_cmd    = 0;
+    logic  Writing_done = 0;
+    uwire  write_ok      = Write_cmd &&  out_V_V_TREADY;
+    uwire  write_blocked = Write_cmd && !out_V_V_TREADY;
+
+    logic  Fetching_done = 0;
+    uwire  fetch_cmd = !($signed(Current_elem) > Newest_buffered_elem) && !write_blocked && !Fetching_done;
+
+    uwire  reading_done = Newest_buffered_elem == Cfg_last_read;
+    uwire  read_cmd =
+        !reading_done && ( // if there is still an input element left to read
+            Fetching_done || ( // if fetching is done (e.g. for skipped rows at FM end due to stride)
+                $signed(((Newest_buffered_elem - (BUF_ELEM_TOTAL - 1)))) < $signed(First_elem_next_window) &&
+                $signed(((Newest_buffered_elem - (BUF_ELEM_TOTAL - 1)))) < $signed(Current_elem)
+            ) // (over-)write to buffer if oldest buffered element will no longer be needed
+        );
+    uwire  read_ok      = read_cmd && in0_V_V_TVALID;
+
+    //assign buffer control
+    assign  window_buffer_write_addr = Window_buffer_write_addr_reg;
+    assign  window_buffer_read_addr = Window_buffer_read_addr_reg;
+    assign  window_buffer_write_enable = read_ok;
+    assign  window_buffer_read_enable = fetch_cmd;
+    assign  advance_controller = fetch_cmd;
+
+    //assign I/O ports
+    assign  window_buffer_in = in0_V_V_TDATA;
+    assign  out_V_V_TDATA = window_buffer_out;
+    assign  in0_V_V_TREADY = ap_rst_n && read_ok; //only asserted if data is available and we can store it (allowed)
+    assign  out_V_V_TVALID = ap_rst_n && Write_cmd; //only asserted if we have data available and it has not been read yet (don't wait for READY from sink)
+
+    //main process for advancing counters
+    always_ff @(posedge ap_clk) begin
+        if(!ap_rst_n) begin
+            Newest_buffered_elem <= -1;
+            Current_elem <= 0;
+            First_elem_next_window <= 0;
+            Position_in_window <= 0;
+            Window_buffer_read_addr_reg <= 0;
+            Window_buffer_write_addr_reg <= 0;
+            Fetching_done <= 0;
+            Write_cmd <= 0;
+            Writing_done <= 0;
+        end
+        else begin
+            if (read_ok) begin
+                Window_buffer_write_addr_reg <= (Window_buffer_write_addr_reg == BUF_ELEM_TOTAL-1)? 0 : Window_buffer_write_addr_reg + 1;
+                Newest_buffered_elem <= Newest_buffered_elem+1;
+
+                if (Newest_buffered_elem == Cfg_last_read-1) begin
+                    Window_buffer_write_addr_reg <= 0;
+                end
+                //check if this is the last read cycle (reading_done will be true afterwards)
+                if ((Newest_buffered_elem == Cfg_last_read-1) && Writing_done) begin
+                    //start processing of next FM if writing is done already (possible due to unused input elements at the tail end)
+                    //todo: allow for read overlapping between feature maps (i.e., reading first elements from next FM while still writing last window of current FM)
+                    Newest_buffered_elem <= -1;
+                    Current_elem <= 0;
+                    Window_buffer_read_addr_reg <= 0;
+                    First_elem_next_window <= 0;
+                    Writing_done <= 0;
+                    Fetching_done <= 0;
+                end
+            end
+
+            if (fetch_cmd) begin
+                //count up to track which element index is about to be read from the buffer, and where it is located within the buffer
+                //use increment value calculated by controller
+
+                // absolute buffer address wrap-around
+                automatic logic signed [$clog2(BUF_ELEM_TOTAL)+1:0]  ra = $signed(Window_buffer_read_addr_reg) + $signed(addr_incr);
+                automatic logic signed [$clog2(BUF_ELEM_TOTAL+1):0]  ra_correct =
+                    (ra >= BUF_ELEM_TOTAL)? -BUF_ELEM_TOTAL :
+                    (ra <               0)?  BUF_ELEM_TOTAL : 0;
+                Window_buffer_read_addr_reg <= ra + ra_correct;
+
+                //keep track where we are within a window
+                Position_in_window <= (Position_in_window != ELEM_PER_WINDOW - 1)? Position_in_window+1 : 0;
+
+                //update first element of next window to allow buffer overwrite up until that point
+                if (Position_in_window == 0)
+                    First_elem_next_window <= First_elem_next_window + tail_incr;
+
+                //check if this is the last write cycle (Writing_done will be true afterwards)
+                if (Current_elem == Cfg_last_write)
+                    Fetching_done <= 1;
+                else
+                    Current_elem <= $signed(Current_elem) + addr_incr;
+
+                // determine if prefetched data will be outstanding in the next cycle
+                // if we fetch in this cycle -> yes
+                // if we do not fetch nor write -> do not change
+                // if we do not fetch but write successfully-> clear outstanding data
+                Write_cmd <= fetch_cmd;
+            end
+
+            if (write_ok)
+                Write_cmd <= fetch_cmd;
+
+            if (write_ok && Fetching_done) begin
+                //check if this is the last write cycle (Writing_done will be true afterwards)
+                if (reading_done || (read_ok && (Newest_buffered_elem == Cfg_last_read - 1))) begin
+                    //start processing of next FM if reading is done already, or completes in the same cycle
+                    Newest_buffered_elem <= -1;
+                    Current_elem <= 0;
+                    Window_buffer_read_addr_reg <= 0;
+                    First_elem_next_window <= 0;
+                    Fetching_done <= 0;
+                end else
+                    Writing_done <= 1;
+            end
+        end
+    end
+
+endmodule : $TOP_MODULE_NAME$_impl
diff --git a/finn-rtllib/swg/swg_template_wrapper_dynamic.v b/finn-rtllib/swg/swg_template_wrapper_dynamic.v
new file mode 100644
index 0000000000000000000000000000000000000000..ca870ace11edcf097645bc12b0486ffbb83b0ea4
--- /dev/null
+++ b/finn-rtllib/swg/swg_template_wrapper_dynamic.v
@@ -0,0 +1,154 @@
+`timescale 1 ns / 1 ps
+
+module $TOP_MODULE_NAME$ #(
+    // top-level parameters (set via code-generation)
+    parameter BIT_WIDTH = $BIT_WIDTH$,
+    parameter SIMD = $SIMD$,
+    parameter MMV_IN = $MMV_IN$,
+    parameter MMV_OUT = $MMV_OUT$,
+
+    parameter CNTR_BITWIDTH = $CNTR_BITWIDTH$,
+    parameter INCR_BITWIDTH = $INCR_BITWIDTH$,
+
+    // derived constants
+    parameter BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN,
+    parameter BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT,
+
+    parameter integer C_s_axilite_DATA_WIDTH	= 32,
+    parameter integer C_s_axilite_ADDR_WIDTH	= 6
+)
+(
+    (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *)
+    input  ap_clk,
+    (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *)
+    input  ap_rst_n,
+    input  [BUF_IN_WIDTH-1:0] in0_V_TDATA,
+    input  in0_V_TVALID,
+    output in0_V_TREADY,
+    output [BUF_OUT_WIDTH-1:0] out_V_TDATA,
+    output out_V_TVALID,
+    input  out_V_TREADY,
+
+    // Ports of Axi Slave Bus Interface s_axilite
+    input  [C_s_axilite_ADDR_WIDTH-1 : 0] s_axilite_awaddr,
+    input  [2 : 0] s_axilite_awprot,
+    input  s_axilite_awvalid,
+    output s_axilite_awready,
+    input  [C_s_axilite_DATA_WIDTH-1 : 0] s_axilite_wdata,
+    input  [(C_s_axilite_DATA_WIDTH/8)-1 : 0] s_axilite_wstrb,
+    input  s_axilite_wvalid,
+    output s_axilite_wready,
+    output [1 : 0] s_axilite_bresp,
+    output s_axilite_bvalid,
+    input  s_axilite_bready,
+    input  [C_s_axilite_ADDR_WIDTH-1 : 0] s_axilite_araddr,
+    input  [2 : 0] s_axilite_arprot,
+    input  s_axilite_arvalid,
+    output s_axilite_arready,
+    output [C_s_axilite_DATA_WIDTH-1 : 0] s_axilite_rdata,
+    output [1 : 0] s_axilite_rresp,
+    output s_axilite_rvalid,
+    input  s_axilite_rready
+);
+
+wire                     cfg_valid;
+wire [CNTR_BITWIDTH-1:0] cfg_cntr_simd;
+wire [CNTR_BITWIDTH-1:0] cfg_cntr_kw;
+wire [CNTR_BITWIDTH-1:0] cfg_cntr_kh;
+wire [CNTR_BITWIDTH-1:0] cfg_cntr_w;
+wire [CNTR_BITWIDTH-1:0] cfg_cntr_h;
+wire [INCR_BITWIDTH-1:0] cfg_incr_head_simd;
+wire [INCR_BITWIDTH-1:0] cfg_incr_head_kw;
+wire [INCR_BITWIDTH-1:0] cfg_incr_head_kh;
+wire [INCR_BITWIDTH-1:0] cfg_incr_head_w;
+wire [INCR_BITWIDTH-1:0] cfg_incr_head_h;
+wire [INCR_BITWIDTH-1:0] cfg_incr_tail_w;
+wire [INCR_BITWIDTH-1:0] cfg_incr_tail_h;
+wire [INCR_BITWIDTH-1:0] cfg_incr_tail_last;
+wire [31:0]              cfg_last_read;
+wire [31:0]              cfg_last_write;
+
+// Instantiation of Axi Bus Interface s_axilite
+$TOP_MODULE_NAME$_axilite # (
+    .C_S_AXI_DATA_WIDTH(C_s_axilite_DATA_WIDTH),
+    .C_S_AXI_ADDR_WIDTH(C_s_axilite_ADDR_WIDTH)
+) axilite_cfg_inst (
+    .S_AXI_ACLK(ap_clk),
+    .S_AXI_ARESETN(ap_rst_n),
+    .S_AXI_AWADDR(s_axilite_awaddr),
+    .S_AXI_AWPROT(s_axilite_awprot),
+    .S_AXI_AWVALID(s_axilite_awvalid),
+    .S_AXI_AWREADY(s_axilite_awready),
+    .S_AXI_WDATA(s_axilite_wdata),
+    .S_AXI_WSTRB(s_axilite_wstrb),
+    .S_AXI_WVALID(s_axilite_wvalid),
+    .S_AXI_WREADY(s_axilite_wready),
+    .S_AXI_BRESP(s_axilite_bresp),
+    .S_AXI_BVALID(s_axilite_bvalid),
+    .S_AXI_BREADY(s_axilite_bready),
+    .S_AXI_ARADDR(s_axilite_araddr),
+    .S_AXI_ARPROT(s_axilite_arprot),
+    .S_AXI_ARVALID(s_axilite_arvalid),
+    .S_AXI_ARREADY(s_axilite_arready),
+    .S_AXI_RDATA(s_axilite_rdata),
+    .S_AXI_RRESP(s_axilite_rresp),
+    .S_AXI_RVALID(s_axilite_rvalid),
+    .S_AXI_RREADY(s_axilite_rready),
+
+    .cfg_reg0(cfg_valid),
+    .cfg_reg1(cfg_cntr_simd),
+    .cfg_reg2(cfg_cntr_kw),
+    .cfg_reg3(cfg_cntr_kh),
+    .cfg_reg4(cfg_cntr_w),
+    .cfg_reg5(cfg_cntr_h),
+    .cfg_reg6(cfg_incr_head_simd),
+    .cfg_reg7(cfg_incr_head_kw),
+    .cfg_reg8(cfg_incr_head_kh),
+    .cfg_reg9(cfg_incr_head_w),
+    .cfg_reg10(cfg_incr_head_h),
+    .cfg_reg11(cfg_incr_tail_w),
+    .cfg_reg12(cfg_incr_tail_h),
+    .cfg_reg13(cfg_incr_tail_last),
+    .cfg_reg14(cfg_last_read),
+    .cfg_reg15(cfg_last_write)
+);
+
+$TOP_MODULE_NAME$_impl
+#(
+    .BIT_WIDTH(BIT_WIDTH),
+    .SIMD(SIMD),
+    .MMV_IN(MMV_IN),
+    .MMV_OUT(MMV_OUT),
+    .CNTR_BITWIDTH(CNTR_BITWIDTH),
+    .INCR_BITWIDTH(INCR_BITWIDTH)
+)
+impl
+(
+    .ap_clk(ap_clk),
+    .ap_rst_n(ap_rst_n),
+    .in0_V_V_TDATA(in0_V_TDATA),
+    .in0_V_V_TVALID(in0_V_TVALID),
+    .in0_V_V_TREADY(in0_V_TREADY),
+    .out_V_V_TDATA(out_V_TDATA),
+    .out_V_V_TVALID(out_V_TVALID),
+    .out_V_V_TREADY(out_V_TREADY),
+
+    .cfg_valid(cfg_valid),
+    .cfg_cntr_simd(cfg_cntr_simd),
+    .cfg_cntr_kw(cfg_cntr_kw),
+    .cfg_cntr_kh(cfg_cntr_kh),
+    .cfg_cntr_w(cfg_cntr_w),
+    .cfg_cntr_h(cfg_cntr_h),
+    .cfg_incr_head_simd(cfg_incr_head_simd),
+    .cfg_incr_head_kw(cfg_incr_head_kw),
+    .cfg_incr_head_kh(cfg_incr_head_kh),
+    .cfg_incr_head_w(cfg_incr_head_w),
+    .cfg_incr_head_h(cfg_incr_head_h),
+    .cfg_incr_tail_w(cfg_incr_tail_w),
+    .cfg_incr_tail_h(cfg_incr_tail_h),
+    .cfg_incr_tail_last(cfg_incr_tail_last),
+    .cfg_last_read(cfg_last_read),
+    .cfg_last_write(cfg_last_write)
+);
+
+endmodule //TOP_MODULE_NAME
diff --git a/notebooks/advanced/2_custom_op.ipynb b/notebooks/advanced/2_custom_op.ipynb
index c27f8bdca788e6404fbc01e226b06e8cfaaba066..051a406708ee7b4bcbd548b39acac000b473c7cf 100644
--- a/notebooks/advanced/2_custom_op.ipynb
+++ b/notebooks/advanced/2_custom_op.ipynb
@@ -178,6 +178,7 @@
    "source": [
     "from qonnx.core.modelwrapper import ModelWrapper\n",
     "from onnx import TensorProto\n",
+    "from qonnx.util.basic import qonnx_make_model\n",
     "\n",
     "def make_graph(ishape, exp, op_type = \"MyPythonPowerOp\"):\n",
     "    inp = helper.make_tensor_value_info(\n",
@@ -204,7 +205,7 @@
     "    graph = helper.make_graph(\n",
     "        nodes=[custom_node], name=\"custom_graph\", inputs=[inp], outputs=[outp]\n",
     "    )\n",
-    "    model = helper.make_model(graph, producer_name=\"custom-model\")\n",
+    "    model = qonnx_make_model(graph, producer_name=\"custom-model\")\n",
     "    return ModelWrapper(model)"
    ]
   },
diff --git a/notebooks/basics/0_how_to_work_with_onnx.ipynb b/notebooks/basics/0_how_to_work_with_onnx.ipynb
index 514efd1693d667af896e89902a264ea7e6e01da7..b6a5a0481574928ef490d8bb55bbbe2bb882b951 100644
--- a/notebooks/basics/0_how_to_work_with_onnx.ipynb
+++ b/notebooks/basics/0_how_to_work_with_onnx.ipynb
@@ -36,6 +36,7 @@
    "outputs": [],
    "source": [
     "import onnx\n",
+    "from qonnx.util.basic import qonnx_make_model\n",
     "\n",
     "Add1_node = onnx.helper.make_node(\n",
     "    'Add',\n",
@@ -158,7 +159,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "onnx_model = onnx.helper.make_model(graph, producer_name=\"simple-model\")\n",
+    "onnx_model = qonnx_make_model(graph, producer_name=\"simple-model\")\n",
     "onnx.save(onnx_model, '/tmp/simple_model.onnx')"
    ]
   },
@@ -550,7 +551,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "onnx_model1 = onnx.helper.make_model(graph, producer_name=\"simple-model1\")\n",
+    "onnx_model1 = qonnx_make_model(graph, producer_name=\"simple-model1\")\n",
     "onnx.save(onnx_model1, '/tmp/simple_model1.onnx')"
    ]
   },
diff --git a/requirements.txt b/requirements.txt
index 348b1afab9deca1547d40cb8d8c54a396befa65d..83aad07d729e30cbbbaf565b4332fb1f7ae6f014 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,11 +4,11 @@ dataclasses-json==0.5.7
 docrep==0.2.7
 gspread==3.6.0
 numpy==1.22.0
-onnx==1.11.0
+onnx==1.13.0
 onnxoptimizer
 onnxruntime==1.11.1
 pre-commit==2.9.2
-protobuf==3.20.2
+protobuf==3.20.3
 psutil==5.9.4
 pyscaffold==3.2.1
 scipy==1.5.2
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index fb4d60c1ebf6bcf39d5c71388572f27f065279cc..6e07a541e3d462b159792482dae4777999921a2c 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -666,8 +666,8 @@ def step_measure_rtlsim_performance(model: ModelWrapper, cfg: DataflowBuildConfi
                 + "in FINN C++ verilator driver, falling back to Python"
             )
         rtlsim_bs = int(cfg.rtlsim_batch_size)
+        orig_rtlsim_trace_depth = get_rtlsim_trace_depth()
         if force_python_rtlsim:
-            orig_rtlsim_trace_depth = get_rtlsim_trace_depth()
             assert rtlsim_bs > 0, "rtlsim batch size must be >0"
             if cfg.verify_save_rtlsim_waveforms:
                 # set depth to 3 for layer-by-layer visibility
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index e5eb483a00f6890f5eeb16c5cec533a4533c9f15..56d4230a3af3057daaa5c47140fcde1590dee686 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -43,6 +43,7 @@ from finn.custom_op.fpgadataflow.downsampler import DownSampler
 from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch
 from finn.custom_op.fpgadataflow.eltwise import StreamingEltwise
 from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch
+from finn.custom_op.fpgadataflow.fmpadding_rtl import FMPadding_rtl
 from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch
 from finn.custom_op.fpgadataflow.iodma import IODMA
 from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch
@@ -91,3 +92,4 @@ custom_op["Lookup"] = Lookup
 custom_op["StreamingConcat"] = StreamingConcat
 custom_op["CheckSum"] = CheckSum
 custom_op["StreamingEltwise"] = StreamingEltwise
+custom_op["FMPadding_rtl"] = FMPadding_rtl
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
index 5424050a8ed0a353894721d5bba28c1d45e62771..1afd23d3a1709a8929a03c21a6eba0a5a8cd6ba6 100755
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
@@ -29,7 +29,6 @@
 import math
 import numpy as np
 import os
-from math import copysign
 from qonnx.core.datatype import DataType
 from qonnx.custom_op.general import im2col
 from qonnx.custom_op.general.im2col import compute_conv_output_dim
@@ -81,6 +80,9 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
             "inputDataType": ("s", True, ""),
             "outputDataType": ("s", True, ""),
             "depthwise": ("i", False, 0, {0, 1}),
+            # Enable reprogrammable implementation to change FM dimensions,
+            # stride, or dilation during runtime
+            "dynamic_mode": ("i", False, 0, {0, 1}),
             # FPGA resource type for ConvolutionInputGenerator input buffer
             # auto -- let Vivado decide
             # block -- use BRAM
@@ -457,9 +459,11 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
     def prepare_codegen_default(self):
         # Default implementation style for MMV_out = 1: addressable cyclic buffer
         # Computing incremental addressing scheme directly..
-        template_path = (
-            os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_template_default.sv"
-        )
+        if self.get_nodeattr("dynamic_mode"):
+            template_select = "/finn-rtllib/swg/swg_template_default_dynamic.sv"
+        else:
+            template_select = "/finn-rtllib/swg/swg_template_default.sv"
+        template_path = os.environ["FINN_ROOT"] + template_select
         code_gen_dict = {}
 
         ifm_ch = self.get_nodeattr("IFMChannels")
@@ -569,10 +573,6 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
             tail_incr_last_window = buffer_min_size - 1
             code_gen_dict["$IS_DEPTHWISE$"] = ["0"]
 
-        code_gen_dict["$TAIL_INCR_W$"] = [str(tail_incr_w)]
-        code_gen_dict["$TAIL_INCR_H$"] = [str(tail_incr_h)]
-        code_gen_dict["$TAIL_INCR_LAST$"] = [str(tail_incr_last_window)]
-
         # support SIMD = IFMChannels and k_w = 1 cases
         # for k = [k_h, k_w] = [1, k_w], no adjustment is needed
         # for k = [k_h, k_w] = [1, 1], do not use this impl. style (mmv_out=K=1)
@@ -590,11 +590,23 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
             code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_SIMD"]
             loop_simd_iterations -= 1  # -1 because state is initial state
 
-        code_gen_dict["$LOOP_H_ITERATIONS$"] = [str(loop_h_iterations - 1)]
-        code_gen_dict["$LOOP_W_ITERATIONS$"] = [str(loop_w_iterations - 1)]
-        code_gen_dict["$LOOP_KH_ITERATIONS$"] = [str(loop_kh_iterations - 1)]
-        code_gen_dict["$LOOP_KW_ITERATIONS$"] = [str(loop_kw_iterations - 1)]
-        code_gen_dict["$LOOP_SIMD_ITERATIONS$"] = [str(loop_simd_iterations - 1)]
+        cntr_bitwidth = math.ceil(
+            math.log2(
+                max(
+                    loop_h_iterations - 2 + 1,
+                    loop_w_iterations - 2 + 1,
+                    loop_kh_iterations - 2 + 1,
+                    loop_kw_iterations - 2 + 1,
+                    loop_simd_iterations - 2 + 1,
+                )
+            )
+        )
+        code_gen_dict["$CNTR_BITWIDTH$"] = [str(cntr_bitwidth)]
+        code_gen_dict["$LOOP_H_ITERATIONS$"] = [str(loop_h_iterations - 2)]
+        code_gen_dict["$LOOP_W_ITERATIONS$"] = [str(loop_w_iterations - 2)]
+        code_gen_dict["$LOOP_KH_ITERATIONS$"] = [str(loop_kh_iterations - 2)]
+        code_gen_dict["$LOOP_KW_ITERATIONS$"] = [str(loop_kw_iterations - 2)]
+        code_gen_dict["$LOOP_SIMD_ITERATIONS$"] = [str(loop_simd_iterations - 2)]
 
         incr_bitwidth = 1 + math.ceil(
             math.log2(
@@ -611,21 +623,14 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
             )
         )
         code_gen_dict["$INCR_BITWIDTH$"] = [str(incr_bitwidth)]
-        code_gen_dict["$ADDR_INCREMENT_MAP$"] = [
-            "'{{ {}'d0, {}'d{}, {}'d{}, {}'d{}, {}'d{}, {}'d{}}}".format(
-                incr_bitwidth,
-                int(copysign(incr_bitwidth, addr_incr_end_simd)),
-                abs(addr_incr_end_simd),
-                int(copysign(incr_bitwidth, addr_incr_end_window_elem)),
-                abs(addr_incr_end_window_elem),
-                int(copysign(incr_bitwidth, addr_incr_end_window_row)),
-                abs(addr_incr_end_window_row),
-                int(copysign(incr_bitwidth, addr_incr_end_window)),
-                abs(addr_incr_end_window),
-                int(copysign(incr_bitwidth, addr_incr_end_row)),
-                abs(addr_incr_end_row),
-            )
-        ]
+        code_gen_dict["$HEAD_INCR_SIMD$"] = [str(addr_incr_end_simd)]
+        code_gen_dict["$HEAD_INCR_KW$"] = [str(addr_incr_end_window_elem)]
+        code_gen_dict["$HEAD_INCR_KH$"] = [str(addr_incr_end_window_row)]
+        code_gen_dict["$HEAD_INCR_W$"] = [str(addr_incr_end_window)]
+        code_gen_dict["$HEAD_INCR_H$"] = [str(addr_incr_end_row)]
+        code_gen_dict["$TAIL_INCR_W$"] = [str(tail_incr_w)]
+        code_gen_dict["$TAIL_INCR_H$"] = [str(tail_incr_h)]
+        code_gen_dict["$TAIL_INCR_LAST$"] = [str(tail_incr_last_window)]
 
         code_gen_dict["$ELEM_PER_WINDOW$"] = [str(elem_per_window)]
         code_gen_dict["$SIMD$"] = [str(simd)]
@@ -710,15 +715,22 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
         with open(template_path, "r") as f:
             template = f.read()
+        if self.get_nodeattr("dynamic_mode"):
+            template_select = "/finn-rtllib/swg/swg_template_wrapper_dynamic.v"
+        else:
+            template_select = "/finn-rtllib/swg/swg_template_wrapper.v"
+        with open(os.environ["FINN_ROOT"] + template_select, "r") as f:
+            template_wrapper = f.read()
         with open(
-            os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_template_wrapper.v", "r"
+            os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_template_axilite.v", "r"
         ) as f:
-            template_wrapper = f.read()
+            template_axilite = f.read()
         for key in code_gen_dict:
             # transform list into long string separated by '\n'
             code_gen_line = "\n".join(code_gen_dict[key])
             template = template.replace(key, code_gen_line)
             template_wrapper = template_wrapper.replace(key, code_gen_line)
+            template_axilite = template_axilite.replace(key, code_gen_line)
         with open(
             os.path.join(
                 code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv"
@@ -734,6 +746,16 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         ) as f:
             f.write(template_wrapper)
 
+        # AXI-Lite reg. file component is only needed for dynamic mode
+        if self.get_nodeattr("dynamic_mode"):
+            with open(
+                os.path.join(
+                    code_gen_dir, self.get_nodeattr("gen_top_module") + "_axilite.v"
+                ),
+                "w",
+            ) as f:
+                f.write(template_axilite)
+
         # set ipgen_path and ip_path so that HLS-Synth transformation
         # and stich_ip transformation do not complain
         self.set_nodeattr("ipgen_path", code_gen_dir)
@@ -754,6 +776,8 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
             self.get_nodeattr("gen_top_module") + "_wrapper.v",
             self.get_nodeattr("gen_top_module") + "_impl.sv",
         ]
+        if self.get_nodeattr("dynamic_mode"):
+            verilog_files.append(self.get_nodeattr("gen_top_module") + "_axilite.v")
 
         # build the Verilator emu library
         sim = PyVerilator.build(
@@ -771,25 +795,97 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         """Constructs and returns the TCL for node instantiation in Vivado IPI."""
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
 
-        cmd = [
-            "add_files -norecurse %s"
-            % (
-                os.path.join(
-                    code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
-                )
-            ),
-            "add_files -norecurse %s"
-            % (
-                os.path.join(
-                    code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv"
-                )
-            ),
-            "create_bd_cell -type module -reference %s %s"
-            % (self.get_nodeattr("gen_top_module"), self.onnx_node.name),
+        sourcefiles = [
+            self.get_nodeattr("gen_top_module") + "_wrapper.v",
+            self.get_nodeattr("gen_top_module") + "_impl.sv",
         ]
 
+        if self.get_nodeattr("dynamic_mode"):
+            sourcefiles += [self.get_nodeattr("gen_top_module") + "_axilite.v"]
+
+        sourcefiles = [os.path.join(code_gen_dir, f) for f in sourcefiles]
+
+        cmd = []
+        for f in sourcefiles:
+            cmd += ["add_files -norecurse %s" % (f)]
+        cmd += [
+            "create_bd_cell -type module -reference %s %s"
+            % (self.get_nodeattr("gen_top_module"), self.onnx_node.name)
+        ]
         return cmd
 
+    def get_verilog_top_module_intf_names(self):
+        # Overload default HLSCustomOp implementation to add axilite control IF
+        """Return a dict of names of input and output interfaces.
+        The keys reflect the protocols each interface implements:
+        'clk', 'rst', 'm_axis', 's_axis', 'aximm', 'axilite'.
+        Values are lists of tuples (axis, aximm) or names (axilite):
+        'axis' tuples correspond to the list of node inputs in order,
+        each tuple is (interface_name, interface_width_bits).
+        axilite always assumed to be 32 bits and is not tuple (name only).
+        Each block must have at most one aximm and one axilite."""
+        intf_names = super().get_verilog_top_module_intf_names()
+        if self.get_nodeattr("dynamic_mode"):
+            intf_names["axilite"] = ["s_axilite"]
+        return intf_names
+
+    def get_dynamic_config(self, ifm_dim=None, stride=None, dilation=None):
+        """Returns a configuration dict to re-configure FM dimension during
+        runtime. Stride and dilation can also be changed. Certain restrictions
+        apply (e.g. component must be synthesized for largest buffer size)."""
+        # NOTE: For better driver integration, this functionality could be packaged
+        # as a standalone function in the future
+
+        if ifm_dim is None:
+            ifm_dim = self.get_nodeattr("IFMDim")
+        k = self.get_nodeattr("ConvKernelDim")
+        if stride is None:
+            stride = self.get_nodeattr("Stride")
+        if dilation is None:
+            dilation = self.get_nodeattr("Dilation")
+
+        k_h, k_w = k
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        # update attributes and perform sanity check
+        original_buffer_depth = self.get_buffer_depth()
+        self.set_nodeattr("IFMDim", ifm_dim)
+        self.set_nodeattr("OFMDim", ofm_dim)
+        self.set_nodeattr("Stride", stride)
+        self.set_nodeattr("Dilation", dilation)
+        assert (
+            self.get_buffer_depth() <= original_buffer_depth
+        ), """Error: requested
+            dynamic configuration does not fit in generated buffer implementation."""
+
+        # (re-)call codegen and extract new values
+        # each setting is mapped to an axi-lite register address
+        template_path, code_gen_dict = self.prepare_codegen_default()
+        config = {
+            "cfg_wren": (0 * 4, 1),
+            "cfg_cntr_simd": (1 * 4, int(code_gen_dict["$LOOP_SIMD_ITERATIONS$"][0])),
+            "cfg_cntr_kw": (2 * 4, int(code_gen_dict["$LOOP_KW_ITERATIONS$"][0])),
+            "cfg_cntr_kh": (3 * 4, int(code_gen_dict["$LOOP_KH_ITERATIONS$"][0])),
+            "cfg_cntr_w": (4 * 4, int(code_gen_dict["$LOOP_W_ITERATIONS$"][0])),
+            "cfg_cntr_h": (5 * 4, int(code_gen_dict["$LOOP_H_ITERATIONS$"][0])),
+            "cfg_incr_head_simd": (6 * 4, int(code_gen_dict["$HEAD_INCR_SIMD$"][0])),
+            "cfg_incr_head_kw": (7 * 4, int(code_gen_dict["$HEAD_INCR_KW$"][0])),
+            "cfg_incr_head_kh": (8 * 4, int(code_gen_dict["$HEAD_INCR_KH$"][0])),
+            "cfg_incr_head_w": (9 * 4, int(code_gen_dict["$HEAD_INCR_W$"][0])),
+            "cfg_incr_head_h": (10 * 4, int(code_gen_dict["$HEAD_INCR_H$"][0])),
+            "cfg_incr_tail_w": (11 * 4, int(code_gen_dict["$TAIL_INCR_W$"][0])),
+            "cfg_incr_tail_h": (12 * 4, int(code_gen_dict["$TAIL_INCR_H$"][0])),
+            "cfg_incr_tail_last": (13 * 4, int(code_gen_dict["$TAIL_INCR_LAST$"][0])),
+            "cfg_last_read": (14 * 4, int(code_gen_dict["$LAST_READ_ELEM$"][0])),
+            "cfg_last_write": (15 * 4, int(code_gen_dict["$LAST_WRITE_ELEM$"][0])),
+        }
+        return config
+
     def code_generation_ipgen(self, model, fpgapart, clk):
         """Normally: Generates C++ code and tcl script for IP generation.
         Here: Generates (System-)Verilog code for IP generation."""
diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_rtl.py b/src/finn/custom_op/fpgadataflow/fmpadding_rtl.py
new file mode 100644
index 0000000000000000000000000000000000000000..5650d218857a7c7ff86c15ac057c4ebbc18df5ca
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/fmpadding_rtl.py
@@ -0,0 +1,420 @@
+# Copyright (C) 2022, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import math
+import numpy as np
+import os
+import shutil
+import warnings
+from qonnx.core.datatype import DataType
+from qonnx.util.basic import roundup_to_integer_multiple
+
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+try:
+    from pyverilator import PyVerilator
+except ModuleNotFoundError:
+    PyVerilator = None
+
+
+class FMPadding_rtl(HLSCustomOp):
+    """CustomOp wrapper for the finn-rtllib fmpadding_axi component
+    Supports adjusting the padding amount and spatial feature sizes at
+    runtime."""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            # spatial size of input images
+            "ImgDim": ("ints", True, []),  # [H, W] = [Y, X]
+            # total padding (per dimension) to apply
+            "Padding": (
+                "ints",
+                True,
+                [1, 1, 1, 1],
+            ),  # [H_begin, W_begin, H_end, W_end] = [Y_begin, X_begin, Y_end, X_end]
+            # number of channels in input image
+            "NumChannels": ("i", True, 0),
+            # SIMD Input parallelism
+            "SIMD": ("i", False, 1),
+            # FINN input datatype
+            "inputDataType": ("s", True, ""),
+            # shape describing input vecs per execution
+            "numInputVectors": ("i", False, 1),
+            # Enable reprogrammable implementation to change FM dimensions,
+            # stride, or dilation during runtime
+            "dynamic_mode": ("i", False, 0, {0, 1}),
+            # attribute to save top module name - not user configurable
+            "gen_top_module": ("s", False, ""),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_padded_odim(self):
+        "Return the padded spatial size of the output."
+        idim_h, idim_w = self.get_nodeattr("ImgDim")
+        pad = self.get_nodeattr("Padding")
+        pad_h = pad[0] + pad[2]
+        pad_w = pad[1] + pad[3]
+        odim_h = idim_h + pad_h
+        odim_w = idim_w + pad_w
+        return [odim_h, odim_w]
+
+    def get_exp_cycles(self):
+        odim_h, odim_w = self.get_padded_odim()
+        channels = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        batch_size = self.get_nodeattr("numInputVectors")
+        exp_cycles = (channels / simd) * batch_size * odim_h * odim_w
+        return int(exp_cycles)
+
+    def get_normal_input_shape(self, ind=0):
+        idim_h, idim_w = self.get_nodeattr("ImgDim")
+        num_ch = self.get_nodeattr("NumChannels")
+        ishape = (1, idim_h, idim_w, num_ch)
+        return ishape
+
+    def get_normal_output_shape(self, ind=0):
+        odim_h, odim_w = self.get_padded_odim()
+        num_ch = self.get_nodeattr("NumChannels")
+
+        oshape = (1, odim_h, odim_w, num_ch)
+        return oshape
+
+    def get_folded_input_shape(self, ind=0):
+        normal_ishape = list(self.get_normal_input_shape())
+        ifm_ch = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        assert ifm_ch % simd == 0, "SIMD must divide input channels"
+        fold = int(normal_ishape[-1] / simd)
+        folded_ishape = normal_ishape[:-1] + [fold, simd]
+        return tuple(folded_ishape)
+
+    def get_folded_output_shape(self, ind=0):
+        normal_oshape = list(self.get_normal_output_shape())
+        ifm_ch = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        assert ifm_ch % simd == 0, "SIMD must divide input channels"
+        fold = int(normal_oshape[-1] / simd)
+        folded_oshape = normal_oshape[:-1] + [fold, simd]
+        return tuple(folded_oshape)
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpected input shape for FMPadding_rtl."
+        return super().make_const_shape_op(oshape)
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("inputDataType", idt.name)
+        model.set_tensor_datatype(node.output[0], idt)
+
+    def verify_node(self):
+        pass
+
+    def get_input_datatype(self, ind=0):
+        """Returns FINN DataType of input."""
+        ret = DataType[self.get_nodeattr("inputDataType")]
+        # the hlslib op always pads with zeros, so ensure that the DataType
+        # is able to represent zeros
+        assert ret.allowed(0), "FMPadding_rtl DataType must support zero"
+        return ret
+
+    def get_output_datatype(self, ind=0):
+        """Returns FINN DataType of output. (Same as input datatype)"""
+        return self.get_input_datatype()
+
+    def get_instream_width(self, ind=0):
+        ibits = self.get_input_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        return ibits * simd
+
+    def get_outstream_width(self, ind=0):
+        obits = self.get_output_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        return obits * simd
+
+    def get_number_output_values(self):
+        folded_oshape = self.get_folded_output_shape()
+        return np.prod(folded_oshape[:-1])
+
+    def get_verilog_top_module_intf_names(self):
+        # Overload default HLSCustomOp implementation to add axilite control IF
+        intf_names = super().get_verilog_top_module_intf_names()
+        if self.get_nodeattr("dynamic_mode"):
+            intf_names["axilite"] = ["s_axilite"]
+        return intf_names
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        exp_ishape = self.get_normal_input_shape()
+        exp_oshape = self.get_normal_output_shape()
+        folded_ishape = self.get_folded_input_shape()
+
+        if mode == "cppsim":
+            raise Exception(
+                "cppsim not possible for FMPadding_rtl, please set exec_mode to rtlsim"
+            )
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert (
+            inp.shape == exp_ishape
+        ), """Input shape doesn't
+        match expected shape (1, ImgDim_h, ImgDim_w, NumChannels)."""
+        export_idt = self.get_input_datatype()
+
+        reshaped_input = inp.reshape(folded_ishape)
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+        sim = self.get_rtlsim()
+        nbits = self.get_instream_width()
+        rtlsim_inp = npy_to_rtlsim_input(
+            "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+        )
+        super().reset_rtlsim(sim)
+        super().toggle_clk(sim)
+        rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+        odt = export_idt
+        target_bits = odt.bitwidth()
+        packed_bits = self.get_outstream_width()
+        out_npy_path = "{}/output.npy".format(code_gen_dir)
+        out_shape = self.get_folded_output_shape()
+        rtlsim_output_to_npy(
+            rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+        )
+        # load and reshape output
+        output = np.load(out_npy_path)
+        output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+        context[node.output[0]] = output
+
+        assert (
+            context[node.output[0]].shape == exp_oshape
+        ), """Output shape doesn't match expected shape
+            (1, OutputDim_H, OutputDim_W, NumChannels)."""
+
+    def get_template_values(self, ifm_dims, pads, chans, simd, idt):
+        dimY, dimX = ifm_dims
+        padT, padL, padB, padR = pads
+        y_counter_bits = int(math.ceil(math.log2(padT + dimY + padB + 1)))
+        x_counter_bits = int(math.ceil(math.log2(padL + dimX + padR + 1)))
+        topname = self.get_verilog_top_module_name()
+        stream_bits = idt.bitwidth() * simd
+        stream_bits = int(roundup_to_integer_multiple(stream_bits, 8))
+        code_gen_dict = {
+            "XCOUNTER_BITS": int(x_counter_bits),
+            "YCOUNTER_BITS": int(y_counter_bits),
+            "NUM_CHANNELS": int(chans),
+            "SIMD": int(simd),
+            "ELEM_BITS": idt.bitwidth(),
+            "TOP_MODULE_NAME": topname,
+            "INIT_XON": int(padL),
+            "INIT_XOFF": int(padL + dimX),
+            "INIT_XEND": int(padL + dimX + padR - 1),
+            "INIT_YON": int(padT),
+            "INIT_YOFF": int(padT + dimY),
+            "INIT_YEND": int(padT + dimY + padB - 1),
+            "STREAM_BITS": int(stream_bits),
+        }
+        return code_gen_dict
+
+    def get_dynamic_config(self, ifm_dims=None, pads=None):
+        """Returns a configuration dict to re-configure FM dimension and
+        padding amounts during runtime."""
+
+        if ifm_dims is None:
+            ifm_dims = self.get_nodeattr("ImgDim")
+        if pads is None:
+            pads = self.get_nodeattr("Padding")
+        chans = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        idt = self.get_input_datatype()
+        code_gen_dict = self.get_template_values(ifm_dims, pads, chans, simd, idt)
+        config = {
+            "XON": (0 * 4, (code_gen_dict["INIT_XON"])),
+            "XOFF": (1 * 4, (code_gen_dict["INIT_XOFF"])),
+            "XEND": (2 * 4, (code_gen_dict["INIT_XEND"])),
+            "YON": (3 * 4, (code_gen_dict["INIT_YON"])),
+            "YOFF": (4 * 4, (code_gen_dict["INIT_YOFF"])),
+            "YEND": (5 * 4, (code_gen_dict["INIT_YEND"])),
+        }
+        return config
+
+    def generate_hdl(self):
+        rtlsrc = os.environ["FINN_ROOT"] + "/finn-rtllib/fmpadding/hdl"
+        template_path = rtlsrc + "/fmpadding_template.v"
+        dims = self.get_nodeattr("ImgDim")
+        pads = self.get_nodeattr("Padding")
+        chans = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        idt = self.get_input_datatype()
+        code_gen_dict = self.get_template_values(dims, pads, chans, simd, idt)
+        # save top module name so we can refer to it after this node has been renamed
+        # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject)
+        self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name())
+
+        # apply code generation to templates
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        with open(template_path, "r") as f:
+            template = f.read()
+        for key_name in code_gen_dict:
+            key = "$%s$" % key_name
+            template = template.replace(key, str(code_gen_dict[key_name]))
+
+        with open(
+            os.path.join(code_gen_dir, self.get_verilog_top_module_name() + ".v"),
+            "w",
+        ) as f:
+            f.write(template)
+
+        sv_files = ["fmpadding_axi.sv", "fmpadding.sv", "axi2we.sv"]
+        for sv_file in sv_files:
+            shutil.copy(rtlsrc + "/" + sv_file, code_gen_dir)
+        # set ipgen_path and ip_path so that HLS-Synth transformation
+        # and stich_ip transformation do not complain
+        self.set_nodeattr("ipgen_path", code_gen_dir)
+        self.set_nodeattr("ip_path", code_gen_dir)
+
+    def prepare_rtlsim(self):
+        """Creates a Verilator emulation library for the RTL code generated
+        for this node, sets the rtlsim_so attribute to its path and returns
+        a PyVerilator wrapper around it."""
+        # Modified to use generated (System-)Verilog instead of HLS output products
+
+        if PyVerilator is None:
+            raise ImportError("Installation of PyVerilator is required.")
+
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        verilog_paths = [code_gen_dir]
+        verilog_files = [
+            "fmpadding_axi.sv",
+            "fmpadding.sv",
+            "axi2we.sv",
+            self.get_nodeattr("gen_top_module") + ".v",
+        ]
+
+        # build the Verilator emu library
+        sim = PyVerilator.build(
+            verilog_files,
+            build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
+            verilog_path=verilog_paths,
+            trace_depth=get_rtlsim_trace_depth(),
+            top_module_name=self.get_verilog_top_module_name(),
+        )
+        # save generated lib filename in attribute
+        self.set_nodeattr("rtlsim_so", sim.lib._name)
+        return sim
+
+    def code_generation_ipi(self):
+        """Constructs and returns the TCL for node instantiation in Vivado IPI."""
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+
+        sourcefiles = [
+            "fmpadding_axi.sv",
+            "fmpadding.sv",
+            "axi2we.sv",
+            self.get_nodeattr("gen_top_module") + ".v",
+        ]
+
+        sourcefiles = [os.path.join(code_gen_dir, f) for f in sourcefiles]
+
+        cmd = []
+        for f in sourcefiles:
+            cmd += ["add_files -norecurse %s" % (f)]
+        cmd += [
+            "create_bd_cell -type module -reference %s %s"
+            % (self.get_nodeattr("gen_top_module"), self.onnx_node.name)
+        ]
+        return cmd
+
+    def code_generation_ipgen(self, model, fpgapart, clk):
+        """Normally: Generates C++ code and tcl script for IP generation.
+        Here: Generates (System-)Verilog code for IP generation."""
+        self.generate_hdl()
+
+    def ipgen_singlenode_code(self):
+        """Normally: Builds the bash script for IP generation."""
+        pass
+
+    def code_generation_cppsim(self, model):
+        """Normally: Generates C++ code for simulation (cppsim)."""
+        pass
+
+    def compile_singlenode_code(self):
+        pass
+
+    def global_includes(self):
+        pass
+
+    def defines(self, var):
+        pass
+
+    def read_npy_data(self):
+        pass
+
+    def strm_decl(self):
+        pass
+
+    def docompute(self):
+        pass
+
+    def dataoutstrm(self):
+        pass
+
+    def save_as_npy(self):
+        pass
+
+    def blackboxfunction(self):
+        pass
+
+    def pragmas(self):
+        pass
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index 525af7ea920e1c8809ce9cd53e628dd756cfdad4..7b8a1bf6b83175cfda041cfc49a22273fd696d8e 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -117,8 +117,12 @@ class InferConvInpGen(Transformation):
                     ConvInpGen_idim_h = odim_padding_h
                     ConvInpGen_idim_w = odim_padding_w
 
+                    padding_optype = (
+                        "FMPadding_rtl" if self.use_rtl_variant else "FMPadding_Batch"
+                    )
+
                     padding_node = helper.make_node(
-                        "FMPadding_Batch",
+                        padding_optype,
                         [i2c_input],
                         [padding_out],
                         domain="finn.custom_op.fpgadataflow",
diff --git a/src/finn/transformation/streamline/absorb.py b/src/finn/transformation/streamline/absorb.py
index 3af34eba8eb709099474426b665f295f21e0ce40..73df52f890d227137ea076804d161206e66653dc 100644
--- a/src/finn/transformation/streamline/absorb.py
+++ b/src/finn/transformation/streamline/absorb.py
@@ -492,6 +492,8 @@ class AbsorbConsecutiveTransposes(Transformation):
             if node.op_type == "Transpose":
                 next_nodes = model.find_consumers(node.output[0])
                 perms1 = list(get_by_name(node.attribute, "perm").ints)
+                if len(next_nodes) == 0:
+                    continue
                 # check if all nodes after fork are opposite transposes
                 all_opposite_transposes = True
                 for next_node in next_nodes:
diff --git a/src/finn/util/create.py b/src/finn/util/create.py
index a8c2e67b385b797905cd4c5a196091069898b583..ed3e1a843eca47d2e20e9ca1c9df0d2d6f5a8a13 100644
--- a/src/finn/util/create.py
+++ b/src/finn/util/create.py
@@ -30,7 +30,11 @@ import numpy as np
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
-from qonnx.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor
+from qonnx.util.basic import (
+    calculate_signed_dot_prod_range,
+    gen_finn_dt_tensor,
+    qonnx_make_model,
+)
 
 
 def hls_random_mlp_maker(layer_spec):
@@ -84,7 +88,7 @@ def hls_mlp_maker(layer_spec):
 
     graph = helper.make_graph(nodes=[], name="mlp", inputs=[], outputs=[])
 
-    model = helper.make_model(graph, producer_name="finn")
+    model = qonnx_make_model(graph, producer_name="finn")
     model = ModelWrapper(model)
 
     for lyr in layer_spec:
diff --git a/tests/fpgadataflow/test_code_gen_trafo.py b/tests/fpgadataflow/test_code_gen_trafo.py
index 49ee32c71ee941ff7435d4c12ccadae3f8e55c5e..f5edabbd4ba029899239cc2f40dd6a94d178eafd 100644
--- a/tests/fpgadataflow/test_code_gen_trafo.py
+++ b/tests/fpgadataflow/test_code_gen_trafo.py
@@ -32,7 +32,7 @@ import os
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
-from qonnx.util.basic import gen_finn_dt_tensor, get_by_name
+from qonnx.util.basic import gen_finn_dt_tensor, get_by_name, qonnx_make_model
 
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 
@@ -70,7 +70,7 @@ def test_code_gen_trafo():
         nodes=[FCLayer_node], name="fclayer_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="fclayer-model")
+    model = qonnx_make_model(graph, producer_name="fclayer-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_compilation_trafo.py b/tests/fpgadataflow/test_compilation_trafo.py
index 9bafb101cedabc99d97356069c883cab4ed8a87f..d04b68a56ba7fc5f01e1eef57075636954f86843 100644
--- a/tests/fpgadataflow/test_compilation_trafo.py
+++ b/tests/fpgadataflow/test_compilation_trafo.py
@@ -32,7 +32,7 @@ import os
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
-from qonnx.util.basic import gen_finn_dt_tensor, get_by_name
+from qonnx.util.basic import gen_finn_dt_tensor, get_by_name, qonnx_make_model
 
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
@@ -71,7 +71,7 @@ def test_compilation_trafo():
         nodes=[FCLayer_node], name="fclayer_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="fclayer-model")
+    model = qonnx_make_model(graph, producer_name="fclayer-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py
index 7b3e20616410f54e4718290baec9a510a0d49c0d..98a7c76ee4de0332586772ba7c1007ee55979a51 100644
--- a/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py
@@ -38,7 +38,7 @@ from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
@@ -121,7 +121,7 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_
         helper.make_tensor_value_info("p1", TensorProto.FLOAT, conv_param_shape)
     ]
 
-    modelproto = helper.make_model(
+    modelproto = qonnx_make_model(
         helper.make_graph(
             name="conv_test",
             inputs=[top_in],
diff --git a/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py b/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py
index 0f19b6d79ab0ed77981022f286fabd430094d69f..089d1ae420f4fab744fcda5950d88b13216b4c93 100644
--- a/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py
@@ -35,7 +35,7 @@ from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_data_layouts import InferDataLayouts
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
@@ -57,7 +57,7 @@ def make_single_maxpool_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape):
     outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, ishape)
     p0 = helper.make_tensor_value_info("p0", TensorProto.FLOAT, pshape)
 
-    model = helper.make_model(
+    model = qonnx_make_model(
         helper.make_graph(
             name="test",
             inputs=[inp],
diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py b/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py
index 0760ff9b37487f4a1ac06853055d2e47b7269f9e..3512c39cb3fab04e4e4225728c9495b546b7c655 100755
--- a/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py
+++ b/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py
@@ -39,7 +39,7 @@ from qonnx.transformation.infer_data_layouts import InferDataLayouts
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
@@ -149,7 +149,7 @@ def test_convert_to_hls_conv_fc_transition(conv_config, depthwise, use_reshape):
             "Flatten", ["thres1_out"], ["flatten_out"], axis=1
         )
 
-    modelproto = helper.make_model(
+    modelproto = qonnx_make_model(
         helper.make_graph(
             name="test",
             inputs=[global_in],
diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
index 8c9f110c315089ec03354863bf2213963197217a..de31ef0f125cb96ea82f953eadb9d5ccf7aab16c 100644
--- a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
@@ -38,7 +38,7 @@ from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
@@ -107,7 +107,7 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mod
         helper.make_tensor_value_info("p1", TensorProto.FLOAT, conv_param_shape)
     ]
 
-    modelproto = helper.make_model(
+    modelproto = qonnx_make_model(
         helper.make_graph(
             name="conv_test",
             inputs=[top_in],
@@ -175,8 +175,11 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mod
             assert np.isclose(exp_cycles, cycles_rtlsim, atol=11)
             assert exp_cycles != 0
 
-    if pad == 1:
-        padding_node = new_model.get_nodes_by_op_type("FMPadding_Batch")[0]
+    if pad:
+        if use_rtl_swg:
+            padding_node = new_model.get_nodes_by_op_type("FMPadding_rtl")[0]
+        else:
+            padding_node = new_model.get_nodes_by_op_type("FMPadding_Batch")[0]
         padding_inst = getCustomOp(padding_node)
         assert padding_inst.get_nodeattr("SIMD") == in_chn
 
diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py b/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py
index 79a48793e0c4f062654e43aadcaf09ebf6d7da5b..c837a46a7ca7dcab6628cbf16373161b7b9ab9c2 100644
--- a/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py
+++ b/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py
@@ -43,7 +43,7 @@ from qonnx.transformation.infer_data_layouts import InferDataLayouts
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.insert_topk import InsertTopK
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
@@ -123,7 +123,7 @@ def make_model(ch, ifmdim):
         outputs=[outp],
     )
 
-    model = helper.make_model(graph, producer_name="add-model")
+    model = qonnx_make_model(graph, producer_name="add-model")
     model = ModelWrapper(model)
 
     # set initializers for scalar add/mul nodes
diff --git a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
index ef9bd7a13dcecf7aa61ecb982ac6393d7813a4d5..6d628c9e53828fef88028bdc115bd64b0292dfed 100644
--- a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
+++ b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
@@ -35,7 +35,7 @@ from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
@@ -78,7 +78,7 @@ def make_single_maxpool_modelwrapper(
         nodes=[mp_node], name="mp_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="mp-model")
+    model = qonnx_make_model(graph, producer_name="mp-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -112,7 +112,7 @@ def make_single_quantavpool_modelwrapper(k, stride, ifm_ch, ifm_dim, ofm_dim, id
         nodes=[mp_node], name="mp_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="mp-model")
+    model = qonnx_make_model(graph, producer_name="mp-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_depthwise_convolution.py b/tests/fpgadataflow/test_depthwise_convolution.py
index 5228ade3d0f4db3bd99f5fcccb7aee41f57ed73b..8ab22bcfdcb0312bd49677f0e00d8e97cdcad3c1 100644
--- a/tests/fpgadataflow/test_depthwise_convolution.py
+++ b/tests/fpgadataflow/test_depthwise_convolution.py
@@ -37,7 +37,11 @@ from qonnx.custom_op.general.im2col import compute_conv_output_dim
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor
+from qonnx.util.basic import (
+    calculate_signed_dot_prod_range,
+    gen_finn_dt_tensor,
+    qonnx_make_model,
+)
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
@@ -123,7 +127,7 @@ def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding):
         outputs=[global_out],
         value_info=value_info,
     )
-    model = oh.make_model(graph, producer_name="lowered_dw_cnv-model")
+    model = qonnx_make_model(graph, producer_name="lowered_dw_cnv-model")
     model = ModelWrapper(model)
 
     # initialize model
diff --git a/tests/fpgadataflow/test_fpgadataflow_addstreams.py b/tests/fpgadataflow/test_fpgadataflow_addstreams.py
index 6d881f45b60384d9a78b5d9f9705581a10b48e6c..1ad2c26610c99c46bde4c05ed156a81b122aba53 100644
--- a/tests/fpgadataflow/test_fpgadataflow_addstreams.py
+++ b/tests/fpgadataflow/test_fpgadataflow_addstreams.py
@@ -34,7 +34,7 @@ from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -68,7 +68,7 @@ def make_addstreams_modelwrapper(ch, pe, idt):
         outputs=[outp],
     )
 
-    model = helper.make_model(graph, producer_name="addstreams-model")
+    model = qonnx_make_model(graph, producer_name="addstreams-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp1", idt)
diff --git a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
index ceafda90e54004c7aea8786d003b6adf1defab35..13fab9a47f15999c184680b9db04494787889881 100644
--- a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
+++ b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
@@ -34,7 +34,7 @@ from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -73,7 +73,7 @@ def make_modelwrapper(C, pe, idt, odt, pdt, func, vecs):
     )
     graph = helper.make_graph(nodes=[node], name="graph", inputs=[inp], outputs=[outp])
 
-    model = helper.make_model(graph, producer_name="model")
+    model = qonnx_make_model(graph, producer_name="model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_fpgadataflow_checksum.py b/tests/fpgadataflow/test_fpgadataflow_checksum.py
index 495fcd10b6a977c6b0917ac37b58ec5595185c25..cd404f5a6332d77f17ec69c47b53c8c893f28607 100644
--- a/tests/fpgadataflow/test_fpgadataflow_checksum.py
+++ b/tests/fpgadataflow/test_fpgadataflow_checksum.py
@@ -36,7 +36,7 @@ from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.core.rtlsim_exec import rtlsim_exec
@@ -115,7 +115,7 @@ def create_two_fc_model():
         value_info=[mid],
     )
 
-    model = helper.make_model(graph, producer_name="fclayer-model")
+    model = qonnx_make_model(graph, producer_name="fclayer-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
index a196ecbb61b74843ddc8efa4ac3c5ab8197e64fe..3cfff9ac34ae47bdc072bca9f6ca0fffeea756c5 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
@@ -34,7 +34,7 @@ from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -73,7 +73,7 @@ def make_single_im2col_modelwrapper(
         nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="im2col-model")
+    model = qonnx_make_model(graph, producer_name="im2col-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -117,7 +117,7 @@ def make_single_slidingwindow_modelwrapper(
         outputs=[outp],
     )
 
-    model = helper.make_model(graph, producer_name="slidingwindow-model")
+    model = qonnx_make_model(graph, producer_name="slidingwindow-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py
index 0fc3ca82cfa919079a324160e4876377ac4dc3b4..f467f37618bbee6359bb7b7dfa963e3d8785d0c9 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py
@@ -35,7 +35,7 @@ from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.general.im2col import compute_conv_output_dim
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -82,7 +82,7 @@ def make_single_im2col_modelwrapper(
         nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="im2col-model")
+    model = qonnx_make_model(graph, producer_name="im2col-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -133,7 +133,7 @@ def make_single_slidingwindow_modelwrapper(
         outputs=[outp],
     )
 
-    model = helper.make_model(graph, producer_name="slidingwindow-model")
+    model = qonnx_make_model(graph, producer_name="slidingwindow-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py
index 007360a5fd0b74ee49d54c84f332061dd5f3a114..58fc5ec04cc471b0e8f201e235ac9bd033e3f5c4 100755
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py
@@ -33,7 +33,7 @@ from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.general.im2col import compute_conv_output_dim
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
@@ -72,7 +72,7 @@ def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilatio
         nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="im2col-model")
+    model = qonnx_make_model(graph, producer_name="im2col-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -124,7 +124,7 @@ def make_single_slidingwindow_modelwrapper(
         outputs=[outp],
     )
 
-    model = helper.make_model(graph, producer_name="slidingwindow-model")
+    model = qonnx_make_model(graph, producer_name="slidingwindow-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f7bf649a9284e7716aec5adfb91957fdabb55d5
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py
@@ -0,0 +1,617 @@
+# Copyright (c) 2022, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import copy
+import numpy as np
+import onnx.parser as oprs
+import os
+from onnx import TensorProto, helper
+from pyverilator.util.axi_utils import axilite_write, reset_rtlsim
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.im2col import compute_conv_output_dim
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.lower_convs_to_matmul import (
+    LowerConvsToMatMul,
+    _auto_pad_to_explicit_padding,
+)
+from qonnx.util.basic import gen_finn_dt_tensor, get_by_name, qonnx_make_model
+
+import finn.core.onnx_exec as oxe
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+import finn.transformation.streamline.absorb as absorb
+from finn.core.onnx_exec import execute_onnx
+from finn.core.rtlsim_exec import rtlsim_exec
+from finn.transformation.fpgadataflow.create_dataflow_partition import (
+    CreateDataflowPartition,
+)
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.util.basic import pyverilate_get_liveness_threshold_cycles
+
+
+def create_conv_model(
+    idim_h, idim_w, ifm, k, stride, ofm, idt, wdt, pad_mode, depthwise
+):
+    np.random.seed(0)
+    group = ifm if depthwise else 1
+    group_str = str(group)
+    ishp = (1, ifm, idim_h, idim_w)
+    pad_0 = _auto_pad_to_explicit_padding(
+        pad_mode, idim_h, idim_w, k, k, stride, stride, 2
+    )
+    int_dim_h = compute_conv_output_dim(
+        idim_h, k, stride, total_pad=pad_0[0] + pad_0[2]
+    )
+    int_dim_w = compute_conv_output_dim(
+        idim_w, k, stride, total_pad=pad_0[1] + pad_0[3]
+    )
+
+    pad_1 = _auto_pad_to_explicit_padding(
+        pad_mode, int_dim_h, int_dim_w, k, k, stride, stride, 2
+    )
+    odim_h = compute_conv_output_dim(
+        int_dim_h, k, stride, total_pad=pad_1[0] + pad_1[2]
+    )
+    odim_w = compute_conv_output_dim(
+        int_dim_w, k, stride, total_pad=pad_1[1] + pad_1[3]
+    )
+    oshp = (1, ifm, odim_h, odim_w) if depthwise else (1, ofm, odim_h, odim_w)
+    wshp = (ifm, 1, k, k) if depthwise else (ofm, ifm, k, k)
+    wshp_1 = (ifm, 1, k, k) if depthwise else (ofm, ofm, k, k)
+    ishp_str = str(list(ishp))
+    oshp_str = str(list(oshp))
+    wshp_str = str(list(wshp))
+    wshp_1_str = str(list(wshp_1))
+    kshp_str = str([k, k])
+    pad_0_str = str(list(pad_0))
+    pad_1_str = str(list(pad_1))
+    stride_str = str([stride, stride])
+    dil_str = str([1, 1])
+
+    input = f"""
+    <
+        ir_version: 7,
+        opset_import: ["" : 9]
+    >
+    agraph (float{ishp_str} in0) => (float{oshp_str} out0)
+    <
+        float{wshp_str} param_c0_weight,
+        float{wshp_1_str} param_c1_weight
+    >
+    {{
+        conv0 = Conv<
+                dilations={dil_str},group={group_str},kernel_shape={kshp_str},pads={pad_0_str},
+                strides={stride_str}
+            >(in0, param_c0_weight)
+        out0 = Conv<
+                dilations={dil_str},group={group_str},kernel_shape={kshp_str},pads={pad_1_str},
+                strides={stride_str}
+            >(conv0, param_c1_weight)
+    }}
+    """
+    model = oprs.parse_model(input)
+    model = ModelWrapper(model)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    model.set_tensor_datatype("in0", idt)
+    model.set_tensor_datatype("param_c0_weight", wdt)
+    model.set_tensor_datatype("param_c1_weight", wdt)
+    model.set_initializer("param_c0_weight", gen_finn_dt_tensor(wdt, wshp))
+    model.set_initializer("param_c1_weight", gen_finn_dt_tensor(wdt, wshp_1))
+    return model
+
+
+def update_conv_model_dims(model, idim_new_h, idim_new_w):
+    cnode = model.get_nodes_by_op_type("Conv")[0]
+    k, _ = get_by_name(cnode.attribute, "kernel_shape").ints
+    stride, _ = get_by_name(cnode.attribute, "strides").ints
+    ishp = model.get_tensor_shape("in0")
+    n, ci, _, _ = ishp
+    n, co, _, _ = model.get_tensor_shape("out0")
+    int_dim_h = compute_conv_output_dim(idim_new_h, k, stride)
+    int_dim_w = compute_conv_output_dim(idim_new_w, k, stride)
+    odim_h = compute_conv_output_dim(int_dim_h, k, stride)
+    odim_w = compute_conv_output_dim(int_dim_w, k, stride)
+    model.set_tensor_shape("in0", (n, ci, idim_new_h, idim_new_w))
+    model.set_tensor_shape("out0", (n, co, odim_h, odim_w))
+    # remove all existing shapes
+    del model.graph.value_info[:]
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    return model
+
+
+# Helper function to update tensor dimensions manually because shape inference
+# does not work on FINN nodes (they assume well-defined tensor shapes).
+def update_tensor_dim(model, tensor_name, new_hw):
+    shape = model.get_tensor_shape(tensor_name)
+    shape[1] = new_hw[0]
+    shape[2] = new_hw[1]
+    model.set_tensor_shape(tensor_name, shape)
+
+
+# Helper function that delivers the hook to program the SWG via AXI-Lite
+def config_hook(configs):
+    if configs is None:
+        return None
+
+    def write_swg_config(sim):
+        reset_rtlsim(sim)
+        for axi_name, config in configs:
+            # Write config registers to the SWG/FMPadding dict
+            # defines (addr, value) tuples
+            for config_entry in config.values():
+                axilite_write(sim, config_entry[0], config_entry[1], basename=axi_name)
+        reset_rtlsim(sim)
+
+    return write_swg_config
+
+
+cfg0 = {
+    "idims": [(32, 32), (8, 8)],
+    "ifm": 64,
+    "k": 3,
+    "stride": 1,
+    "ofm": 64,
+    "depthwise": True,
+    "pad_mode": "SAME_UPPER",
+}
+cfg1 = {
+    "idims": [(32, 16), (16, 8)],
+    "ifm": 4,
+    "k": 4,
+    "stride": 1,
+    "ofm": 8,
+    "depthwise": False,
+    "pad_mode": "SAME_UPPER",
+}
+cfg2 = {
+    "idims": [(64, 128), (2, 4)],
+    "ifm": 64,
+    "k": 3,
+    "stride": 1,
+    "ofm": 64,
+    "depthwise": True,
+    "pad_mode": "SAME_UPPER",
+}
+
+
+@pytest.mark.parametrize("cfg", [cfg0, cfg1, cfg2])
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.fpgadataflow
+def test_fpgadataflow_conv_dynamic(cfg):
+    pad_mode = cfg["pad_mode"]
+    depthwise = cfg["depthwise"]
+    idims = cfg["idims"]
+    ifm = cfg["ifm"]
+    k = cfg["k"]
+    stride = cfg["stride"]
+    ofm = cfg["ofm"]
+    idt = DataType["UINT4"]
+    wdt = DataType["INT2"]
+    exp_cfgs = []
+    largest_model = None
+    for idim in idims:
+        idim_h, idim_w = idim
+        ishp = (1, ifm, idim_h, idim_w)
+        np.random.seed(0)
+        inp = gen_finn_dt_tensor(idt, ishp)
+        model = create_conv_model(
+            idim_h, idim_w, ifm, k, stride, ofm, idt, wdt, pad_mode, depthwise
+        )
+        _, _, int_dim_h, int_dim_w = model.get_tensor_shape("conv0")
+        _, _, odim_h, odim_w = model.get_tensor_shape("out0")
+        pad0 = get_by_name(model.graph.node[0].attribute, "pads").ints
+        pad1 = get_by_name(model.graph.node[1].attribute, "pads").ints
+        if idim == max(idims):
+            # use largest model for hardware conversion
+            largest_model = copy.deepcopy(model)
+        golden = execute_onnx(model, {"in0": inp})["out0"]
+        exp_cfg = (
+            (idim_h, idim_w),
+            (int_dim_h, int_dim_w),
+            (odim_h, odim_w),
+            pad0,
+            pad1,
+            inp,
+            golden,
+        )
+        exp_cfgs.append(exp_cfg)
+
+    # convert to hardware and prepare simulation
+    model = largest_model.transform(LowerConvsToMatMul())
+    model = model.transform(to_hls.InferConvInpGen(use_rtl_variant=True))
+    model = model.transform(
+        to_hls.InferQuantizedMatrixVectorActivation(mem_mode="decoupled")
+    )
+    model = model.transform(to_hls.InferVectorVectorActivation())
+    model = model.transform(absorb.AbsorbConsecutiveTransposes())
+    parent_model = model.transform(CreateDataflowPartition())
+    sdp_inst = getCustomOp(
+        parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+    )
+    model = ModelWrapper(sdp_inst.get_nodeattr("model"))
+    assert len(model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")) == 2
+    if pad_mode == "VALID":
+        assert len(model.get_nodes_by_op_type("FMPadding_rtl")) == 0
+    else:
+        assert len(model.get_nodes_by_op_type("FMPadding_rtl")) == 2
+    dyn_nodes = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")
+    dyn_nodes += model.get_nodes_by_op_type("FMPadding_rtl")
+    for swg_node in dyn_nodes:
+        getCustomOp(swg_node).set_nodeattr("SIMD", 4)
+        getCustomOp(swg_node).set_nodeattr("dynamic_mode", 1)
+        getCustomOp(swg_node).set_nodeattr("inFIFODepths", [16])
+        getCustomOp(swg_node).set_nodeattr("outFIFODepths", [16])
+    comp_nodes = model.get_nodes_by_op_type("MatrixVectorActivation")
+    comp_nodes += model.get_nodes_by_op_type("VectorVectorActivation")
+    for comp_node in comp_nodes:
+        if depthwise:
+            getCustomOp(comp_node).set_nodeattr("PE", 4)
+        else:
+            getCustomOp(comp_node).set_nodeattr("SIMD", 4)
+            getCustomOp(comp_node).set_nodeattr("PE", 4)
+    model = model.transform(InsertDWC())
+    model = model.transform(InsertFIFO(create_shallow_fifos=True))
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(CreateStitchedIP("xc7z020clg400-1", 5))
+    model.set_metadata_prop("exec_mode", "rtlsim")
+
+    # loop through experiment configurations
+    for exp_cfg in exp_cfgs:
+        (
+            (idim_h, idim_w),
+            (int_dim_h, int_dim_w),
+            (odim_h, odim_w),
+            pad0,
+            pad1,
+            inp,
+            golden,
+        ) = exp_cfg
+        conv0_idim_h = idim_h + pad0[0] + pad0[2]
+        conv0_idim_w = idim_w + pad0[1] + pad0[3]
+        conv1_idim_h = int_dim_h + pad1[0] + pad1[2]
+        conv1_idim_w = int_dim_w + pad1[1] + pad1[3]
+        # get config for the new dimensions
+        swg_nodes = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")
+        swg0 = getCustomOp(swg_nodes[0])
+        update_tensor_dim(model, swg0.onnx_node.input[0], (conv0_idim_h, conv0_idim_w))
+        update_tensor_dim(model, swg0.onnx_node.output[0], (int_dim_h, int_dim_w))
+        swg_config0 = swg0.get_dynamic_config((conv0_idim_h, conv0_idim_w))
+        swg1 = getCustomOp(swg_nodes[1])
+        update_tensor_dim(model, swg1.onnx_node.input[0], (conv1_idim_h, conv1_idim_w))
+        update_tensor_dim(model, swg1.onnx_node.output[0], (odim_h, odim_w))
+        swg_config1 = swg1.get_dynamic_config((conv1_idim_h, conv1_idim_w))
+        if pad_mode != "VALID":
+            pad_nodes = model.get_nodes_by_op_type("FMPadding_rtl")
+            padder0 = getCustomOp(pad_nodes[0])
+            update_tensor_dim(model, padder0.onnx_node.input[0], (idim_h, idim_w))
+            update_tensor_dim(
+                model, padder0.onnx_node.output[0], (conv0_idim_h, conv0_idim_w)
+            )
+            pad_config0 = padder0.get_dynamic_config((idim_h, idim_w), pad0)
+            padder1 = getCustomOp(pad_nodes[1])
+            update_tensor_dim(model, padder1.onnx_node.input[0], (int_dim_h, int_dim_w))
+            update_tensor_dim(
+                model, padder1.onnx_node.output[0], (conv1_idim_h, conv1_idim_w)
+            )
+            pad_config1 = padder1.get_dynamic_config((int_dim_h, int_dim_w), pad1)
+            configs = [
+                ("s_axilite_0_", pad_config0),
+                ("s_axilite_1_", swg_config0),
+                ("s_axilite_2_", pad_config1),
+                ("s_axilite_3_", swg_config1),
+            ]
+        else:
+            configs = [("s_axilite_0_", swg_config0), ("s_axilite_1_", swg_config1)]
+        # adjust folded shapes for I/O FIFOs
+        # (since rtlsim_exec uses folded shape info to fold global i/o tensors)
+        first_node = getCustomOp(model.graph.node[0])
+        first_node_shp = list(first_node.get_folded_input_shape())
+        first_node_shp[1] = idim_h
+        first_node_shp[2] = idim_w
+        first_node.set_nodeattr("folded_shape", first_node_shp)
+        update_tensor_dim(model, first_node.onnx_node.input[0], (idim_h, idim_w))
+        last_node = getCustomOp(model.graph.node[-1])
+        last_node_shp = list(last_node.get_folded_output_shape())
+        last_node_shp[1] = odim_h
+        last_node_shp[2] = odim_w
+        update_tensor_dim(model, last_node.onnx_node.output[0], (odim_h, odim_w))
+        last_node.set_nodeattr("folded_shape", last_node_shp)
+        ctx = {"global_in": inp.transpose(0, 2, 3, 1)}
+        liveness_prev = pyverilate_get_liveness_threshold_cycles()
+        os.environ["LIVENESS_THRESHOLD"] = "100000"
+        rtlsim_exec(model, ctx, pre_hook=config_hook(configs))
+        os.environ["LIVENESS_THRESHOLD"] = str(liveness_prev)
+        ret = ctx["global_out"].transpose(0, 3, 1, 2)
+        assert np.isclose(golden, ret).all()
+
+
+def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt):
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+    ofm_dim_h, ofm_dim_w = ofm_dim
+
+    odt = idt
+    inp = helper.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]
+    )
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch]
+    )
+
+    im2col_node = helper.make_node(
+        "Im2Col",
+        ["inp"],
+        ["outp"],
+        domain="finn.custom_op.general",
+        stride=[stride_h, stride_w],
+        kernel_size=[k_h, k_w],
+        input_shape=str((1, ifm_dim_h, ifm_dim_w, ifm_ch)),
+        dilations=[dilation_h, dilation_w],
+        pad_amount=[0, 0, 0, 0],
+        pad_value=0,
+    )
+    graph = helper.make_graph(
+        nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp]
+    )
+
+    model = qonnx_make_model(graph, producer_name="im2col-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+
+    return model
+
+
+def make_single_slidingwindow_modelwrapper(
+    k, ifm_ch, ifm_dim, ofm_dim, simd, m, parallel_window, stride, dilation, idt, dw=0
+):
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+    ofm_dim_h, ofm_dim_w = ofm_dim
+
+    odt = idt
+    inp = helper.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]
+    )
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch]
+    )
+
+    SlidingWindow_node = helper.make_node(
+        "ConvolutionInputGenerator_rtl",
+        ["inp"],
+        ["outp"],
+        domain="finn.custom_op.fpgadataflow",
+        backend="fpgadataflow",
+        ConvKernelDim=[k_h, k_w],
+        IFMChannels=ifm_ch,
+        IFMDim=[ifm_dim_h, ifm_dim_w],
+        OFMDim=[ofm_dim_h, ofm_dim_w],
+        SIMD=simd,
+        M=m,
+        parallel_window=parallel_window,
+        Stride=[stride_h, stride_w],
+        Dilation=[dilation_h, dilation_w],
+        inputDataType=idt.name,
+        outputDataType=odt.name,
+        depthwise=dw,
+        dynamic_mode=1,
+    )
+    graph = helper.make_graph(
+        nodes=[SlidingWindow_node],
+        name="slidingwindow_graph",
+        inputs=[inp],
+        outputs=[outp],
+    )
+
+    model = qonnx_make_model(graph, producer_name="slidingwindow-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+
+    return model
+
+
+def prepare_inputs(input_tensor):
+    return {"inp": input_tensor}
+
+
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["UINT4"]])
+# kernel size
+@pytest.mark.parametrize("k", [[3, 3]])
+# input dimension
+@pytest.mark.parametrize("ifm_dim_series", [[[32, 32], [16, 16], [8, 8]]])
+# input channels
+@pytest.mark.parametrize("ifm_ch", [6])
+# Stride
+@pytest.mark.parametrize("stride", [[1, 1]])
+# Dilation
+@pytest.mark.parametrize("dilation", [[1, 1]])
+# depthwise
+@pytest.mark.parametrize("dw", [0, 1])
+# input channel parallelism ("SIMD")
+@pytest.mark.parametrize("simd", [2, 6])
+# parallel_window enable (MMV_out = M*K)
+@pytest.mark.parametrize("parallel_window", [0])
+# in/out MMV ("M")
+@pytest.mark.parametrize("m", [1])
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.fpgadataflow
+def test_fpgadataflow_slidingwindow_rtl_dynamic(
+    idt, k, ifm_dim_series, ifm_ch, stride, dilation, dw, simd, m, parallel_window
+):
+    # Begin test by generating RTL SWG normally for the first FM of the series.
+    # The following FM dimensions must be equal or smaller than the initial
+    # dimensions (in terms of required buffer depth).
+    ifm_dim = ifm_dim_series[0]
+
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+    ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+    ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+    ofm_dim = [ofm_dim_h, ofm_dim_w]
+    kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+    kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+    if simd > ifm_ch:
+        pytest.skip("SIMD cannot be larger than number of input channels")
+    if ifm_ch % simd != 0:
+        pytest.skip("SIMD must divide number of input channels")
+    if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+        pytest.skip(
+            "Illegal convolution configuration: kernel or stride > FM dimension"
+        )
+    if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+        pytest.skip(
+            "Illegal convolution configuration: kernel or stride > FM dimension"
+        )
+    if (k_h == 1 and (stride_h != 1 or dilation_h != 1)) or (
+        k_w == 1 and (stride_w != 1 or dilation_w != 1)
+    ):
+        pytest.skip(
+            """Illegal convolution configuration:
+            stride or dilation defined for unitary kernel dim"""
+        )
+    if k_h == 1 and k_w == 1 and simd != ifm_ch:
+        pytest.skip("1x1 Kernel only supported in parallel mode (SIMD=C)")
+    if parallel_window and simd != ifm_ch:
+        pytest.skip("Parallel window requires SIMD=C")
+
+    model = make_single_slidingwindow_modelwrapper(
+        k=k,
+        ifm_ch=ifm_ch,
+        ifm_dim=ifm_dim,
+        ofm_dim=ofm_dim,
+        simd=simd,
+        m=m,
+        parallel_window=parallel_window,
+        stride=stride,
+        dilation=dilation,
+        idt=idt,
+        dw=dw,
+    )
+
+    # Simulate using stitched-ip-rtlsim so we can use existing infrastructure
+    # that supports hook functions to re-program configuration before rtlsim
+    model = model.transform(InsertFIFO(True))  # required for proper simulation
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(CreateStitchedIP("xc7z020clg400-1", 5))
+    model.set_metadata_prop("exec_mode", "rtlsim")
+
+    # Simulate 1 FM for each dimension in the series
+    for i, ifm_dim in enumerate(ifm_dim_series):
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        configs = None
+        if i > 0:  # skip re-programming for initial FM dimension
+            # Necessary update of node and tensor attributes to make rtlsim work:
+            swg_node = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")[0]
+            swg_inst = getCustomOp(swg_node)
+            update_tensor_dim(model, swg_node.input[0], ifm_dim)
+            update_tensor_dim(model, swg_node.output[0], ofm_dim)
+
+            # Generate config, also overwrites IFMDim/OFMDim attributes:
+            config = swg_inst.get_dynamic_config(ifm_dim)
+            configs = [("s_axilite_0_", config)]
+
+            # Also update FIFO nodes and corresponding tensors
+            fifo_node = model.get_nodes_by_op_type("StreamingFIFO")[0]
+            fifo_inst = getCustomOp(fifo_node)
+            shape = fifo_inst.get_nodeattr("folded_shape")
+            shape[1] = ifm_dim_h
+            shape[2] = ifm_dim_w
+            fifo_inst.set_nodeattr("folded_shape", shape)
+            update_tensor_dim(model, fifo_node.input[0], ifm_dim)
+
+            fifo_node = model.get_nodes_by_op_type("StreamingFIFO")[1]
+            fifo_inst = getCustomOp(fifo_node)
+            shape = fifo_inst.get_nodeattr("folded_shape")
+            shape[1] = ofm_dim_h
+            shape[2] = ofm_dim_w
+            fifo_inst.set_nodeattr("folded_shape", shape)
+            update_tensor_dim(model, fifo_node.output[0], ofm_dim)
+
+        # Run rtlsim on stitched-ip
+        x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch))
+        context = prepare_inputs(x)
+        rtlsim_exec(model, context, pre_hook=config_hook(configs))
+        y_produced = context["outp"]
+
+        # Generate golden result
+        golden = make_single_im2col_modelwrapper(
+            k=k,
+            ifm_ch=ifm_ch,
+            ifm_dim=ifm_dim,
+            ofm_dim=ofm_dim,
+            stride=stride,
+            dilation=dilation,
+            idt=idt,
+        )
+        input_dict = prepare_inputs(x)
+        y_expected = oxe.execute_onnx(golden, input_dict)["outp"]
+
+        # Check result
+        if dw == 0:
+            assert (y_produced == y_expected).all()
+        else:
+            y_expected = y_expected.reshape(
+                1, ofm_dim_h, ofm_dim_w, k_h * k_w, ifm_ch // simd, simd
+            )
+            y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5)
+            y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, ifm_ch * k_h * k_w)
+            assert (y_produced == y_expected).all()
diff --git a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
index 7ec254405d23f0a972de7f9d02d2ab021ed3d959..441bbce50a8a218185f93a7968767abe2541ed15 100644
--- a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
+++ b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
@@ -36,7 +36,7 @@ from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -76,7 +76,7 @@ def make_dupstreams_modelwrapper(ch, pe, idim, idt, n_dupl):
         nodes=[dupstrm_node], name="graph", inputs=[inp], outputs=out_vi
     )
 
-    model = helper.make_model(graph, producer_name="addstreams-model")
+    model = qonnx_make_model(graph, producer_name="addstreams-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_fpgadataflow_dwc.py b/tests/fpgadataflow/test_fpgadataflow_dwc.py
index 695a5f902ce7bf3c22bfd46dc264dda4bfceb15f..2bde148a1499e4c7065ab1e151e3c4198e1e96da 100644
--- a/tests/fpgadataflow/test_fpgadataflow_dwc.py
+++ b/tests/fpgadataflow/test_fpgadataflow_dwc.py
@@ -32,7 +32,7 @@ from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
@@ -63,7 +63,7 @@ def make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype, impl_styl
         nodes=[DWC_node], name="dwc_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="dwc-model")
+    model = qonnx_make_model(graph, producer_name="dwc-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", finn_dtype)
diff --git a/tests/fpgadataflow/test_fpgadataflow_fifo.py b/tests/fpgadataflow/test_fpgadataflow_fifo.py
index b9c74185d9f104e15355a5dd6021d7e74dac641e..efdb3bf6aaab23fec67055ae28b2e285f1a32b6a 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fifo.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fifo.py
@@ -33,7 +33,7 @@ from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
@@ -66,7 +66,7 @@ def make_single_fifo_modelwrapper(Shape, Depth, fld_shape, finn_dtype):
         nodes=[FIFO_node], name="fifo_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="fifo-model")
+    model = qonnx_make_model(graph, producer_name="fifo-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", finn_dtype)
diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
index 34928ce45be0fd96d27b153ae28e2128bf306bb5..b95409fda87718f30a74bad88697c3dbad0bf98f 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
@@ -36,7 +36,7 @@ from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -53,7 +53,7 @@ test_fpga_part = pynq_part_map[test_pynq_board]
 target_clk_ns = 10
 
 
-def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt):
+def make_single_fmpadding_modelwrapper(optype, idim, padding, num_ch, simd, idt):
     pad_h = padding[0] + padding[2]
     pad_w = padding[1] + padding[3]
     idim_h, idim_w = idim
@@ -70,7 +70,7 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt):
     )
 
     FMPadding = helper.make_node(
-        "FMPadding_Batch",
+        optype,
         ["inp"],
         ["outp"],
         domain="finn.custom_op.fpgadataflow",
@@ -87,7 +87,7 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt):
         nodes=[FMPadding], name="fmpadding_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="fmpadding-model")
+    model = qonnx_make_model(graph, producer_name="fmpadding-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -110,10 +110,14 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt):
 @pytest.mark.parametrize("idt", [DataType["INT2"], DataType["INT4"]])
 # execution mode
 @pytest.mark.parametrize("mode", ["cppsim", "rtlsim"])
+# implementation style
+@pytest.mark.parametrize("impl_style", ["rtl", "hls"])
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode):
+def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode, impl_style):
+    if impl_style == "rtl" and mode == "cppsim":
+        pytest.skip("rtl implstyle has no cppsim, skipping")
     if num_ch % simd != 0:
         pytest.skip(" num_ch % simd != 0, skipping")
 
@@ -127,7 +131,9 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode):
     odim_h = idim_h + pad_h
     odim_w = idim_w + pad_w
 
-    model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, simd, idt)
+    optype = {"hls": "FMPadding_Batch", "rtl": "FMPadding_rtl"}[impl_style]
+
+    model = make_single_fmpadding_modelwrapper(optype, idim, pad, num_ch, simd, idt)
     model = model.transform(InferShapes())
     model = model.transform(SetExecMode(mode))
     model = model.transform(GiveUniqueNodeNames())
@@ -138,6 +144,7 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode):
         model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
         model = model.transform(HLSSynthIP())
         model = model.transform(PrepareRTLSim())
+
     y_produced = oxe.execute_onnx(model, input_dict)["outp"]
     expected_oshape = (1, odim_h, odim_w, num_ch)
     assert y_produced.shape == expected_oshape
@@ -149,7 +156,7 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode):
     assert (y_produced == y_expected).all()
 
     if mode == "rtlsim":
-        node = model.get_nodes_by_op_type("FMPadding_Batch")[0]
+        node = model.get_nodes_by_op_type(optype)[0]
         inst = getCustomOp(node)
         cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
         exp_cycles_dict = model.analysis(exp_cycles_per_layer)
diff --git a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
index a37e6e3271a9f7e033e6beaa6dbed01271365101..a2c3d09a55f81dc5e9d5ae1819cd8ea6b7df1e27 100644
--- a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
+++ b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
@@ -34,7 +34,7 @@ from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -65,7 +65,7 @@ def make_accpool_modelwrapper(ch, pe, idim, idt):
         nodes=[accpool_node], name="graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="thresholding-model")
+    model = qonnx_make_model(graph, producer_name="thresholding-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
index 325470a6d6c6032249ca1dd64317fb288d3e94c9..b220338e6919e8eeaeef0f6e5343fed9b1dfca10 100644
--- a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
+++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
@@ -36,7 +36,7 @@ from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_data_layouts import InferDataLayouts
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 from finn.core.onnx_exec import execute_onnx
 from finn.transformation.fpgadataflow.create_dataflow_partition import (
@@ -100,7 +100,7 @@ def create_one_fc_model(mem_mode="const"):
         nodes=[fc0], name="fclayer_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="fclayer-model")
+    model = qonnx_make_model(graph, producer_name="fclayer-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -177,7 +177,7 @@ def create_two_fc_model(mem_mode="decoupled"):
         value_info=[mid],
     )
 
-    model = helper.make_model(graph, producer_name="fclayer-model")
+    model = qonnx_make_model(graph, producer_name="fclayer-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_fpgadataflow_labelselect.py b/tests/fpgadataflow/test_fpgadataflow_labelselect.py
index a9b98ecaf80b4c86fc1e9ccec23e6d97c5982f55..553f263ba2e004233011db90feabea057d88026a 100644
--- a/tests/fpgadataflow/test_fpgadataflow_labelselect.py
+++ b/tests/fpgadataflow/test_fpgadataflow_labelselect.py
@@ -33,7 +33,7 @@ from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
@@ -67,7 +67,7 @@ def make_labelselect_modelwrapper(labels, pe, k, idt):
         outputs=[outp],
     )
 
-    model = helper.make_model(graph, producer_name="thresholding-model")
+    model = qonnx_make_model(graph, producer_name="thresholding-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py
index a7e7eba7ee8de81ec5eebe3e270e8e1d28564a00..b80ef76a19e487a93b23ae7db17350e85fb66822 100644
--- a/tests/fpgadataflow/test_fpgadataflow_mvau.py
+++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py
@@ -36,7 +36,11 @@ from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.general.multithreshold import multithreshold
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor
+from qonnx.util.basic import (
+    calculate_signed_dot_prod_range,
+    gen_finn_dt_tensor,
+    qonnx_make_model,
+)
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -106,7 +110,7 @@ def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=Non
         nodes=[FCLayer_node], name="fclayer_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="fclayer-model")
+    model = qonnx_make_model(graph, producer_name="fclayer-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
index e3c79fa44fb57718d359b58d1a8716746f6668fb..b3cf7b4229c39f27c7f3689ef51fb7d22c7aa0f2 100644
--- a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
+++ b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
@@ -32,6 +32,7 @@ from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.util.basic import qonnx_make_model
 
 from finn.analysis.fpgadataflow.res_estimation import (
     res_estimation,
@@ -87,7 +88,7 @@ def test_res_estimate():
         nodes=[FCLayer_node], name="fclayer_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="fclayer-model")
+    model = qonnx_make_model(graph, producer_name="fclayer-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py b/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py
index a3968cf79704092ffb5ec53c887842372b625f4d..628721b429abadf198126a2f5801178f2f710033 100644
--- a/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py
+++ b/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py
@@ -35,7 +35,7 @@ from qonnx.custom_op.general.maxpoolnhwc import compute_pool_output_dim
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -74,7 +74,7 @@ def make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt, ceil_
         nodes=[mp_node], name="mp_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="mp-model")
+    model = qonnx_make_model(graph, producer_name="mp-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
index 706679b6809844d0b2924411440088ea892ba7a9..96cd69c3453793c1634f132cb159f0cc8a94a28c 100644
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
@@ -37,7 +37,7 @@ from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.general.multithreshold import multithreshold
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -93,7 +93,7 @@ def make_single_thresholding_modelwrapper(
         outputs=[outp],
     )
 
-    model = helper.make_model(graph, producer_name="thresholding-model")
+    model = qonnx_make_model(graph, producer_name="thresholding-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau.py b/tests/fpgadataflow/test_fpgadataflow_vvau.py
index 03ddb1286320b8178276ea53082095106a43d7a1..abf8ba0b9efde67c77711abc8451475887430cae 100644
--- a/tests/fpgadataflow/test_fpgadataflow_vvau.py
+++ b/tests/fpgadataflow/test_fpgadataflow_vvau.py
@@ -35,7 +35,7 @@ from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.general.multithreshold import multithreshold
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -132,7 +132,7 @@ def _make_single_vvau_modelwrapper(
         nodes=[VVAU_node], name="vvau_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="vvau-model")
+    model = qonnx_make_model(graph, producer_name="vvau-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_set_folding.py b/tests/fpgadataflow/test_set_folding.py
index 8ea0e18f2cace10b6fefae50ce1e28845ab24050..5355dd7044343d9dbb077225b5b8786eb7fdfe32 100644
--- a/tests/fpgadataflow/test_set_folding.py
+++ b/tests/fpgadataflow/test_set_folding.py
@@ -34,6 +34,7 @@ from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.util.basic import qonnx_make_model
 
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.transformation.fpgadataflow.create_dataflow_partition import (
@@ -91,7 +92,7 @@ def make_multi_fclayer_model(ch, wdt, adt, tdt, nnodes):
         outputs=[tensors[-1]],
     )
 
-    model = helper.make_model(graph, producer_name="fclayer-model")
+    model = qonnx_make_model(graph, producer_name="fclayer-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", adt)
diff --git a/tests/transformation/streamline/test_absorb_mul_into_topk.py b/tests/transformation/streamline/test_absorb_mul_into_topk.py
index a6dff788dc58dba45536a280c7fe5f5c53edc4e1..89ef74e0b3f83fc092268ad2582c533e47eab618 100644
--- a/tests/transformation/streamline/test_absorb_mul_into_topk.py
+++ b/tests/transformation/streamline/test_absorb_mul_into_topk.py
@@ -34,6 +34,7 @@ from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNode
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.insert_topk import InsertTopK
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.absorb import AbsorbScalarMulAddIntoTopK
@@ -65,7 +66,7 @@ def test_absorb_mul_into_topk(mul_positive, scalar):
         value_info=[a0, b0, c0],
     )
 
-    model = helper.make_model(mul_graph, producer_name="mul_model")
+    model = qonnx_make_model(mul_graph, producer_name="mul_model")
     model = ModelWrapper(model)
     # initialize values
     # for mul
diff --git a/tests/transformation/streamline/test_absorb_transp_into_flatten.py b/tests/transformation/streamline/test_absorb_transp_into_flatten.py
index 1358d468c04c3edf08b11e7e9b858dda58965368..44b0c1d7e04447f13043cb326047a7b8d69469dd 100644
--- a/tests/transformation/streamline/test_absorb_transp_into_flatten.py
+++ b/tests/transformation/streamline/test_absorb_transp_into_flatten.py
@@ -8,6 +8,7 @@ from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNode
 from qonnx.transformation.infer_data_layouts import InferDataLayouts
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.absorb import AbsorbTransposeIntoFlatten
@@ -45,7 +46,7 @@ def test_absorb_transp_into_flatten(perm, shape, ishape, data_layout):
         outputs=[outp],
     )
 
-    model = helper.make_model(graph, producer_name="absorb_transpose_model")
+    model = qonnx_make_model(graph, producer_name="absorb_transpose_model")
     model = ModelWrapper(model)
     if shape is not None:
         model.graph.value_info.append(shape0)
diff --git a/tests/transformation/streamline/test_collapse_repeated_op.py b/tests/transformation/streamline/test_collapse_repeated_op.py
index 268e0ffc5c5cb342634ff51ac8fe02157ae8c7c6..c1d3ee00883b84ec2a8c18d093b1756a4d6aea36 100644
--- a/tests/transformation/streamline/test_collapse_repeated_op.py
+++ b/tests/transformation/streamline/test_collapse_repeated_op.py
@@ -33,6 +33,7 @@ import onnx.helper as oh
 from onnx import TensorProto
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as ox
 from finn.transformation.streamline import CollapseRepeatedAdd, CollapseRepeatedMul
@@ -46,7 +47,7 @@ def test_collapse_repeated_op():
     add_param_1 = oh.make_tensor_value_info("add_param_1", TensorProto.FLOAT, [2])
     mul_param_1 = oh.make_tensor_value_info("mul_param_1", TensorProto.FLOAT, [2])
     top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, [2])
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
@@ -96,7 +97,7 @@ def test_collapse_repeated_only_if_linear(test_args):
     value_info += [oh.make_tensor_value_info("p4", TensorProto.FLOAT, [1])]
     value_info += [oh.make_tensor_value_info("p5", TensorProto.FLOAT, [1])]
 
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
diff --git a/tests/transformation/streamline/test_factor_out_mul_sign_magnitude.py b/tests/transformation/streamline/test_factor_out_mul_sign_magnitude.py
index 04ab9bf0b9c092bdf2c2a6c6268974fd78020eee..89596a1c0f4af4b95e19f3b6aba19e7f459aa7df 100644
--- a/tests/transformation/streamline/test_factor_out_mul_sign_magnitude.py
+++ b/tests/transformation/streamline/test_factor_out_mul_sign_magnitude.py
@@ -33,6 +33,7 @@ import onnx.helper as oh
 from onnx import TensorProto
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as ox
 from finn.transformation.streamline import FactorOutMulSignMagnitude
@@ -43,7 +44,7 @@ def test_factor_out_mul_sign_magnitude():
     top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, [1, 2])
     mul_param = oh.make_tensor_value_info("mul_param", TensorProto.FLOAT, [1, 2])
     top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, [1, 2])
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
diff --git a/tests/transformation/streamline/test_linear_past_eltwise.py b/tests/transformation/streamline/test_linear_past_eltwise.py
index 12633d750bb405757efca0c028dece92b289b472..4e5dcd63862b61f5575d8adf2cbb69912ee726d7 100644
--- a/tests/transformation/streamline/test_linear_past_eltwise.py
+++ b/tests/transformation/streamline/test_linear_past_eltwise.py
@@ -35,6 +35,7 @@ from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.fold_constants import FoldConstants
 from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MoveLinearPastEltwiseAdd
@@ -78,7 +79,7 @@ def make_model(shape):
         outputs=[outp],
     )
 
-    model = helper.make_model(graph, producer_name="add-model")
+    model = qonnx_make_model(graph, producer_name="add-model")
     model = ModelWrapper(model)
 
     # set initializers for scalar add/mul nodes
@@ -156,7 +157,7 @@ def test_linear_past_eltwise_add_multiple_forks(ch, ifmdim):
             helper.make_tensor_value_info("p" + str(i), TensorProto.FLOAT, input_shape)
         ]
 
-    modelproto = helper.make_model(
+    modelproto = qonnx_make_model(
         helper.make_graph(
             name="test",
             inputs=[top_in],
diff --git a/tests/transformation/streamline/test_maxpool_nhwc.py b/tests/transformation/streamline/test_maxpool_nhwc.py
index aa77b5cf1a6e77d67ff8351ca5f544a63eb47f29..d61eedaaf5d1f10e64712d5282190b67f56acb49 100644
--- a/tests/transformation/streamline/test_maxpool_nhwc.py
+++ b/tests/transformation/streamline/test_maxpool_nhwc.py
@@ -7,7 +7,7 @@ from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.general.maxpoolnhwc import compute_pool_output_dim
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
@@ -56,7 +56,7 @@ def create_maxpool(ifm_dim, ifm_ch, kernel_shape, pads, strides, ceil_mode, idt)
         value_info=[outp_mp],
     )
 
-    model = oh.make_model(graph, producer_name="maxpool_model")
+    model = qonnx_make_model(graph, producer_name="maxpool_model")
     model = ModelWrapper(model)
     model.set_tensor_datatype("inp", idt)
     model.set_tensor_datatype("outp", idt)
diff --git a/tests/transformation/streamline/test_move_add_past_mul.py b/tests/transformation/streamline/test_move_add_past_mul.py
index 0fb4dd9f7a116d0d52578d7222421f251ac17ec1..ea9c2a954d2bd7b4a4be421c1869d4a8dd8f0cf1 100644
--- a/tests/transformation/streamline/test_move_add_past_mul.py
+++ b/tests/transformation/streamline/test_move_add_past_mul.py
@@ -33,6 +33,7 @@ import onnx.helper as oh
 from onnx import TensorProto
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as ox
 from finn.transformation.streamline import MoveAddPastMul
@@ -44,7 +45,7 @@ def test_move_add_past_mul_single():
     add_param = oh.make_tensor_value_info("add_param", TensorProto.FLOAT, [2])
     mul_param = oh.make_tensor_value_info("mul_param", TensorProto.FLOAT, [2])
     top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, [2])
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
@@ -76,7 +77,7 @@ def test_move_add_past_mul_multi():
     add_param_1 = oh.make_tensor_value_info("add_param_1", TensorProto.FLOAT, [2])
     mul_param_1 = oh.make_tensor_value_info("mul_param_1", TensorProto.FLOAT, [2])
     top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, [2])
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
@@ -116,7 +117,7 @@ def test_move_add_past_mul_only_if_linear():
     value_info += [oh.make_tensor_value_info("mul1_param", TensorProto.FLOAT, [1])]
     value_info += [oh.make_tensor_value_info("mul2_param", TensorProto.FLOAT, [1])]
     value_info += [oh.make_tensor_value_info("mul3_param", TensorProto.FLOAT, [1])]
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
diff --git a/tests/transformation/streamline/test_move_chw_add_past_conv.py b/tests/transformation/streamline/test_move_chw_add_past_conv.py
index 7eb7f9f1af67efa1a6934157b9c2b3f8a6a814c2..e1b324a798a23b5f4a6878f5e2b27434a61fe8f8 100644
--- a/tests/transformation/streamline/test_move_chw_add_past_conv.py
+++ b/tests/transformation/streamline/test_move_chw_add_past_conv.py
@@ -33,6 +33,7 @@ from onnx import TensorProto, helper
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.general.im2col import compute_conv_output_dim
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MoveAddPastConv
@@ -72,7 +73,7 @@ def test_move_chw_add_past_conv(idim, k, s, ich, och):
     add_node = helper.make_node("Add", ["inp", "a0"], ["add_out"])
     conv_node = helper.make_node("Conv", ["add_out", "a1"], ["outp"], **conv_config)
 
-    model = helper.make_model(
+    model = qonnx_make_model(
         helper.make_graph(
             nodes=[add_node, conv_node],
             name="move-add-graph",
diff --git a/tests/transformation/streamline/test_move_flatten_past_affine.py b/tests/transformation/streamline/test_move_flatten_past_affine.py
index 8c3f71d1f35de1b03fb33e53e41599fae7e02304..22c5e19fac700e147a36f74f10dad10614d47992 100644
--- a/tests/transformation/streamline/test_move_flatten_past_affine.py
+++ b/tests/transformation/streamline/test_move_flatten_past_affine.py
@@ -36,7 +36,7 @@ from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNode
 from qonnx.transformation.infer_data_layouts import InferDataLayouts
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MoveFlattenPastAffine
@@ -74,7 +74,7 @@ def test_move_flatten_past_affine(data_layout, batch_size):
         value_info=[a0, a1, a2],
     )
 
-    model = helper.make_model(graph, producer_name="move_reshape_model")
+    model = qonnx_make_model(graph, producer_name="move_reshape_model")
     model = ModelWrapper(model)
 
     # initialize values
diff --git a/tests/transformation/streamline/test_move_flatten_past_topk.py b/tests/transformation/streamline/test_move_flatten_past_topk.py
index d1478088e2e8caaeb33fbec2880e74ea65905073..82336cd3e69d865e4c36536e7e0b16f092a7033d 100644
--- a/tests/transformation/streamline/test_move_flatten_past_topk.py
+++ b/tests/transformation/streamline/test_move_flatten_past_topk.py
@@ -36,7 +36,7 @@ from qonnx.transformation.infer_data_layouts import InferDataLayouts
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.insert_topk import InsertTopK
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MoveFlattenPastTopK
@@ -67,7 +67,7 @@ def test_move_flatten_past_topk(data_layout, batch_size):
         outputs=[outp],
     )
 
-    model = helper.make_model(graph, producer_name="move_flatten_model")
+    model = qonnx_make_model(graph, producer_name="move_flatten_model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", DataType["INT2"])
diff --git a/tests/transformation/streamline/test_move_identical_op_past_join_op.py b/tests/transformation/streamline/test_move_identical_op_past_join_op.py
index 4986363ff4dba0b0126babdbd1f393faa2df5de3..7be97631625354297c322267792520628454c4f9 100644
--- a/tests/transformation/streamline/test_move_identical_op_past_join_op.py
+++ b/tests/transformation/streamline/test_move_identical_op_past_join_op.py
@@ -30,7 +30,7 @@ import pytest
 from onnx import TensorProto
 from onnx import helper as oh
 from qonnx.core.modelwrapper import ModelWrapper
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MoveTransposePastJoinAdd
@@ -81,7 +81,7 @@ def create_model(perm):
         ],
     )
 
-    onnx_model = oh.make_model(graph, producer_name="test_model")
+    onnx_model = qonnx_make_model(graph, producer_name="test_model")
     model = ModelWrapper(onnx_model)
 
     return model
diff --git a/tests/transformation/streamline/test_move_maxpool_past_multithreshold.py b/tests/transformation/streamline/test_move_maxpool_past_multithreshold.py
index bf25eee9e685d2536faf5bd25bc7b1aa36700463..6126acd9e388869c34cd0c73bb64f4b6c56b4c06 100644
--- a/tests/transformation/streamline/test_move_maxpool_past_multithreshold.py
+++ b/tests/transformation/streamline/test_move_maxpool_past_multithreshold.py
@@ -32,6 +32,7 @@ from onnx import TensorProto, helper
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MoveMaxPoolPastMultiThreshold
@@ -99,7 +100,7 @@ def test_move_maxpool_past_multithreshold():
         )
     ]
 
-    modelproto = helper.make_model(
+    modelproto = qonnx_make_model(
         helper.make_graph(
             name="test",
             inputs=[top_in],
diff --git a/tests/transformation/streamline/test_move_mul_past_dw_conv.py b/tests/transformation/streamline/test_move_mul_past_dw_conv.py
index 401631a728412e7676fa804626601cfc58b5a5e3..72a6650ec4e6b853b79c93941af84dd15a7e5c47 100644
--- a/tests/transformation/streamline/test_move_mul_past_dw_conv.py
+++ b/tests/transformation/streamline/test_move_mul_past_dw_conv.py
@@ -33,7 +33,7 @@ from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.general.im2col import compute_conv_output_dim
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MoveMulPastDWConv
@@ -94,7 +94,7 @@ def test_move_mul_past_dw_conv(ifm_dim, ifm_ch, k, stride, pad_amt, dw):
         value_info=[mul, W],
     )
 
-    model = helper.make_model(graph, producer_name="mulpastconv-model")
+    model = qonnx_make_model(graph, producer_name="mulpastconv-model")
     model = ModelWrapper(model)
     inp_values = gen_finn_dt_tensor(DataType["INT2"], [1, ifm_ch, ifm_dim, ifm_dim])
     mul_values = gen_finn_dt_tensor(DataType["INT2"], [1, ifm_ch, 1, 1])
diff --git a/tests/transformation/streamline/test_move_mul_past_maxpool.py b/tests/transformation/streamline/test_move_mul_past_maxpool.py
index fcc1b6513230c548bdcc04a40aad793b64c6faf2..3bae2905a064b8372b520a7a8083905284343429 100755
--- a/tests/transformation/streamline/test_move_mul_past_maxpool.py
+++ b/tests/transformation/streamline/test_move_mul_past_maxpool.py
@@ -34,7 +34,7 @@ from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.general.maxpoolnhwc import compute_pool_output_dim
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MoveMulPastMaxPool
@@ -92,7 +92,7 @@ def test_move_mul_past_maxpool(ifm_dim, ifm_ch, k, stride, pad, cw, negative):
         value_info=[mul],
     )
 
-    model = helper.make_model(graph, producer_name="mulpastmaxpool-model")
+    model = qonnx_make_model(graph, producer_name="mulpastmaxpool-model")
     model = ModelWrapper(model)
     inp_values = gen_finn_dt_tensor(DataType["INT2"], [1, ifm_ch, ifm_dim, ifm_dim])
     mul_values = np.random.random_sample(mul_shape).astype(np.float32)
diff --git a/tests/transformation/streamline/test_move_scalar_past_conv.py b/tests/transformation/streamline/test_move_scalar_past_conv.py
index 59b8b8f8b2fee99bbb77c6d354620406a108cb54..bb99fd1d8f7d48ab9ad7038d78f5352f26f2ad06 100644
--- a/tests/transformation/streamline/test_move_scalar_past_conv.py
+++ b/tests/transformation/streamline/test_move_scalar_past_conv.py
@@ -32,6 +32,7 @@ import onnx.helper as oh
 from onnx import TensorProto
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as ox
 from finn.transformation.streamline import MoveAddPastConv, MoveScalarMulPastConv
@@ -79,7 +80,7 @@ def test_move_scalar_past_conv(test_args, padding):
     value_info += [oh.make_tensor_value_info("p2", TensorProto.FLOAT, conv_param_shape)]
     value_info += [oh.make_tensor_value_info("p3", TensorProto.FLOAT, conv_param_shape)]
 
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
@@ -158,7 +159,7 @@ def test_move_scalar_past_conv_only_if_linear(test_args):
     value_info += [oh.make_tensor_value_info("p4", TensorProto.FLOAT, conv_param_shape)]
     value_info += [oh.make_tensor_value_info("p5", TensorProto.FLOAT, conv_param_shape)]
 
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
diff --git a/tests/transformation/streamline/test_move_scalar_past_matmul.py b/tests/transformation/streamline/test_move_scalar_past_matmul.py
index 6fdaaadfaea5862b566fd3a8d060ac28acadf1cd..6c788294bc739332c0b9bd0e98081bcb83330b53 100644
--- a/tests/transformation/streamline/test_move_scalar_past_matmul.py
+++ b/tests/transformation/streamline/test_move_scalar_past_matmul.py
@@ -33,6 +33,7 @@ import onnx.helper as oh
 from onnx import TensorProto
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as ox
 from finn.transformation.streamline import (
@@ -47,7 +48,7 @@ def test_move_scalar_mul_past_matmul():
     mul_param = oh.make_tensor_value_info("mul_param", TensorProto.FLOAT, [1, 1])
     matmul_param = oh.make_tensor_value_info("matmul_param", TensorProto.FLOAT, [2, 2])
     top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, [1, 2])
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
@@ -79,7 +80,7 @@ def test_move_scalar_add_past_matmul():
     add_param = oh.make_tensor_value_info("add_param", TensorProto.FLOAT, [1, 1])
     matmul_param = oh.make_tensor_value_info("matmul_param", TensorProto.FLOAT, [2, 2])
     top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, [1, 2])
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
@@ -122,7 +123,7 @@ def test_move_scalar_past_matmul_only_if_linear(test_args):
     p2 = oh.make_tensor_value_info("p2", TensorProto.FLOAT, matmul_shape)
     p3 = oh.make_tensor_value_info("p3", TensorProto.FLOAT, matmul_shape)
     p4 = oh.make_tensor_value_info("p4", TensorProto.FLOAT, matmul_shape)
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
diff --git a/tests/transformation/streamline/test_move_transpose_past_scalar_mul.py b/tests/transformation/streamline/test_move_transpose_past_scalar_mul.py
index 9662ba8a908e9bb793e0c0c2b078cf26adb5cef3..6bf72961ac06331c8ce972c8ca78dea99fb3c0a0 100644
--- a/tests/transformation/streamline/test_move_transpose_past_scalar_mul.py
+++ b/tests/transformation/streamline/test_move_transpose_past_scalar_mul.py
@@ -36,6 +36,7 @@ from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNode
 from qonnx.transformation.infer_data_layouts import InferDataLayouts
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MoveTransposePastScalarMul
@@ -71,7 +72,7 @@ def test_move_transpose_past_scalar_mul(perm, scalar, data_layout):
         value_info=[a0],
     )
 
-    model = helper.make_model(graph, producer_name="mv_transpose_model")
+    model = qonnx_make_model(graph, producer_name="mv_transpose_model")
     model = ModelWrapper(model)
 
     # initialize values
diff --git a/tests/transformation/streamline/test_round_thresholds.py b/tests/transformation/streamline/test_round_thresholds.py
index 1ec5f02e878a540a89cc37179b2e6dd76ede882c..85c60b37d5193de7ed2f38b9da6eb2e9b35b0150 100644
--- a/tests/transformation/streamline/test_round_thresholds.py
+++ b/tests/transformation/streamline/test_round_thresholds.py
@@ -32,6 +32,7 @@ import numpy as np
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline import RoundAndClipThresholds
@@ -46,7 +47,7 @@ def test_round_thresholds():
         "MultiThreshold", ["v", "thresholds"], ["out"], domain="qonnx.custom_op.general"
     )
     graph_def = helper.make_graph([node_def], "test_model", [v, thresholds], [out])
-    model_def = helper.make_model(graph_def)
+    model_def = qonnx_make_model(graph_def)
     model = ModelWrapper(model_def)
     threshold_val = np.asarray([[-1.1], [0.7], [2.3], [5.1]], dtype=np.float32)
     model.set_initializer("thresholds", threshold_val)
diff --git a/tests/transformation/streamline/test_scale_resize_nhwc.py b/tests/transformation/streamline/test_scale_resize_nhwc.py
index f10930f4e7d5aeb98a60630e7e4f48adfc371d59..5e107448f8d8cc78d572f846496ed541591dfe05 100644
--- a/tests/transformation/streamline/test_scale_resize_nhwc.py
+++ b/tests/transformation/streamline/test_scale_resize_nhwc.py
@@ -9,7 +9,7 @@ from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.infer_data_layouts import InferDataLayouts
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MakeScaleResizeNHWC
@@ -58,7 +58,7 @@ def create_resize_transpose(ifm_dim, ifm_ch, scales, mode, idt):
         value_info=[outp_up, param, roi],
     )
 
-    model = oh.make_model(graph, producer_name="resize_model1")
+    model = qonnx_make_model(graph, producer_name="resize_model1")
     model = ModelWrapper(model)
     model.set_tensor_datatype("inp", idt)
     model.set_tensor_datatype("outp", idt)
@@ -113,7 +113,7 @@ def create_transpose_resize(ifm_dim, ifm_ch, scales, mode, idt):
         value_info=[outp_tr, param, roi],
     )
 
-    model = oh.make_model(graph, producer_name="resize_model2")
+    model = qonnx_make_model(graph, producer_name="resize_model2")
     model = ModelWrapper(model)
     model.set_tensor_datatype("inp", idt)
     model.set_tensor_datatype("outp", idt)
@@ -180,7 +180,7 @@ def create_transpose_resize_transpose(ifm_dim, ifm_ch, scales, mode, idt):
         value_info=[outp_up, outp_tr, param, roi],
     )
 
-    model = oh.make_model(graph, producer_name="resize_model3")
+    model = qonnx_make_model(graph, producer_name="resize_model3")
     model = ModelWrapper(model)
     model.set_tensor_datatype("inp", idt)
     model.set_tensor_datatype("outp", idt)