diff --git a/custom_hls/lookup.hpp b/custom_hls/lookup.hpp
index 3001f6613ec6ed9a9e5f47d9be356d4b032f7192..037b038a09a10ff2bd066740d20f0b47489e24e4 100644
--- a/custom_hls/lookup.hpp
+++ b/custom_hls/lookup.hpp
@@ -26,14 +26,15 @@
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- *******************************************************************************/
+*******************************************************************************/
+#ifndef LOOKUP_HPP
+#define LOOKUP_HPP
 
 #include <ap_int.h>
 #include <hls_stream.h>
 
-#ifndef LOOKUP_HPP
-#define LOOKUP_HPP
+#include "utils.hpp"
+
 
 template <
     unsigned NumEmbeddings,
@@ -57,4 +58,50 @@ void StreamingLookup(
     }
 }
 
+/**
+ * Lookup implementation over a table stored in AXI-accessible memory.
+ */
+template <
+	unsigned  EmbeddingSize,                            // Number of memory words per embedding
+	unsigned  EmbeddingAlign = clog2(EmbeddingSize),    // Alignment of entries = number of word index bits
+	typename  T_SRC,
+	typename  T_DST
+>
+void StreamingLookup_ext(
+	hls::stream<T_SRC> &in0,
+	hls::stream<T_DST> &out,
+	T_DST const *const  mem,
+	unsigned  const     size,
+	unsigned           &oob_count,
+	bool               &oob_irq
+) {
+#pragma HLS pipeline II=EmbeddingSize+9 style=flp
+
+	static unsigned  oob_count_li;
+	static unsigned  oob_count_int;
+#pragma HLS reset variable=oob_count_li
+#pragma HLS reset variable=oob_count_int
+
+	if(oob_count != oob_count_li) {
+		oob_count_int -= oob_count_li;
+		oob_count_li   = oob_count;
+	}
+	if(!in0.empty()) {
+		T_SRC const  x = in0.read();
+
+		// Map out-of-bounds inputs to an offset of zero and increment counter
+		bool  const  oob = x >= T_SRC(size);
+		ap_uint<T_SRC::width+EmbeddingAlign> const  ofs =
+			((oob? T_SRC(0) : x), ap_uint<EmbeddingAlign>(0));
+		oob_count_int += oob;
+
+		// Stream lookup data (burst inferred)
+		for(unsigned  i = 0; i < EmbeddingSize; i++) {
+#pragma HLS pipeline II=1 style=flp
+			out.write(mem[ofs+i]);
+		}
+	}
+	oob_count =  oob_count_int;
+	oob_irq   = (oob_count_int != 0);
+}
 #endif
diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn
index 9c18c03d7bdb8406d43aa8fc4efdb8a206b1217e..b3c669ec1097745bd30f650ca0b9dacda647c61d 100644
--- a/docker/Dockerfile.finn
+++ b/docker/Dockerfile.finn
@@ -65,7 +65,7 @@ RUN locale-gen "en_US.UTF-8"
 RUN apt-get install -y git perl python3 make autoconf g++ flex bison ccache libgoogle-perftools-dev numactl perl-doc libfl2 libfl-dev zlibc zlib1g zlib1g-dev
 RUN git clone https://github.com/verilator/verilator
 RUN cd verilator && \
-    git checkout v4.012 && \
+    git checkout v4.224 && \
     autoconf && \
     ./configure && \
     make -j4 && \
diff --git a/fetch-repos.sh b/fetch-repos.sh
index 74d910478e83ce9a18000350c04e213a3e1f381e..b0f6400ed142b203b1c9f6d7ea4ac6ababcf34d1 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -27,12 +27,12 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-QONNX_COMMIT="398a0ecfcb32407c0a3df39246cf6d2bca02886c"
+QONNX_COMMIT="f702b17cdb9d5e57f85f43a5d33890647e063de6"
 FINN_EXP_COMMIT="9cbd2787b5160e2b44e0e8164a0df1457dbd5366"
 BREVITAS_COMMIT="a5b71d6de1389d3e7db898fef72e014842670f03"
-PYVERILATOR_COMMIT="64b8294ff1afebb47be76fcad6ae87027e0402c2"
+PYVERILATOR_COMMIT="766e457465f5c0dd315490d7b9cc5d74f9a76f4f"
 CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
-HLSLIB_COMMIT="79d7c61fbe318bfcd56e3c35bbfb774995a7870c"
+HLSLIB_COMMIT="d27f6b6c5d8f1bb208db395659389603f63ad4be"
 OMX_COMMIT="d1065a788219ca0eb54d5e57600b1f9d7f67d4cc"
 AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b"
 XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e"
diff --git a/finn-rtllib/memstream/hdl/Q_srl.v b/finn-rtllib/memstream/hdl/Q_srl.v
index b4e89628a44bb1f55c3445ee8e6866beada23585..11cef604e0a3d106529a65ae229bc4cb419c4d70 100644
--- a/finn-rtllib/memstream/hdl/Q_srl.v
+++ b/finn-rtllib/memstream/hdl/Q_srl.v
@@ -69,7 +69,7 @@
 `define Q_srl
 
 
-module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
+module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount);
 
    parameter depth = 16;   // - greatest #items in queue  (2 <= depth <= 256)
    parameter width = 16;   // - width of data (i_d, o_d)
@@ -90,7 +90,9 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
    wire               o_b;	// - output stream back-pressure
 
    output [addrwidth:0] count;  // - output number of elems in queue
+   output [addrwidth:0] maxcount;  // - maximum observed count since reset
 
+   reg [addrwidth:0] maxcount_reg;  // - maximum count seen until now
    reg    [addrwidth-1:0] addr, addr_, a_;		// - SRL16 address
 							//     for data output
    reg 			  shift_en_;			// - SRL16 shift enable
@@ -124,6 +126,7 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
    assign o_d = srlo;				// - output data from queue
    assign o_v = o_v_reg;			// - output valid if non-empty
    assign i_b = i_b_reg;			// - input bp if full
+   assign maxcount = maxcount_reg;
 
    assign i_r = !i_b;
    assign o_b = !o_r;
@@ -139,7 +142,10 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
 	 addr      <= 0;
          addr_full <= 0;
 	 o_v_reg   <= 0;
-	 i_b_reg   <= 1;
+
+	 i_b_reg   <= 0;
+	 maxcount_reg <= 0;
+
       end
       else begin
 	 state     <= state_;
@@ -147,6 +153,7 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
          addr_full <= addr_full_;
 	 o_v_reg   <= o_v_reg_;
 	 i_b_reg   <= i_b_reg_;
+	 maxcount_reg <= (count > maxcount_reg ? count : maxcount_reg);
       end
    end // always @ (posedge clock)
 
diff --git a/finn-rtllib/swg/swg_template_default.sv b/finn-rtllib/swg/swg_template_default.sv
new file mode 100644
index 0000000000000000000000000000000000000000..97517438a0c261e4488b74a677a352f9dc51743b
--- /dev/null
+++ b/finn-rtllib/swg/swg_template_default.sv
@@ -0,0 +1,351 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+module $TOP_MODULE_NAME$_controller #(
+    int unsigned  LOOP_H_ITERATIONS    = $LOOP_H_ITERATIONS$,
+    int unsigned  LOOP_W_ITERATIONS    = $LOOP_W_ITERATIONS$,
+    int unsigned  LOOP_KH_ITERATIONS   = $LOOP_KH_ITERATIONS$,
+    int unsigned  LOOP_KW_ITERATIONS   = $LOOP_KW_ITERATIONS$,
+    int unsigned  LOOP_SIMD_ITERATIONS = $LOOP_SIMD_ITERATIONS$,
+
+    int unsigned  INCR_BITWIDTH = $INCR_BITWIDTH$,
+    bit [INCR_BITWIDTH-1:0]  ADDR_INCREMENT_MAP[6] = $ADDR_INCREMENT_MAP$,
+
+    bit IS_DEPTHWISE = $IS_DEPTHWISE$
+)(
+    input   logic  clk,
+    input   logic  rst_n,
+
+    input   logic  advance,
+    output  logic [INCR_BITWIDTH-1:0]  addr_incr,
+    output  logic [INCR_BITWIDTH-1:0]  tail_incr
+);
+
+    // state and counters
+    typedef enum logic [2:0] {
+        STATE_START,
+        STATE_LOOP_SIMD,
+        STATE_LOOP_KW,
+        STATE_LOOP_KH,
+        STATE_LOOP_W,
+        STATE_LOOP_H
+    }  state_e;
+    state_e  State = $INNERMOST_STATE$;
+    state_e  state_next;
+
+    logic signed [$clog2(LOOP_H_ITERATIONS   +2)+1-1:0]  Counter_loop_h    = LOOP_H_ITERATIONS-1;
+    logic signed [$clog2(LOOP_W_ITERATIONS   +2)+1-1:0]  Counter_loop_w    = LOOP_W_ITERATIONS-1;
+    logic signed [$clog2(LOOP_KH_ITERATIONS  +2)+1-1:0]  Counter_loop_kh   = LOOP_KH_ITERATIONS-1;
+    logic signed [$clog2(LOOP_KW_ITERATIONS  +2)+1-1:0]  Counter_loop_kw   = LOOP_KW_ITERATIONS-1;
+    logic signed [$clog2(LOOP_SIMD_ITERATIONS+2)+1-1:0]  Counter_loop_simd = LOOP_SIMD_ITERATIONS-1;
+
+    assign  addr_incr = ADDR_INCREMENT_MAP[State];
+
+    // combinational logic for tail_incr generation
+    uwire  tail_incr_inner_condition = IS_DEPTHWISE? (Counter_loop_kh >= 0) : 0;
+    always_comb begin : blkTail
+        if (tail_incr_inner_condition)
+            tail_incr = 1;
+        else if (Counter_loop_w >= 0)
+            tail_incr = $TAIL_INCR_W$;
+        else if (Counter_loop_h >= 0)
+            tail_incr = $TAIL_INCR_H$;
+        else
+            tail_incr = $TAIL_INCR_LAST$;
+    end
+
+    // combinational next state logic
+    always_comb begin : blkState
+        state_next = State;
+        if(State != $INNERMOST_STATE$)  state_next = $INNERMOST_STATE$;
+        else begin
+            if(Counter_loop_simd < 0) begin
+                state_next =
+                    (Counter_loop_kw >= 0)? STATE_LOOP_KW :
+                    (Counter_loop_kh >= 0)? STATE_LOOP_KH :
+                    (Counter_loop_w  >= 0)? STATE_LOOP_W :
+                    (Counter_loop_h  >= 0)? STATE_LOOP_H :
+                    /* else */              STATE_START;
+            end
+        end
+    end : blkState
+
+    // sequential logic
+    always_ff @ (posedge clk) begin
+        if(!rst_n) begin
+            State <= $INNERMOST_STATE$;
+            Counter_loop_h    <= LOOP_H_ITERATIONS-1;
+            Counter_loop_w    <= LOOP_W_ITERATIONS-1;
+            Counter_loop_kh   <= LOOP_KH_ITERATIONS-1;
+            Counter_loop_kw   <= LOOP_KW_ITERATIONS-1;
+            Counter_loop_simd <= LOOP_SIMD_ITERATIONS-1;
+        end
+        else if(advance) begin
+            State <= state_next;
+            if (State == $INNERMOST_STATE$) begin
+                if(Counter_loop_simd >= 0)  Counter_loop_simd <= Counter_loop_simd-1;
+                else begin
+                    Counter_loop_simd <= LOOP_SIMD_ITERATIONS-1;
+                    if(Counter_loop_kw >= 0)  Counter_loop_kw <= Counter_loop_kw-1;
+                    else begin
+                        Counter_loop_kw <= LOOP_KW_ITERATIONS-1;
+                        if(Counter_loop_kh >= 0)  Counter_loop_kh <= Counter_loop_kh-1;
+                        else begin
+                            Counter_loop_kh <= LOOP_KH_ITERATIONS-1;
+                            if(Counter_loop_w >= 0)  Counter_loop_w <= Counter_loop_w-1;
+                            else begin
+                                Counter_loop_w <= LOOP_W_ITERATIONS-1;
+                                if(Counter_loop_h >= 0)  Counter_loop_h <= Counter_loop_h-1;
+                                else  Counter_loop_h <= LOOP_H_ITERATIONS-1;
+                            end
+                        end
+                    end
+                end
+            end
+        end
+    end
+
+endmodule :  $TOP_MODULE_NAME$_controller
+
+module $TOP_MODULE_NAME$_cyclic_buffer_addressable #(
+    int unsigned  WIDTH,
+    int unsigned  DEPTH
+)(
+    input   logic  clk,
+    input   logic  rst_n,
+
+    input   logic  write_enable,
+    input   logic [$clog2(DEPTH)-1:0] write_addr,
+    input   logic [WIDTH-1:0]  data_in,
+
+    input   logic  read_enable,
+    input   logic [$clog2(DEPTH)-1:0]  read_addr, // absolute (!) read address of cyclic buffer
+    output  logic [WIDTH-1:0]  data_out
+);
+
+    $RAM_STYLE$ logic [WIDTH-1:0] Ram[DEPTH];
+    logic [WIDTH-1:0]  Out = 'x;
+    always_ff @(posedge clk) begin
+        if (read_enable)  Out <= Ram[read_addr];
+        if (write_enable) Ram[write_addr] <= data_in;
+    end
+    assign  data_out = Out;
+
+endmodule : $TOP_MODULE_NAME$_cyclic_buffer_addressable
+
+module $TOP_MODULE_NAME$_impl #(
+    int  BIT_WIDTH,
+    int  SIMD,
+    int  MMV_IN,
+    int  MMV_OUT,
+    int  LAST_READ_ELEM = $LAST_READ_ELEM$,
+    int  LAST_WRITE_ELEM = $LAST_WRITE_ELEM$,
+    int  BUF_ELEM_TOTAL = $BUF_ELEM_TOTAL$,
+    int  ELEM_PER_WINDOW = $ELEM_PER_WINDOW$,
+    int  INCR_BITWIDTH = $INCR_BITWIDTH$
+)(
+    input   logic  ap_clk,
+    input   logic  ap_rst_n,
+
+    input   logic  in0_V_V_TVALID,
+    output  logic  in0_V_V_TREADY,
+    input   logic [BIT_WIDTH * SIMD * MMV_IN-1:0]  in0_V_V_TDATA,
+
+    output  logic  out_V_V_TVALID,
+    input   logic  out_V_V_TREADY,
+    output  logic [BIT_WIDTH * SIMD * MMV_OUT-1:0]  out_V_V_TDATA
+);
+    // derived Constants
+    localparam int unsigned  BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN;
+    localparam int unsigned  BUF_OUT_ELEM_WIDTH = BIT_WIDTH * SIMD;
+    localparam int unsigned  BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT;
+
+   // main buffer instantiation
+    uwire [BUF_IN_WIDTH -1:0]  window_buffer_in;
+    uwire [BUF_OUT_WIDTH-1:0]  window_buffer_out;
+    uwire  window_buffer_write_enable;
+    uwire  window_buffer_read_enable;
+    uwire [$clog2(BUF_ELEM_TOTAL)-1:0]  window_buffer_write_addr;
+    uwire [$clog2(BUF_ELEM_TOTAL)-1:0]  window_buffer_read_addr;
+    $TOP_MODULE_NAME$_cyclic_buffer_addressable #(
+        .WIDTH(BUF_IN_WIDTH),
+        .DEPTH(BUF_ELEM_TOTAL)
+    ) window_buffer_inst (
+        .clk(ap_clk),
+        .rst_n(ap_rst_n),
+
+        .write_enable(window_buffer_write_enable),
+        .write_addr(window_buffer_write_addr),
+        .data_in(window_buffer_in),
+
+        .read_enable(window_buffer_read_enable),
+        .read_addr(window_buffer_read_addr),
+        .data_out(window_buffer_out)
+    );
+
+    //controller instantiation
+    uwire  advance_controller;
+    uwire signed [INCR_BITWIDTH-1:0]  addr_incr;
+    uwire        [INCR_BITWIDTH-1:0]  tail_incr;
+    $TOP_MODULE_NAME$_controller controller_inst (
+        .clk(ap_clk),
+        .rst_n(ap_rst_n),
+        .advance(advance_controller),
+        .addr_incr(addr_incr),
+        .tail_incr(tail_incr)
+    );
+
+    // Counters/address registers
+    // Add a sign bit even to (most) unsigned counters and Window_buffer_read_addr_reg,
+    // so we can use automatic sign extension and simplify calculations w/ signed increment.
+    // Alternatively, we could manually sign-extend and shave off a bit here or there.
+    logic signed [$clog2(LAST_READ_ELEM+1)+1-1:0]  Newest_buffered_elem = -1;
+    logic        [$clog2(LAST_READ_ELEM+1)+1-1:0]  Current_elem = 0;
+    logic        [$clog2(LAST_READ_ELEM+1)+1-1:0]  First_elem_next_window = 0;
+    logic        [$clog2(ELEM_PER_WINDOW)   -1:0]  Position_in_window = 0;
+    logic        [$clog2(BUF_ELEM_TOTAL)+1  -1:0]  Window_buffer_read_addr_reg = 0;
+    logic        [$clog2(BUF_ELEM_TOTAL)-1:0]      Window_buffer_write_addr_reg = 0;
+
+    // Control signals/registers
+    uwire  read_cmd =
+        !reading_done && ( // if there is still an input element left to read
+            Fetching_done || ( // if fetching is done (e.g. for skipped rows at FM end due to stride)
+                $signed(((Newest_buffered_elem - (BUF_ELEM_TOTAL - 1)))) < $signed(First_elem_next_window) &&
+                $signed(((Newest_buffered_elem - (BUF_ELEM_TOTAL - 1)))) < $signed(Current_elem)
+            ) // (over-)write to buffer if oldest buffered element will no longer be needed
+        );
+    uwire  read_ok      = read_cmd && in0_V_V_TVALID;
+    uwire  reading_done = Newest_buffered_elem == LAST_READ_ELEM;
+
+    uwire  fetch_cmd = !($signed(Current_elem) > Newest_buffered_elem) && !write_blocked && !Fetching_done;
+    logic  Fetching_done = 0;
+
+    logic  Write_cmd    = 0;
+    logic  Writing_done = 0;
+    uwire  write_ok      = Write_cmd &&  out_V_V_TREADY;
+    uwire  write_blocked = Write_cmd && !out_V_V_TREADY;;
+
+    //assign buffer control
+    assign  window_buffer_write_addr = Window_buffer_write_addr_reg;
+    assign  window_buffer_read_addr = Window_buffer_read_addr_reg;
+    assign  window_buffer_write_enable = read_ok;
+    assign  window_buffer_read_enable = fetch_cmd;
+    assign  advance_controller = fetch_cmd;
+
+    //assign I/O ports
+    assign  window_buffer_in = in0_V_V_TDATA;
+    assign  out_V_V_TDATA = window_buffer_out;
+    assign  in0_V_V_TREADY = ap_rst_n && read_ok; //only asserted if data is available and we can store it (allowed)
+    assign  out_V_V_TVALID = ap_rst_n && Write_cmd; //only asserted if we have data available and it has not been read yet (don't wait for READY from sink)
+
+    //main process for advancing counters
+    always_ff @(posedge ap_clk) begin
+        if(!ap_rst_n) begin
+            Newest_buffered_elem <= -1;
+            Current_elem <= 0;
+            First_elem_next_window <= 0;
+            Position_in_window <= 0;
+            Window_buffer_read_addr_reg <= 0;
+            Window_buffer_write_addr_reg <= 0;
+            Fetching_done <= 0;
+            Write_cmd <= 0;
+            Writing_done <= 0;
+        end
+        else begin
+            if (read_ok) begin
+                Window_buffer_write_addr_reg <= (Window_buffer_write_addr_reg == BUF_ELEM_TOTAL-1)? 0 : Window_buffer_write_addr_reg + 1;
+                Newest_buffered_elem <= Newest_buffered_elem+1;
+
+                if (Newest_buffered_elem == LAST_READ_ELEM-1) begin
+                    Window_buffer_write_addr_reg <= 0;
+                end
+                //check if this is the last read cycle (reading_done will be true afterwards)
+                if ((Newest_buffered_elem == LAST_READ_ELEM-1) && Writing_done) begin
+                    //start processing of next FM if writing is done already (possible due to unused input elements at the tail end)
+                    //todo: allow for read overlapping between feature maps (i.e., reading first elements from next FM while still writing last window of current FM)
+                    Newest_buffered_elem <= -1;
+                    Current_elem <= 0;
+                    Window_buffer_read_addr_reg <= 0;
+                    First_elem_next_window <= 0;
+                    Writing_done <= 0;
+                    Fetching_done <= 0;
+                end
+            end
+
+            if (fetch_cmd) begin
+                //count up to track which element index is about to be read from the buffer, and where it is located within the buffer
+                //use increment value calculated by controller
+
+                // absolute buffer address wrap-around
+                automatic logic signed [$clog2(BUF_ELEM_TOTAL)+1:0]  ra = $signed(Window_buffer_read_addr_reg) + $signed(addr_incr);
+                automatic logic signed [$clog2(BUF_ELEM_TOTAL+1):0]  ra_correct =
+                    (ra >= BUF_ELEM_TOTAL)? -BUF_ELEM_TOTAL :
+                    (ra <               0)?  BUF_ELEM_TOTAL : 0;
+                Window_buffer_read_addr_reg <= ra + ra_correct;
+
+                //keep track where we are within a window
+                Position_in_window <= (Position_in_window != ELEM_PER_WINDOW - 1)? Position_in_window+1 : 0;
+
+                //update first element of next window to allow buffer overwrite up until that point
+                if (Position_in_window == 0)
+                    First_elem_next_window <= First_elem_next_window + tail_incr;
+
+                //check if this is the last write cycle (Writing_done will be true afterwards)
+                if (Current_elem == LAST_WRITE_ELEM)
+                    Fetching_done <= 1;
+                else
+                    Current_elem <= $signed(Current_elem) + addr_incr;
+
+                // determine if prefetched data will be outstanding in the next cycle
+                // if we fetch in this cycle -> yes
+                // if we do not fetch nor write -> do not change
+                // if we do not fetch but write successfully-> clear outstanding data
+                Write_cmd <= fetch_cmd;
+            end
+
+            if (write_ok)
+                Write_cmd <= fetch_cmd;
+
+            if (write_ok && Fetching_done) begin
+                //check if this is the last write cycle (Writing_done will be true afterwards)
+                if (reading_done || (read_ok && (Newest_buffered_elem == LAST_READ_ELEM - 1))) begin
+                    //start processing of next FM if reading is done already, or completes in the same cycle
+                    Newest_buffered_elem <= -1;
+                    Current_elem <= 0;
+                    Window_buffer_read_addr_reg <= 0;
+                    First_elem_next_window <= 0;
+                    Fetching_done <= 0;
+                end else
+                    Writing_done <= 1;
+            end
+        end
+    end
+
+endmodule : $TOP_MODULE_NAME$_impl
diff --git a/finn-rtllib/swg/swg_template_wrapper.v b/finn-rtllib/swg/swg_template_wrapper.v
new file mode 100644
index 0000000000000000000000000000000000000000..0cc3579a255fddaf1a470d440b9e8ac245abe486
--- /dev/null
+++ b/finn-rtllib/swg/swg_template_wrapper.v
@@ -0,0 +1,75 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+`timescale 1 ns / 1 ps
+
+module $TOP_MODULE_NAME$ (
+(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *)
+input  ap_clk,
+(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *)
+input  ap_rst_n,
+input  [BUF_IN_WIDTH-1:0] in0_V_TDATA,
+input  in0_V_TVALID,
+output in0_V_TREADY,
+output [BUF_OUT_WIDTH-1:0] out_V_TDATA,
+output out_V_TVALID,
+input  out_V_TREADY
+);
+
+// top-level parameters (set via code-generation)
+parameter BIT_WIDTH = $BIT_WIDTH$;
+parameter SIMD = $SIMD$;
+parameter MMV_IN = $MMV_IN$;
+parameter MMV_OUT = $MMV_OUT$;
+
+// derived constants
+parameter BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN;
+parameter BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT;
+
+$TOP_MODULE_NAME$_impl
+#(
+    .BIT_WIDTH(BIT_WIDTH),
+    .SIMD(SIMD),
+    .MMV_IN(MMV_IN),
+    .MMV_OUT(MMV_OUT)
+)
+impl
+(
+    .ap_clk(ap_clk),
+    .ap_rst_n(ap_rst_n),
+    .in0_V_V_TDATA(in0_V_TDATA),
+    .in0_V_V_TVALID(in0_V_TVALID),
+    .in0_V_V_TREADY(in0_V_TREADY),
+    .out_V_V_TDATA(out_V_TDATA),
+    .out_V_V_TVALID(out_V_TVALID),
+    .out_V_V_TREADY(out_V_TREADY)
+);
+
+endmodule //TOP_MODULE_NAME
diff --git a/requirements.txt b/requirements.txt
index 970acc342bb7984e69929d1ef5eaa027b765ced0..9038a5e8170301421529e0b570482316e4fff20a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,7 @@ onnx==1.11.0
 onnxoptimizer
 onnxruntime==1.11.1
 pre-commit==2.9.2
-protobuf==3.20.1
+protobuf==3.20.2
 pyscaffold==3.2.1
 scipy==1.5.2
 setupext-janitor>=1.1.2
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index 92263bd82ce291833c6868847876ac7e3b68e6f8..d3c4156d9b4ccf601d3eea348f6cb61c0d9a6e87 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -37,6 +37,13 @@ from finn.transformation.fpgadataflow.vitis_build import VitisOptStrategy
 from finn.util.basic import alveo_default_platform, alveo_part_map, pynq_part_map
 
 
+class AutoFIFOSizingMethod(str, Enum):
+    "Select the type of automatic FIFO sizing strategy."
+
+    CHARACTERIZE = "characterize"
+    LARGEFIFO_RTLSIM = "largefifo_rtlsim"
+
+
 class ShellFlowType(str, Enum):
     """For builds that produce a bitfile, select the shell flow that will integrate
     the FINN-generated accelerator."""
@@ -246,6 +253,12 @@ class DataflowBuildConfig:
     #: for each FIFO.
     auto_fifo_depths: Optional[bool] = True
 
+    #: When `auto_fifo_depths = True`, select which method will be used for
+    #: setting the FIFO sizes.
+    auto_fifo_strategy: Optional[
+        AutoFIFOSizingMethod
+    ] = AutoFIFOSizingMethod.LARGEFIFO_RTLSIM
+
     #: Memory resource type for large FIFOs
     #: Only relevant when `auto_fifo_depths = True`
     large_fifo_mem_style: Optional[LargeFIFOMemStyle] = LargeFIFOMemStyle.AUTO
@@ -258,6 +271,10 @@ class DataflowBuildConfig:
     #: Which memory mode will be used for compute layers
     default_mem_mode: Optional[ComputeEngineMemMode] = ComputeEngineMemMode.DECOUPLED
 
+    #: Force inference of RTL ConvolutionInputGenerator over HLS implementation
+    #: If set to False, falls back to the default behavior of InferConvInpGen()
+    force_rtl_conv_inp_gen: Optional[bool] = False
+
     #: Which Vitis platform will be used.
     #: Only relevant when `shell_flow_type = ShellFlowType.VITIS_ALVEO`
     #: e.g. "xilinx_u250_xdma_201830_2"
@@ -316,6 +333,10 @@ class DataflowBuildConfig:
     #: Override the number of inputs for rtlsim performance measurement.
     rtlsim_batch_size: Optional[int] = 1
 
+    #: If set to True, FIFOs and DWCs with impl_style=vivado will be kept during
+    #: rtlsim, otherwise they will be replaced by HLS implementations.
+    rtlsim_use_vivado_comps: Optional[bool] = True
+
     def _resolve_hls_clk_period(self):
         if self.hls_clk_period_ns is None:
             # use same clk for synth and hls if not explicitly specified
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index 59f77650da5c3c3f9db0ea65e2288544b376bec3..5da608c27def8136f9ad11f62b4707452eac3120 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -29,6 +29,7 @@
 import json
 import numpy as np
 import os
+import shutil
 from copy import deepcopy
 from distutils.dir_util import copy_tree
 from qonnx.core.modelwrapper import ModelWrapper
@@ -78,6 +79,10 @@ from finn.transformation.fpgadataflow.create_dataflow_partition import (
     CreateDataflowPartition,
 )
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.derive_characteristic import (
+    DeriveCharacteristic,
+    DeriveFIFOSizes,
+)
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
@@ -85,6 +90,7 @@ from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
 from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
     ReplaceVerilogRelPaths,
 )
@@ -121,81 +127,126 @@ def verify_step(
     verify_out_dir = cfg.output_dir + "/verification_output"
     intermediate_models_dir = cfg.output_dir + "/intermediate_models"
     os.makedirs(verify_out_dir, exist_ok=True)
-    (in_npy, exp_out_npy) = cfg._resolve_verification_io_pair()
-    if need_parent:
-        assert (
-            cfg.save_intermediate_models
-        ), "Enable save_intermediate_models for verification"
-        parent_model_fn = intermediate_models_dir + "/dataflow_parent.onnx"
-        child_model_fn = intermediate_models_dir + "/verify_%s.onnx" % step_name
-        model.save(child_model_fn)
-        out_tensor_name = ModelWrapper(parent_model_fn).graph.output[0].name
-        out_dict = execute_parent(
-            parent_model_fn, child_model_fn, in_npy, return_full_ctx=True
-        )
-        out_npy = out_dict[out_tensor_name]
-    else:
-        inp_tensor_name = model.graph.input[0].name
-        out_tensor_name = model.graph.output[0].name
-        inp_dict = {inp_tensor_name: in_npy}
-        if rtlsim_pre_hook is not None:
-            out_dict = rtlsim_exec(model, inp_dict, pre_hook=rtlsim_pre_hook)
+    (in_npy_all, exp_out_npy_all) = cfg._resolve_verification_io_pair()
+    bsize_in = in_npy_all.shape[0]
+    bsize_out = exp_out_npy_all.shape[0]
+    assert bsize_in == bsize_out, "Batch sizes don't match for verification IO pair"
+    all_res = True
+    for b in range(bsize_in):
+        in_npy = np.expand_dims(in_npy_all[b], axis=0)
+        exp_out_npy = np.expand_dims(exp_out_npy_all[b], axis=0)
+        if need_parent:
+            assert (
+                cfg.save_intermediate_models
+            ), "Enable save_intermediate_models for verification"
+            parent_model_fn = intermediate_models_dir + "/dataflow_parent.onnx"
+            child_model_fn = intermediate_models_dir + "/verify_%s.onnx" % step_name
+            model.save(child_model_fn)
+            parent_model = ModelWrapper(parent_model_fn)
+            out_tensor_name = parent_model.graph.output[0].name
+            exp_ishape = parent_model.get_tensor_shape(parent_model.graph.input[0].name)
+            if in_npy.shape != exp_ishape:
+                print(
+                    "Verification input has shape %s while model expects %s"
+                    % (str(in_npy.shape), str(exp_ishape))
+                )
+                print("Attempting to force model shape on verification input")
+                in_npy = in_npy.reshape(exp_ishape)
+            out_dict = execute_parent(
+                parent_model_fn, child_model_fn, in_npy, return_full_ctx=True
+            )
+            out_npy = out_dict[out_tensor_name]
         else:
-            out_dict = execute_onnx(model, inp_dict, True)
-        out_npy = out_dict[out_tensor_name]
-    res = np.isclose(exp_out_npy, out_npy, atol=1e-3).all()
-    res_to_str = {True: "SUCCESS", False: "FAIL"}
-    res_str = res_to_str[res]
-    if cfg.verify_save_full_context:
-        verification_output_fn = verify_out_dir + "/verify_%s_%s.npz" % (
-            step_name,
-            res_str,
-        )
-        np.savez(verification_output_fn, **out_dict)
-    else:
-        verification_output_fn = verify_out_dir + "/verify_%s_%s.npy" % (
-            step_name,
-            res_str,
-        )
-        np.save(verification_output_fn, out_npy)
-    print("Verification for %s : %s" % (step_name, res_str))
+            inp_tensor_name = model.graph.input[0].name
+            out_tensor_name = model.graph.output[0].name
+            exp_ishape = model.get_tensor_shape(inp_tensor_name)
+            if in_npy.shape != exp_ishape:
+                print(
+                    "Verification input has shape %s while model expects %s"
+                    % (str(in_npy.shape), str(exp_ishape))
+                )
+                print("Attempting to force model shape on verification input")
+                in_npy = in_npy.reshape(exp_ishape)
+            inp_dict = {inp_tensor_name: in_npy}
+            if rtlsim_pre_hook is not None:
+                out_dict = rtlsim_exec(model, inp_dict, pre_hook=rtlsim_pre_hook)
+            else:
+                out_dict = execute_onnx(model, inp_dict, True)
+            out_npy = out_dict[out_tensor_name]
+        exp_oshape = exp_out_npy.shape
+        if out_npy.shape != exp_oshape:
+            print(
+                "Verification output has shape %s while model produces %s"
+                % (str(exp_oshape), str(out_npy.shape))
+            )
+            print("Attempting to force model shape on verification output")
+            out_npy = out_npy.reshape(exp_oshape)
+
+        res = np.isclose(exp_out_npy, out_npy, atol=1e-3).all()
+        all_res = all_res and res
+        res_to_str = {True: "SUCCESS", False: "FAIL"}
+        res_str = res_to_str[res]
+        if cfg.verify_save_full_context:
+            verification_output_fn = verify_out_dir + "/verify_%s_%d_%s.npz" % (
+                step_name,
+                b,
+                res_str,
+            )
+            np.savez(verification_output_fn, **out_dict)
+        else:
+            verification_output_fn = verify_out_dir + "/verify_%s_%d_%s.npy" % (
+                step_name,
+                b,
+                res_str,
+            )
+            np.save(verification_output_fn, out_npy)
+        if cfg.verify_save_rtlsim_waveforms:
+            vcd_path = model.get_metadata_prop("rtlsim_trace")
+            if vcd_path is not None and os.path.isfile(vcd_path):
+                new_vcd_path = vcd_path.replace(".vcd", "_%d.vcd" % b)
+                shutil.move(vcd_path, new_vcd_path)
+    print("Verification for %s : %s" % (step_name, res_to_str[all_res]))
 
 
 def prepare_for_stitched_ip_rtlsim(verify_model, cfg):
-    need_restitch = False
-    # rtlsim only supports certain impl_style for some nodes
-    # StreamingFIFO must have impl_style=rtl
-    for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO"):
-        inst = getCustomOp(fifo_layer)
-        if inst.get_nodeattr("impl_style") != "rtl":
-            inst.set_nodeattr("impl_style", "rtl")
-            inst.set_nodeattr("code_gen_dir_ipgen", "")
-            inst.set_nodeattr("ipgen_path", "")
-            need_restitch = True
-    # StreamingDataWidthConverter must have impl_style=hls
-    for dwc_layer in verify_model.get_nodes_by_op_type(
-        "StreamingDataWidthConverter_Batch"
-    ):
-        inst = getCustomOp(dwc_layer)
-        if inst.get_nodeattr("impl_style") != "hls":
-            inst.set_nodeattr("impl_style", "hls")
-            inst.set_nodeattr("code_gen_dir_ipgen", "")
-            inst.set_nodeattr("ipgen_path", "")
-            need_restitch = True
-    # if we've made alterations to the model, need to do some re-prep
-    if need_restitch:
-        print("Need to regen/re-stitch some IP for STITCHED_IP_RTLSIM")
-        verify_model = verify_model.transform(
-            PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
-        )
-        verify_model = verify_model.transform(HLSSynthIP())
-        verify_model = verify_model.transform(
-            CreateStitchedIP(
-                cfg._resolve_fpga_part(),
-                cfg.synth_clk_period_ns,
-                vitis=False,
+    if not cfg.rtlsim_use_vivado_comps:
+        need_restitch = False
+        # switch impl_style=vivado components to rtl/hls
+        # StreamingFIFO must have impl_style=rtl
+        for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO"):
+            inst = getCustomOp(fifo_layer)
+            if inst.get_nodeattr("impl_style") != "rtl":
+                inst.set_nodeattr("impl_style", "rtl")
+                inst.set_nodeattr("code_gen_dir_ipgen", "")
+                inst.set_nodeattr("ipgen_path", "")
+                need_restitch = True
+        # StreamingDataWidthConverter must have impl_style=hls
+        for dwc_layer in verify_model.get_nodes_by_op_type(
+            "StreamingDataWidthConverter_Batch"
+        ):
+            inst = getCustomOp(dwc_layer)
+            if inst.get_nodeattr("impl_style") != "hls":
+                inst.set_nodeattr("impl_style", "hls")
+                inst.set_nodeattr("code_gen_dir_ipgen", "")
+                inst.set_nodeattr("ipgen_path", "")
+                need_restitch = True
+        # if we've made alterations to the model, need to do some re-prep
+        if need_restitch:
+            print("Need to regen/re-stitch some IP for STITCHED_IP_RTLSIM")
+            verify_model = verify_model.transform(
+                PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
             )
-        )
+            verify_model = verify_model.transform(HLSSynthIP())
+            verify_model = verify_model.transform(
+                CreateStitchedIP(
+                    cfg._resolve_fpga_part(),
+                    cfg.synth_clk_period_ns,
+                    vitis=False,
+                )
+            )
+    else:
+        print("rtlsim_use_vivado_comps is enabled, may yield incorrect results")
+
     # set top-level prop for stitched-ip rtlsim and launch
     verify_model.set_metadata_prop("exec_mode", "rtlsim")
     # TODO make configurable
@@ -302,7 +353,10 @@ def step_convert_to_hls(model: ModelWrapper, cfg: DataflowBuildConfig):
     # needed for convolutions -- TODO always exec?
     need_conv = len(model.get_nodes_by_op_type("Im2Col")) > 0
     if need_conv:
-        model = model.transform(to_hls.InferConvInpGen())
+        if cfg.force_rtl_conv_inp_gen:
+            model = model.transform(to_hls.InferConvInpGen(use_rtl_variant=True))
+        else:
+            model = model.transform(to_hls.InferConvInpGen())
         model = model.transform(to_hls.InferStreamingMaxPool())
         model = model.transform(RemoveCNVtoFCFlatten())
     # get rid of Tranpose -> Tranpose identity seq
@@ -446,9 +500,9 @@ def step_hls_ipgen(model: ModelWrapper, cfg: DataflowBuildConfig):
 def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
     """
     Depending on the auto_fifo_depths setting, do one of the following:
-    * if auto_fifo_depths=True:  Run the `InsertAndSetFIFODepths` transformation
-    to attempt to determine the FIFO sizes that provide full throughput. Involves
-    running stitched-IP rtlsim and may take a long time.
+    * if auto_fifo_depths=True:  Run the appropriate auto-sizing transformation
+    to attempt to determine the FIFO sizes that provide full throughput.
+    May take a long time.
     * if auto_fifo_depths=False:  Assume the folding config file contains FIFO
     sizes as well. Runs the `InsertFIFO` transformation, then
     `ApplyConfig(cfg.folding_config_file)`, and finally `RemoveShallowFIFOs`.
@@ -457,13 +511,35 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
     """
 
     if cfg.auto_fifo_depths:
-        model = model.transform(
-            InsertAndSetFIFODepths(
-                cfg._resolve_fpga_part(),
-                cfg._resolve_hls_clk_period(),
-                vivado_ram_style=cfg.large_fifo_mem_style,
+        if cfg.auto_fifo_strategy == "characterize":
+            model = model.transform(InsertDWC())
+            model = model.transform(GiveUniqueNodeNames())
+            model = model.transform(
+                PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
             )
-        )
+            model = model.transform(HLSSynthIP())
+            model = model.transform(PrepareRTLSim())
+            model = model.transform(AnnotateCycles())
+            period = model.analysis(dataflow_performance)["max_cycles"] + 10
+            model = model.transform(DeriveCharacteristic(period))
+            model = model.transform(DeriveFIFOSizes())
+            model = model.transform(
+                InsertFIFO(
+                    vivado_ram_style=cfg.large_fifo_mem_style, max_qsrl_depth=256
+                )
+            )
+            model = model.transform(GiveUniqueNodeNames())
+            model = model.transform(GiveReadableTensorNames())
+        elif cfg.auto_fifo_strategy == "largefifo_rtlsim":
+            model = model.transform(
+                InsertAndSetFIFODepths(
+                    cfg._resolve_fpga_part(),
+                    cfg._resolve_hls_clk_period(),
+                    vivado_ram_style=cfg.large_fifo_mem_style,
+                )
+            )
+        else:
+            assert "Unsupported auto_fifo_strategy: " + cfg.auto_fifo_strategy
     else:
         # assume folding cfg json contains FIFO sizes too
         # insert DWCs, FIFOs and run ApplyConfig once more
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 2c7c86c64ea1279cb18cf8342aa20fb2792bdaf5..e5eb483a00f6890f5eeb16c5cec533a4533c9f15 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -36,8 +36,12 @@ from finn.custom_op.fpgadataflow.convolutioninputgenerator import (
 from finn.custom_op.fpgadataflow.convolutioninputgenerator1d import (
     ConvolutionInputGenerator1D,
 )
+from finn.custom_op.fpgadataflow.convolutioninputgenerator_rtl import (
+    ConvolutionInputGenerator_rtl,
+)
 from finn.custom_op.fpgadataflow.downsampler import DownSampler
 from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch
+from finn.custom_op.fpgadataflow.eltwise import StreamingEltwise
 from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch
 from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch
 from finn.custom_op.fpgadataflow.iodma import IODMA
@@ -67,6 +71,7 @@ custom_op["StreamingMaxPool_Batch"] = StreamingMaxPool_Batch
 custom_op["MatrixVectorActivation"] = MatrixVectorActivation
 custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator
 custom_op["ConvolutionInputGenerator1D"] = ConvolutionInputGenerator1D
+custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl
 custom_op["TLastMarker"] = TLastMarker
 custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch
 custom_op["StreamingFIFO"] = StreamingFIFO
@@ -85,3 +90,4 @@ custom_op["UpsampleNearestNeighbour_Batch"] = UpsampleNearestNeighbour_Batch
 custom_op["Lookup"] = Lookup
 custom_op["StreamingConcat"] = StreamingConcat
 custom_op["CheckSum"] = CheckSum
+custom_op["StreamingEltwise"] = StreamingEltwise
diff --git a/src/finn/custom_op/fpgadataflow/addstreams_batch.py b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
index 13a4c5892c8f82c37e1794057a06217981a6a580..cd0af6b3ab3d8250abbf7d48e004622e55f09f04 100644
--- a/src/finn/custom_op/fpgadataflow/addstreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
@@ -42,18 +42,21 @@ class AddStreams_Batch(HLSCustomOp):
         super().__init__(onnx_node)
 
     def get_nodeattr_types(self):
-        my_attrs = {
-            "NumChannels": ("i", True, ""),
-            "PE": ("i", True, ""),
-            # FINN DataTypes for inputs; output datatype inferred from input
-            "inputDataType": ("s", True, ""),
-            # number of input vectors, examples:
-            # [1] is a single vector (like a FC layer with batch=1)
-            # [4] is four vectors (like a FC layer with batch=4)
-            # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
-            "numInputVectors": ("ints", False, [1]),
-        }
-        my_attrs.update(super().get_nodeattr_types())
+        my_attrs = super().get_nodeattr_types()
+        my_attrs.update(
+            {
+                "NumChannels": ("i", True, ""),
+                "PE": ("i", True, ""),
+                # FINN DataTypes for inputs; output datatype inferred from input
+                "inputDataType": ("s", True, ""),
+                # number of input vectors, examples:
+                # [1] is a single vector (like a FC layer with batch=1)
+                # [4] is four vectors (like a FC layer with batch=4)
+                # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
+                "numInputVectors": ("ints", False, [1]),
+                "inFIFODepths": ("ints", False, [2, 2]),
+            }
+        )
         return my_attrs
 
     def get_normal_input_shape(self, ind=0):
@@ -70,10 +73,10 @@ class AddStreams_Batch(HLSCustomOp):
         ishape = tuple(vecs + [ich // pe, pe])
         return ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         return self.get_normal_input_shape()
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         return self.get_folded_input_shape()
 
     def make_shape_compatible_op(self, model):
@@ -124,11 +127,11 @@ class AddStreams_Batch(HLSCustomOp):
 
         return info_messages
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         # we need to set output datatype to the next larger int or uint
         # enhancement: consider specifying w/ explicit outputDataType attribute
@@ -139,14 +142,14 @@ class AddStreams_Batch(HLSCustomOp):
         else:
             return DataType.get_smallest_possible(2 * idt.max())
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         """Returns input stream width."""
         ibits = self.get_input_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
         in_width = pe * ibits
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         """Returns output stream width."""
         obits = self.get_output_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
@@ -357,3 +360,14 @@ class AddStreams_Batch(HLSCustomOp):
         swidth = self.get_instream_width_padded()
         intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]]
         return intf_names
+
+    def derive_characteristic_fxns(self, period):
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        io_dict = {
+            "inputs": {
+                "in0": [0 for i in range(n_inps)],
+                "in1": [0 for i in range(n_inps)],
+            },
+            "outputs": {"out": []},
+        }
+        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
index 3ed76db2982e411b711be5bd78e39dd866332714..46adca680d3c96695eeb5a91be53ea158fc78f1f 100644
--- a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
+++ b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
@@ -102,9 +102,6 @@ class ChannelwiseOp_Batch(HLSCustomOp):
             "inputDataType": ("s", True, ""),
             "paramDataType": ("s", True, ""),
             "outputDataType": ("s", True, ""),
-            # input and output FIFO depths
-            "inFIFODepth": ("i", False, 0),
-            "outFIFODepth": ("i", False, 0),
             # number of input vectors, examples:
             # [1] is a single vector (like a FC layer with batch=1)
             # [4] is four vectors (like a FC layer with batch=4)
@@ -221,23 +218,23 @@ class ChannelwiseOp_Batch(HLSCustomOp):
         # total cost
         return comparator_cost + lutram_cost
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("outputDataType")]
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         i_bits = self.get_input_datatype().bitwidth()
         return i_bits * self.get_nodeattr("PE")
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         o_bits = self.get_output_datatype().bitwidth()
         return o_bits * self.get_nodeattr("PE")
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         ich = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
         fold = ich // pe
@@ -245,17 +242,17 @@ class ChannelwiseOp_Batch(HLSCustomOp):
         folded_input_shape = tuple(vecs + [fold, pe])
         return folded_input_shape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         # same shape as input
         return self.get_folded_input_shape()
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         ich = self.get_nodeattr("NumChannels")
         vecs = list(self.get_nodeattr("numInputVectors"))
         normal_input_shape = tuple(vecs + [ich])
         return normal_input_shape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         # same shape as input
         return self.get_normal_input_shape()
 
diff --git a/src/finn/custom_op/fpgadataflow/checksum.py b/src/finn/custom_op/fpgadataflow/checksum.py
index bde285eb0dd1b3818926c1feb7ac8d5de69a4be6..c927c07df21faf40ccbf9ddbe47e3f2f2ca61c89 100644
--- a/src/finn/custom_op/fpgadataflow/checksum.py
+++ b/src/finn/custom_op/fpgadataflow/checksum.py
@@ -77,31 +77,31 @@ class CheckSum(HLSCustomOp):
     def verify_node(self):
         pass
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         # here same as input data type
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         dtype = DataType[self.get_nodeattr("inputDataType")]
         folded_shape = self.get_nodeattr("folded_shape")
         in_width = folded_shape[-1] * dtype.bitwidth()
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         return self.get_instream_width()
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         return self.get_nodeattr("folded_shape")
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         return self.get_nodeattr("folded_shape")
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         # derive normal shape from folded shape
         # checksum nodes are inserted in between fpgadataflow nodes
         # the folded shape could be for example (1, nf, pe)
@@ -127,7 +127,7 @@ class CheckSum(HLSCustomOp):
     def get_ap_int_max_w(self):
         return max(super().get_ap_int_max_w(), 32)
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         # same shape as input
         return self.get_normal_input_shape()
 
diff --git a/src/finn/custom_op/fpgadataflow/concat.py b/src/finn/custom_op/fpgadataflow/concat.py
index 5fcf9cf96cbacd4e444af0b90618a19eefb9bfe2..4437bcd1984c5194b0a19b43d692babb7e3cd158 100644
--- a/src/finn/custom_op/fpgadataflow/concat.py
+++ b/src/finn/custom_op/fpgadataflow/concat.py
@@ -74,12 +74,12 @@ class StreamingConcat(HLSCustomOp):
     def get_folded_input_shape(self, ind=0):
         return self.get_normal_input_shape(ind)
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         total_elems = self.get_total_elems()
         vecs = list(self.get_nodeattr("numInputVectors"))
         return tuple(vecs + [total_elems])
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         return self.get_normal_output_shape()
 
     def make_shape_compatible_op(self, model):
@@ -106,7 +106,7 @@ class StreamingConcat(HLSCustomOp):
         # input dt identical for all inputs
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         return self.get_input_datatype()
 
     def get_instream_width(self, ind=0):
@@ -115,7 +115,7 @@ class StreamingConcat(HLSCustomOp):
         ibits = self.get_input_datatype().bitwidth()
         return elems * ibits
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         obits = self.get_output_datatype().bitwidth()
         total_elems = self.get_total_elems()
         out_width = total_elems * obits
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
index 251a9882c58a3cf94449701795b72c8a6adab318..1566445999a2c568b5c5a112d436bf05fd89aca5 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
@@ -99,13 +99,13 @@ class ConvolutionInputGenerator(HLSCustomOp):
             assert ret[0] == ret[1] == 1, "Only dilation=1 supported"
         return ret
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
         ifm_ch = self.get_nodeattr("IFMChannels")
         ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch)
         return ishape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
         ifm_ch = self.get_nodeattr("IFMChannels")
         simd = self.get_nodeattr("SIMD")
@@ -114,7 +114,7 @@ class ConvolutionInputGenerator(HLSCustomOp):
         folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd)
         return folded_ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         k_h, k_w = self.get_nodeattr("ConvKernelDim")
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
         ifm_ch = self.get_nodeattr("IFMChannels")
@@ -126,7 +126,7 @@ class ConvolutionInputGenerator(HLSCustomOp):
         oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch)
         return oshape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         k_h, k_w = self.get_nodeattr("ConvKernelDim")
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
         ifm_ch = self.get_nodeattr("IFMChannels")
@@ -158,15 +158,15 @@ class ConvolutionInputGenerator(HLSCustomOp):
     def verify_node(self):
         pass
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("outputDataType")]
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         """Returns stream width, input and output stream width are equal for
         the sliding window function"""
         ibits = self.get_input_datatype().bitwidth()
@@ -176,7 +176,7 @@ class ConvolutionInputGenerator(HLSCustomOp):
         in_width = simd * ibits
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         """Returns stream width, input and output stream width are equal for
         the sliding window function, so the function to determine the input
         stream width can be reused."""
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
index aba74baecc0f40571fa288459a04ad42e167ccf6..f1c84662cc06e89df5bd7c0762ac47b8c5723502 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
@@ -91,13 +91,13 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
         ifm_ch = self.get_nodeattr("IFMChannels")
         ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch)
         return ishape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
         ifm_ch = self.get_nodeattr("IFMChannels")
         simd = self.get_nodeattr("SIMD")
@@ -106,7 +106,7 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
         folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd)
         return folded_ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         k_h, k_w = self.get_nodeattr("ConvKernelDim")
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
         ifm_ch = self.get_nodeattr("IFMChannels")
@@ -118,7 +118,7 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
         oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch)
         return oshape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         k_h, k_w = self.get_nodeattr("ConvKernelDim")
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
         ifm_ch = self.get_nodeattr("IFMChannels")
@@ -153,15 +153,15 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
     def verify_node(self):
         pass
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("outputDataType")]
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         ibits = self.get_input_datatype().bitwidth()
         simd = self.get_nodeattr("SIMD")
         ifm_ch = self.get_nodeattr("IFMChannels")
@@ -169,7 +169,7 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
         in_width = simd * ibits
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         if self.use_parallel_window_output():
             # feed all window pixels in parallel
             k_h, k_w = self.get_nodeattr("ConvKernelDim")
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
new file mode 100755
index 0000000000000000000000000000000000000000..5424050a8ed0a353894721d5bba28c1d45e62771
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
@@ -0,0 +1,834 @@
+# Copyright (C) 2022, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import math
+import numpy as np
+import os
+from math import copysign
+from qonnx.core.datatype import DataType
+from qonnx.custom_op.general import im2col
+from qonnx.custom_op.general.im2col import compute_conv_output_dim
+
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+try:
+    from pyverilator import PyVerilator
+except ModuleNotFoundError:
+    PyVerilator = None
+
+# RTL Convolution Input Generator / Sliding Window Generator (SWG)
+# Matches and extends the functionality of all ConvolutionInputGenerator_* functions
+# in finn-hlslib by generating HDL code for two different implementation styles:
+# - Addressable cyclic buffer: to be used when out_width <= in_width
+# - Parallel registers + line buffers: to be used when out_width > in_width
+# Supports non-square, 1D, strided, dilated, and depthwise convolutions.
+# Note: the actual data layout produced is different for depthwise and non-depthwise:
+# * non-depthwise SWG: (1, OFMDim_H, OFMDim_W, K_H, K_W, IFMChannels/SIMD, SIMD)
+# * depthwise SWG: (1, OFMDim_H, OFMDim_W, IFMChannels/SIMD, K_H, K_W, SIMD)
+
+# NOTE: "Parallel" implementation style not yet implemented in this version!
+
+
+class ConvolutionInputGenerator_rtl(HLSCustomOp):
+    """Class that does not correspond to one of the finn-hlslib ConvolutionInputGenerator
+    (sliding window) function variants. Generates an RTL ConvolutionInputGenerator
+    implementation based on (System-)Verilog templates, defined in finn-rtllib/swg."""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "ConvKernelDim": ("ints", True, []),  # [H, W] = [Y, X]
+            "IFMChannels": ("i", True, 0),
+            "IFMDim": ("ints", True, []),  # [H, W] = [Y, X]
+            "OFMDim": ("ints", True, []),  # [H, W] = [Y, X]
+            "SIMD": ("i", True, 0),
+            # additional parallelization parameter - not yet implemented
+            "M": ("i", False, 1),
+            # alternative implementation style - not yet implemented
+            "parallel_window": ("i", False, 0, {0}),
+            "Stride": ("ints", True, []),  # [H, W] = [Y, X]
+            "Dilation": ("ints", True, []),  # [H, W] = [Y, X]
+            # FINN DataTypes for inputs, weights, outputs
+            "inputDataType": ("s", True, ""),
+            "outputDataType": ("s", True, ""),
+            "depthwise": ("i", False, 0, {0, 1}),
+            # FPGA resource type for ConvolutionInputGenerator input buffer
+            # auto -- let Vivado decide
+            # block -- use BRAM
+            # distributed -- use LUTRAM
+            # ultra -- use URAM
+            "ram_style": (
+                "s",
+                False,
+                "auto",
+                {"auto", "block", "distributed", "ultra"},
+            ),
+            # attribute to save top module name - not user configurable
+            "gen_top_module": ("s", False, ""),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_normal_input_shape(self, ind=0):
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch)
+        return ishape
+
+    def get_folded_input_shape(self, ind=0):
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        simd = self.get_nodeattr("SIMD")
+        assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
+        wf = int(ifm_ch / simd)
+        folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd)
+        return folded_ishape
+
+    def get_normal_output_shape(self, ind=0):
+        k_h, k_w = self.get_nodeattr("ConvKernelDim")
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        stride_h, stride_w = self.get_nodeattr("Stride")
+        dilation_h, dilation_w = self.get_nodeattr("Dilation")
+        pad = 0
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w)
+        oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch)
+        return oshape
+
+    def get_folded_output_shape(self, ind=0):
+        k_h, k_w = self.get_nodeattr("ConvKernelDim")
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        stride_h, stride_w = self.get_nodeattr("Stride")
+        dilation_h, dilation_w = self.get_nodeattr("Dilation")
+        simd = self.get_nodeattr("SIMD")
+        pad = 0
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w)
+        assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
+        if self.get_nodeattr("parallel_window"):
+            wf = int((ifm_ch) // simd)
+            folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd)
+        else:
+            wf = int((k_h * k_w * ifm_ch) // simd)
+            folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd)
+        return folded_oshape
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpect input shape for ConvInpGen."
+        return super().make_const_shape_op(oshape)
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        # data type stays the same
+        dtype = model.get_tensor_datatype(node.input[0])
+        model.set_tensor_datatype(node.output[0], dtype)
+
+    def verify_node(self):
+        pass
+
+    def get_input_datatype(self, ind=0):
+        """Returns FINN DataType of input."""
+        return DataType[self.get_nodeattr("inputDataType")]
+
+    def get_output_datatype(self, ind=0):
+        """Returns FINN DataType of output."""
+        return DataType[self.get_nodeattr("outputDataType")]
+
+    def get_instream_width(self, ind=0):
+        ibits = self.get_input_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
+        in_width = simd * ibits
+        return in_width
+
+    def get_outstream_width(self, ind=0):
+        if self.get_nodeattr("parallel_window"):
+            # feed all window pixels in parallel
+            k_h, k_w = self.get_nodeattr("ConvKernelDim")
+            return self.get_instream_width() * k_h * k_w
+        else:
+            # if parallel variant not in use: same width for output and input stream
+            return self.get_instream_width()
+
+    def get_number_input_values(self):
+        folded_ishape = self.get_folded_input_shape()
+        num_input_elems = np.prod(folded_ishape[:-1])
+        return num_input_elems
+
+    def get_number_output_values(self):
+        folded_oshape = self.get_folded_output_shape()
+        num_output_elems = np.prod(folded_oshape[:-1])
+        return num_output_elems
+
+    def get_1d_conv_attrs_normalized(self):
+        # normalize FM dimensions so that:
+        # [H, W] = [Y, X] = [1, D] or [D, 1] are always mapped to [1, D].
+        # The dummy ('1') dimension is the Y-dimension.
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        k = self.get_nodeattr("ConvKernelDim")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        ofm_dim = self.get_nodeattr("OFMDim")
+        stride = self.get_nodeattr("Stride")
+        dilation = self.get_nodeattr("Dilation")
+
+        if ifm_dim[1] == 1:
+            ifm_dim = ifm_dim[::-1]
+            ofm_dim = ofm_dim[::-1]
+            k = k[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation)
+
+    def get_buffer_depth(self):
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        k = self.get_nodeattr("ConvKernelDim")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        stride = self.get_nodeattr("Stride")
+        dilation = self.get_nodeattr("Dilation")
+        simd = self.get_nodeattr("SIMD")
+
+        k_h, k_w = k
+        h, w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+        mmv_in = 1
+        mmv_out = 1
+        channel_factor = int(ifm_ch / simd)
+
+        impl_style = self.select_impl_style()
+        if impl_style == "default":
+            # compute minimal buffer length (assuming it holds 1 complete window)
+            buffer_min_size = (
+                (k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1
+            ) * channel_factor
+
+            # add additional buffer space in case of stride > 1
+            # this minimizes cycle count as it allows an earlier pre-load of inputs
+            buffer_depth = (
+                buffer_min_size
+                + max(
+                    0,
+                    ((stride_w - 1) - (int(mmv_out * k_h * k_w / mmv_in)))
+                    * channel_factor,
+                )
+                + max(
+                    0,
+                    ((stride_h - 1) * w - (int(mmv_out * k_h * k_w / mmv_in)))
+                    * channel_factor,
+                )
+            )
+        else:
+            buffer_depth = 0
+            raise Exception("Requested impl. style not implemented")
+        return buffer_depth
+
+    def get_exp_cycles(self):
+        simd = self.get_nodeattr("SIMD")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        k = self.get_nodeattr("ConvKernelDim")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        ofm_dim = self.get_nodeattr("OFMDim")
+        stride = self.get_nodeattr("Stride")
+        dilation = self.get_nodeattr("Dilation")
+        depthwise = self.get_nodeattr("depthwise")
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        ofm_dim_h, ofm_dim_w = ofm_dim
+        k_h, k_w = k
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        channel_factor = int(ifm_ch / simd)
+
+        if ifm_dim_h == 1 or ifm_dim_w == 1:
+            # 1D case
+            (
+                ifm_ch,
+                [ifm_dim_h, ifm_dim_w],
+                [ofm_dim_h, ofm_dim_w],
+                [k_h, k_w],
+                [stride_h, stride_w],
+                [dilation_h, dilation_w],
+            ) = self.get_1d_conv_attrs_normalized()
+
+            if depthwise:
+                exp_cycles = (
+                    +ofm_dim_w * k_w * channel_factor
+                    + channel_factor * (k_w - 1) * (stride_w - 1)
+                    - (k_w - 1)
+                    + 2
+                )
+            else:
+                exp_cycles = ofm_dim_w * k_w * channel_factor + 2
+        else:
+            # 2D case
+            buffer_min_size = (
+                (k_h - 1) * dilation_h * ifm_dim_w + (k_w - 1) * dilation_w + 1
+            ) * channel_factor
+            cycles_write_block = ofm_dim_w * k_w * k_h * channel_factor
+            cycles_read_block = stride_w * ifm_dim_w * channel_factor
+            max_cycles = max(cycles_write_block, cycles_read_block)
+            if depthwise:
+                max_cycles += ofm_dim_w * (stride_w - 1) * (channel_factor - 1)
+            exp_cycles = buffer_min_size + ofm_dim_h * max_cycles  # initial buffering
+            if depthwise:
+                exp_cycles += (stride_h - 1) * ifm_dim_w * channel_factor
+
+        return int(exp_cycles)
+
+    def bram_estimation(self):
+        simd = self.get_nodeattr("SIMD")
+        ram_style = self.get_nodeattr("ram_style")
+
+        # NOTE: Actual BRAM usage might be lower in some cases.
+        # This does not account for the exact Vivado behavior yet.
+        buffer_width = simd * self.get_input_datatype().bitwidth()
+        buffer_depth = self.get_buffer_depth()
+        if ram_style == "block" or ram_style == "auto":
+            if buffer_depth <= 512:
+                ram_width = 36
+            elif buffer_depth <= 1024:
+                ram_width = 18
+            elif buffer_depth <= 2048:
+                ram_width = 9
+            elif buffer_depth <= 4096:
+                ram_width = 4
+            elif buffer_depth <= 8192:
+                ram_width = 2
+            else:
+                ram_width = 1
+
+            ram_cascade_depth = math.ceil(buffer_depth / 16384)
+            ram_cascade_width = math.ceil(buffer_width / ram_width)
+            cascade_savings = 0
+            if buffer_depth > 16384:
+                remainder_depth = buffer_depth % 16384
+                if remainder_depth <= 512:
+                    remainder_width = 36
+                elif remainder_depth <= 1024:
+                    remainder_width = 18
+                elif remainder_depth <= 2048:
+                    remainder_width = 9
+                elif remainder_depth <= 4096:
+                    remainder_width = 4
+                elif remainder_depth <= 8192:
+                    remainder_width = 2
+                else:
+                    remainder_width = 1
+
+                remainder_cascade_width = math.ceil(buffer_width / remainder_width)
+                cascade_savings = ram_cascade_width - remainder_cascade_width
+
+            return int(ram_cascade_depth * ram_cascade_width - cascade_savings)
+        else:
+            return 0
+
+    def lut_estimation(self):
+        simd = self.get_nodeattr("SIMD")
+        ram_style = self.get_nodeattr("ram_style")
+        buffer_width = simd * self.get_input_datatype().bitwidth()
+        buffer_depth = self.get_buffer_depth()
+        if ram_style == "distributed":
+            ram_luts = int(buffer_width * math.ceil(buffer_depth / 38))
+        else:
+            ram_luts = 0
+        return 300 + ram_luts
+
+    def uram_estimation(self):
+        simd = self.get_nodeattr("SIMD")
+        ram_style = self.get_nodeattr("ram_style")
+        buffer_width = simd * self.get_input_datatype().bitwidth()
+        buffer_depth = self.get_buffer_depth()
+
+        if ram_style == "ultra":
+            ram_depth = 4096
+            ram_width = 72
+            ram_cascade_depth = math.ceil(buffer_depth / ram_depth)
+            ram_cascade_width = math.ceil(buffer_width / ram_width)
+            return int(ram_cascade_depth * ram_cascade_width)
+        else:
+            return 0
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        exp_ishape = self.get_normal_input_shape()
+        exp_oshape = self.get_normal_output_shape()
+        folded_ishape = self.get_folded_input_shape()
+
+        if mode == "cppsim":
+            raise Exception(
+                "cppsim not possible for RTL SWG, please set exec_mode to rtlsim"
+            )
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert (
+            inp.shape == exp_ishape
+        ), """Input shape doesn't match expected shape (1, ifm_dim, ifm_dim, ifm_ch)."""
+        if self.get_input_datatype() == DataType["BIPOLAR"]:
+            # store bipolar activations as binary
+            inp = (inp + 1) / 2
+            export_idt = DataType["BINARY"]
+        else:
+            export_idt = self.get_input_datatype()
+
+        # reshape input into folded form
+        inp = inp.reshape(folded_ishape)
+        # make copy before saving array
+        reshaped_input = inp.copy()
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+        sim = self.get_rtlsim()
+        nbits = self.get_instream_width()
+        rtlsim_inp = npy_to_rtlsim_input(
+            "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+        )
+        super().reset_rtlsim(sim)
+        super().toggle_clk(sim)
+        rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+        odt = export_idt
+        target_bits = odt.bitwidth()
+        packed_bits = self.get_outstream_width()
+        out_npy_path = "{}/output.npy".format(code_gen_dir)
+        out_shape = self.get_folded_output_shape()
+        rtlsim_output_to_npy(
+            rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+        )
+        # load and reshape output
+        output = np.load(out_npy_path)
+        output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+        context[node.output[0]] = output
+
+        # binary -> bipolar if needed
+        if self.get_output_datatype() == DataType["BIPOLAR"]:
+            out = context[node.output[0]]
+            out = 2 * out - 1
+            context[node.output[0]] = out
+        assert (
+            context[node.output[0]].shape == exp_oshape
+        ), """Output
+        shape doesn't match expected shape (1, ofm_dim_h, ofm_dim_w, k_h*k_w*ifm_ch)."""
+
+    def prepare_codegen_default(self):
+        # Default implementation style for MMV_out = 1: addressable cyclic buffer
+        # Computing incremental addressing scheme directly..
+        template_path = (
+            os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_template_default.sv"
+        )
+        code_gen_dict = {}
+
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        k = self.get_nodeattr("ConvKernelDim")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        stride = self.get_nodeattr("Stride")
+        dilation = self.get_nodeattr("Dilation")
+        depthwise = self.get_nodeattr("depthwise")
+        simd = self.get_nodeattr("SIMD")
+
+        k_h, k_w = k
+        h, w = ifm_dim
+        pad = [0, 0, 0, 0]  # padding happens in separate padding node for now
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+        pad_h = pad[0] + pad[2]
+        pad_w = pad[1] + pad[3]
+        out_dim_h = im2col.compute_conv_output_dim(h, k_h, stride_h, pad_h, dilation_h)
+        out_dim_w = im2col.compute_conv_output_dim(w, k_w, stride_w, pad_w, dilation_w)
+        mmv_in = 1
+        mmv_out = 1
+        channel_factor = int(ifm_ch / simd)
+
+        # compute minimal buffer length (assuming it holds 1 complete window)
+        buffer_min_size = (
+            (k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1
+        ) * channel_factor
+
+        buffer_actual_size = self.get_buffer_depth()
+        code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_actual_size)]
+
+        # compute some intermediate values, e.g., kernel "width" = k_w incl. dilation
+        # or cols/rows that are skipped due to imperfect stride<->dim combination
+        kernel_width = (k_w - 1) * dilation_w + 1
+        kernel_height = (k_h - 1) * dilation_h + 1
+        skip_columns = w % (kernel_width + (out_dim_w - 1) * stride_w)
+        skip_rows = h % (kernel_height + (out_dim_h - 1) * stride_h)
+
+        # compute address increment values for 5-loop nest
+        addr_incr_end_simd = 1
+        addr_incr_end_window_elem = (dilation_w - 1) * channel_factor + 1
+        addr_incr_end_window_row = (
+            ((w - kernel_width) * channel_factor)  # remaining line
+            + ((dilation_h - 1) * w * channel_factor)  # skip lines
+            + 1  # wrap-around of minimally sized buffer
+        )
+        addr_incr_end_window = -buffer_min_size + stride_w * channel_factor + 1
+        addr_incr_end_row = (
+            -buffer_min_size
+            + ((skip_columns + kernel_width) * channel_factor)  # remaining line
+            + ((stride_h - 1) * w * channel_factor)  # skip lines
+            + 1
+        )
+
+        # re-use same controller structure -> re-assign address increments
+        if depthwise:
+            addr_incr_end_window_elem = dilation_w * channel_factor
+            addr_incr_end_window_row = (
+                channel_factor
+                + (w - kernel_width) * channel_factor
+                + (dilation_h - 1) * w * channel_factor
+            )
+            addr_incr_end_simd = -buffer_min_size + (channel_factor + 1)
+
+        # sanity check
+        assert not (
+            abs(addr_incr_end_window) > buffer_actual_size
+        ), "ERROR: W increment > buffer size, wrap logic doesn't account for this"
+        assert not (
+            abs(addr_incr_end_row) > buffer_actual_size
+        ), "ERROR: H increment > buffer size, wrap logic doesn't account for this"
+
+        # set certain threshold indices to detect when reading/writing finishes
+        code_gen_dict["$LAST_READ_ELEM$"] = [str(h * w * channel_factor - 1)]
+        code_gen_dict["$LAST_WRITE_ELEM$"] = [
+            str(((h - skip_rows - 1) * w + (w - skip_columns)) * channel_factor - 1)
+        ]
+
+        # default controller loop structure: # iterations (counters) map directly
+        loop_h_iterations = out_dim_h
+        loop_w_iterations = out_dim_w
+        loop_kh_iterations = k_h
+        loop_kw_iterations = k_w
+        loop_simd_iterations = channel_factor
+
+        if depthwise and channel_factor > 1:
+            # re-arrange existing controller loop structure for depthwise convolutions
+            loop_kh_iterations = channel_factor
+            loop_kw_iterations = k_h
+            loop_simd_iterations = k_w
+            addr_incr_end_simd_ = addr_incr_end_simd
+            addr_incr_end_simd = addr_incr_end_window_elem
+            addr_incr_end_window_elem = addr_incr_end_window_row
+            addr_incr_end_window_row = addr_incr_end_simd_
+            elem_per_window = k_h * k_w
+
+            tail_incr_w = addr_incr_end_window + buffer_min_size - channel_factor
+            tail_incr_h = addr_incr_end_row + buffer_min_size - channel_factor
+            tail_incr_last_window = buffer_min_size - 1
+            code_gen_dict["$IS_DEPTHWISE$"] = ["1"]
+        else:
+            # depthwise output format is equivalent to non-depthwise if SIMD=C
+            elem_per_window = k_h * k_w * channel_factor
+
+            tail_incr_w = addr_incr_end_window + buffer_min_size - 1
+            tail_incr_h = addr_incr_end_row + buffer_min_size - 1
+            tail_incr_last_window = buffer_min_size - 1
+            code_gen_dict["$IS_DEPTHWISE$"] = ["0"]
+
+        code_gen_dict["$TAIL_INCR_W$"] = [str(tail_incr_w)]
+        code_gen_dict["$TAIL_INCR_H$"] = [str(tail_incr_h)]
+        code_gen_dict["$TAIL_INCR_LAST$"] = [str(tail_incr_last_window)]
+
+        # support SIMD = IFMChannels and k_w = 1 cases
+        # for k = [k_h, k_w] = [1, k_w], no adjustment is needed
+        # for k = [k_h, k_w] = [1, 1], do not use this impl. style (mmv_out=K=1)
+        # innermost loop is executed at least once -> adjust if needed
+        if loop_simd_iterations == 1:
+            # skip innermost SIMD loop completely
+            if loop_kw_iterations == 1:
+                # skip innermost KW loop completely
+                code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_KH"]
+                loop_kh_iterations -= 1  # -1 because state is initial state
+            else:
+                code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_KW"]
+                loop_kw_iterations -= 1  # -1 because state is initial state
+        else:
+            code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_SIMD"]
+            loop_simd_iterations -= 1  # -1 because state is initial state
+
+        code_gen_dict["$LOOP_H_ITERATIONS$"] = [str(loop_h_iterations - 1)]
+        code_gen_dict["$LOOP_W_ITERATIONS$"] = [str(loop_w_iterations - 1)]
+        code_gen_dict["$LOOP_KH_ITERATIONS$"] = [str(loop_kh_iterations - 1)]
+        code_gen_dict["$LOOP_KW_ITERATIONS$"] = [str(loop_kw_iterations - 1)]
+        code_gen_dict["$LOOP_SIMD_ITERATIONS$"] = [str(loop_simd_iterations - 1)]
+
+        incr_bitwidth = 1 + math.ceil(
+            math.log2(
+                max(
+                    abs(addr_incr_end_simd) + 1,
+                    abs(addr_incr_end_window_elem) + 1,
+                    abs(addr_incr_end_window_row) + 1,
+                    abs(addr_incr_end_window) + 1,
+                    abs(addr_incr_end_row) + 1,
+                    abs(tail_incr_w) + 1,
+                    abs(tail_incr_h) + 1,
+                    abs(tail_incr_last_window) + 1,
+                )
+            )
+        )
+        code_gen_dict["$INCR_BITWIDTH$"] = [str(incr_bitwidth)]
+        code_gen_dict["$ADDR_INCREMENT_MAP$"] = [
+            "'{{ {}'d0, {}'d{}, {}'d{}, {}'d{}, {}'d{}, {}'d{}}}".format(
+                incr_bitwidth,
+                int(copysign(incr_bitwidth, addr_incr_end_simd)),
+                abs(addr_incr_end_simd),
+                int(copysign(incr_bitwidth, addr_incr_end_window_elem)),
+                abs(addr_incr_end_window_elem),
+                int(copysign(incr_bitwidth, addr_incr_end_window_row)),
+                abs(addr_incr_end_window_row),
+                int(copysign(incr_bitwidth, addr_incr_end_window)),
+                abs(addr_incr_end_window),
+                int(copysign(incr_bitwidth, addr_incr_end_row)),
+                abs(addr_incr_end_row),
+            )
+        ]
+
+        code_gen_dict["$ELEM_PER_WINDOW$"] = [str(elem_per_window)]
+        code_gen_dict["$SIMD$"] = [str(simd)]
+        code_gen_dict["$MMV_IN$"] = [str(mmv_in)]
+        code_gen_dict["$MMV_OUT$"] = [str(mmv_out)]
+
+        return template_path, code_gen_dict
+
+    def select_impl_style(self):
+        simd = self.get_nodeattr("SIMD")
+        M = self.get_nodeattr("M")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        stride = self.get_nodeattr("Stride")
+        dilation = self.get_nodeattr("Dilation")
+        k = self.get_nodeattr("ConvKernelDim")
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+        k_h, k_w = k
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        # check for valid configuration
+        assert (
+            kernel_height <= ifm_dim_h
+            and kernel_width <= ifm_dim_w
+            and stride_h <= ifm_dim_h
+            and stride_w <= ifm_dim_w
+        ), "Illegal conv configuration: kernel or stride > FM dimension"
+
+        # init folding config
+        if self.get_nodeattr("parallel_window"):
+            # mmv_in = M * 1
+            mmv_out = M * k_h * k_w
+            assert (
+                ifm_ch == simd
+            ), "Constraint violated: SIMD must be equal to IFMChannels"
+        else:
+            # mmv_in = 1
+            mmv_out = 1
+            assert (
+                ifm_ch % simd == 0
+            ), "Constraint violated: SIMD must divide IFMChannels"
+
+        # choose implementation style
+        if mmv_out > 1 or (k_h == 1 and k_w == 1):
+            impl_style = "parallel"
+            assert (
+                ifm_ch == simd
+            ), "Constraint violated: SIMD must be equal to IFMChannels"
+        else:
+            impl_style = "default"
+
+        assert (
+            impl_style == "default"
+        ), "ERROR: Parallel window mode not yet implemented"
+        return impl_style
+
+    def generate_hdl(self):
+        impl_style = self.select_impl_style()
+
+        # prepare code generation by filling out dictionaries
+        if impl_style == "default":
+            template_path, code_gen_dict = self.prepare_codegen_default()
+        else:
+            raise Exception("Requested impl. style not implemented")
+
+        # add general parameters to dictionary
+        code_gen_dict["$TOP_MODULE_NAME$"] = [self.get_verilog_top_module_name()]
+        # save top module name so we can refer to it after this node has been renamed
+        # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject)
+        self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name())
+        code_gen_dict["$BIT_WIDTH$"] = [str(self.get_input_datatype().bitwidth())]
+        ram_style = self.get_nodeattr("ram_style")
+        if ram_style == "auto":
+            code_gen_dict["$RAM_STYLE$"] = [""]
+        else:
+            code_gen_dict["$RAM_STYLE$"] = ['(* ram_style = "{}" *)'.format(ram_style)]
+
+        # apply code generation to templates
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        with open(template_path, "r") as f:
+            template = f.read()
+        with open(
+            os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_template_wrapper.v", "r"
+        ) as f:
+            template_wrapper = f.read()
+        for key in code_gen_dict:
+            # transform list into long string separated by '\n'
+            code_gen_line = "\n".join(code_gen_dict[key])
+            template = template.replace(key, code_gen_line)
+            template_wrapper = template_wrapper.replace(key, code_gen_line)
+        with open(
+            os.path.join(
+                code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv"
+            ),
+            "w",
+        ) as f:
+            f.write(template)
+        with open(
+            os.path.join(
+                code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
+            ),
+            "w",
+        ) as f:
+            f.write(template_wrapper)
+
+        # set ipgen_path and ip_path so that HLS-Synth transformation
+        # and stich_ip transformation do not complain
+        self.set_nodeattr("ipgen_path", code_gen_dir)
+        self.set_nodeattr("ip_path", code_gen_dir)
+
+    def prepare_rtlsim(self):
+        """Creates a Verilator emulation library for the RTL code generated
+        for this node, sets the rtlsim_so attribute to its path and returns
+        a PyVerilator wrapper around it."""
+        # Modified to use generated (System-)Verilog instead of HLS output products
+
+        if PyVerilator is None:
+            raise ImportError("Installation of PyVerilator is required.")
+
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        verilog_paths = [code_gen_dir]
+        verilog_files = [
+            self.get_nodeattr("gen_top_module") + "_wrapper.v",
+            self.get_nodeattr("gen_top_module") + "_impl.sv",
+        ]
+
+        # build the Verilator emu library
+        sim = PyVerilator.build(
+            verilog_files,
+            build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
+            verilog_path=verilog_paths,
+            trace_depth=get_rtlsim_trace_depth(),
+            top_module_name=self.get_verilog_top_module_name(),
+        )
+        # save generated lib filename in attribute
+        self.set_nodeattr("rtlsim_so", sim.lib._name)
+        return sim
+
+    def code_generation_ipi(self):
+        """Constructs and returns the TCL for node instantiation in Vivado IPI."""
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+
+        cmd = [
+            "add_files -norecurse %s"
+            % (
+                os.path.join(
+                    code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
+                )
+            ),
+            "add_files -norecurse %s"
+            % (
+                os.path.join(
+                    code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv"
+                )
+            ),
+            "create_bd_cell -type module -reference %s %s"
+            % (self.get_nodeattr("gen_top_module"), self.onnx_node.name),
+        ]
+
+        return cmd
+
+    def code_generation_ipgen(self, model, fpgapart, clk):
+        """Normally: Generates C++ code and tcl script for IP generation.
+        Here: Generates (System-)Verilog code for IP generation."""
+        self.generate_hdl()
+
+    def ipgen_singlenode_code(self):
+        """Normally: Builds the bash script for IP generation."""
+        pass
+
+    def code_generation_cppsim(self, model):
+        """Normally: Generates C++ code for simulation (cppsim)."""
+        pass
+
+    def compile_singlenode_code(self):
+        pass
+
+    def global_includes(self):
+        pass
+
+    def defines(self, var):
+        pass
+
+    def read_npy_data(self):
+        pass
+
+    def strm_decl(self):
+        pass
+
+    def docompute(self):
+        pass
+
+    def dataoutstrm(self):
+        pass
+
+    def save_as_npy(self):
+        pass
+
+    def blackboxfunction(self):
+        pass
+
+    def pragmas(self):
+        pass
diff --git a/src/finn/custom_op/fpgadataflow/downsampler.py b/src/finn/custom_op/fpgadataflow/downsampler.py
index da29a524b6bba7ce0c7a71bc64a44ae128d91709..b7efaff440dd5cc2160fbfb8050b30924460ffe6 100644
--- a/src/finn/custom_op/fpgadataflow/downsampler.py
+++ b/src/finn/custom_op/fpgadataflow/downsampler.py
@@ -36,7 +36,7 @@ from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 
 class DownSampler(HLSCustomOp):
-    """Corresponds to finn-hlslib ConvolutionInputGenerator_kernel1 function.
+    """Corresponds to finn-hlslib ConvolutionInputGenerator_*_kernel1 function.
     Basically performs a down sampling of the image removing rows and columns."""
 
     def __init__(self, onnx_node):
@@ -55,6 +55,10 @@ class DownSampler(HLSCustomOp):
             "inputDataType": ("s", True, ""),
             # Batch size
             "numInputVectors": ("i", False, 1),
+            # 1D (True) or 2D (False) spatial data
+            "is1D": ("i", False, 0),
+            # for 1D only: (D, 1) (True) or (1, D) dims
+            "is1D_unitx": ("i", False, 1),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -66,28 +70,46 @@ class DownSampler(HLSCustomOp):
         return int(np.floor((idim - 1) / stride) + 1)
 
     def get_exp_cycles(self):
+        is_1D = self.get_nodeattr("is1D")
         idim = self.get_nodeattr("ImgDim")
+        idim_total = idim if is_1D else idim * idim
         channels = self.get_nodeattr("NumChannels")
         simd = self.get_nodeattr("SIMD")
         batch_size = self.get_nodeattr("numInputVectors")
-        exp_cycles = channels / simd * batch_size * idim * idim
+        exp_cycles = channels / simd * batch_size * idim_total
         return int(exp_cycles)
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
+        is_1D = self.get_nodeattr("is1D")
+        is_1D_unitx = self.get_nodeattr("is1D_unitx")
         idim = self.get_nodeattr("ImgDim")
         num_ch = self.get_nodeattr("NumChannels")
         batch = self.get_nodeattr("numInputVectors")
-        ishape = (batch, idim, idim, num_ch)
+        if is_1D:
+            if is_1D_unitx:
+                ishape = (batch, idim, 1, num_ch)
+            else:
+                ishape = (batch, 1, idim, num_ch)
+        else:
+            ishape = (batch, idim, idim, num_ch)
         return ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
+        is_1D = self.get_nodeattr("is1D")
+        is_1D_unitx = self.get_nodeattr("is1D_unitx")
         odim = self.get_downsampled_odim()
         num_ch = self.get_nodeattr("NumChannels")
         batch = self.get_nodeattr("numInputVectors")
-        oshape = (batch, odim, odim, num_ch)
+        if is_1D:
+            if is_1D_unitx:
+                oshape = (batch, odim, 1, num_ch)
+            else:
+                oshape = (batch, 1, odim, num_ch)
+        else:
+            oshape = (batch, odim, odim, num_ch)
         return oshape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         normal_ishape = list(self.get_normal_input_shape())
         ifm_ch = self.get_nodeattr("NumChannels")
         simd = self.get_nodeattr("SIMD")
@@ -96,7 +118,7 @@ class DownSampler(HLSCustomOp):
         folded_ishape = normal_ishape[:-1] + [fold, simd]
         return tuple(folded_ishape)
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         normal_oshape = list(self.get_normal_output_shape())
         ifm_ch = self.get_nodeattr("NumChannels")
         simd = self.get_nodeattr("SIMD")
@@ -129,21 +151,21 @@ class DownSampler(HLSCustomOp):
     def verify_node(self):
         pass
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         ret = DataType[self.get_nodeattr("inputDataType")]
         return ret
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output. (Same as input datatype)"""
         return self.get_input_datatype()
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         ibits = self.get_input_datatype().bitwidth()
         simd = self.get_nodeattr("SIMD")
         return ibits * simd
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         obits = self.get_output_datatype().bitwidth()
         simd = self.get_nodeattr("SIMD")
         return obits * simd
@@ -204,8 +226,9 @@ class DownSampler(HLSCustomOp):
         )
 
     def docompute(self):
+        dim_var = "1D" if (self.get_nodeattr("is1D") == 1) else "2D"
         self.code_gen_dict["$DOCOMPUTE$"] = [
-            """ConvolutionInputGenerator_kernel1<IFMChannels, Input_precision,
+            f"""ConvolutionInputGenerator_{dim_var}_kernel1<IFMChannels, Input_precision,
             IFMDim, SIMD,Stride> (in0, out, numReps);"""
         ]
 
diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
index 04ca45e7f1c1844a9976d46392be46f6cffc2167..93cde15ca7d42dbed12417837916359fdcc71b67 100644
--- a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
@@ -61,13 +61,13 @@ class DuplicateStreams_Batch(HLSCustomOp):
     def get_num_output_streams(self):
         return self.get_nodeattr("NumOutputStreams")
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         ch = self.get_nodeattr("NumChannels")
         vecs = list(self.get_nodeattr("numInputVectors"))
         ishape = tuple(vecs + [ch])
         return ishape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         ch = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
         vecs = list(self.get_nodeattr("numInputVectors"))
@@ -138,22 +138,22 @@ class DuplicateStreams_Batch(HLSCustomOp):
 
         return info_messages
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         """Returns input stream width."""
         ibits = self.get_input_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
         in_width = pe * ibits
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         """Returns output stream width."""
         obits = self.get_output_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
@@ -408,3 +408,13 @@ class DuplicateStreams_Batch(HLSCustomOp):
                 ("out%d_%s" % (i, sname), self.get_outstream_width_padded())
             )
         return intf_names
+
+    def derive_characteristic_fxns(self, period):
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        io_dict = {
+            "inputs": {
+                "in0": [0 for i in range(n_inps)],
+            },
+            "outputs": {"out0": [], "out1": []},
+        }
+        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
diff --git a/src/finn/custom_op/fpgadataflow/eltwise.py b/src/finn/custom_op/fpgadataflow/eltwise.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6284750c73026c09fb7986ffc2517ed9ae3b153
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/eltwise.py
@@ -0,0 +1,466 @@
+# Copyright (c) 2022, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import os
+import warnings
+from qonnx.core.datatype import DataType
+
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class StreamingEltwise(HLSCustomOp):
+    """Class that corresponds to finn-hlslib StreamingEltwise function."""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+
+    def get_nodeattr_types(self):
+
+        my_attrs = super().get_nodeattr_types()
+        my_attrs.update(
+            {
+                "NumChannels": ("i", True, ""),
+                "PE": ("i", True, ""),
+                # FINN DataTypes for inputs; output datatype inferred from input
+                "inputDataType0": ("s", True, ""),
+                "inputDataType1": ("s", True, ""),
+                # type of EltwiseFunction for the operation
+                "eltwiseOp": ("s", True, "", ["Add", "Sub", "AbsDiff"]),
+                # number of input vectors, examples:
+                # [1] is a single vector (like a FC layer with batch=1)
+                # [4] is four vectors (like a FC layer with batch=4)
+                # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
+                "numInputVectors": ("ints", False, [1]),
+                "inFIFODepths": ("ints", False, [2, 2]),
+            }
+        )
+        return my_attrs
+
+    def get_eltwise_op_lambda(self):
+        eltwise_op = self.get_nodeattr("eltwiseOp")
+        idt0 = self.get_input_datatype(0)
+        idt1 = self.get_input_datatype(1)
+        odt = self.get_output_datatype()
+        tin0 = idt0.get_hls_datatype_str()
+        tin1 = idt1.get_hls_datatype_str()
+        tout = odt.get_hls_datatype_str()
+        eltwise_ops = {
+            # "Add": "[](auto a, auto b) { return  a + b; }",
+            # "Sub": "[](auto a, auto b) { return  a - b; }",
+            # "AbsDiff": "[](auto a, auto b) { return  a>b? a-b : b-a; }",
+            "Add": f"add<{tin0}, {tin1}, {tout}>()",
+            "Sub": f"sub<{tin0}, {tin1}, {tout}>()",
+            "AbsDiff": f"absdiff<{tin0}, {tin1}, {tout}>()",
+        }
+        return eltwise_ops[eltwise_op]
+
+    def get_normal_input_shape(self, ind=0):
+        ich = self.get_nodeattr("NumChannels")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        ishape = tuple(vecs + [ich])
+        return ishape
+
+    def get_folded_input_shape(self, ind=0):
+        ich = self.get_nodeattr("NumChannels")
+        pe = self.get_nodeattr("PE")
+        assert ich % pe == 0, "PE must divide NumChannels"
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        ishape = tuple(vecs + [ich // pe, pe])
+        return ishape
+
+    def get_normal_output_shape(self, ind=0):
+        return self.get_normal_input_shape()
+
+    def get_folded_output_shape(self, ind=0):
+        return self.get_folded_input_shape()
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpected input1 shape."
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[1]))
+        assert ishape == exp_ishape, "Unexpected input2 shape."
+        return super().make_const_shape_op(oshape)
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        idt0 = model.get_tensor_datatype(node.input[0])
+        if idt0 != self.get_input_datatype(0):
+            warn_str = "inputDataType0 changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype(0)),
+                str(idt0),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("inputDataType0", idt0.name)
+        idt1 = model.get_tensor_datatype(node.input[1])
+        if idt1 != self.get_input_datatype(1):
+            warn_str = "inputDataType1 changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype(1)),
+                str(idt1),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("inputDataType1", idt1.name)
+        # enforce output data type (calculated based on idt)
+        odt = self.get_output_datatype()
+        model.set_tensor_datatype(self.onnx_node.output[0], odt)
+
+    def verify_node(self):
+        info_messages = []
+        # verify that "backend" is set to "fpgadataflow"
+        backend_value = self.get_nodeattr("backend")
+        if backend_value == "fpgadataflow":
+            info_messages.append("Attribute backend is set correctly")
+        else:
+            info_messages.append('Attribute backend should be set to "fpgadataflow"')
+
+        # verify that all necessary attributes exist
+        try:
+            self.get_nodeattr("code_gen_dir_cppsim")
+            self.get_nodeattr("executable_path")
+            self.get_nodeattr("NumChannels")
+            self.get_nodeattr("PE")
+            self.get_nodeattr("inputDataType0")
+            self.get_nodeattr("inputDataType1")
+            self.get_nodeattr("eltwiseOp")
+            info_messages.append("All necessary attributes exist")
+        except Exception:
+            info_messages.append(
+                """The required StreamingEltwise attributes do not exist."""
+            )
+
+        return info_messages
+
+    def get_input_datatype(self, ind=0):
+        """Returns FINN DataType of input."""
+        return DataType[self.get_nodeattr("inputDataType" + str(ind))]
+
+    def get_output_datatype(self, ind=0):
+        """Returns FINN DataType of output."""
+        op = self.get_nodeattr("eltwiseOp")
+        idt0 = self.get_input_datatype(0)
+        idt1 = self.get_input_datatype(1)
+        assert idt0.signed() == idt1.signed(), (
+            "%s: Inputs must have same signedness" % self.onnx_node.name
+        )
+        idt0_min, idt0_max = idt0.min(), idt0.max()
+        idt1_min, idt1_max = idt1.min(), idt1.max()
+        cands = [
+            idt0_min - idt1_min,
+            idt0_min - idt1_max,
+            idt0_max - idt1_min,
+            idt0_max - idt1_max,
+        ]
+        largest_magnitude = max(map(abs, cands))
+        if op == "Add":
+            if idt0.signed():
+                return DataType.get_smallest_possible(idt0.min() + idt1.min())
+            else:
+                return DataType.get_smallest_possible(idt0.max() + idt1.max())
+        elif op == "Sub":
+            return DataType.get_smallest_possible(-largest_magnitude)
+        elif op == "AbsDiff":
+            return DataType.get_smallest_possible(largest_magnitude)
+        else:
+            raise Exception("%s: Unknown eltWiseOp = %s" % (self.onnx_node.name, op))
+
+    def get_instream_width(self, ind=0):
+        """Returns input stream width."""
+        ibits = self.get_input_datatype(ind).bitwidth()
+        pe = self.get_nodeattr("PE")
+        in_width = pe * ibits
+        return in_width
+
+    def get_outstream_width(self, ind=0):
+        """Returns output stream width."""
+        obits = self.get_output_datatype().bitwidth()
+        pe = self.get_nodeattr("PE")
+        out_width = pe * obits
+        return out_width
+
+    def get_number_output_values(self):
+        return np.prod(self.get_folded_output_shape()[:-1])
+
+    def get_exp_cycles(self):
+        # Channels/PE * batch size * fmdim * fmdim
+        return np.prod(self.get_folded_output_shape()[:-1])
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        exp_ishape = self.get_normal_input_shape()
+        exp_oshape = self.get_normal_output_shape()
+        folded_ishape = self.get_folded_input_shape()
+
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert (
+            inp.shape == exp_ishape
+        ), """Input0 shape doesn't match expected shape ."""
+        export_idt0 = self.get_input_datatype(0)
+        # reshape input into folded form
+        inp = inp.reshape(folded_ishape)
+        # make copy before saving array
+        reshaped_input = inp.copy()
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+        # exact same thing for input1
+        inp = context[node.input[1]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert (
+            inp.shape == exp_ishape
+        ), """Input1 shape doesn't match expected shape ."""
+        export_idt1 = self.get_input_datatype(1)
+        # reshape input into folded form
+        inp = inp.reshape(folded_ishape)
+        # make copy before saving array
+        reshaped_input = inp.copy()
+        np.save(os.path.join(code_gen_dir, "input_1.npy"), reshaped_input)
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_output(context)
+            assert (
+                context[node.output[0]].shape == exp_oshape
+            ), "cppsim did not produce expected output shape"
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits0 = self.get_instream_width(0)
+            nbits1 = self.get_instream_width(1)
+            rtlsim_inp0 = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt0, nbits0
+            )
+            rtlsim_inp1 = npy_to_rtlsim_input(
+                "{}/input_1.npy".format(code_gen_dir), export_idt1, nbits1
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            rtlsim_output = self.rtlsim(sim, rtlsim_inp0, rtlsim_inp1)
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        assert (
+            context[node.output[0]].shape == exp_oshape
+        ), """Output shape doesn't match expected shape."""
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = [
+            '#include "eltwise.hpp"',
+            '#include "interpret.hpp"',
+        ]
+
+        self.code_gen_dict["$GLOBALS$"].extend(
+            [
+                "template<typename TI1, typename TI2, typename TO>",
+                "struct absdiff {",
+                "TO operator()(TI1 const &a, TI2 const &b) const {",
+                "#pragma HLS inline",
+                "return  a>b? a-b : b-a;",
+                "}",
+                "};",
+                "template<typename TI1, typename TI2, typename TO>",
+                "struct sub {",
+                "TO operator()(TI1 const &a, TI2 const &b) const {",
+                "#pragma HLS inline",
+                "return  a-b;",
+                "}",
+                "};",
+                "template<typename TI1, typename TI2, typename TO>",
+                "struct add {",
+                "TO operator()(TI1 const &a, TI2 const &b) const {",
+                "#pragma HLS inline",
+                "return  a+b;",
+                "}",
+                "};",
+            ]
+        )
+
+    def defines(self, var):
+        self.code_gen_dict["$DEFINES$"] = []
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        idt0 = self.get_input_datatype(0)
+        idt1 = self.get_input_datatype(1)
+        elem_bits_0 = idt0.bitwidth()
+        elem_bits_1 = idt1.bitwidth()
+        packed_bits_0 = self.get_instream_width(0)
+        packed_hls_type_0 = "ap_uint<%d>" % packed_bits_0
+        packed_bits_1 = self.get_instream_width(1)
+        packed_hls_type_1 = "ap_uint<%d>" % packed_bits_1
+        elem_hls_type_0 = idt0.get_hls_datatype_str()
+        elem_hls_type_1 = idt1.get_hls_datatype_str()
+        npy_type = "float"
+        self.code_gen_dict["$READNPYDATA$"] = []
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
+            % (packed_hls_type_0, elem_hls_type_0, elem_bits_0, npy_type, npy_in)
+        )
+        npy_in = "%s/input_1.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in1);'
+            % (packed_hls_type_1, elem_hls_type_1, elem_bits_1, npy_type, npy_in)
+        )
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width(0))
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in1 ("in1");'.format(self.get_instream_width(1))
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+        )
+
+    def docompute(self):
+        op = self.get_nodeattr("eltwiseOp")
+        idt0 = self.get_input_datatype(0)
+        idt1 = self.get_input_datatype(1)
+        odt = self.get_output_datatype()
+        elem_hls_type_0 = idt0.get_hls_datatype_str()
+        elem_hls_type_1 = idt1.get_hls_datatype_str()
+        out_hls_type = odt.get_hls_datatype_str()
+        slice_in0 = "Slice<%s>" % elem_hls_type_0
+        slice_in1 = "Slice<%s>" % elem_hls_type_1
+        slice_out = "Slice<%s>" % out_hls_type
+        eltwise_op_str = self.get_eltwise_op_lambda()
+        "%sEltwiseFunction<%s, %s, %s>()" % (
+            op,
+            elem_hls_type_0,
+            elem_hls_type_1,
+            out_hls_type,
+        )
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            """{}<{}, {}, {}, {}, {}, {}>(in0, in1, out, {});""".format(
+                "StreamingEltwise",
+                self.get_nodeattr("NumChannels"),
+                self.get_nodeattr("PE"),
+                self.get_number_output_values(),
+                slice_in0,
+                slice_in1,
+                slice_out,
+                eltwise_op_str,
+            )
+        ]
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        oshape = self.get_folded_output_shape()
+        oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
+
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                oshape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            """void {}(hls::stream<ap_uint<{}>> &in0, hls::stream<ap_uint<{}>> &in1,
+                hls::stream<ap_uint<{}>> &out)""".format(
+                self.onnx_node.name,
+                self.get_nodeattr("PE") * self.get_input_datatype(0).bitwidth(),
+                self.get_nodeattr("PE") * self.get_input_datatype(1).bitwidth(),
+                self.get_nodeattr("PE") * self.get_output_datatype().bitwidth(),
+            )
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+        ]
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=in1 name=in1_" + self.hls_sname()
+        )
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
+        )
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+        )
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        sname = self.hls_sname()
+        swidth = self.get_instream_width_padded()
+        intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]]
+        return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
index d69ea471ea8ae1d58f97d056936b505cc2a2806b..dfc55d283fa664e3b60fc7c4d5a056f53a119292 100644
--- a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
@@ -47,10 +47,6 @@ class FMPadding_Batch(HLSCustomOp):
             # spatial size of input images
             "ImgDim": ("ints", True, []),  # [H, W] = [Y, X]
             # total padding (per dimension) to apply
-            # NOTE: Current padding scheme that is applied tries to pad the same
-            # amount of zeros in front and behind the image for each dimension.
-            # As an example, a padding scheme such as [1, x, 3, x] is equal
-            # to [2, x, 2, x]
             "Padding": (
                 "ints",
                 True,
@@ -62,10 +58,6 @@ class FMPadding_Batch(HLSCustomOp):
             "SIMD": ("i", False, 1),
             # FINN input datatype
             "inputDataType": ("s", True, ""),
-            # controls distribution of padded pixels
-            # in case of uneven padding -- see FMPadding fxn
-            # in hlslib
-            "PaddingStyle": ("i", False, 2, {2, 1}),
             # shape describing input vecs per execution
             "numInputVectors": ("i", False, 1),
         }
@@ -90,20 +82,20 @@ class FMPadding_Batch(HLSCustomOp):
         exp_cycles = (channels / simd) * batch_size * odim_h * odim_w
         return int(exp_cycles)
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         idim_h, idim_w = self.get_nodeattr("ImgDim")
         num_ch = self.get_nodeattr("NumChannels")
         ishape = (1, idim_h, idim_w, num_ch)
         return ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         odim_h, odim_w = self.get_padded_odim()
         num_ch = self.get_nodeattr("NumChannels")
 
         oshape = (1, odim_h, odim_w, num_ch)
         return oshape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         normal_ishape = list(self.get_normal_input_shape())
         ifm_ch = self.get_nodeattr("NumChannels")
         simd = self.get_nodeattr("SIMD")
@@ -112,7 +104,7 @@ class FMPadding_Batch(HLSCustomOp):
         folded_ishape = normal_ishape[:-1] + [fold, simd]
         return tuple(folded_ishape)
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         normal_oshape = list(self.get_normal_output_shape())
         ifm_ch = self.get_nodeattr("NumChannels")
         simd = self.get_nodeattr("SIMD")
@@ -144,7 +136,7 @@ class FMPadding_Batch(HLSCustomOp):
     def verify_node(self):
         pass
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         ret = DataType[self.get_nodeattr("inputDataType")]
         # the hlslib op always pads with zeros, so ensure that the DataType
@@ -152,16 +144,16 @@ class FMPadding_Batch(HLSCustomOp):
         assert ret.allowed(0), "FMPadding_Batch DataType must support zero"
         return ret
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output. (Same as input datatype)"""
         return self.get_input_datatype()
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         ibits = self.get_input_datatype().bitwidth()
         simd = self.get_nodeattr("SIMD")
         return ibits * simd
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         obits = self.get_output_datatype().bitwidth()
         simd = self.get_nodeattr("SIMD")
         return obits * simd
@@ -179,23 +171,21 @@ class FMPadding_Batch(HLSCustomOp):
         pad = self.get_nodeattr("Padding")
         pad_h = pad[0] + pad[2]
         pad_w = pad[1] + pad[3]
-        is_square = idim_h == idim_w
+        is_square_img = idim_h == idim_w
+        is_square_pad = pad_h == pad_w
 
-        if is_square:
-            assert (
-                pad_h == pad_w
-            ), "Only equal padding along the dimensions for square images is supported"
+        if is_square_img and is_square_pad:
             self.code_gen_dict["$DEFINES$"] = [
                 """#define ImgDim1 {}\n#define OutputDim1 {}\n
-                #define Padding1 {}\n#define NumChannels1 {}\n
-                #define SIMD1 {}\n#define PaddingStyle1 {}\n
+                #define PaddingBefore1 {}\n#define PaddingBehind1 {}\n
+                #define NumChannels1 {}\n#define SIMD1 {}\n
                 #define numReps {}\n""".format(
                     idim_h,
                     odim_h,
-                    pad_h,
+                    pad[0],
+                    pad[2],
                     self.get_nodeattr("NumChannels"),
                     self.get_nodeattr("SIMD"),
-                    self.get_nodeattr("PaddingStyle"),
                     self.get_nodeattr("numInputVectors"),
                 )
             ]
@@ -204,20 +194,22 @@ class FMPadding_Batch(HLSCustomOp):
                 """
                 #define OutputDim1_x {}\n
                 #define OutputDim1_y {}\n
-                #define Padding1_x {}\n
-                #define Padding1_y {}\n
+                #define PaddingLeft1 {}\n
+                #define PaddingRight1 {}\n
+                #define PaddingTop1 {}\n
+                #define PaddingBottom1 {}\n
                 #define NumChannels1 {}\n
                 #define SIMD1 {}\n
-                #define PaddingStyle1 {}\n
                 #define numReps {}\n
                 """.format(
                     odim_w,
                     odim_h,
-                    pad_w,
-                    pad_h,
+                    pad[1],
+                    pad[3],
+                    pad[0],
+                    pad[2],
                     self.get_nodeattr("NumChannels"),
                     self.get_nodeattr("SIMD"),
-                    self.get_nodeattr("PaddingStyle"),
                     self.get_nodeattr("numInputVectors"),
                 )
             ]
@@ -254,21 +246,26 @@ class FMPadding_Batch(HLSCustomOp):
         node = self.onnx_node
 
         idim_h, idim_w = self.get_nodeattr("ImgDim")
-        is_square = idim_h == idim_w
+        pad = self.get_nodeattr("Padding")
+        pad_h = pad[0] + pad[2]
+        pad_w = pad[1] + pad[3]
+        is_square_img = idim_h == idim_w
+        is_square_pad = pad_h == pad_w
 
-        if is_square:
+        if is_square_img and is_square_pad:
             hls_call = node.op_type
             self.code_gen_dict["$DOCOMPUTE$"] = [
-                """{}<ImgDim1, OutputDim1, Padding1, NumChannels1,SIMD1,
-                {}, PaddingStyle1> (in0, out, numReps);""".format(
+                """{}<ImgDim1, OutputDim1, PaddingBefore1, PaddingBehind1, NumChannels1, SIMD1,
+                {}> (in0, out, numReps);""".format(
                     hls_call, in_t
                 )
             ]
         else:
             hls_call = "FMPadding_nonsquare_Batch"
             self.code_gen_dict["$DOCOMPUTE$"] = [
-                """{}<OutputDim1_x, OutputDim1_y, Padding1_x, Padding1_y, NumChannels1,
-                SIMD1, {}, PaddingStyle1> (in0, out, numReps);""".format(
+                """{}<OutputDim1_x, OutputDim1_y, PaddingLeft1, PaddingRight1,
+                PaddingTop1, PaddingBottom1, NumChannels1,
+                SIMD1, {}> (in0, out, numReps);""".format(
                     hls_call, in_t
                 )
             ]
diff --git a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
index adafa7dcf36111e63fa49e0d184594fff54be99d..e7fa5bc0048b54a32ebc61482b96009fa019809e 100644
--- a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
@@ -56,13 +56,13 @@ class GlobalAccPool_Batch(HLSCustomOp):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         ch = self.get_nodeattr("NumChannels")
         vecs = list(self.get_nodeattr("numInputVectors"))
         ishape = tuple(vecs + [ch])
         return ishape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         ch = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
         vecs = list(self.get_nodeattr("numInputVectors"))
@@ -71,7 +71,7 @@ class GlobalAccPool_Batch(HLSCustomOp):
         folded_ishape = tuple(vecs + [folds, pe])
         return folded_ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         ch = self.get_nodeattr("NumChannels")
         vecs = list(self.get_nodeattr("numInputVectors"))
         if len(vecs) == 1:
@@ -80,7 +80,7 @@ class GlobalAccPool_Batch(HLSCustomOp):
             oshape = tuple([vecs[0]] + [1, 1, ch])
         return oshape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         ch = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
         unfolded_shape = list(self.get_normal_output_shape())
@@ -139,11 +139,11 @@ class GlobalAccPool_Batch(HLSCustomOp):
 
         return info_messages
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         # determine data type from image size and input type
         idt = DataType[self.get_nodeattr("inputDataType")]
@@ -155,14 +155,14 @@ class GlobalAccPool_Batch(HLSCustomOp):
             extreme_value = npixels * idt.max()
         return DataType.get_smallest_possible(extreme_value)
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         """Returns input stream width."""
         ibits = self.get_input_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
         in_width = pe * ibits
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         """Returns output stream width."""
         obits = self.get_output_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py
index b202e95a28a26de3dabc098c2030bafcf840d164..f307be95c30d822dfc517e4c331bd8d82d727997 100644
--- a/src/finn/custom_op/fpgadataflow/hlscustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py
@@ -29,8 +29,9 @@
 import numpy as np
 import os
 import subprocess
+import warnings
 from abc import abstractmethod
-from pyverilator.util.axi_utils import rtlsim_multi_io
+from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io
 from qonnx.core.datatype import DataType
 from qonnx.custom_op.base import CustomOp
 from qonnx.util.basic import roundup_to_integer_multiple
@@ -107,10 +108,18 @@ class HLSCustomOp(CustomOp):
             # ID of FPGA device to which this Op is allocated, in
             # a multi-FPGA setting
             "device_id": ("i", False, 0),
-            # input and output FIFO depths
-            "inFIFODepth": ("i", False, 2),
-            "outFIFODepth": ("i", False, 2),
+            # input and output FIFO depths for multi-I/O nodes
+            "inFIFODepths": ("ints", False, [2]),
+            "outFIFODepths": ("ints", False, [2]),
             "output_hook": ("s", False, ""),
+            # accumulated characteristic function over two periods
+            "io_chrc_in": ("t", False, np.asarray([], dtype=np.int32)),
+            "io_chrc_out": ("t", False, np.asarray([], dtype=np.int32)),
+            # the period for which the characterization was run
+            "io_chrc_period": ("i", False, 0),
+            # amount of zero padding inserted during chrc.
+            "io_chrc_pads_in": ("ints", False, []),
+            "io_chrc_pads_out": ("ints", False, []),
         }
 
     def get_verilog_top_module_name(self):
@@ -138,6 +147,7 @@ class HLSCustomOp(CustomOp):
         intf_names["m_axis"] = [("out_" + sname, self.get_outstream_width_padded())]
         intf_names["aximm"] = []
         intf_names["axilite"] = []
+        intf_names["ap_none"] = []
         return intf_names
 
     def get_verilog_top_filename(self):
@@ -687,40 +697,48 @@ compilation transformations?
         HLSCustomOp class but has to be filled by every node."""
         pass
 
-    def get_normal_input_shape(self):
+    def get_input_datatype(self, ind=0):
+        """Returns FINN DataType of input stream ind."""
+        raise Exception("get_input_datatype not implemented for this op")
+
+    def get_output_datatype(self, ind=0):
+        """Returns FINN DataType of output stream ind."""
+        raise Exception("get_output_datatype not implemented for this op")
+
+    def get_normal_input_shape(self, ind=0):
         """Returns normal input shape if implemented."""
         raise Exception("get_normal_input_shape not implemented for this op")
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         """Returns folded output shape if implemented."""
         raise Exception("get_normal_output_shape not implemented for this op")
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         """Returns folded input shape (according to synapse folding), if implemented."""
         raise Exception("get_folded_input_shape not implemented for this op")
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         """Returns folded output shape (according to neuron folding), if implemented."""
         raise Exception("get_folded_output_shape not implemented for this op")
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         """Returns input stream width, if implemented."""
         raise Exception("get_instream_width not implemented for this op")
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         """Returns output stream width, if implemented."""
         raise Exception("get_outstream_width not implemented for this op")
 
-    def get_instream_width_padded(self):
+    def get_instream_width_padded(self, ind=0):
         """Returns input stream width padded to a multiple of 8. This is required
         by the AXI Stream spec."""
-        in_width = self.get_instream_width()
+        in_width = self.get_instream_width(ind=ind)
         return roundup_to_integer_multiple(in_width, 8)
 
-    def get_outstream_width_padded(self):
+    def get_outstream_width_padded(self, ind=0):
         """Returns output stream width padded to a multiple of 8. This is required
         by the AXI Stream spec."""
-        out_width = self.get_outstream_width()
+        out_width = self.get_outstream_width(ind=ind)
         return roundup_to_integer_multiple(out_width, 8)
 
     def get_ap_int_max_w(self):
@@ -733,3 +751,119 @@ compilation transformations?
             "AP_INT_MAX_W=%d is larger than allowed maximum of 32768" % ret
         )
         return ret
+
+    def derive_characteristic_fxns(self, period, override_rtlsim_dict=None):
+        """Return the unconstrained characteristic functions for this node."""
+        # ensure rtlsim is ready
+        assert self.get_nodeattr("rtlsim_so") != "", (
+            "rtlsim not ready for " + self.onnx_node.name
+        )
+        if self.get_nodeattr("io_chrc_period") > 0:
+            warnings.warn(
+                "Skipping node %s: already has FIFO characteristic"
+                % self.onnx_node.name
+            )
+            return
+        exp_cycles = self.get_exp_cycles()
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        n_outs = np.prod(self.get_folded_output_shape()[:-1])
+        if exp_cycles == 0:
+            # try to come up with an optimistic estimate
+            exp_cycles = min(n_inps, n_outs)
+        assert (
+            exp_cycles <= period
+        ), "Period %d too short to characterize %s : expects min %d cycles" % (
+            period,
+            self.onnx_node.name,
+            exp_cycles,
+        )
+        sim = self.get_rtlsim()
+        # signal name
+        sname = "_" + self.hls_sname() + "_"
+        if override_rtlsim_dict is not None:
+            io_dict = override_rtlsim_dict
+        else:
+            io_dict = {
+                "inputs": {
+                    "in0": [0 for i in range(n_inps)],
+                },
+                "outputs": {"out": []},
+            }
+
+        # extra dicts to keep track of cycle-by-cycle transaction behavior
+        # note that we restrict key names to filter out weight streams etc
+        txns_in = {key: [] for (key, value) in io_dict["inputs"].items() if "in" in key}
+        txns_out = {
+            key: [] for (key, value) in io_dict["outputs"].items() if "out" in key
+        }
+
+        def monitor_txns(sim_obj):
+            for inp in txns_in:
+                in_ready = _read_signal(sim, inp + sname + "TREADY") == 1
+                in_valid = _read_signal(sim, inp + sname + "TVALID") == 1
+                if in_ready and in_valid:
+                    txns_in[inp].append(1)
+                else:
+                    txns_in[inp].append(0)
+            for outp in txns_out:
+                if (
+                    _read_signal(sim, outp + sname + "TREADY") == 1
+                    and _read_signal(sim, outp + sname + "TVALID") == 1
+                ):
+                    txns_out[outp].append(1)
+                else:
+                    txns_out[outp].append(0)
+
+        reset_rtlsim(sim)
+        total_cycle_count = rtlsim_multi_io(
+            sim,
+            io_dict,
+            n_outs,
+            sname=sname,
+            liveness_threshold=period,
+            hook_preclk=monitor_txns,
+        )
+        assert (
+            total_cycle_count <= period
+        ), """Total cycle count from rtl simulation is higher than
+            specified period, please set the period higher than {}""".format(
+            total_cycle_count
+        )
+        self.set_nodeattr("io_chrc_period", period)
+
+        def accumulate_char_fxn(chrc):
+            p = len(chrc)
+            ret = []
+            for t in range(2 * p):
+                if t == 0:
+                    ret.append(chrc[0])
+                else:
+                    ret.append(ret[-1] + chrc[t % p])
+            return np.asarray(ret, dtype=np.int32)
+
+        all_txns_in = np.empty((len(txns_in.keys()), 2 * period), dtype=np.int32)
+        all_txns_out = np.empty((len(txns_out.keys()), 2 * period), dtype=np.int32)
+        all_pad_in = []
+        all_pad_out = []
+        for in_idx, in_strm_nm in enumerate(txns_in.keys()):
+            txn_in = txns_in[in_strm_nm]
+            if len(txn_in) < period:
+                pad_in = period - len(txn_in)
+                txn_in += [0 for x in range(pad_in)]
+            txn_in = accumulate_char_fxn(txn_in)
+            all_txns_in[in_idx, :] = txn_in
+            all_pad_in.append(pad_in)
+
+        for out_idx, out_strm_nm in enumerate(txns_out.keys()):
+            txn_out = txns_out[out_strm_nm]
+            if len(txn_out) < period:
+                pad_out = period - len(txn_out)
+                txn_out += [0 for x in range(pad_out)]
+            txn_out = accumulate_char_fxn(txn_out)
+            all_txns_out[out_idx, :] = txn_out
+            all_pad_out.append(pad_out)
+
+        self.set_nodeattr("io_chrc_in", all_txns_in)
+        self.set_nodeattr("io_chrc_out", all_txns_out)
+        self.set_nodeattr("io_chrc_pads_in", all_pad_in)
+        self.set_nodeattr("io_chrc_pads_out", all_pad_out)
diff --git a/src/finn/custom_op/fpgadataflow/iodma.py b/src/finn/custom_op/fpgadataflow/iodma.py
index 33ee1d359c7b82494e1b5ce1b83aa5d0199f8153..65683079fc6a648de31148e398ea498f38b8d3d9 100644
--- a/src/finn/custom_op/fpgadataflow/iodma.py
+++ b/src/finn/custom_op/fpgadataflow/iodma.py
@@ -100,16 +100,16 @@ class IODMA(HLSCustomOp):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         vecs = list(self.get_nodeattr("numInputVectors"))
         num_ch = self.get_nodeattr("NumChannels")
         ishape = tuple(vecs + [num_ch])
         return ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         return self.get_normal_input_shape()
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         if self.get_nodeattr("direction") == "in":
             raise ValueError("Folded input shape not defined for input IODMA")
         else:
@@ -126,7 +126,7 @@ class IODMA(HLSCustomOp):
             shape.append(elems_per_word)
             return tuple(shape)
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         if self.get_nodeattr("direction") == "out":
             raise ValueError("Folded output shape not defined for output IODMA")
         else:
@@ -166,15 +166,15 @@ class IODMA(HLSCustomOp):
     def verify_node(self):
         pass
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("dataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output. (Same as input datatype)"""
         return self.get_input_datatype()
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         if self.get_nodeattr("direction") == "in":
             return self.get_nodeattr("intfWidth")
         elif self.get_nodeattr("direction") == "out":
@@ -182,7 +182,7 @@ class IODMA(HLSCustomOp):
         else:
             raise ValueError("Invalid IODMA direction, please set to in or out")
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         if self.get_nodeattr("direction") == "out":
             return self.get_nodeattr("intfWidth")
         elif self.get_nodeattr("direction") == "in":
diff --git a/src/finn/custom_op/fpgadataflow/labelselect_batch.py b/src/finn/custom_op/fpgadataflow/labelselect_batch.py
index 3e27ee01113392174c1206fc10e1c9abe82fdfe7..03f89bd7ecac69a9097f4f35c42bd528be709515 100644
--- a/src/finn/custom_op/fpgadataflow/labelselect_batch.py
+++ b/src/finn/custom_op/fpgadataflow/labelselect_batch.py
@@ -70,13 +70,13 @@ class LabelSelect_Batch(HLSCustomOp):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         nlabels = self.get_nodeattr("Labels")
         vecs = list(self.get_nodeattr("numInputVectors"))
         ishape = tuple(vecs + [nlabels])
         return ishape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         nlabels = self.get_nodeattr("Labels")
         pe = self.get_nodeattr("PE")
         vecs = list(self.get_nodeattr("numInputVectors"))
@@ -85,13 +85,13 @@ class LabelSelect_Batch(HLSCustomOp):
         folded_ishape = tuple(vecs + [folds, pe])
         return folded_ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         k = self.get_nodeattr("K")
         vecs = list(self.get_nodeattr("numInputVectors"))
         oshape = tuple(vecs + [k])
         return oshape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         k = self.get_nodeattr("K")
         vecs = list(self.get_nodeattr("numInputVectors"))
         oshape = tuple(vecs + [k, 1])
@@ -152,24 +152,24 @@ class LabelSelect_Batch(HLSCustomOp):
 
         return info_messages
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         ret = DataType[self.get_nodeattr("inputDataType")]
         return ret
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         ret = DataType[self.get_nodeattr("outputDataType")]
         return ret
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         """Returns input stream width."""
         ibits = self.get_input_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
         in_width = pe * ibits
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         """Returns output stream width."""
         return self.get_output_datatype().bitwidth()
 
diff --git a/src/finn/custom_op/fpgadataflow/lookup.py b/src/finn/custom_op/fpgadataflow/lookup.py
index d90fa0f05ab2a92391f610ae1c4516a95a881ce4..fd3e2b5b1cfa74eb4f957df4b568e6c46da47617 100644
--- a/src/finn/custom_op/fpgadataflow/lookup.py
+++ b/src/finn/custom_op/fpgadataflow/lookup.py
@@ -75,21 +75,21 @@ class Lookup(HLSCustomOp):
         exp_cycles = int(n_inputs)
         return exp_cycles
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         return self.get_nodeattr("InputShape")
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         ishape = self.get_normal_input_shape()
         emb_dim = self.get_nodeattr("EmbeddingDim")
         oshape = list(ishape) + [emb_dim]
         return tuple(oshape)
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         ishape = self.get_normal_input_shape()
         folded_ishape = list(ishape) + [1]
         return tuple(folded_ishape)
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         ishape = self.get_normal_input_shape()
         mem_mode = self.get_nodeattr("mem_mode")
         emb_dim = self.get_nodeattr("EmbeddingDim")
@@ -135,19 +135,19 @@ class Lookup(HLSCustomOp):
     def verify_node(self):
         pass
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         ret = DataType[self.get_nodeattr("InputType")]
         return ret
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         ret = DataType[self.get_nodeattr("EmbeddingType")]
         return ret
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         ibits = self.get_input_datatype().bitwidth()
         return ibits
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         folded_oshape = self.get_folded_output_shape()
         obits = self.get_output_datatype().bitwidth()
         return obits * folded_oshape[-1]
@@ -159,8 +159,8 @@ class Lookup(HLSCustomOp):
     def global_includes(self):
         mem_mode = self.get_nodeattr("mem_mode")
         global_incls = []
+        global_incls.append('#include "lookup.hpp"')
         if mem_mode == "const":
-            global_incls.append('#include "lookup.hpp"')
             global_incls.append('#include "embeddings.hpp"')
         self.code_gen_dict["$GLOBALS$"] = global_incls
 
@@ -258,17 +258,10 @@ class Lookup(HLSCustomOp):
                 InputType, EmbeddingType >(in0, out, embeddings);"""
             ]
         elif mem_mode == "external":
-            hls_impl = """
-    if(!in0.empty()) {
-        ap_uint<T_SRC::width+EmbeddingAlign> const  base =
-            (in0.read(), ap_uint<EmbeddingAlign>(0));
-        for(unsigned  j = 0; j < EmbeddingSize; j++) {
-#pragma HLS PIPELINE II=1
-            out.write(mem[base+j]);
-        }
-    }
-            """
-            self.code_gen_dict["$DOCOMPUTE$"] = [hls_impl]
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """StreamingLookup_ext<EmbeddingSize>(in0, out, mem, size, oob_count,
+                oob_irq);"""
+            ]
 
     def blackboxfunction(self):
         mem_mode = self.get_nodeattr("mem_mode")
@@ -286,7 +279,8 @@ class Lookup(HLSCustomOp):
                 "void "
                 + self.onnx_node.name
                 + "(hls::stream<T_SRC> &in0, hls::stream<T_DST> &out, "
-                + "T_DST const *const  mem)"
+                + "T_DST const *const  mem, unsigned const size, "
+                + "unsigned &oob_count, bool &oob_irq)"
             ]
 
     def pragmas(self):
@@ -305,6 +299,13 @@ class Lookup(HLSCustomOp):
         elif mem_mode == "external":
             my_pragmas.append("#pragma HLS INTERFACE m_axi offset=slave port=mem")
             my_pragmas.append("#pragma HLS INTERFACE s_axilite port=mem bundle=control")
+            my_pragmas.append(
+                "#pragma HLS INTERFACE s_axilite port=size bundle=control"
+            )
+            my_pragmas.append(
+                "#pragma HLS INTERFACE s_axilite port=oob_count bundle=control"
+            )
+            my_pragmas.append("#pragma HLS INTERFACE ap_none port=oob_irq")
         else:
             raise Exception("Unrecognized mem_mode: " + mem_mode)
         self.code_gen_dict["$PRAGMAS$"] = my_pragmas
@@ -475,4 +476,5 @@ class Lookup(HLSCustomOp):
         if mem_mode == "external":
             intf_names["axilite"] = ["s_axi_control"]
             intf_names["aximm"] = [("m_axi_gmem", self.get_nodeattr("ext_mem_width"))]
+            intf_names["ap_none"] = ["oob_irq"]
         return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
index 9d2717dc8c65ddb5329816880067b81b10db2c02..69763fbea8a6079c7b0a61e14da37a3af69dfdfb 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
@@ -409,16 +409,16 @@ class MatrixVectorActivation(HLSCustomOp):
         """Returns FINN DataType of weights."""
         return DataType[self.get_nodeattr("weightDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("outputDataType")]
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         i_bits = self.get_input_datatype().bitwidth()
         in_width = i_bits * self.get_nodeattr("SIMD")
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         o_bits = self.get_output_datatype().bitwidth()
         out_width = o_bits * self.get_nodeattr("PE")
         return out_width
@@ -474,7 +474,7 @@ class MatrixVectorActivation(HLSCustomOp):
 
         return folded_input_shape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         mh = self.get_nodeattr("MH")
         pe = self.get_nodeattr("PE")
         nf = mh // pe
@@ -482,13 +482,13 @@ class MatrixVectorActivation(HLSCustomOp):
         folded_output_shape = tuple(vecs + [nf, pe])
         return folded_output_shape
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         mw = self.get_nodeattr("MW")
         vecs = list(self.get_nodeattr("numInputVectors"))
         normal_input_shape = tuple(vecs + [mw])
         return normal_input_shape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         mh = self.get_nodeattr("MH")
         vecs = list(self.get_nodeattr("numInputVectors"))
         normal_output_shape = tuple(vecs + [mh])
@@ -1227,8 +1227,11 @@ class MatrixVectorActivation(HLSCustomOp):
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
         )
-        in_fifo_depth = self.get_nodeattr("inFIFODepth")
-        out_fifo_depth = self.get_nodeattr("outFIFODepth")
+        # TODO can we deprecate this entirely? this looks like legacy code
+        # that does not really serve a purpose - FIFO sizes are not typically
+        # allocated at this point; at best they are set to 2 as the default
+        in_fifo_depth = 2
+        out_fifo_depth = 2
         # insert depth pragmas only if specified
         if in_fifo_depth != 0:
             self.code_gen_dict["$PRAGMAS$"].append(
@@ -1462,3 +1465,20 @@ class MatrixVectorActivation(HLSCustomOp):
             thres_count = out_features
             ret_dict[thres_param_type] = thres_count
         return ret_dict
+
+    def derive_characteristic_fxns(self, period):
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        io_dict = {
+            "inputs": {
+                "in0": [0 for i in range(n_inps)],
+            },
+            "outputs": {"out": []},
+        }
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode in ["decoupled", "external"]:
+            n_weight_inps = self.calc_wmem()
+            num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+            io_dict["inputs"]["weights"] = [
+                0 for i in range(num_w_reps * n_weight_inps)
+            ]
+        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
diff --git a/src/finn/custom_op/fpgadataflow/pool_batch.py b/src/finn/custom_op/fpgadataflow/pool_batch.py
index 3bf187fa9a78ed2c812f042a29079ee1e3163d74..91cd537baeff0c7666bbf3596b46a7412ec2fe4e 100644
--- a/src/finn/custom_op/fpgadataflow/pool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/pool_batch.py
@@ -74,11 +74,11 @@ class Pool_Batch(HLSCustomOp):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("InputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         fxn = self.get_nodeattr("Function")
         odt = DataType[self.get_nodeattr("OutputDataType")]
@@ -98,7 +98,7 @@ class Pool_Batch(HLSCustomOp):
 
         return odt
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         ifm_ch = self.get_nodeattr("Channels")
         odims = self.get_nodeattr("OutImgDims")
         batch_size = self.get_nodeattr("BatchSize")
@@ -107,7 +107,7 @@ class Pool_Batch(HLSCustomOp):
         ishape = (batch_size, *odims, k_prod * ifm_ch)
         return ishape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         normal_ishape = list(self.get_normal_input_shape())
         ifm_ch = self.get_nodeattr("Channels")
         pe = self.get_nodeattr("PE")
@@ -116,14 +116,14 @@ class Pool_Batch(HLSCustomOp):
         folded_ishape = normal_ishape[:-1] + [fold, pe]
         return tuple(folded_ishape)
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         ofm_ch = self.get_nodeattr("Channels")
         odims = self.get_nodeattr("OutImgDims")
         batch_size = self.get_nodeattr("BatchSize")
         oshape = (batch_size, *odims, ofm_ch)
         return oshape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         normal_oshape = list(self.get_normal_output_shape())
         ifm_ch = self.get_nodeattr("Channels")
         pe = self.get_nodeattr("PE")
@@ -147,13 +147,13 @@ class Pool_Batch(HLSCustomOp):
         exp_cycles = ((ifm_ch * k_prod) / pe) * np.prod(odims) * batch_size
         return int(exp_cycles)
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         dt_bits = self.get_input_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
         in_width = int(dt_bits * pe)
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         dt_bits = self.get_output_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
         out_width = int(dt_bits * pe)
diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
index 1e6b72e4d54ede639e797f32f51fb7705ec8ce4b..a3aa9d570d0efcbe82090d19a151d4f5b12078b6 100644
--- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
@@ -60,19 +60,19 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("dataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("dataType")]
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         ishape = self.get_nodeattr("shape")
         return ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         oshape = self.get_nodeattr("shape")
         return oshape
 
@@ -97,7 +97,7 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
                 Please adjust PE and SIMD values so that OutWidth % InWidth = 0
                 or alternatively use impl_style = vivado"""
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         self.check_divisible_iowidths()
         iwidth = self.get_nodeattr("inWidth")
         ishape = self.get_normal_input_shape()
@@ -117,7 +117,7 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
         dummy_t = dummy_t.reshape(new_shape)
         return dummy_t.shape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         self.check_divisible_iowidths()
         owidth = self.get_nodeattr("outWidth")
         oshape = self.get_normal_output_shape()
@@ -142,11 +142,11 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
         folded_oshape = self.get_folded_output_shape()
         return np.prod(folded_oshape[:-1])
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         in_width = self.get_nodeattr("inWidth")
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         out_width = self.get_nodeattr("outWidth")
         return out_width
 
diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py
index a7c3cd0be59db4ba8665f8fba5be72282339b8c8..40d016de43820a37e8c7894a3e1f30146c667e59 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfifo.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py
@@ -68,11 +68,29 @@ class StreamingFIFO(HLSCustomOp):
                 "auto",
                 {"auto", "block", "distributed", "ultra"},
             ),
+            # whether depth monitoring is enabled (impl_style=rtl only)
+            "depth_monitor": ("i", False, 0),
         }
         my_attrs.update(super().get_nodeattr_types())
 
         return my_attrs
 
+    def get_adjusted_depth(self):
+        impl = self.get_nodeattr("impl_style")
+        depth = self.get_nodeattr("depth")
+        if impl == "vivado":
+            old_depth = depth
+            # round up depth to nearest power-of-2
+            # Vivado FIFO impl may fail otherwise
+            depth = (1 << (depth - 1).bit_length()) if impl == "vivado" else depth
+            if old_depth != depth:
+                warnings.warn(
+                    "%s: rounding-up FIFO depth from %d to %d for impl_style=vivado"
+                    % (self.onnx_node.name, old_depth, depth)
+                )
+
+        return depth
+
     def make_shape_compatible_op(self, model):
         exp_ishape = self.get_normal_input_shape()
         oshape = self.get_normal_output_shape()
@@ -97,6 +115,14 @@ class StreamingFIFO(HLSCustomOp):
     def verify_node(self):
         pass
 
+    def get_verilog_top_module_intf_names(self):
+        ret = super().get_verilog_top_module_intf_names()
+        is_rtl = self.get_nodeattr("impl_style") == "rtl"
+        is_depth_monitor = self.get_nodeattr("depth_monitor") == 1
+        if is_rtl and is_depth_monitor:
+            ret["ap_none"] = ["maxcount"]
+        return ret
+
     def get_verilog_top_module_name(self):
         "Return the Verilog top module name for this node."
 
@@ -180,10 +206,8 @@ class StreamingFIFO(HLSCustomOp):
         self.set_nodeattr("ip_vlnv", vlnv)
         self.code_gen_dict.clear()
 
-    def get_normal_input_shape(self):
-        depth = self.get_nodeattr("depth")
-        # depth has to be between 2 and 256 with the current
-        # StreamingFIFO implementation
+    def get_normal_input_shape(self, ind=0):
+        depth = self.get_adjusted_depth()
         assert depth >= 2, """Depth is too low"""
         if depth > 256 and self.get_nodeattr("impl_style") == "rtl":
             warnings.warn(
@@ -211,22 +235,22 @@ class StreamingFIFO(HLSCustomOp):
 
         return normal_ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         return self.get_normal_input_shape()
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         return self.get_nodeattr("folded_shape")
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         return self.get_nodeattr("folded_shape")
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         dtype = DataType[self.get_nodeattr("dataType")]
         folded_shape = self.get_nodeattr("folded_shape")
         in_width = folded_shape[-1] * dtype.bitwidth()
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         dtype = DataType[self.get_nodeattr("dataType")]
         folded_shape = self.get_nodeattr("folded_shape")
         in_width = folded_shape[-1] * dtype.bitwidth()
@@ -328,7 +352,7 @@ class StreamingFIFO(HLSCustomOp):
         elif impl_style == "vivado":
             cmd = []
             node_name = self.onnx_node.name
-            depth = self.get_nodeattr("depth")
+            depth = self.get_adjusted_depth()
             ram_style = self.get_nodeattr("ram_style")
             # create a hierarchy for this layer, with the same port names
             clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
@@ -393,7 +417,7 @@ class StreamingFIFO(HLSCustomOp):
         """Calculates resource estimation for BRAM"""
         impl = self.get_nodeattr("impl_style")
         ram_type = self.get_nodeattr("ram_style")
-        depth = self.get_nodeattr("depth")
+        depth = self.get_adjusted_depth()
         W = self.get_instream_width()
 
         if impl == "rtl" or (impl == "vivado" and ram_type != "block"):
@@ -418,7 +442,7 @@ class StreamingFIFO(HLSCustomOp):
 
         impl = self.get_nodeattr("impl_style")
         ram_type = self.get_nodeattr("ram_style")
-        depth = self.get_nodeattr("depth")
+        depth = self.get_adjusted_depth()
         W = self.get_instream_width()
 
         if impl == "rtl" or (impl == "vivado" and ram_type != "ultra"):
@@ -428,7 +452,7 @@ class StreamingFIFO(HLSCustomOp):
             return (math.ceil(depth / 4096)) * (math.ceil(W / 72))
 
     def bram_efficiency_estimation(self):
-        depth = self.get_nodeattr("depth")
+        depth = self.get_adjusted_depth()
         W = self.get_instream_width()
         bram16_est = self.bram_estimation()
         if bram16_est == 0:
@@ -441,7 +465,7 @@ class StreamingFIFO(HLSCustomOp):
         """Calculates resource estimations for LUTs"""
         impl = self.get_nodeattr("impl_style")
         ram_type = self.get_nodeattr("ram_style")
-        depth = self.get_nodeattr("depth")
+        depth = self.get_adjusted_depth()
         W = self.get_instream_width()
 
         address_luts = 2 * math.ceil(math.log(depth, 2))
diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
index 882b40a0aaf542e6dcaf427ca3567ae78394ede5..a0e60931edd8590aaebc0560c4bd28d61d62e8ea 100755
--- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
@@ -57,11 +57,11 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("dataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("dataType")]
 
@@ -82,13 +82,13 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized()
         return (ifm_dim[0] == 1) and (k[0] == 1)
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim")
         ifm_ch = self.get_nodeattr("NumChannels")
         ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch)
         return ishape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim")
         ifm_ch = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
@@ -99,7 +99,7 @@ class StreamingMaxPool_Batch(HLSCustomOp):
             folded_ishape = (1, ifm_dim_h, ifm_dim_w, 1, ifm_ch)
         return folded_ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim")
         k_h, k_w = tuple(self.get_nodeattr("PoolDim"))
         ifm_ch = self.get_nodeattr("NumChannels")
@@ -116,7 +116,7 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         oshape = (1, ofm_dim_h, ofm_dim_w, ifm_ch)
         return oshape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         # even though there is no folding in the current hlslib op,
         # insert a time multiplexing axis to remain compatible with the
         # shapes produced by the rest of the dataflow pipeline
@@ -155,7 +155,7 @@ class StreamingMaxPool_Batch(HLSCustomOp):
             # TODO: adjust inaccurate formula
             return int(ifm_dim[1] * ifm_dim[1] * (1 + 1 / (k[1] * k[1])))
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         dt_bits = self.get_input_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
         ifm_ch = self.get_nodeattr("NumChannels")
@@ -165,7 +165,7 @@ class StreamingMaxPool_Batch(HLSCustomOp):
             in_width = int(dt_bits * ifm_ch)
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         """For streaming maxpool out stream width is the same as in stream width"""
         return self.get_instream_width()
 
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index e73fa9bb2872d4a5023afb0c4e6953b4e6866b8d..c7bbc3f139b64f57943b2b099083a9611951e9c4 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -319,6 +319,7 @@ module $TOPNAME$(
 ap_clk,
 ap_rst_n,
 count,
+maxcount,
 in0_$HLS_SNAME$_TDATA,
 in0_$HLS_SNAME$_TVALID,
 in0_$HLS_SNAME$_TREADY,
@@ -330,6 +331,7 @@ out_$HLS_SNAME$_TREADY
 input   ap_clk;
 input   ap_rst_n;
 output $COUNT_RANGE$ count;
+output $COUNT_RANGE$ maxcount;
 input  $IN_RANGE$ in0_$HLS_SNAME$_TDATA;
 input   in0_$HLS_SNAME$_TVALID;
 output   in0_$HLS_SNAME$_TREADY;
@@ -346,6 +348,7 @@ $LAYER_NAME$
  .clock(ap_clk),
  .reset(!ap_rst_n),
  .count(count),
+ .maxcount(maxcount),
  .i_d(in0_$HLS_SNAME$_TDATA),
  .i_v(in0_$HLS_SNAME$_TVALID),
  .i_r(in0_$HLS_SNAME$_TREADY),
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
index 5383cc1f4bdf9eb88c7d7bd69c25231282f11c6f..f2cc64668d62ef15446772309577e9b15a378ef5 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
@@ -75,9 +75,6 @@ class Thresholding_Batch(HLSCustomOp):
             "inputDataType": ("s", True, ""),
             "weightDataType": ("s", True, ""),
             "outputDataType": ("s", True, ""),
-            # input and output FIFO depths
-            "inFIFODepth": ("i", False, 0),
-            "outFIFODepth": ("i", False, 0),
             # number of input vectors, examples:
             # [1] is a single vector (like a FC layer with batch=1)
             # [4] is four vectors (like a FC layer with batch=4)
@@ -185,11 +182,11 @@ class Thresholding_Batch(HLSCustomOp):
         # total cost
         return comparator_cost + lutram_cost
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("outputDataType")]
 
@@ -221,11 +218,11 @@ class Thresholding_Batch(HLSCustomOp):
         self.set_nodeattr("weightDataType", tdt.name)
         return DataType[self.get_nodeattr("weightDataType")]
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         i_bits = self.get_input_datatype().bitwidth()
         return i_bits * self.get_nodeattr("PE")
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         o_bits = self.get_output_datatype().bitwidth()
         return o_bits * self.get_nodeattr("PE")
 
@@ -251,7 +248,7 @@ class Thresholding_Batch(HLSCustomOp):
         weightstream = self.get_weightstream_width()
         return max([weightstream, temp_value])
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         ich = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
         fold = ich // pe
@@ -259,17 +256,17 @@ class Thresholding_Batch(HLSCustomOp):
         folded_input_shape = tuple(vecs + [fold, pe])
         return folded_input_shape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         # same shape as input
         return self.get_folded_input_shape()
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         ich = self.get_nodeattr("NumChannels")
         vecs = list(self.get_nodeattr("numInputVectors"))
         normal_input_shape = tuple(vecs + [ich])
         return normal_input_shape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         # same shape as input
         return self.get_normal_input_shape()
 
@@ -960,3 +957,20 @@ class Thresholding_Batch(HLSCustomOp):
         "Return a list of extra tcl directives for HLS synthesis."
 
         return ["config_compile -pipeline_style frp"]
+
+    def derive_characteristic_fxns(self, period):
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        io_dict = {
+            "inputs": {
+                "in0": [0 for i in range(n_inps)],
+            },
+            "outputs": {"out": []},
+        }
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode in ["decoupled", "external"]:
+            n_weight_inps = self.calc_tmem()
+            num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+            io_dict["inputs"]["weights"] = [
+                0 for i in range(num_w_reps * n_weight_inps)
+            ]
+        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/tlastmarker.py
index 7386aa7e6311754b653e94f8d2e9b2a910a1370b..1bd32442a1986d6a86571e85a09322d6c15d8a78 100644
--- a/src/finn/custom_op/fpgadataflow/tlastmarker.py
+++ b/src/finn/custom_op/fpgadataflow/tlastmarker.py
@@ -218,21 +218,21 @@ class TLastMarker(HLSCustomOp):
     def get_number_output_values(self):
         return self.get_nodeattr("NumIters")
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         stream_width = self.get_nodeattr("StreamWidth")
         elem_width = self.get_nodeattr("ElemWidth")
         n_packed_elems = stream_width // elem_width
         n_iters = self.get_nodeattr("NumIters")
         return (1, n_iters, n_packed_elems)
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         return self.get_folded_input_shape()
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         stream_width = self.get_nodeattr("StreamWidth")
         return stream_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         stream_width = self.get_nodeattr("StreamWidth")
         return stream_width
 
diff --git a/src/finn/custom_op/fpgadataflow/upsampler.py b/src/finn/custom_op/fpgadataflow/upsampler.py
index eb51fe39fc6e7ec84204f9d541a0e47c333bbf43..a018fd35aac4d63b365e97464dab0fd4a5fa13f2 100644
--- a/src/finn/custom_op/fpgadataflow/upsampler.py
+++ b/src/finn/custom_op/fpgadataflow/upsampler.py
@@ -73,7 +73,7 @@ class UpsampleNearestNeighbour_Batch(HLSCustomOp):
         exp_cycles = OFMDim * reps
         return int(exp_cycles)
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         IFMDim = self.get_nodeattr("IFMDim")
         num_ch = self.get_nodeattr("NumChannels")
         batch = self.get_nodeattr("numInputVectors")
@@ -84,7 +84,7 @@ class UpsampleNearestNeighbour_Batch(HLSCustomOp):
             ishape = (batch, IFMDim, 1, num_ch)
         return ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         OFMDim = self.get_nodeattr("OFMDim")
         num_ch = self.get_nodeattr("NumChannels")
         batch = self.get_nodeattr("numInputVectors")
@@ -95,11 +95,11 @@ class UpsampleNearestNeighbour_Batch(HLSCustomOp):
             oshape = (batch, OFMDim, 1, num_ch)
         return oshape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         normal_ishape = list(self.get_normal_input_shape())
         return tuple(normal_ishape)
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         normal_oshape = list(self.get_normal_output_shape())
         return tuple(normal_oshape)
 
@@ -129,21 +129,21 @@ class UpsampleNearestNeighbour_Batch(HLSCustomOp):
     def verify_node(self):
         pass
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         ret = DataType[self.get_nodeattr("inputDataType")]
         return ret
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output. (Same as input datatype)"""
         return self.get_input_datatype()
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         ibits = self.get_input_datatype().bitwidth()
         ifm_ch = self.get_nodeattr("NumChannels")
         return ibits * ifm_ch
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         obits = self.get_output_datatype().bitwidth()
         ifm_ch = self.get_nodeattr("NumChannels")
         return obits * ifm_ch
diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
index bc332b59449a6627a309e3e0b7fb18024b2af14f..16a51a3c909c76497bd8b60c372c589b441a1f01 100644
--- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
@@ -29,6 +29,7 @@
 import math
 import numpy as np
 import os
+import textwrap
 import warnings
 from qonnx.core.datatype import DataType
 from qonnx.util.basic import (
@@ -41,6 +42,7 @@ from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
     numpy_to_hls_code,
+    pack_innermost_dim_as_hex_string,
     rtlsim_output_to_npy,
 )
 
@@ -68,6 +70,36 @@ class VectorVectorActivation(HLSCustomOp):
             "accDataType": ("s", False, "INT32"),
             # no-activation mode (produce accumulators)
             "noActivation": ("i", False, 0, {0, 1}),
+            # memory mode for the layer weights
+            # const -- embedded weights, default, long compile/synth times
+            # decoupled -- streaming weights with weight streamer packaged inside IP
+            # external -- streaming weights with external streamer
+            "mem_mode": ("s", False, "const", {"const", "decoupled", "external"}),
+            # (mem_mode = decoupled only) whether weights will be writable through
+            # an AXI-lite interface during runtime
+            # 1 for enabled, 0 for disabled.
+            # see finn-rtllib/memstream/doc/README for more about the memory
+            # address map used for writable weights
+            # IMPORTANT: After using AXI lite to either read or write the weights,
+            # always "flush" the accelerator by first passing a dummy input
+            # vector through the accelerator. This will get rid of any old
+            # weight data from the weight FIFOs.
+            "runtime_writeable_weights": ("i", False, 0, {0, 1}),
+            # FPGA resource type for memories in decoupled mode
+            # auto -- let Vivado decide
+            # block -- use BRAM
+            # distributed -- use LUTRAM
+            # ultra -- use UltraRAM (URAM), must have runtime_writeable_weights=1
+            # see also https://www.xilinx.com/support/answers/38070.html
+            "ram_style": (
+                "s",
+                False,
+                "auto",
+                {"auto", "block", "distributed", "ultra"},
+            ),
+            # use xnor-popcount for binary weights/inputs, thus treating them
+            # as bipolar
+            "binaryXnorMode": ("i", False, 0, {0, 1}),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -178,7 +210,7 @@ class VectorVectorActivation(HLSCustomOp):
     def verify_node(self):
         pass
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
 
@@ -186,11 +218,11 @@ class VectorVectorActivation(HLSCustomOp):
         """Returns FINN DataType of weights."""
         return DataType[self.get_nodeattr("weightDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("outputDataType")]
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         i_bits = self.get_input_datatype().bitwidth()
         simd = self.get_nodeattr("SIMD")
         if simd > 1:
@@ -200,12 +232,12 @@ class VectorVectorActivation(HLSCustomOp):
         in_width = i_bits * simd * pe
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         o_bits = self.get_output_datatype().bitwidth()
         out_width = o_bits * self.get_nodeattr("PE")
         return out_width
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         k_h, k_w = self.get_nodeattr("Kernel")
         dim_h, dim_w = self.get_nodeattr("Dim")
         ch = self.get_nodeattr("Channels")
@@ -216,10 +248,19 @@ class VectorVectorActivation(HLSCustomOp):
             pe = self.get_nodeattr("PE")
         sf = k_h * k_w // simd
         nf = ch // pe
-        folded_input_shape = tuple([1, dim_h, dim_w, sf * nf, simd * pe])
+
+        if ind == 0:
+            # calculate shape of input 0
+            folded_input_shape = tuple([1, dim_h, dim_w, sf * nf, simd * pe])
+        elif ind == 1 and self.get_nodeattr("mem_mode") == "external":
+            # calculate shape of input 1 (weights)
+            folded_input_shape = tuple([1, sf * nf, pe])
+        else:
+            raise Exception("Undefined input shape for requested input")
+
         return folded_input_shape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         ch = self.get_nodeattr("Channels")
         pe = self.get_nodeattr("PE")
         nf = ch // pe
@@ -227,14 +268,14 @@ class VectorVectorActivation(HLSCustomOp):
         folded_output_shape = tuple([1, dim_h, dim_w, nf, pe])
         return folded_output_shape
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         dim_h, dim_w = self.get_nodeattr("Dim")
         ch = self.get_nodeattr("Channels")
         k_h, k_w = self.get_nodeattr("Kernel")
         normal_input_shape = tuple([1, dim_h, dim_w, k_h * k_w * ch])
         return normal_input_shape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         ch = self.get_nodeattr("Channels")
         dim_h, dim_w = self.get_nodeattr("Dim")
         normal_output_shape = tuple([1, dim_h, dim_w, ch])
@@ -263,13 +304,31 @@ class VectorVectorActivation(HLSCustomOp):
         ret = dict()
         inp_hls_str = self.get_input_datatype().get_hls_datatype_str()
         out_hls_str = self.get_output_datatype().get_hls_datatype_str()
+        inp_is_binary = self.get_input_datatype() == DataType["BINARY"]
+        # out_is_binary = self.get_output_datatype() == DataType["BINARY"]
+        wt_is_binary = self.get_weight_datatype() == DataType["BINARY"]
+        bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1
+        if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode):
+            raise Exception("True binary (non-bipolar) inputs not yet supported")
         inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"]
+        # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"]
         wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"]
+        # reinterpret inp/wt as bipolar if bin_xnor_mode is iset
+        inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode)
+        wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
         # fill in TSrcI and TWeightI
-        # TODO handle bipolar inputs
-        if inp_is_bipolar or wt_is_bipolar:
-            raise Exception("VVAU node doesn't support bipolar values yet.")
-        else:
+        # TODO check these with Giulio
+        # TODO handle non-bipolar binary inputs
+        if inp_is_bipolar and wt_is_bipolar:
+            ret["TSrcI"] = "Recast<XnorMul>"
+            ret["TWeightI"] = "Identity"
+        elif (not inp_is_bipolar) and wt_is_bipolar:
+            ret["TSrcI"] = "Slice<%s>" % inp_hls_str
+            ret["TWeightI"] = "Recast<Binary>"
+        elif inp_is_bipolar and (not wt_is_bipolar):
+            ret["TSrcI"] = "Recast<Binary>"
+            ret["TWeightI"] = "Identity"
+        elif (not inp_is_bipolar) and (not wt_is_bipolar):
             ret["TSrcI"] = "Slice<%s>" % inp_hls_str
             ret["TWeightI"] = "Identity"
 
@@ -299,6 +358,13 @@ class VectorVectorActivation(HLSCustomOp):
         return ret
 
     def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
+        """Convert the original numpy weight matrix orig_weight_matrix into
+        a form suitable for passing to the hlslib call:
+        * ensure MH % PE == 0
+        * for bipolar weights&inputs, ensure thresholds are positive
+        * interleave rows between PEs
+        * reshape into (PE, TMEM, n_thres_steps) and return
+        """
         ch = self.get_nodeattr("Channels")
         pe = self.get_nodeattr("PE")
         tmem = self.calc_tmem()
@@ -308,14 +374,33 @@ class VectorVectorActivation(HLSCustomOp):
         ), """Threshold matrix dimension is
         not as expected (2)."""
         n_thres_steps = orig_thres_matrix.shape[1]
+        inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"]
+        wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"]
+        # reinterpret inp/wt as bipolar if bin_xnor_mode is iset
+        inp_is_binary = self.get_input_datatype() == DataType["BINARY"]
+        wt_is_binary = self.get_weight_datatype() == DataType["BINARY"]
+        bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1
+        inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode)
+        wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
+        if inp_is_bipolar and wt_is_bipolar:
+            # ensure all thresholds are nonnegative
+            assert (orig_thres_matrix >= 0).all()
+            # ensure all thresholds are integer
+            assert (orig_thres_matrix.astype(np.int32) == orig_thres_matrix).all()
         ret = orig_thres_matrix
         # workaround for vivado_hls threshold bug
-        if ret[0][0] == 0:
+        if ret[0][0] == 0 and n_thres_steps == 1:
             ret = np.copy(ret)
             ret[0][0] = 1
             warnings.warn(
                 "Setting 0-valued first threshold to 1 to avoid vivado_hls bug"
             )
+        # ensure channels = mh , duplicating if necessary
+        if ret.shape[0] == 1:
+            ret = np.tile(ret, (ch, 1))
+        assert (
+            ret.shape[0] == ch
+        ), "Channels of threshold matrix are not as expected (ch)"
         # distribute rows between PEs
         ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
         assert (
@@ -332,44 +417,175 @@ class VectorVectorActivation(HLSCustomOp):
         rows between PEs is not as expected (n_thres_steps)"""
         return ret.reshape(1, pe, tmem, n_thres_steps)
 
-    def generate_params(self, model, path):
-        # weights
-        weights = model.get_initializer(self.onnx_node.input[1])
+    def make_weight_file(self, weights, weight_file_mode, weight_file_name):
+        """Produce a file containing given weights in appropriate format for this
+        layer. This file can be used for either synthesis or run-time reconfig
+        of weights.
+
+        Arguments:
+        * weights : numpy array with weights to be put into the file
+        * weight_file_mode : one of {hls_header, decoupled_verilog_dat,
+          decoupled_runtime}
+        * weight_file_name : filename for the weight file to be generated
+        """
         # convert weights into hlslib-compatible format
         weight_tensor = self.get_hls_compatible_weight_tensor(weights)
-        wdt = self.get_weight_datatype()
-        code_gen_dir = path
-
-        """Saves weights into params.h"""
-        weight_hls_code = numpy_to_hls_code(weight_tensor, wdt, "weights", True, True)
-        # write weights into params.h
-        f_weights = open("{}/params.h".format(code_gen_dir), "w")
-
-        if wdt.bitwidth() != 1:
-            f_weights.write(
-                "const FixedPointWeights<{},{},{},{}> weights = ".format(
-                    self.get_nodeattr("SIMD"),
-                    wdt.get_hls_datatype_str(),
-                    self.get_nodeattr("PE"),
-                    self.calc_wmem(),
+        export_wdt = self.get_weight_datatype()
+        # we have converted bipolar weights to binary for export,
+        # so use it as such for weight generation
+        if self.get_weight_datatype() == DataType["BIPOLAR"]:
+            export_wdt = DataType["BINARY"]
+        if weight_file_mode == "hls_header":
+            weight_hls_code = numpy_to_hls_code(
+                weight_tensor, export_wdt, "weights", True, True
+            )
+            # write weights into C++ header file as dictated by finn-hlslib
+            f_weights = open(weight_file_name, "w")
+            if export_wdt.bitwidth() != 1:
+                f_weights.write(
+                    "const FixedPointWeights<{},{},{},{}> weights = ".format(
+                        self.get_nodeattr("SIMD"),
+                        export_wdt.get_hls_datatype_str(),
+                        self.get_nodeattr("PE"),
+                        self.calc_wmem(),
+                    )
                 )
+            else:
+                f_weights.write(
+                    "const BinaryWeights<{},{},{}> weights = ".format(
+                        self.get_nodeattr("SIMD"),
+                        self.get_nodeattr("PE"),
+                        self.calc_wmem(),
+                    )
+                )
+            f_weights.write(weight_hls_code)
+            f_weights.close()
+        elif "decoupled" in weight_file_mode:
+            # create a weight stream for various flavors of decoupled mode:
+            # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD)
+            weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3))
+            # reverse SIMD flip for saving weights in .npy
+            weight_tensor_simd_flipped = np.flip(weight_tensor_unflipped, axis=-1)
+            # PE flip for saving weights in .dat
+            weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2)
+            # reshape weight tensor (simd_flipped and pe_flipped) to desired shape
+            pe = self.get_nodeattr("PE")
+            simd = 1
+            # simd_flipped
+            weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape(
+                1, -1, pe * simd
             )
+            weight_tensor_simd_flipped = weight_tensor_simd_flipped.copy()
+            # flipped
+            weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape(
+                1, -1, pe * simd
+            )
+            weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy()
+            if weight_file_mode == "decoupled_npy":
+                # save weight stream into npy for cppsim
+                np.save(weight_file_name, weight_tensor_simd_flipped)
+            elif weight_file_mode == "decoupled_verilog_dat":
+                # convert weight values into hexstring
+                weight_width = self.get_weightstream_width()
+                # pad to nearest 4 bits to get hex strings
+                weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
+                weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
+                    weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
+                )
+                # add zeroes to pad out file to 1024 entries
+                weight_stream = weight_tensor_pe_flipped.flatten()
+                weight_stream = weight_stream.copy()
+                with open(weight_file_name, "w") as f:
+                    for val in weight_stream:
+                        f.write(val + "\n")
+            elif weight_file_mode == "decoupled_runtime":
+                # memstream axi-lite interface will map each mem line to
+                # one or multiple 32-bit words
+                weight_width = self.get_weightstream_width()
+                words_per_memwidth = 2 ** math.ceil(math.log2(weight_width / 32))
+                if words_per_memwidth < 1:
+                    words_per_memwidth = 1
+                weight_width_padded = words_per_memwidth * 32
+                # first, pack and ensure padding to 32 bits
+                weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
+                    weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
+                )
+                weight_stream = weight_tensor_pe_flipped.flatten()
+                weight_stream = weight_stream.copy()
+                with open(weight_file_name, "w") as f:
+                    for val in weight_stream:
+                        # split into groups of 8 hex digits (= 32 bits)
+                        words_32b = textwrap.wrap(val, 8)
+                        words_32b.reverse()
+                        for word_32b in words_32b:
+                            f.write(word_32b + "\n")
+            else:
+                raise Exception("Unknown weight_file_mode")
+
         else:
-            f_weights.write(
-                "const BinaryWeights<{},{},{}> weights = ".format(
-                    self.get_nodeattr("SIMD"), self.get_nodeattr("PE"), self.calc_wmem()
+            raise Exception("Unknown weight_file_mode")
+
+    def generate_params(self, model, path):
+        mem_mode = self.get_nodeattr("mem_mode")
+        code_gen_dir = path
+        # weights, if not external
+        weights = model.get_initializer(self.onnx_node.input[1])
+        if mem_mode == "const":
+            # save hlslib-compatible weights in params.h
+            weight_filename = "{}/params.h".format(code_gen_dir)
+            self.make_weight_file(weights, "hls_header", weight_filename)
+        elif mem_mode == "decoupled" or mem_mode == "external":
+            weight_filename_sim = "{}/weights.npy".format(code_gen_dir)
+            # save decoupled weights for cppsim
+            self.make_weight_file(weights, "decoupled_npy", weight_filename_sim)
+            if mem_mode == "decoupled":
+                # also save weights as Verilog .dat file
+                # note that we provide two different .dat files, one for synth
+                # and one for synthesis. this is because URAM-based weights always
+                # need zero weights for synthesis, otherwise they get inferred
+                # as BRAM
+                weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(
+                    code_gen_dir
                 )
+                weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir)
+                # sim weights are always the true weights
+                self.make_weight_file(
+                    weights, "decoupled_verilog_dat", weight_filename_rtl_sim
+                )
+                ram_style = self.get_nodeattr("ram_style")
+                if ram_style == "ultra":
+                    # UltraRAM must have no memory initializer, or only zeroes
+                    # otherwise BRAM will be inferred instead of URAM
+                    # as a workaround we provide a zero-weight init here
+                    synth_weights = np.zeros_like(weights, dtype=np.float32)
+                else:
+                    synth_weights = weights
+                self.make_weight_file(
+                    synth_weights, "decoupled_verilog_dat", weight_filename_rtl_synth
+                )
+        else:
+            raise Exception(
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
             )
-        f_weights.write(weight_hls_code)
-        f_weights.close()
 
         # save thresholds in thresh.h
         if len(self.onnx_node.input) > 2:
             thresholds = model.get_initializer(self.onnx_node.input[2])
             if thresholds is not None:
                 threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
+                # use UINT32 threshold export for bipolar times bipolar
+                inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"]
+                wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"]
+                # reinterpret inp/wt as bipolar if bin_xnor_mode is iset
+                inp_is_binary = self.get_input_datatype() == DataType["BINARY"]
+                wt_is_binary = self.get_weight_datatype() == DataType["BINARY"]
+                bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1
+                inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode)
+                wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
                 # get computed threshold datatype from attribute
                 tdt = DataType[self.get_nodeattr("accDataType")]
+
                 assert np.vectorize(tdt.allowed)(
                     threshold_tensor
                 ).all(), "Thresholds in %s can't be expressed with type %s" % (
@@ -382,8 +598,11 @@ class VectorVectorActivation(HLSCustomOp):
                 # write thresholds into thresh.h
                 f_thresh = open("{}/thresh.h".format(code_gen_dir), "w")
                 tdt_hls = tdt.get_hls_datatype_str()
-                odt = self.get_output_datatype()
-                odt_hls = odt.get_hls_datatype_str()
+                # use binary to export bipolar activations
+                export_odt = self.get_output_datatype()
+                if self.get_output_datatype() == DataType["BIPOLAR"]:
+                    export_odt = DataType["BINARY"]
+                odt_hls = export_odt.get_hls_datatype_str()
                 f_thresh.write(
                     "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \
                     = ".format(
@@ -401,6 +620,7 @@ class VectorVectorActivation(HLSCustomOp):
 
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
+        mem_mode = self.get_nodeattr("mem_mode")
         node = self.onnx_node
 
         # TODO ensure codegen dir exists
@@ -454,7 +674,28 @@ class VectorVectorActivation(HLSCustomOp):
             inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), idt, nbits)
             super().reset_rtlsim(sim)
             super().toggle_clk(sim)
-            output = self.rtlsim(sim, inp)
+
+            if mem_mode == "external" or mem_mode == "decoupled":
+                wnbits = self.get_weightstream_width()
+                export_wdt = self.get_weight_datatype()
+                # we have converted bipolar weights to binary for export,
+                # so use it as such for weight generation
+                if self.get_weight_datatype() == DataType["BIPOLAR"]:
+                    export_wdt = DataType["BINARY"]
+                wei = npy_to_rtlsim_input(
+                    "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits
+                )
+                dim_h, dim_w = self.get_nodeattr("Dim")
+                num_w_reps = dim_h * dim_w
+
+                io_dict = {
+                    "inputs": {"in0": inp, "weights": wei * num_w_reps},
+                    "outputs": {"out": []},
+                }
+                self.rtlsim_multi_io(sim, io_dict)
+                output = io_dict["outputs"]["out"]
+            else:
+                output = self.rtlsim(sim, inp)
             odt = self.get_output_datatype()
             target_bits = odt.bitwidth()
             packed_bits = self.get_outstream_width()
@@ -480,6 +721,12 @@ class VectorVectorActivation(HLSCustomOp):
     def global_includes(self):
         self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"']
         self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"']
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode not in ["const", "decoupled", "external"]:
+            raise Exception(
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
+            )
         if self.calc_tmem() != 0:
             self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"']
 
@@ -488,6 +735,8 @@ class VectorVectorActivation(HLSCustomOp):
         numReps = 1 * dim_h * dim_w
         k_h, k_w = self.get_nodeattr("Kernel")
         innerProdDim = k_h * k_w
+        mem_mode = self.get_nodeattr("mem_mode")
+
         self.code_gen_dict["$DEFINES$"] = [
             """#define Channels1 {}\n #define InnerProdDim {}\n
             #define SIMD1 {}\n #define PE1 {}\n #define numReps {}""".format(
@@ -498,6 +747,11 @@ class VectorVectorActivation(HLSCustomOp):
                 numReps,
             )
         ]
+        if mem_mode == "decoupled" or mem_mode == "external":
+            wdt = self.get_weight_datatype()
+            self.code_gen_dict["$DEFINES$"].append(
+                "#define WP1 {}\n".format(wdt.bitwidth())
+            )
 
     def read_npy_data(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -515,7 +769,23 @@ class VectorVectorActivation(HLSCustomOp):
             % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
         )
 
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "decoupled" or mem_mode == "external":
+            wdt = self.get_weight_datatype()
+            elem_bits = wdt.bitwidth()
+            packed_bits = self.get_weightstream_width()
+            packed_hls_type = "ap_uint<%d>" % packed_bits
+            elem_hls_type = wdt.get_hls_datatype_str()
+            npy_type = "float"
+            npy_in = "%s/weights.npy" % code_gen_dir
+
+            self.code_gen_dict["$READNPYDATA$"].append(
+                'npy2apintstream<%s, %s, %d, %s>("%s", weights, false, numReps);'
+                % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+            )
+
     def strm_decl(self):
+        mem_mode = self.get_nodeattr("mem_mode")
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
             'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
@@ -523,8 +793,15 @@ class VectorVectorActivation(HLSCustomOp):
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
             'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
         )
+        if mem_mode == "decoupled" or mem_mode == "external":
+            self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+                'hls::stream<ap_uint<{}>> weights ("weights");'.format(
+                    self.get_weightstream_width()
+                )
+            )
 
     def docompute(self):
+        mem_mode = self.get_nodeattr("mem_mode")
         map_to_hls_mult_style = {
             "auto": "ap_resource_dflt()",
             "lut": "ap_resource_lut()",
@@ -536,16 +813,42 @@ class VectorVectorActivation(HLSCustomOp):
             threshs = "PassThroughActivation<%s>()" % odtype_hls_str
         else:
             threshs = "threshs"
-        self.code_gen_dict["$DOCOMPUTE$"] = [
-            """Vector_Vector_Activate_Batch<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}>
-            (in0, out, weights, {}, numReps, {});""".format(
-                tmpl_args["TSrcI"],
-                tmpl_args["TDstI"],
-                tmpl_args["TWeightI"],
-                threshs,
-                map_to_hls_mult_style[self.get_nodeattr("resType")],
+
+        if mem_mode == "const":
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """Vector_Vector_Activate_Batch<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}>
+                (in0, out, weights, {}, numReps, {});""".format(
+                    tmpl_args["TSrcI"],
+                    tmpl_args["TDstI"],
+                    tmpl_args["TWeightI"],
+                    threshs,
+                    map_to_hls_mult_style[self.get_nodeattr("resType")],
+                )
+            ]
+        elif mem_mode == "decoupled" or mem_mode == "external":
+            wdt = self.get_weight_datatype()
+            if wdt == DataType["BIPOLAR"]:
+                export_wdt = DataType["BINARY"]
+            else:
+                export_wdt = wdt
+            wdtype_hls_str = export_wdt.get_hls_datatype_str()
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}, {}>
+                (in0, out, weights, {}, numReps, {});""".format(
+                    "Vector_Vector_Activate_Stream_Batch",
+                    tmpl_args["TSrcI"],
+                    tmpl_args["TDstI"],
+                    tmpl_args["TWeightI"],
+                    wdtype_hls_str,
+                    threshs,
+                    map_to_hls_mult_style[self.get_nodeattr("resType")],
+                )
+            ]
+        else:
+            raise Exception(
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
             )
-        ]
 
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -576,25 +879,49 @@ class VectorVectorActivation(HLSCustomOp):
         self.code_gen_dict["$SAVEASCNPY$"] = []
 
     def blackboxfunction(self):
-        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            """void {}(hls::stream<ap_uint<{}>> &in0,
-            hls::stream<ap_uint<{}>> &out
-            )""".format(
-                self.onnx_node.name,
-                self.get_instream_width(),
-                self.get_outstream_width(),
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "const":
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void {}(hls::stream<ap_uint<{}>> &in0,
+                hls::stream<ap_uint<{}>> &out
+                )""".format(
+                    self.onnx_node.name,
+                    self.get_instream_width(),
+                    self.get_outstream_width(),
+                )
+            ]
+        elif mem_mode == "decoupled" or mem_mode == "external":
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void {}(
+                    hls::stream<ap_uint<{}>> &in0,
+                    hls::stream<ap_uint<{}>> &weights,
+                    hls::stream<ap_uint<{}>> &out
+                    )""".format(
+                    self.onnx_node.name,
+                    self.get_instream_width(),
+                    self.get_weightstream_width(),
+                    self.get_outstream_width(),
+                )
+            ]
+        else:
+            raise Exception(
+                """Please set mem_mode to "const" or "decoupled", currently no other
+                    parameter value is supported!"""
             )
-        ]
 
     def pragmas(self):
+        mem_mode = self.get_nodeattr("mem_mode")
         self.code_gen_dict["$PRAGMAS$"] = [
             "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
         ]
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
         )
-        in_fifo_depth = self.get_nodeattr("inFIFODepth")
-        out_fifo_depth = self.get_nodeattr("outFIFODepth")
+        # TODO can we deprecate this entirely? this looks like legacy code
+        # that does not really serve a purpose - FIFO sizes are not typically
+        # allocated at this point; at best they are set to 2 as the default
+        in_fifo_depth = 2
+        out_fifo_depth = 2
         # insert depth pragmas only if specified
         if in_fifo_depth != 0:
             self.code_gen_dict["$PRAGMAS$"].append(
@@ -608,12 +935,30 @@ class VectorVectorActivation(HLSCustomOp):
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
 
-        self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"')
-        # the weight tensor is ap_uint<ch*prec> [PE][WMEM]
-        # partition for parallel access along the PE dimension (dim 1)
-        self.code_gen_dict["$PRAGMAS$"].append(
-            ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1")
-        )
+        if mem_mode == "const":
+            self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"')
+            # the weight tensor is ap_uint<ch*prec> [PE][WMEM]
+            # partition for parallel access along the PE dimension (dim 1)
+            self.code_gen_dict["$PRAGMAS$"].append(
+                (
+                    "#pragma HLS ARRAY_PARTITION variable=weights.m_weights "
+                    "complete dim=1"
+                )
+            )
+        elif mem_mode == "decoupled" or mem_mode == "external":
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS INTERFACE axis port=weights name=weights_"
+                + self.hls_sname()
+            )
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS stream depth=8 variable=weights"
+            )
+        else:
+            raise Exception(
+                """Please set mem_mode to "const", "decoupled", or external,
+                currently no other parameter value is supported!"""
+            )
+
         if self.calc_tmem() != 0:
             # TODO find a better way of checking for no pregenerated thresholds
             self.code_gen_dict["$PRAGMAS$"].append(
@@ -629,6 +974,157 @@ class VectorVectorActivation(HLSCustomOp):
                 )
             )
 
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        mem_mode = self.get_nodeattr("mem_mode")
+        sname = self.hls_sname()
+        if mem_mode == "external":
+            intf_names["s_axis"].append(
+                ("weights_" + sname, self.get_weightstream_width_padded())
+            )
+        if mem_mode == "decoupled":
+            # only expose axilite interface if attribute is set
+            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
+            if runtime_writable:
+                intf_names["axilite"] = ["s_axilite"]
+        return intf_names
+
+    def code_generation_ipi(self):
+        cmd = []
+        # add streamer if needed
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "decoupled":
+            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
+            if self.get_nodeattr("ram_style") == "ultra":
+                assert (
+                    runtime_writable == 1
+                ), "Layer with URAM weights must have runtime_writeable_weights=1"
+            node_name = self.onnx_node.name
+            sname = self.hls_sname()
+            # create a hierarchy for this layer, with the same port names
+            clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
+            rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
+            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0]
+            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0]
+            cmd.append("create_bd_cell -type hier %s" % node_name)
+            cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
+            cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
+            cmd.append(
+                "create_bd_intf_pin -mode Master "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s"
+                % (node_name, dout_name)
+            )
+            cmd.append(
+                "create_bd_intf_pin -mode Slave "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name)
+            )
+            # instantiate the hls ip
+            cmd.append(
+                "create_bd_cell -type ip -vlnv %s /%s/%s"
+                % (self.get_nodeattr("ip_vlnv"), node_name, node_name)
+            )
+            # instantiate a streamer and connect it to the HLS IP
+            strm_vlnv = "xilinx.com:user:memstream:1.0"
+            strm_inst = node_name + "_wstrm"
+            cmd.append(
+                "create_bd_cell -type ip -vlnv %s /%s/%s"
+                % (strm_vlnv, node_name, strm_inst)
+            )
+            cmd.append(
+                "set_property -dict [list "
+                "CONFIG.NSTREAMS {1} "
+                "CONFIG.MEM_DEPTH {%d} "
+                "CONFIG.MEM_WIDTH {%d} "
+                "CONFIG.MEM_INIT {%s} "
+                "CONFIG.RAM_STYLE {%s} "
+                "CONFIG.STRM0_DEPTH {%d} "
+                "CONFIG.STRM0_WIDTH {%d} "
+                "CONFIG.STRM0_OFFSET {0} "
+                "] [get_bd_cells /%s/%s]"
+                % (
+                    self.calc_wmem(),
+                    self.get_weightstream_width_padded(),
+                    self.get_nodeattr("code_gen_dir_ipgen") + "/",
+                    self.get_nodeattr("ram_style"),
+                    self.calc_wmem(),
+                    self.get_weightstream_width_padded(),
+                    node_name,
+                    strm_inst,
+                )
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s/m_axis_0] "
+                "[get_bd_intf_pins %s/%s/weights_%s]"
+                % (node_name, strm_inst, node_name, node_name, sname)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]"
+                % (node_name, rst_name, node_name, strm_inst)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aclk]"
+                % (node_name, clk_name, node_name, strm_inst)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
+                % (node_name, rst_name, node_name, node_name, rst_name)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
+                % (node_name, clk_name, node_name, node_name, clk_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                "[get_bd_intf_pins %s/%s/%s]"
+                % (node_name, din_name, node_name, node_name, din_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                "[get_bd_intf_pins %s/%s/%s]"
+                % (node_name, dout_name, node_name, node_name, dout_name)
+            )
+            if runtime_writable:
+                # expose axi lite interface for writeable weights
+                axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0]
+                cmd.append(
+                    "create_bd_intf_pin -mode Slave "
+                    "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s"
+                    % (node_name, axilite_name)
+                )
+                cmd.append(
+                    "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                    "[get_bd_intf_pins %s/%s/%s]"
+                    % (node_name, axilite_name, node_name, strm_inst, axilite_name)
+                )
+                # TODO calculate and pass in segment size here
+                cmd.append("assign_bd_address")
+            cmd.append("save_bd_design")
+        elif mem_mode == "const" or mem_mode == "external":
+            # base class impl sufficient for const/external modes
+            return super().code_generation_ipi()
+        else:
+            raise Exception("Unrecognized mem_mode for VectorVectorActivation")
+        return cmd
+
+    def uram_estimation(self):
+        P = self.get_nodeattr("PE")
+        Q = 1
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        omega = self.calc_wmem()
+        mem_width = Q * W * P
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (
+            (mmode == "decoupled" and mstyle != "ultra")
+            or (mmode == "const" and self.calc_wmem() <= 128)
+            or (mmode == "external")
+        ):
+            return 0
+        width_multiplier = math.ceil(mem_width / 72)
+        depth_multiplier = math.ceil(omega / 4096)
+        return width_multiplier * depth_multiplier
+
     def bram_estimation(self):
         """Calculates resource estimation for BRAM"""
         # TODO add in/out FIFO contributions
@@ -639,7 +1135,13 @@ class VectorVectorActivation(HLSCustomOp):
         # assuming SDP mode RAMB18s (see UG573 Table 1-10)
         # since this is HLS memory, not using the full width of a BRAM
         # assuming memories up to 128 deep get implemented in LUTs
-        if self.calc_wmem() <= 128:
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (
+            (mmode == "decoupled" and mstyle in ["distributed", "ultra"])
+            or (mmode == "const" and self.calc_wmem() <= 128)
+            or (mmode == "external")
+        ):
             return 0
 
         if W == 1:
@@ -686,8 +1188,12 @@ class VectorVectorActivation(HLSCustomOp):
         c0 = 300
         c1 = 1.1
         c2 = 0
-        if self.calc_wmem() <= 128:
-            c2 = P * W * math.ceil(self.calc_wmem() / 64)
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (mmode == "decoupled" and mstyle == "distributed") or (
+            mmode == "const" and self.calc_wmem() <= 128
+        ):
+            c2 = (P * W) * math.ceil(self.calc_wmem() / 64)
 
         # multiplication
         res_type = self.get_nodeattr("resType")
@@ -725,6 +1231,25 @@ class VectorVectorActivation(HLSCustomOp):
             mult_dsp = 0
         return int(mult_dsp)
 
+    def get_weightstream_width(self):
+        """Returns weight stream width. Used only in decoupled mode."""
+        if (
+            self.get_nodeattr("mem_mode") == "decoupled"
+            or self.get_nodeattr("mem_mode") == "external"
+        ):
+            pe = self.get_nodeattr("PE")
+            wp = self.get_weight_datatype().bitwidth()
+            w_width = pe * wp
+            return w_width
+        else:
+            return 0
+
+    def get_weightstream_width_padded(self):
+        """Returns weight stream width padded to a multiple of 8. This is required
+        by the AXI Stream spec. Used in decoupled mode."""
+        weight_width = self.get_weightstream_width()
+        return roundup_to_integer_multiple(weight_width, 8)
+
     def get_op_and_param_counts(self):
         k_h, k_w = self.get_nodeattr("Kernel")
         fm = self.get_nodeattr("Channels")
@@ -748,3 +1273,20 @@ class VectorVectorActivation(HLSCustomOp):
             thres_count = fm
             ret_dict[thres_param_type] = thres_count
         return ret_dict
+
+    def derive_characteristic_fxns(self, period):
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        io_dict = {
+            "inputs": {
+                "in0": [0 for i in range(n_inps)],
+            },
+            "outputs": {"out": []},
+        }
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode in ["decoupled", "external"]:
+            n_weight_inps = self.calc_wmem()
+            num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+            io_dict["inputs"]["weights"] = [
+                0 for i in range(num_w_reps * n_weight_inps)
+            ]
+        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
diff --git a/src/finn/qnn-data/build_dataflow/expected_output.npy b/src/finn/qnn-data/build_dataflow/expected_output.npy
index a8d09384633791b7e3760dc8a2d1ba88a05d526d..98037351bb4ee49985a98631750f18e9b86965b1 100644
Binary files a/src/finn/qnn-data/build_dataflow/expected_output.npy and b/src/finn/qnn-data/build_dataflow/expected_output.npy differ
diff --git a/src/finn/qnn-data/build_dataflow/input.npy b/src/finn/qnn-data/build_dataflow/input.npy
index edd24de05a33a15ebc330cdab31f3d77d2c47196..8bece67b7daf5b7668ff5e7515f15a891146b00b 100644
Binary files a/src/finn/qnn-data/build_dataflow/input.npy and b/src/finn/qnn-data/build_dataflow/input.npy differ
diff --git a/src/finn/qnn-data/testcase/residual_testcase.onnx b/src/finn/qnn-data/testcase/residual_testcase.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..c96e8c694e3a39cdb9e5d984e1c069ceb55b3f2a
Binary files /dev/null and b/src/finn/qnn-data/testcase/residual_testcase.onnx differ
diff --git a/src/finn/qnn-data/verilog/custom_axis_infrastructure.vh b/src/finn/qnn-data/verilog/custom_axis_infrastructure.vh
new file mode 100644
index 0000000000000000000000000000000000000000..1c8b6403e8628e3647810ca5fca65ca1122eaf9d
--- /dev/null
+++ b/src/finn/qnn-data/verilog/custom_axis_infrastructure.vh
@@ -0,0 +1,346 @@
+//  (c) Copyright 2011-2013 Xilinx, Inc. All rights reserved.
+//
+//  This file contains confidential and proprietary information
+//  of Xilinx, Inc. and is protected under U.S. and
+//  international copyright and other intellectual property
+//  laws.
+//
+//  DISCLAIMER
+//  This disclaimer is not a license and does not grant any
+//  rights to the materials distributed herewith. Except as
+//  otherwise provided in a valid license issued to you by
+//  Xilinx, and to the maximum extent permitted by applicable
+//  law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND
+//  WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES
+//  AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING
+//  BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-
+//  INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and
+//  (2) Xilinx shall not be liable (whether in contract or tort,
+//  including negligence, or under any other theory of
+//  liability) for any loss or damage of any kind or nature
+//  related to, arising under or in connection with these
+//  materials, including for any direct, or any indirect,
+//  special, incidental, or consequential loss or damage
+//  (including loss of data, profits, goodwill, or any type of
+//  loss or damage suffered as a result of any action brought
+//  by a third party) even if such damage or loss was
+//  reasonably foreseeable or Xilinx had been advised of the
+//  possibility of the same.
+//
+//  CRITICAL APPLICATIONS
+//  Xilinx products are not designed or intended to be fail-
+//  safe, or for use in any application requiring fail-safe
+//  performance, such as life-support or safety devices or
+//  systems, Class III medical devices, nuclear facilities,
+//  applications related to the deployment of airbags, or any
+//  other applications that could lead to death, personal
+//  injury, or severe property or environmental damage
+//  (individually and collectively, "Critical
+//  Applications"). Customer assumes the sole risk and
+//  liability of any use of Xilinx products in Critical
+//  Applications, subject only to applicable laws and
+//  regulations governing limitations on product liability.
+//
+//  THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS
+//  PART OF THIS FILE AT ALL TIMES.
+//-----------------------------------------------------------------------------
+//
+// Generic Functions used by AXIS-Interconnect and Infrastrucutre Modules
+//
+// Verilog-standard:  Verilog 2001
+//--------------------------------------------------------------------------
+// Global Parameters:
+//
+// Functions:
+//   f_clogb2
+//   f_gcd
+//   f_lcm
+//   f_get_tdata_indx
+//   f_get_tstrb_indx
+//   f_get_tkeep_indx
+//   f_get_tlast_indx
+//   f_get_tid_indx
+//   f_get_tdest_indx
+//   f_get_tuser_indx
+//   f_payload_width
+// Tasks:
+//   t_display_tdata_error
+//--------------------------------------------------------------------------
+///////////////////////////////////////////////////////////////////////////////
+// BEGIN Global Parameters
+///////////////////////////////////////////////////////////////////////////////
+// Define Signal Set indices
+localparam G_INDX_SS_TREADY = 0;
+localparam G_INDX_SS_TDATA  = 1;
+localparam G_INDX_SS_TSTRB  = 2;
+localparam G_INDX_SS_TKEEP  = 3;
+localparam G_INDX_SS_TLAST  = 4;
+localparam G_INDX_SS_TID    = 5;
+localparam G_INDX_SS_TDEST  = 6;
+localparam G_INDX_SS_TUSER  = 7;
+localparam G_MASK_SS_TREADY = 32'h1 << G_INDX_SS_TREADY;
+localparam G_MASK_SS_TDATA  = 32'h1 << G_INDX_SS_TDATA;
+localparam G_MASK_SS_TSTRB  = 32'h1 << G_INDX_SS_TSTRB;
+localparam G_MASK_SS_TKEEP  = 32'h1 << G_INDX_SS_TKEEP;
+localparam G_MASK_SS_TLAST  = 32'h1 << G_INDX_SS_TLAST;
+localparam G_MASK_SS_TID    = 32'h1 << G_INDX_SS_TID  ;
+localparam G_MASK_SS_TDEST  = 32'h1 << G_INDX_SS_TDEST;
+localparam G_MASK_SS_TUSER  = 32'h1 << G_INDX_SS_TUSER;
+
+// Task DRC error levels
+localparam G_TASK_SEVERITY_ERR   = 2;
+localparam G_TASK_SEVERITY_WARNING = 1;
+localparam G_TASK_SEVERITY_INFO    = 0;
+
+///////////////////////////////////////////////////////////////////////////////
+// BEGIN Functions
+///////////////////////////////////////////////////////////////////////////////
+// ceiling logb2
+  function integer f_clogb2 (input integer size);
+    integer s;
+    begin
+      s = size;
+      s = s - 1;
+      for (f_clogb2=1; s>1; f_clogb2=f_clogb2+1)
+            s = s >> 1;
+    end
+  endfunction // clogb2
+
+  // Calculates the Greatest Common Divisor between two integers using the
+  // euclidean algorithm.
+  function automatic integer f_gcd (
+    input integer a,
+    input integer b
+    );
+    begin : main
+      integer A, B, done, swap;
+      A = a;
+      B = b;
+      done = 0;
+      while(!done)
+      begin
+        if (A < B ) begin
+          swap = A;
+          A = B;
+          B = swap;
+        end else if ( B != 0 ) begin
+          A = A - B;
+        end else begin
+          done = 1;
+        end
+      end
+
+      f_gcd = A;
+    end
+  endfunction
+
+
+  // Calculates the Lowest Common Denominator between two integers
+  function integer f_lcm (
+    input integer a,
+    input integer b
+    );
+    begin : main
+      f_lcm = ( a / f_gcd(a, b)) * b;
+    end
+  endfunction
+
+  // Returns back the index to the TDATA portion of TPAYLOAD, returns 0 if the
+  // signal is not enabled.
+  function integer f_get_tdata_indx (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      f_get_tdata_indx = 0;
+    end
+  endfunction
+
+  // Returns back the index to the tstrb portion of TPAYLOAD, returns 0 if the
+  // signal is not enabled.
+  function integer f_get_tstrb_indx (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      integer cur_indx;
+      cur_indx = f_get_tdata_indx(DAW, IDW, DEW, USW, SST);
+      // If TDATA exists, then add its width to its base to get the tstrb index
+      f_get_tstrb_indx = SST[G_INDX_SS_TDATA] ? cur_indx + DAW : cur_indx;
+    end
+  endfunction
+
+  // Returns back the index to the tkeep portion of TPAYLOAD, returns 0 if the
+  // signal is not enabled.
+  function integer f_get_tkeep_indx (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      integer cur_indx;
+      cur_indx = f_get_tstrb_indx(DAW, IDW, DEW, USW, SST);
+      f_get_tkeep_indx = SST[G_INDX_SS_TSTRB] ? cur_indx + DAW/8 : cur_indx;
+    end
+  endfunction
+
+  // Returns back the index to the tlast portion of TPAYLOAD, returns 0 if the
+  // signal is not enabled.
+  function integer f_get_tlast_indx (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      integer cur_indx;
+      cur_indx = f_get_tkeep_indx(DAW, IDW, DEW, USW, SST);
+      f_get_tlast_indx = SST[G_INDX_SS_TKEEP] ? cur_indx + DAW/8 : cur_indx;
+    end
+  endfunction
+
+  // Returns back the index to the tid portion of TPAYLOAD, returns 0 if the
+  // signal is not enabled.
+  function integer f_get_tid_indx (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      integer cur_indx;
+      cur_indx = f_get_tlast_indx(DAW, IDW, DEW, USW, SST);
+      f_get_tid_indx = SST[G_INDX_SS_TLAST] ? cur_indx + 1 : cur_indx;
+    end
+  endfunction
+
+  // Returns back the index to the tdest portion of TPAYLOAD, returns 0 if the
+  // signal is not enabled.
+  function integer f_get_tdest_indx (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      integer cur_indx;
+      cur_indx = f_get_tid_indx(DAW, IDW, DEW, USW, SST);
+      f_get_tdest_indx = SST[G_INDX_SS_TID] ? cur_indx + IDW : cur_indx;
+    end
+  endfunction
+
+  // Returns back the index to the tuser portion of TPAYLOAD, returns 0 if the
+  // signal is not enabled.
+  function integer f_get_tuser_indx (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      integer cur_indx;
+      cur_indx = f_get_tdest_indx(DAW, IDW, DEW, USW, SST);
+      f_get_tuser_indx = SST[G_INDX_SS_TDEST] ? cur_indx + DEW : cur_indx;
+    end
+  endfunction
+
+  // Payload is the sum of all the AXIS signals present except for
+  // TREADY/TVALID
+  function integer f_payload_width (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      integer cur_indx;
+      cur_indx = f_get_tuser_indx(DAW, IDW, DEW, USW, SST);
+      f_payload_width = SST[G_INDX_SS_TUSER] ? cur_indx + USW : cur_indx;
+      // Ensure that the return value is never less than 1
+      f_payload_width = (f_payload_width < 1) ? 1 : f_payload_width;
+    end
+  endfunction
+
+  task t_check_tdata_width(
+    input  integer    data_width,
+    input  [8*80-1:0] var_name,
+    input  [8*80-1:0] inst_name,
+    input  integer    severity_lvl,
+    output integer    ret_val
+  );
+    // Severity levels:
+    // 0 = INFO
+    // 1 = WARNING
+    // 2 = ERROR
+    begin : t_check_tdata_width
+      if (data_width%8 != 0) begin
+        //       000       1          2         3         4         5         6         7         8
+        //       012       0          0         0         0         0         0         0         0
+        if (severity_lvl >= 2) begin
+        $display("ERROR: %m::%s", inst_name);
+        end else if (severity_lvl == 1) begin
+        $display("WARNING: %m::%s", inst_name);
+        end else begin
+        $display("INFO: %m::%s", inst_name);
+        end
+        $display("       Parameter %s (%2d) must be a multiple of 8.", var_name, data_width);
+        $display("       AXI4-Stream data width is only defined for byte multiples. See the ");
+        $display("       AMBA4 AXI4-Stream Protocol Specification v1.0 Section 2.1 for more");
+        $display("       information.");
+        ret_val = 1;
+      end else begin
+        ret_val = 0;
+      end
+    end
+  endtask
+
+  task t_check_tuser_width(
+    input  integer    tuser_width,
+    input  [8*80-1:0] tuser_name,
+    input  integer    tdata_width,
+    input  [8*80-1:0] tdata_name,
+    input  [8*80-1:0] inst_name,
+    input  integer    severity_lvl,
+    output integer    ret_val
+  );
+    // Severity levels:
+    // 0 = INFO
+    // 1 = WARNING
+    // 2 = ERROR
+    begin : t_check_tuser_width
+      integer tdata_bytes;
+      tdata_bytes = tdata_width/8;
+      if ((tuser_width%tdata_bytes) != 0) begin
+        //       000       1          2         3         4         5         6         7         8
+        //       012       0          0         0         0         0         0         0         0
+        if (severity_lvl >= 2) begin
+        $display("ERROR: %m::%s", inst_name);
+        end else if (severity_lvl == 1) begin
+        $display("WARNING: %m::%s", inst_name);
+        end else begin
+        $display("INFO: %m::%s", inst_name);
+        end
+        $display("       Parameter %s == %2d is not the recommended value of 'an integer ", tuser_name, tuser_width);
+        $display("       multiple of the width of the interface (%s == %2d) in bytes.'  AXI4-Stream", tdata_name, tdata_width);
+        $display("       TUSER width in this module is only defined when the TUSER is the");
+        $display("       recommended value.  See the AMBA4 AXI4-Stream Protocol Specification v1.0");
+        $display("       Section 2.1, 2.3.3 and 2.8 for more information.  ");
+        ret_val = 1;
+      end else begin
+        ret_val = 0;
+      end
+    end
+  endtask
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index 429bc34ffc59b5d98bb559f36ac557de4dbba92f..7e4ab34af79c52a08e737f57b2fc8f017940bcf5 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -48,6 +48,10 @@ from finn.transformation.fpgadataflow.minimize_accumulator_width import (
 class InferConvInpGen(Transformation):
     """Convert Im2Col layers to ConvolutionInputGenerator layers."""
 
+    def __init__(self, use_rtl_variant=False):
+        super().__init__()
+        self.use_rtl_variant = use_rtl_variant
+
     def apply(self, model):
         graph = model.graph
         node_ind = 0
@@ -128,105 +132,144 @@ class InferConvInpGen(Transformation):
                     )
                     graph.node.insert(node_ind, padding_node)
 
-                # Ensure that only supported HLS nodes are inserted
+                is_kernel_pointwise = k_h == 1 and k_w == 1
                 is_square_image = ConvInpGen_idim_h == ConvInpGen_idim_w
                 is_square_kernel = k_h == k_w
-                is_kernel_pointwise = k_h == 1 and k_w == 1
                 is_equal_stride = stride_h == stride_w
                 is_1d_convolution = (k_h == 1 and k_w > 1 and ifm_dim_h == 1) or (
                     k_h > 1 and k_w == 1 and ifm_dim_w == 1
                 )
 
-                if (stride_h > 1 or stride_w > 1) and is_kernel_pointwise:
-                    assert is_square_image, (
-                        "%s : DownSampler currently only supports square input images."
-                        % n.name
-                    )
-                    assert is_equal_stride, (
-                        """%s : DownSampler currently only supports equal stride value
-                        along different axes."""
-                        % n.name
-                    )
-                    ConvInpGen_idim = ConvInpGen_idim_h
-                    stride = stride_h
-                    # create DownSampler node
+                # Ensure that RTL variant is not inserted for unsupported configuration
+                is_rtl_variant_compatible = True
+                if is_kernel_pointwise:
+                    is_rtl_variant_compatible = False
+                    if self.use_rtl_variant:
+                        warnings.warn(
+                            """%s : RTL ConvInpGen requested for unsupported
+                                configuration. Falling back to HLS implementation."""
+                            % n.name
+                        )
+
+                if self.use_rtl_variant and is_rtl_variant_compatible:
+
                     ConvInpGen_node = helper.make_node(
-                        "DownSampler",
+                        "ConvolutionInputGenerator_rtl",
                         [ConvInpGen_input],
                         [i2c_output],
                         domain="finn.custom_op.fpgadataflow",
                         backend="fpgadataflow",
-                        ImgDim=ConvInpGen_idim,
-                        NumChannels=ifm_ch,
+                        ConvKernelDim=[k_h, k_w],
+                        IFMChannels=ifm_ch,
+                        IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w],
+                        OFMDim=[ofm_dim_h, ofm_dim_w],
                         SIMD=ifm_ch,
-                        Stride=stride,
+                        M=1,
+                        parallel_window=0,
+                        Stride=[stride_h, stride_w],
+                        Dilation=[dilation_h, dilation_w],
                         inputDataType=dt.name,
-                        name="DownSampler_" + n.name,
+                        outputDataType=dt.name,
+                        depthwise=depthwise,
+                        name="ConvolutionInputGenerator_rtl_" + n.name,
                     )
                     graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
                 else:
-                    # create equivalent ConvolutionInputGenerator node
-                    if (
-                        is_square_image and is_square_kernel
-                    ):  # square images and square kernels
-                        assert is_equal_stride, (
-                            """%s: Non-equal strides along different axes is not supported
-                            for (non-)square convolutions"""
-                            % n.name
-                        )
-                        assert dilation_h == 1 and dilation_w == 1, (
-                            """%s: Dilation value != 1 is not supported
-                            for square convolutions"""
-                            % n.name
+                    # Ensure that only supported HLS nodes are inserted
+                    if (stride_h > 1 or stride_w > 1) and is_kernel_pointwise:
+                        downsample_1D = (ifm_dim_h == 1) or (ifm_dim_w == 1)
+                        is1D_unitx = ifm_dim_w == 1
+                        downsample_2D = (
+                            (not downsample_1D) and is_square_image and is_equal_stride
                         )
+                        if not (downsample_1D or downsample_2D):
+                            warnings.warn(
+                                f"Couldn't infer Downsample from {n.name},check config."
+                            )
+                            continue
+                        ConvInpGen_idim = max(ConvInpGen_idim_h, ConvInpGen_idim_w)
+                        stride = max(stride_h, stride_w)
+                        # create DownSampler node
                         ConvInpGen_node = helper.make_node(
-                            "ConvolutionInputGenerator",
+                            "DownSampler",
                             [ConvInpGen_input],
                             [i2c_output],
                             domain="finn.custom_op.fpgadataflow",
                             backend="fpgadataflow",
-                            ConvKernelDim=[k_h, k_w],
-                            IFMChannels=ifm_ch,
-                            IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w],
-                            OFMDim=[ofm_dim_h, ofm_dim_w],
+                            ImgDim=ConvInpGen_idim,
+                            NumChannels=ifm_ch,
                             SIMD=ifm_ch,
-                            Stride=[stride_h, stride_w],
-                            Dilation=[dilation_h, dilation_w],
+                            Stride=stride,
                             inputDataType=dt.name,
-                            outputDataType=dt.name,
-                            depthwise=depthwise,
-                            name="ConvolutionInputGenerator_" + n.name,
+                            name="DownSampler_" + n.name,
+                            is1D=downsample_1D,
+                            is1D_unitx=is1D_unitx,
                         )
-                    else:  # 1D images and/or kernels
-                        assert is_1d_convolution, (
-                            "%s: ConvolutionInputGenerator1D works only for 1D convs"
-                            % n.name
-                        )
-                        if dilation_h > 1 or dilation_w > 1:
-                            assert depthwise == 1, (
-                                """%s: Dilation value > 1 is only supported for
-                                1D depthwise separable convolutions"""
+                        graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
+                    else:
+                        # create equivalent ConvolutionInputGenerator node
+                        if (
+                            is_square_image and is_square_kernel
+                        ):  # square images and square kernels
+                            assert is_equal_stride, (
+                                """%s: Non-equal strides along different axes is not supported
+                                for (non-)square convolutions"""
                                 % n.name
                             )
-                        ConvInpGen_node = helper.make_node(
-                            "ConvolutionInputGenerator1D",
-                            [ConvInpGen_input],
-                            [i2c_output],
-                            domain="finn.custom_op.fpgadataflow",
-                            backend="fpgadataflow",
-                            ConvKernelDim=[k_h, k_w],
-                            IFMChannels=ifm_ch,
-                            IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w],
-                            OFMDim=[ofm_dim_h, ofm_dim_w],
-                            SIMD=ifm_ch,
-                            Stride=[stride_h, stride_w],
-                            Dilation=[dilation_h, dilation_w],
-                            inputDataType=dt.name,
-                            outputDataType=dt.name,
-                            depthwise=depthwise,
-                            name="ConvolutionInputGenerator1D_" + n.name,
-                        )
-                    graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
+                            assert dilation_h == 1 and dilation_w == 1, (
+                                """%s: Dilation value != 1 is not supported
+                                for square convolutions"""
+                                % n.name
+                            )
+                            ConvInpGen_node = helper.make_node(
+                                "ConvolutionInputGenerator",
+                                [ConvInpGen_input],
+                                [i2c_output],
+                                domain="finn.custom_op.fpgadataflow",
+                                backend="fpgadataflow",
+                                ConvKernelDim=[k_h, k_w],
+                                IFMChannels=ifm_ch,
+                                IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w],
+                                OFMDim=[ofm_dim_h, ofm_dim_w],
+                                SIMD=ifm_ch,
+                                Stride=[stride_h, stride_w],
+                                Dilation=[dilation_h, dilation_w],
+                                inputDataType=dt.name,
+                                outputDataType=dt.name,
+                                depthwise=depthwise,
+                                name="ConvolutionInputGenerator_" + n.name,
+                            )
+                        else:  # 1D images and/or kernels
+                            assert is_1d_convolution, (
+                                """%s: ConvolutionInputGenerator1D works only
+                                for 1D convs"""
+                                % n.name
+                            )
+                            if dilation_h > 1 or dilation_w > 1:
+                                assert depthwise == 1, (
+                                    """%s: Dilation value > 1 is only supported for
+                                    1D depthwise separable convolutions"""
+                                    % n.name
+                                )
+                            ConvInpGen_node = helper.make_node(
+                                "ConvolutionInputGenerator1D",
+                                [ConvInpGen_input],
+                                [i2c_output],
+                                domain="finn.custom_op.fpgadataflow",
+                                backend="fpgadataflow",
+                                ConvKernelDim=[k_h, k_w],
+                                IFMChannels=ifm_ch,
+                                IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w],
+                                OFMDim=[ofm_dim_h, ofm_dim_w],
+                                SIMD=ifm_ch,
+                                Stride=[stride_h, stride_w],
+                                Dilation=[dilation_h, dilation_w],
+                                inputDataType=dt.name,
+                                outputDataType=dt.name,
+                                depthwise=depthwise,
+                                name="ConvolutionInputGenerator1D_" + n.name,
+                            )
+                        graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
                 # remove old nodes
                 graph.node.remove(n)
                 graph_modified = True
@@ -870,6 +913,10 @@ class InferVectorVectorActivation(Transformation):
     a depthwise convolution. Any immediately following MultiThreshold
     layers will also be absorbed into the VVAU."""
 
+    def __init__(self, mem_mode="const"):
+        super().__init__()
+        self.mem_mode = mem_mode
+
     def apply(self, model):
         graph = model.graph
         node_ind = 0
@@ -970,6 +1017,7 @@ class InferVectorVectorActivation(Transformation):
                             ActVal=actval,
                             noActivation=0,
                             name="VectorVectorActivation_" + n.name,
+                            mem_mode=self.mem_mode,
                         )
                         graph.node.insert(node_ind, new_node)
                         # remove old nodes
@@ -1234,6 +1282,7 @@ class InferDuplicateStreamsLayer(Transformation):
                     inputDataType=dt.name,
                     numInputVectors=vecs,
                     NumOutputStreams=n_outputs,
+                    outFIFODepths=[2] * n_outputs,
                     name="DuplicateStreams_Batch_" + node.name,
                 )
 
@@ -1661,6 +1710,7 @@ class InferConcatLayer(Transformation):
                     ElemsPerStream=elems_per_stream,
                     inputDataType=dt0.name,
                     numInputVectors=inp_vec,
+                    inFIFODepths=[2] * len(node.input),
                 )
                 graph.node.insert(node_ind, new_node)
                 # remove old node
@@ -1671,3 +1721,95 @@ class InferConcatLayer(Transformation):
             model = model.transform(InferShapes())
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
+
+
+class InferStreamingEltwise(Transformation):
+    """Convert eltwise Sub or Sub -> Abs to StreamingEltwise layer
+    with SubEltwise or AbsDiffEltwise op."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for node in graph.node:
+            node_ind += 1
+            if node.op_type == "Sub":
+                in0 = node.input[0]
+                in1 = node.input[1]
+                result = node.output[0]
+                in0_shape = model.get_tensor_shape(in0)
+                in1_shape = model.get_tensor_shape(in1)
+
+                # skip if different shapes on inputs
+                if in0_shape != in1_shape:
+                    continue
+
+                idt0 = model.get_tensor_datatype(in0)
+                idt1 = model.get_tensor_datatype(in1)
+
+                # skip conversion for layers with float input
+                if not (idt0.is_integer() and idt1.is_integer()):
+                    continue
+
+                eltwiseOp = "Sub"
+                nodes_to_remove = [node]
+                # look for a downstream Abs node
+                res_consumer = model.find_consumer(result)
+                if (res_consumer is not None) and (res_consumer.op_type == "Abs"):
+                    eltwiseOp = "AbsDiff"
+                    result = res_consumer.output[0]
+                    nodes_to_remove.append(res_consumer)
+
+                # check layout and convert if necessary
+                in0_layout = model.get_tensor_layout(in0)
+                in1_layout = model.get_tensor_layout(in1)
+                result_layout = model.get_tensor_layout(result)
+
+                if in0_layout == DataLayout.NCHW:
+                    in0 = nchw_to_nhwc(in0, model, node_ind)
+                    node_ind += 1
+                    in0_shape = model.get_tensor_shape(in0)
+
+                if in1_layout == DataLayout.NCHW:
+                    in1 = nchw_to_nhwc(in1, model, node_ind)
+                    node_ind += 1
+                    in1_shape = model.get_tensor_shape(in1)
+
+                # keep track of where we need to insert the HLS Op
+                # it has to be ahead of the output transform
+                insert_point = node_ind
+
+                if result_layout == DataLayout.NCHW:
+                    result = nchw_to_nhwc(result, model, node_ind, reverse=True)
+                    node_ind += 1
+
+                # now safe to assume num_channels is size of last dimension
+                num_channels = int(in0_shape[-1])
+                # create node with no parallelization first
+                pe = 1
+
+                # create and insert new Eltwise node
+                new_node = helper.make_node(
+                    "StreamingEltwise",
+                    [in0, in1],
+                    [result],
+                    domain="finn.custom_op.fpgadataflow",
+                    backend="fpgadataflow",
+                    NumChannels=num_channels,
+                    PE=pe,
+                    inputDataType0=idt0.name,
+                    inputDataType1=idt1.name,
+                    eltwiseOp=eltwiseOp,
+                    numInputVectors=in0_shape[:-1],
+                    name="StreamingEltwise_" + node.name,
+                )
+                graph.node.insert(insert_point, new_node)
+                # remove old nodes
+                for nd in nodes_to_remove:
+                    graph.node.remove(nd)
+                graph_modified = True
+
+        # if graph_modified:
+        # model = model.transform(InferShapes())
+        # model = model.transform(InferDataTypes())
+        return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index 892ab09fdf41947f86e2bf122e057e94585dfa8c..52e4e88b409766f0764d3ce7666dbf1971713575 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -228,6 +228,22 @@ class CreateStitchedIP(Transformation):
             )
             self.s_axis_idx += 1
 
+    def connect_ap_none_external(self, node):
+        inst_name = node.name
+        node_inst = getCustomOp(node)
+        input_intf_names = node_inst.get_verilog_top_module_intf_names()["ap_none"]
+        # make external
+        for i in range(len(input_intf_names)):
+            input_intf_name = input_intf_names[i]
+            self.connect_cmds.append(
+                "make_bd_pins_external [get_bd_pins %s/%s]"
+                % (inst_name, input_intf_name)
+            )
+            self.connect_cmds.append(
+                "set_property name %s [get_bd_ports %s_0]"
+                % (input_intf_name, input_intf_name)
+            )
+
     def insert_signature(self, checksum_count):
         signature_vlnv = "AMD:user:axi_info_top:1.0"
         signature_name = "axi_info_top0"
@@ -275,7 +291,7 @@ class CreateStitchedIP(Transformation):
             "make_bd_intf_pins_external [get_bd_intf_pins %s/s_axi]" % signature_name
         )
         self.connect_cmds.append(
-            "set_property name s_axis_info [get_bd_intf_ports s_axi_0]"
+            "set_property name s_axilite_info [get_bd_intf_ports s_axi_0]"
         )
         self.connect_cmds.append("assign_bd_address")
 
@@ -305,6 +321,7 @@ class CreateStitchedIP(Transformation):
             ip_dirs += [ip_dir_value]
             self.create_cmds += node_inst.code_generation_ipi()
             self.connect_clk_rst(node)
+            self.connect_ap_none_external(node)
             self.connect_axi(node)
             for i in range(len(node.input)):
                 if not is_external_input(model, node, i):
@@ -387,6 +404,7 @@ class CreateStitchedIP(Transformation):
         wrapper_filename = "%s/hdl/%s_wrapper.v" % (bd_base, block_name)
         tcl.append("add_files -norecurse %s" % wrapper_filename)
         model.set_metadata_prop("wrapper_filename", wrapper_filename)
+        tcl.append("set_property top finn_design_wrapper [current_fileset]")
         # synthesize to DCP and export stub, DCP and constraints
         if self.vitis:
             tcl.append(
@@ -565,6 +583,10 @@ class CreateStitchedIP(Transformation):
             if os.path.isfile(wrapper_filename_alt):
                 model.set_metadata_prop("wrapper_filename", wrapper_filename_alt)
             else:
-                raise Exception("CreateStitchedIP failed, no wrapper HDL found.")
+                raise Exception(
+                    """CreateStitchedIP failed, no wrapper HDL found under %s or %s.
+                    Please check logs under the parent directory."""
+                    % (wrapper_filename, wrapper_filename_alt)
+                )
 
         return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/derive_characteristic.py b/src/finn/transformation/fpgadataflow/derive_characteristic.py
new file mode 100644
index 0000000000000000000000000000000000000000..822679721036c7832241db4642911ff804fb9dff
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/derive_characteristic.py
@@ -0,0 +1,190 @@
+# Copyright (c) 2022, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import qonnx.custom_op.registry as registry
+import warnings
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.base import NodeLocalTransformation
+
+from finn.util.fpgadataflow import is_fpgadataflow_node
+
+
+class DeriveCharacteristic(NodeLocalTransformation):
+    """For each node in the graph, run rtlsim to obtain the i/o
+    characteristic function for FIFO sizing and set the attribute.
+    It is assumed that the PrepareRTLSim transformation was already
+    called on the graph.
+
+    This transformation performs rtlsim for each node, so it will run for
+    some time (minutes to hours depending on configuration).
+
+    * period (int) desired period over which the characteristic function
+      will be derived.
+
+    * num_workers (int or None) number of parallel workers, see documentation in
+      NodeLocalTransformation for more details.
+    """
+
+    def __init__(self, period, num_workers=None, manual_bypass=False):
+        super().__init__(num_workers=num_workers)
+        self.period = period
+        self.manual_bypass = manual_bypass
+
+    def applyNodeLocal(self, node):
+        op_type = node.op_type
+        if is_fpgadataflow_node(node) is True:
+            try:
+                # lookup op_type in registry of CustomOps
+                inst = registry.getCustomOp(node)
+                inst.derive_characteristic_fxns(period=self.period)
+            except KeyError:
+                # exception if op_type is not supported
+                raise Exception(
+                    "Custom op_type %s is currently not supported." % op_type
+                )
+        return (node, False)
+
+    def apply(self, model: ModelWrapper):
+        (model, run_again) = super().apply(model)
+        if not self.manual_bypass:
+            return (model, run_again)
+        # apply manual fix for DuplicateStreams and AddStreams for
+        # simple residual reconvergent paths with bypass
+        addstrm_nodes = model.get_nodes_by_op_type("AddStreams_Batch")
+        for addstrm_node in addstrm_nodes:
+            # we currently only support the case where one branch is
+            # a bypass
+            b0 = model.find_producer(addstrm_node.input[0])
+            b1 = model.find_producer(addstrm_node.input[1])
+            if (b0 is None) or (b1 is None):
+                warnings.warn("Found unsupported AddStreams, skipping")
+                return (model, run_again)
+            b0_is_bypass = b0.op_type == "DuplicateStreams_Batch"
+            b1_is_bypass = b1.op_type == "DuplicateStreams_Batch"
+            if (not b0_is_bypass) and (not b1_is_bypass):
+                warnings.warn("Found unsupported AddStreams, skipping")
+                return (model, run_again)
+            ds_node = b0 if b0_is_bypass else b1
+            comp_branch_last = b1 if b0_is_bypass else b0
+
+            ds_comp_bout = ds_node.output[0] if b0_is_bypass else ds_node.output[1]
+            comp_branch_first = model.find_consumer(ds_comp_bout)
+            if comp_branch_first is None or comp_branch_last is None:
+                warnings.warn("Found unsupported DuplicateStreams, skipping")
+                return (model, run_again)
+            comp_branch_last = registry.getCustomOp(comp_branch_last)
+            comp_branch_first = registry.getCustomOp(comp_branch_first)
+            # for DuplicateStreams, use comp_branch_first's input characterization
+            # for AddStreams, use comp_branch_last's output characterization
+            period = comp_branch_first.get_nodeattr("io_chrc_period")
+            comp_branch_first_f = comp_branch_first.get_nodeattr("io_characteristic")[
+                : 2 * period
+            ]
+            comp_branch_last_f = comp_branch_last.get_nodeattr("io_characteristic")[
+                2 * period :
+            ]
+            ds_node_inst = registry.getCustomOp(ds_node)
+            addstrm_node_inst = registry.getCustomOp(addstrm_node)
+            ds_node_inst.set_nodeattr("io_chrc_period", period)
+            ds_node_inst.set_nodeattr("io_characteristic", comp_branch_first_f * 2)
+            addstrm_node_inst.set_nodeattr("io_chrc_period", period)
+            addstrm_node_inst.set_nodeattr("io_characteristic", comp_branch_last_f * 2)
+            warnings.warn(
+                f"Set {ds_node.name} chrc. from {comp_branch_first.onnx_node.name}"
+            )
+            warnings.warn(
+                f"Set {addstrm_node.name} chrc. from {comp_branch_last.onnx_node.name}"
+            )
+        return (model, run_again)
+
+
+class DeriveFIFOSizes(NodeLocalTransformation):
+    """Prerequisite: DeriveCharacteristic already called on graph.
+    For each node in the graph, use the accumulated I/O characteristic function
+    to perform FIFO sizing, setting the in/outFIFODepth attributes of HLSCustomOp
+    nodes.
+
+    * num_workers (int or None) number of parallel workers, see documentation in
+      NodeLocalTransformation for more details.
+    """
+
+    def __init__(self, num_workers=None):
+        super().__init__(num_workers=num_workers)
+
+    def applyNodeLocal(self, node):
+        op_type = node.op_type
+        if is_fpgadataflow_node(node) is True:
+            try:
+                # lookup op_type in registry of CustomOps
+                prod = registry.getCustomOp(node)
+                assert op_type != "StreamingFIFO", "Found existing FIFOs"
+                period = prod.get_nodeattr("io_chrc_period")
+                prod_chrc = prod.get_nodeattr("io_chrc_out")[0]
+                assert (
+                    len(prod_chrc) == 2 * period
+                ), "Found unexpected characterization attribute"
+                if any([x > 2 for x in prod.get_nodeattr("outFIFODepths")]):
+                    # FIFO depth already set, can skip this node
+                    return (node, False)
+
+                # find consumers
+                model = self.ref_input_model
+                out_fifo_depths = []
+                for output_name in node.output:
+                    cons_node = model.find_consumer(output_name)
+                    if cons_node is None:
+                        # could be final node, will be overridden if so
+                        # need an entry in the list anyway
+                        out_fifo_depths.append(2)
+                        continue
+                    cons = registry.getCustomOp(cons_node)
+                    cons_chrc = cons.get_nodeattr("io_chrc_in")[0]
+                    # find minimum phase shift satisfying the constraint
+                    pshift_min = period - 1
+                    for pshift_cand in range(period):
+                        prod_chrc_part = prod_chrc[pshift_cand:period]
+                        cons_chrc_part = cons_chrc[: period - pshift_cand]
+                        if (prod_chrc_part >= cons_chrc_part).all():
+                            pshift_min = pshift_cand
+                            break
+                    prod_chrc_part = prod_chrc[pshift_min : (pshift_min + period)]
+                    cons_chrc_part = cons_chrc[:period]
+                    fifo_depth = int((prod_chrc_part - cons_chrc_part).max())
+                    out_fifo_depths.append(fifo_depth)
+                # set output FIFO depth for this (producing) node
+                # InsertFIFO looks at the max of (outFIFODepth, inFIFODepth)
+                # for each tensor
+                prod.set_nodeattr("outFIFODepths", out_fifo_depths)
+
+            except KeyError:
+                # exception if op_type is not supported
+                raise Exception(
+                    "Custom op_type %s is currently not supported." % op_type
+                )
+        return (node, False)
diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py
index 9817f2e3d2857bd5e59b304fbdaf3bad74a9b037..efc179923545eb06e4d173c683b0941887f8bb79 100644
--- a/src/finn/transformation/fpgadataflow/insert_dwc.py
+++ b/src/finn/transformation/fpgadataflow/insert_dwc.py
@@ -81,6 +81,12 @@ class InsertDWC(Transformation):
                             dwc_in_width = n0.get_outstream_width()
                             # determine dwc outwidth
                             dwc_out_width = n1.get_instream_width()
+                            larger_width = max(dwc_in_width, dwc_out_width)
+                            smaller_width = min(dwc_in_width, dwc_out_width)
+                            if larger_width % smaller_width == 0:
+                                impl_style = "hls"
+                            else:
+                                impl_style = "vivado"
 
                             # determine shape for dwc
                             dwc_shape = n0.get_normal_output_shape()
@@ -105,6 +111,7 @@ class InsertDWC(Transformation):
                                 inWidth=dwc_in_width,
                                 outWidth=dwc_out_width,
                                 dataType=str(dtype.name),
+                                impl_style=impl_style,
                             )
                             # insert dwc
                             graph.node.insert(node_ind + 1, dwc_node)
diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index 78200b280960ad53e3e84d44394c10296c432ba5..79bd717a5d96e7a9839740d73254db53e5133e13 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -70,16 +70,26 @@ class InsertFIFO(Transformation):
     node attribute 'outFIFODepth' of the previous and node attribute 'inFIFODepth'
     of the subsequent node. max() of these two values sets the FIFO depth.
 
-    Normally, shallow-depth (<=2) FIFOs won't be created since HLS streaming
-    interfaces already have a degree of buffering. You can set
-    create_shallow_fifos=True to override this default behavior.
+    Constructor arguments:
+    - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of
+                       Verilog FIFOs (Q_srl.v)
+    - vivado_ram_style : the StreamingFIFO.ram_style attribute to be used for
+                          large FIFOs implemented by Vivado
+    - create_shallow_fifos : Normally, shallow-depth (<=2) FIFOs won't be created since
+                            HLS streaming interfaces already have a degree of buffering.
+                            Override with this parameter.
+
 
     The other node attributes necessary to create a FIFO node are taken from the
     node the FIFO node is inserted after: 'folded_shape' and 'dtype'"""
 
-    def __init__(self, create_shallow_fifos=False):
+    def __init__(
+        self, create_shallow_fifos=False, max_qsrl_depth=None, vivado_ram_style="auto"
+    ):
         super().__init__()
         self.create_shallow_fifos = create_shallow_fifos
+        self.max_qsrl_depth = max_qsrl_depth
+        self.vivado_ram_style = vivado_ram_style
 
     def apply(self, model):
         graph = model.graph
@@ -88,8 +98,8 @@ class InsertFIFO(Transformation):
         for first_node in graph.node:
             node_ind += 1
             if _suitable_node(first_node):
-                for n_output in first_node.output:
-                    consumers = model.find_consumers(n_output)
+                for idx_out, output_name in enumerate(first_node.output):
+                    consumers = model.find_consumers(output_name)
                     if consumers == []:
                         continue
                     if len(consumers) > 1:
@@ -108,11 +118,9 @@ class InsertFIFO(Transformation):
                         # input of the second node is equal
                         n1 = getCustomOp(consumer)
                         for idx, inp in enumerate(consumer.input):
-                            if inp == n_output:
-                                if idx == 0:
-                                    fld_shape_2 = n1.get_folded_input_shape()
-                                else:
-                                    fld_shape_2 = n1.get_folded_input_shape(ind=idx)
+                            if inp == output_name:
+                                fld_shape_2 = n1.get_folded_input_shape(ind=idx)
+                                idx_inp = idx
                         assert _suitable_folded_shapes(
                             fld_shape, fld_shape_2
                         ), """The
@@ -122,12 +130,10 @@ class InsertFIFO(Transformation):
 
                         # check if outFIFOdepth attribute of first node
                         # and inFIFOdepth attribute of consumer node is equal
-                        n0_depth = n0.get_nodeattr("outFIFODepth")
-                        n1_depth = n1.get_nodeattr("inFIFODepth")
-                        if n0_depth == n1_depth:
-                            fifo_depth = n0_depth
-                        elif n0_depth != n1_depth:
-                            fifo_depth = max(n0_depth, n1_depth)
+                        n0_depth = n0.get_nodeattr("outFIFODepths")[idx_out]
+                        n1_depth = n1.get_nodeattr("inFIFODepths")[idx_inp]
+
+                        fifo_depth = max(n0_depth, n1_depth)
 
                         if fifo_depth > 2 or self.create_shallow_fifos:
                             # assumption: HLS streaming components already have
@@ -143,25 +149,40 @@ class InsertFIFO(Transformation):
                             graph.value_info.append(fifo_output_tensor)
                             model.set_tensor_datatype(fifo_output_tensor.name, dtype)
 
+                            if (
+                                self.max_qsrl_depth is None
+                                or fifo_depth <= self.max_qsrl_depth
+                            ):
+                                impl_style = "rtl"
+                            else:
+                                impl_style = "vivado"
+
                             fifo_node = oh.make_node(
                                 "StreamingFIFO",
-                                [n_output],
+                                [output_name],
                                 [fifo_output_tensor.name],
                                 domain="finn.custom_op.fpgadataflow",
                                 backend="fpgadataflow",
                                 depth=fifo_depth,
                                 folded_shape=fld_shape,
                                 dataType=str(dtype.name),
+                                impl_style=impl_style,
+                                ram_style=self.vivado_ram_style,
                             )
                             # insert fifo
                             graph.node.insert(node_ind + 1, fifo_node)
                             # set fifo output tensor as new input tensor of second node
                             for idx, inp in enumerate(consumer.input):
-                                if inp == n_output:
+                                if inp == output_name:
                                     consumer.input[idx] = fifo_output_tensor.name
                             # ensure created FIFO depth is reflected on both sides
-                            n0.set_nodeattr("outFIFODepth", fifo_depth)
-                            n1.set_nodeattr("inFIFODepth", fifo_depth)
+                            odepths = n0.get_nodeattr("outFIFODepths")
+                            odepths[idx_out] = fifo_depth
+                            n0.set_nodeattr("outFIFODepths", odepths)
+                            idepths = n1.get_nodeattr("inFIFODepths")
+                            idepths[idx_inp] = fifo_depth
+                            n1.set_nodeattr("inFIFODepths", idepths)
+
                             graph_modified = True
 
         if graph_modified is False:
@@ -177,13 +198,9 @@ class InsertFIFO(Transformation):
                     n_input = first_node.input[inp_ind]
                     n0 = getCustomOp(first_node)
                     # determine fifo node attributes
-                    if inp_ind == 0:
-                        fld_shape = n0.get_folded_input_shape()
-                        dtype = n0.get_input_datatype()
-                    else:
-                        fld_shape = n0.get_folded_input_shape(inp_ind)
-                        dtype = n0.get_input_datatype(inp_ind)
-                    fifo_depth = n0.get_nodeattr("inFIFODepth")
+                    fld_shape = n0.get_folded_input_shape(inp_ind)
+                    dtype = n0.get_input_datatype(inp_ind)
+                    fifo_depth = n0.get_nodeattr("inFIFODepths")[inp_ind]
 
                     if fifo_depth <= 2:
                         warnings.warn("Overriding input FIFO depth to 32")
@@ -198,6 +215,11 @@ class InsertFIFO(Transformation):
                     graph.value_info.append(fifo_output_tensor)
                     model.set_tensor_datatype(fifo_output_tensor.name, dtype)
 
+                    if self.max_qsrl_depth is None or fifo_depth <= self.max_qsrl_depth:
+                        impl_style = "rtl"
+                    else:
+                        impl_style = "vivado"
+
                     fifo_node = oh.make_node(
                         "StreamingFIFO",
                         [n_input],
@@ -207,6 +229,8 @@ class InsertFIFO(Transformation):
                         depth=fifo_depth,
                         folded_shape=fld_shape,
                         dataType=str(dtype.name),
+                        impl_style=impl_style,
+                        ram_style=self.vivado_ram_style,
                     )
                     # insert fifo
                     graph.node.insert(0, fifo_node)
@@ -227,10 +251,11 @@ class InsertFIFO(Transformation):
                     ), """Insert tlast marker should be done
                         after inserting the FIFOs"""
                     n0 = getCustomOp(final_node)
+                    out_ind = list(final_node.output).index(graph_out_name)
                     # determine fifo node attributes
-                    fld_shape = n0.get_folded_output_shape()
-                    dtype = n0.get_output_datatype()
-                    fifo_depth = n0.get_nodeattr("outFIFODepth")
+                    fld_shape = n0.get_folded_output_shape(out_ind)
+                    dtype = n0.get_output_datatype(out_ind)
+                    fifo_depth = n0.get_nodeattr("outFIFODepths")[out_ind]
 
                     if fifo_depth <= 2:
                         warnings.warn("Overriding output FIFO depth to 32")
@@ -245,6 +270,11 @@ class InsertFIFO(Transformation):
                     graph.value_info.append(fifo_input_tensor)
                     model.set_tensor_datatype(fifo_input_tensor.name, dtype)
 
+                    if self.max_qsrl_depth is None or fifo_depth <= self.max_qsrl_depth:
+                        impl_style = "rtl"
+                    else:
+                        impl_style = "vivado"
+
                     fifo_node = oh.make_node(
                         "StreamingFIFO",
                         [fifo_input_tensor.name],
@@ -254,6 +284,8 @@ class InsertFIFO(Transformation):
                         depth=fifo_depth,
                         folded_shape=fld_shape,
                         dataType=str(dtype.name),
+                        impl_style=impl_style,
+                        ram_style=self.vivado_ram_style,
                     )
                     # insert fifo
                     graph.node.append(fifo_node)
diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
index a589cb039c825ff97c11df7ffa57109df27f3fd0..f48566326e576f4d39d81359fe7f28a12645a635 100644
--- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -45,7 +45,7 @@ from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.util.basic import make_build_dir, pynq_part_map
+from finn.util.basic import make_build_dir, pynq_native_port_width, pynq_part_map
 
 from . import templates
 
@@ -320,6 +320,7 @@ class ZynqBuild(Transformation):
     ):
         super().__init__()
         self.fpga_part = pynq_part_map[platform]
+        self.axi_port_width = pynq_native_port_width[platform]
         self.period_ns = period_ns
         self.platform = platform
         self.enable_debug = enable_debug
@@ -330,7 +331,7 @@ class ZynqBuild(Transformation):
         model = model.transform(InferDataLayouts())
         # prepare at global level, then break up into kernels
         prep_transforms = [
-            InsertIODMA(64),
+            InsertIODMA(self.axi_port_width),
             InsertDWC(),
             Floorplan(),
             CreateDataflowPartition(partition_model_dir=self.partition_model_dir),
diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
index 0139c71666fdfa4b60cb356ceb65ce2c5b831c13..f715aaeffb6d4d00f2e14c5fb25ec931443d5d97 100644
--- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py
+++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
@@ -192,10 +192,11 @@ class InsertAndSetFIFODepths(Transformation):
     - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of
                        Verilog FIFOs (Q_srl.v)
     - max_depth : how deep the "max"-sized FIFOs initially inserted will be
+                   if set to None, use the tensor size as the depth
     - swg_exception : call CapConvolutionFIFODepths to make convolution FIFOs
                         smaller where appropriate
     - vivado_ram_style : the StreamingFIFO.ram_style attribute to be used for
-                          large FIFOs implemented by Vivado
+                          large FIFOs implemented by Vivado afterwards
 
     Assumed input graph properties:
     - all nodes are fpgadataflow nodes
@@ -210,7 +211,7 @@ class InsertAndSetFIFODepths(Transformation):
     necessary to insert FIFOs between them to prevent stalls due to bursty
     behavior. The sizes of those FIFOs are hard to predict analytically, so
     we do the following:
-    - insert very deep (default 16k deep) FIFOs between all fpgadataflow nodes
+    - insert deep (=tensor size) FIFOs between all fpgadataflow nodes
     - create stitched design
     - run through rtlsim with stream of multiple random input images (to fill pipeline)
     - keep track of observed maximum occupancy for each FIFO during rtlsim
@@ -223,7 +224,7 @@ class InsertAndSetFIFODepths(Transformation):
         fpgapart,
         clk_ns=10.0,
         max_qsrl_depth=256,
-        max_depth=2**14,
+        max_depth=None,
         swg_exception=True,
         vivado_ram_style="auto",
     ):
@@ -236,6 +237,9 @@ class InsertAndSetFIFODepths(Transformation):
         self.vivado_ram_style = vivado_ram_style
 
     def apply(self, model):
+        # these optypes may potentially use external weights
+        # we'll temporarily change them to use decoupled mode for FIFO sizing
+        extw_optypes = ["MatrixVectorActivation", "VectorVectorActivation"]
         # change external to decoupled and warn user
         # this way we are sure we have exactly one input/output
         modified_fc_nodes = []
@@ -246,9 +250,22 @@ class InsertAndSetFIFODepths(Transformation):
             )
             assert node.op_type != "StreamingFIFO", "Found existing StreamingFIFO node"
             node = getCustomOp(node)
-            node.set_nodeattr("inFIFODepth", self.max_depth)
-            node.set_nodeattr("outFIFODepth", self.max_depth)
-            if node.onnx_node.op_type == "MatrixVectorActivation":
+            ifd = node.get_nodeattr("inFIFODepths")
+            ofd = node.get_nodeattr("outFIFODepths")
+            if self.max_depth is not None:
+                ifd = [self.max_depth] * len(ifd)
+                ofd = [self.max_depth] * len(ofd)
+            else:
+                # set each FIFO to its tensor size
+                # (except stream width hence the :-1)
+                for i in range(len(ifd)):
+                    ifd[i] = np.prod(node.get_folded_input_shape(i)[:-1])
+                for o in range(len(ofd)):
+                    ofd[o] = np.prod(node.get_folded_output_shape(o)[:-1])
+            node.set_nodeattr("inFIFODepths", ifd)
+            node.set_nodeattr("outFIFODepths", ofd)
+
+            if node.onnx_node.op_type in extw_optypes:
                 mmode = node.get_nodeattr("mem_mode")
                 if mmode == "external":
                     modified_fc_nodes.append(node.onnx_node.name)
@@ -267,13 +284,17 @@ class InsertAndSetFIFODepths(Transformation):
 
         # gather FIFO names, check they are of expected depth
         fifos = {}
-        for node in model.graph.node:
-            if node.op_type == "StreamingFIFO":
-                fifos[node.name] = 0
-                node = getCustomOp(node)
-                # check depths and fix as necessary
-                if node.get_nodeattr("depth") != self.max_depth:
-                    node.set_nodeattr("depth", self.max_depth)
+        fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO")
+        for node in fifo_nodes:
+            fifos[node.name] = 0
+            node = getCustomOp(node)
+            node.set_nodeattr("depth_monitor", 1)
+            node.set_nodeattr("impl_style", "rtl")
+            # check depths and fix as necessary
+            if (self.max_depth is not None) and (
+                node.get_nodeattr("depth") != self.max_depth
+            ):
+                node.set_nodeattr("depth", self.max_depth)
 
         # insert FIFOs and do all transformations for RTLsim
         model = model.transform(AnnotateCycles())
@@ -324,21 +345,6 @@ class InsertAndSetFIFODepths(Transformation):
             else:
                 set_signal(sim, "tvalid", 0)
 
-            # check/update all fifo counts
-            for key in fifos:
-                current_state = sim.internals["finn_design_i"][key]["inst"][
-                    key + "_" + key
-                ]["state"]
-                current_addr = sim.internals["finn_design_i"][key]["inst"][
-                    key + "_" + key
-                ]["addr"]
-                if current_state == 2:
-                    current_count = current_addr + 2
-                else:
-                    current_count = current_state
-                if current_count > fifos[key]:
-                    fifos[key] = current_count
-
             # since latency estimation is very pessimistic, detect first output
             # and fast-forward the sim
             if get_signal(sim, "tvalid") != 0 and not output_detected:
@@ -352,6 +358,12 @@ class InsertAndSetFIFODepths(Transformation):
                 "No output detected, calculated FIFO depths may not be correct"
             )
 
+        for ind, node in enumerate(fifo_nodes):
+            maxcount_name = "maxcount_%d" % ind
+            if ind == 0:
+                maxcount_name = "maxcount"
+            fifos[node.name] = sim[maxcount_name]
+
         # Apply depths back into the model;
         # also set in/outFIFODepth to zero for non-FIFO
         # nodes, preventing further FIFO insertion
@@ -364,6 +376,7 @@ class InsertAndSetFIFODepths(Transformation):
                 depth = optimize_depth(fifos[node.name])
                 node_inst = getCustomOp(node)
                 node_inst.set_nodeattr("depth", depth)
+                node_inst.set_nodeattr("depth_monitor", 0)
                 # Set FIFO implementation/ram styles
                 if depth > self.max_qsrl_depth:
                     node_inst.set_nodeattr("impl_style", "vivado")
@@ -374,11 +387,14 @@ class InsertAndSetFIFODepths(Transformation):
                 reset_implementation(node_inst)
                 del fifos[node.name]
             else:
-                getCustomOp(node).set_nodeattr("inFIFODepth", 0)
-                getCustomOp(node).set_nodeattr("outFIFODepth", 0)
-                # for every FC node we changed from external to decoupled,
+                inst = getCustomOp(node)
+                ifd = inst.get_nodeattr("inFIFODepths")
+                ofd = inst.get_nodeattr("outFIFODepths")
+                inst.set_nodeattr("inFIFODepths", [0] * len(ifd))
+                inst.set_nodeattr("outFIFODepths", [0] * len(ofd))
+                # for every extw node we changed from external to decoupled,
                 # change back and reset implementation
-                if node.op_type == "MatrixVectorActivation":
+                if node.op_type in extw_optypes:
                     if node.name in modified_fc_nodes:
                         node_inst = getCustomOp(node)
                         node_inst.set_nodeattr("mem_mode", "external")
diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py
index 23943084ab99d6ab880a69975e0b4a49756905a7..e24e24f1f8ebb2873c81617884cd333311d8aea9 100644
--- a/src/finn/transformation/fpgadataflow/set_folding.py
+++ b/src/finn/transformation/fpgadataflow/set_folding.py
@@ -109,6 +109,7 @@ class SetFolding(Transformation):
             "FMPadding_Batch",
             "ConvolutionInputGenerator",
             "ConvolutionInputGenerator1D",
+            "ConvolutionInputGenerator_rtl",
         ]
         # these ops are preceded by depthwise SWG and have special behavior,
         # as explained in the SetFolding docstring
@@ -171,10 +172,7 @@ class SetFolding(Transformation):
                             "Expected SWU on DW op input, found " + swu_node.op_type
                         )
             elif op_type in simd_ops:
-                if op_type in [
-                    "ConvolutionInputGenerator",
-                    "ConvolutionInputGenerator1D",
-                ]:
+                if op_type.startswith("ConvolutionInputGenerator"):
                     depthwise = node_inst.get_nodeattr("depthwise")
                     if depthwise == 0:
                         max_simd = node_inst.get_nodeattr("IFMChannels")
diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index 3e815c1537353cc2be970a2068d4ded30cc48bc8..29eefacc32370598ddcd39283d022f5eb61f3f0c 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -725,6 +725,77 @@ class MakeMaxPoolNHWC(Transformation):
         return (model, graph_modified)
 
 
+class MakeScaleResizeNHWC(Transformation):
+    """
+    Converts the inputs and outputs for all scales Resize and Upsample nodes
+    from NCHW to NHWC.
+    """
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type == "Upsample" or n.op_type == "Resize":
+                if model.get_tensor_layout(n.input[0]) != DataLayout.NCHW:
+                    warnings.warn(
+                        "%s: Input not NCHW. Can't operate transformation on node."
+                        % n.name
+                    )
+                    continue
+                consumer = model.find_consumer(n.output[0])
+                producer = model.find_producer(n.input[0])
+                if n.op_type == "Upsample":
+                    scales_ind = 1
+                else:
+                    scales_ind = 2
+                if producer is not None and producer.op_type == "Transpose":
+                    perms = list(get_by_name(producer.attribute, "perm").ints)
+                    if perms == [0, 3, 1, 2]:
+                        old_value = model.get_initializer(n.input[scales_ind])
+                        new_value = np.array(
+                            [old_value[idx] for idx in (0, 2, 3, 1)],
+                            dtype=np.dtype("float32"),
+                        )
+                        model.set_initializer(n.input[scales_ind], new_value)
+                        start_name = producer.input[0]
+                        mid_name = n.input[0]
+                        end_name = n.output[0]
+                        (b, hi, wi, c) = model.get_tensor_shape(start_name)
+                        (b, c, ho, wo) = model.get_tensor_shape(end_name)
+                        producer.input[0] = mid_name
+                        producer.output[0] = end_name
+                        n.input[0] = start_name
+                        n.output[0] = mid_name
+                        model.set_tensor_shape(mid_name, (b, ho, wo, c))
+                        model.set_tensor_shape(end_name, (b, c, ho, wo))
+                        graph.node.remove(producer)
+                        graph.node.insert(node_ind, producer)
+                elif consumer is not None and consumer.op_type == "Transpose":
+                    perms = list(get_by_name(consumer.attribute, "perm").ints)
+                    if perms == [0, 2, 3, 1]:
+                        old_value = model.get_initializer(n.input[scales_ind])
+                        new_value = np.array(
+                            [old_value[idx] for idx in (0, 2, 3, 1)],
+                            dtype=np.dtype("float32"),
+                        )
+                        model.set_initializer(n.input[scales_ind], new_value)
+                        start_name = n.input[0]
+                        mid_name = consumer.input[0]
+                        end_name = consumer.output[0]
+                        (b, c, hi, wi) = model.get_tensor_shape(start_name)
+                        (b, c, ho, wo) = model.get_tensor_shape(mid_name)
+                        consumer.input[0] = start_name
+                        consumer.output[0] = mid_name
+                        n.input[0] = mid_name
+                        n.output[0] = end_name
+                        model.set_tensor_shape(mid_name, (b, hi, wi, c))
+                        model.set_tensor_shape(end_name, (b, ho, wo, c))
+                        graph.node.remove(consumer)
+                        graph.node.insert(node_ind - 1, consumer)
+        return (model, False)
+
+
 class MoveOpPastFork(Transformation):
     """Move node operations past graph forks. Used when a node before a fork
     can be merged with nodes in the branches
diff --git a/src/finn/util/pyverilator.py b/src/finn/util/pyverilator.py
index f6a51da8e44ea60ae5693cdd033b39bdf51376ac..d7ed3e261fe024b7f054382f12184628d3f3e94c 100644
--- a/src/finn/util/pyverilator.py
+++ b/src/finn/util/pyverilator.py
@@ -26,7 +26,10 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import pkg_resources as pk
+
 import os
+import shutil
 from pyverilator import PyVerilator
 
 from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
@@ -74,14 +77,35 @@ def pyverilate_stitched_ip(
     # are identical but in multiple directories (regslice_core.v)
 
     # remove duplicates from list by doing list -> set -> list
-    all_verilog_files = list(
-        set(filter(lambda x: x.endswith(".v") or x.endswith(".sv"), all_verilog_srcs))
+    src_exts = [".v", ".sv"]
+
+    all_verilog_src_files = list(
+        set(
+            filter(
+                lambda x: any(map(lambda y: x.endswith(y), src_exts)), all_verilog_srcs
+            )
+        )
+    )
+
+    verilog_header_dir = make_build_dir("pyverilator_vh_")
+    # use custom version of axis infrastructure vh
+    # to enable Verilator to simulate AMD/Xilinx components (e.g DWC)
+    custom_vh = pk.resource_filename(
+        "finn.qnn-data", "verilog/custom_axis_infrastructure.vh"
     )
+    shutil.copy(custom_vh, verilog_header_dir + "/axis_infrastructure_v1_1_0.vh")
+    for fn in all_verilog_srcs:
+        if fn.endswith(".vh"):
+            if "axis_infrastructure_v1_1_0.vh" in fn:
+                # skip, we use a custom version for this file without recursive gcd
+                continue
+            else:
+                shutil.copy(fn, verilog_header_dir)
 
     # remove all but one instances of regslice_core.v
     filtered_verilog_files = []
     remove_entry = False
-    for vfile in all_verilog_files:
+    for vfile in all_verilog_src_files:
         if "regslice_core" in vfile:
             if not remove_entry:
                 filtered_verilog_files.append(vfile)
@@ -94,7 +118,12 @@ def pyverilate_stitched_ip(
         for vfile in filtered_verilog_files:
             with open(vfile) as rf:
                 wf.write("//Added from " + vfile + "\n\n")
-                wf.write(rf.read())
+                lines = rf.read()
+                for line in lines.split("\n"):
+                    # break down too-long lines, Verilator complains otherwise
+                    if len(line) > 20000:
+                        line = line.replace("&", "\n&")
+                    wf.write("\n" + line)
 
     verilator_args = []
     # disable common verilator warnings that should be harmless but commonly occur
@@ -108,10 +137,20 @@ def pyverilate_stitched_ip(
     # force inlining of all submodules to ensure we can read internal signals properly
     if read_internal_signals:
         verilator_args += ["--inline-mult", "0"]
+    # add defines to make certain XPM src files work with Verilator
+    verilator_args.append("-DDISABLE_XPM_ASSERTIONS")
+    verilator_args.append("-DOBSOLETE")
+    verilator_args.append("-DONESPIN")
+    verilator_args.append("--bbox-unsup")
+    vivado_path = os.environ["VIVADO_PATH"]
+    # additional SystemVerilog modules to make XPMs work with Verilator
+    xpm_memory = f"{vivado_path}/data/ip/xpm/xpm_memory/hdl/xpm_memory.sv"
+    xpm_cdc = f"{vivado_path}/data/ip/xpm/xpm_cdc/hdl/xpm_cdc.sv"
+    xpm_fifo = f"{vivado_path}/data/ip/xpm/xpm_fifo/hdl/xpm_fifo.sv"
 
     sim = PyVerilator.build(
-        top_module_file_name,
-        verilog_path=[vivado_stitch_proj_dir],
+        [top_module_file_name, xpm_fifo, xpm_memory, xpm_cdc],
+        verilog_path=[vivado_stitch_proj_dir, verilog_header_dir],
         build_dir=build_dir,
         trace_depth=get_rtlsim_trace_depth(),
         top_module_name=top_module_name,
diff --git a/src/finn/util/test.py b/src/finn/util/test.py
index f5d3b1c30b8b7b439eae1c684ad84b33a3401c7c..bfe4aa0bb826c73f6a7c67f025e24764da8c36cc 100644
--- a/src/finn/util/test.py
+++ b/src/finn/util/test.py
@@ -180,6 +180,7 @@ def execute_parent(parent_path, child_path, input_tensor_npy, return_full_ctx=Fa
     sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
     sdp_node = getCustomOp(sdp_node)
     sdp_node.set_nodeattr("model", child_path)
+    sdp_node.set_nodeattr("return_full_exec_context", 1 if return_full_ctx else 0)
     ret = execute_onnx(parent_model, {iname: input_tensor_npy}, True)
     if return_full_ctx:
         return ret
diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py
index 103f18b514c23c4e1ad35a85d020dc0481aa9c47..5f787d1f889645d04884aed9b89a0b1c91d1f418 100644
--- a/tests/end2end/test_end2end_bnn_pynq.py
+++ b/tests/end2end/test_end2end_bnn_pynq.py
@@ -569,8 +569,8 @@ class TestEnd2End:
         for node in hls_layers:
             if node.op_type != "StreamingFIFO":
                 op_inst = getCustomOp(node)
-                assert op_inst.get_nodeattr("inFIFODepth") == 0
-                assert op_inst.get_nodeattr("outFIFODepth") == 0
+                assert op_inst.get_nodeattr("inFIFODepths") == [0]
+                assert op_inst.get_nodeattr("outFIFODepths") == [0]
         model.save(
             get_checkpoint_name(
                 topology, wbits, abits, QONNX_export, "fifodepth_" + kind
diff --git a/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py
index 5bbaefac2d3e5f800fbb9471df6469235271c2f3..7b3e20616410f54e4718290baec9a510a0d49c0d 100644
--- a/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py
@@ -66,11 +66,12 @@ from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
     ],
 )
 @pytest.mark.parametrize("depthwise", [False, True])
+@pytest.mark.parametrize("use_rtl_swg", [False, True])
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, exec_mode):
+def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mode):
     pad, kernel_size, stride, dilation = conv_config
     np.random.seed(0)
     idt = DataType["UINT4"]
@@ -84,6 +85,9 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, exec_mode):
     pad_h = pad[0] + pad[2]
     pad_w = pad[1] + pad[3]
 
+    if use_rtl_swg and exec_mode == "cppsim":
+        pytest.skip("cppsim not supported for RTL SWG")
+
     if depthwise is True:
         group = out_chn = in_chn
         conv_param_shape = [out_chn, 1, k_h, k_w]
@@ -139,7 +143,7 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, exec_mode):
     model = model.transform(InferDataTypes())
 
     new_model = model.transform(LowerConvsToMatMul())
-    new_model = new_model.transform(to_hls.InferConvInpGen())
+    new_model = new_model.transform(to_hls.InferConvInpGen(use_rtl_variant=use_rtl_swg))
     if depthwise is True:
         new_model = new_model.transform(to_hls.InferVectorVectorActivation())
     else:
diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
index 55dc77cafb898ead28a7cbb9641e0b40db276919..8c9f110c315089ec03354863bf2213963197217a 100644
--- a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
@@ -57,11 +57,12 @@ from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
     "conv_config", [(1, 2, 0), (1, 3, 0), (3, 2, 1), (3, 1, 0), (3, 1, 1), (5, 2, 1)]
 )
 @pytest.mark.parametrize("depthwise", [False, True])
+@pytest.mark.parametrize("use_rtl_swg", [False, True])
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode):
+def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mode):
     kernel_size, stride, pad = conv_config
     np.random.seed(0)
     idt = DataType["UINT4"]
@@ -69,6 +70,12 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode):
     in_feature_dim = 7
     in_chn = 16
 
+    if use_rtl_swg and exec_mode == "cppsim":
+        pytest.skip("cppsim not supported for RTL SWG")
+
+    if use_rtl_swg and kernel_size == 1:
+        pytest.skip("1x1 kernel not supported by current RTL SWG")
+
     if depthwise is True:
         group = out_chn = in_chn
         conv_param_shape = [out_chn, 1, kernel_size, kernel_size]
@@ -122,7 +129,7 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode):
     model = model.transform(InferDataTypes())
 
     new_model = model.transform(LowerConvsToMatMul())
-    new_model = new_model.transform(to_hls.InferConvInpGen())
+    new_model = new_model.transform(to_hls.InferConvInpGen(use_rtl_variant=use_rtl_swg))
     if depthwise is True:
         new_model = new_model.transform(to_hls.InferVectorVectorActivation())
     else:
@@ -156,6 +163,7 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode):
     x = gen_finn_dt_tensor(idt, input_shape)
     inp_dict = {model.graph.input[0].name: x}
     assert oxe.compare_execution(model, new_model, inp_dict)
+
     if kernel_size == 1 and stride > 1 and pad == 0:
         assert new_model.graph.node[1].op_type == "DownSampler"
         if exec_mode == "rtlsim":
diff --git a/tests/fpgadataflow/test_fifosizing.py b/tests/fpgadataflow/test_fifosizing.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fd1439bd055782692bac404622137e166ef5e07
--- /dev/null
+++ b/tests/fpgadataflow/test_fifosizing.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2022 Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import pytest
+
+import json
+import shutil
+from brevitas.export.onnx.generic.manager import BrevitasONNXManager
+
+import finn.builder.build_dataflow as build
+import finn.builder.build_dataflow_config as build_cfg
+from finn.util.basic import make_build_dir
+from finn.util.test import get_trained_network_and_ishape
+
+
+def fetch_test_model(topology, wbits=2, abits=2):
+    tmp_output_dir = make_build_dir("build_fifosizing_%s_" % topology)
+    (model, ishape) = get_trained_network_and_ishape(topology, wbits, abits)
+    chkpt_name = tmp_output_dir + "/model.onnx"
+    BrevitasONNXManager.export(model, ishape, chkpt_name)
+    return tmp_output_dir
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.fpgadataflow
+def test_fifosizing_linear():
+    tmp_output_dir = fetch_test_model("tfc")
+    cfg = build_cfg.DataflowBuildConfig(
+        output_dir=tmp_output_dir,
+        auto_fifo_depths=True,
+        auto_fifo_strategy="characterize",
+        target_fps=10000,
+        synth_clk_period_ns=10.0,
+        board="Pynq-Z1",
+        rtlsim_batch_size=100,
+        shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ,
+        generate_outputs=[
+            build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
+            build_cfg.DataflowOutputType.STITCHED_IP,
+            build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
+        ],
+        default_mem_mode=build_cfg.ComputeEngineMemMode.DECOUPLED,
+    )
+    build.build_dataflow_cfg(tmp_output_dir + "/model.onnx", cfg)
+    with open(tmp_output_dir + "/report/estimate_network_performance.json") as f:
+        est_data = json.load(f)
+    with open(tmp_output_dir + "/report/rtlsim_performance.json") as f:
+        sim_data = json.load(f)
+    assert (
+        float(sim_data["throughput[images/s]"])
+        / float(est_data["estimated_throughput_fps"])
+        > 0.9
+    )
+    shutil.rmtree(tmp_output_dir)
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py
new file mode 100755
index 0000000000000000000000000000000000000000..007360a5fd0b74ee49d54c84f332061dd5f3a114
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py
@@ -0,0 +1,260 @@
+# Copyright (C) 2022, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.im2col import compute_conv_output_dim
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.util.basic import gen_finn_dt_tensor
+
+import finn.core.onnx_exec as oxe
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+
+
+def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt):
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+    ofm_dim_h, ofm_dim_w = ofm_dim
+
+    odt = idt
+    inp = helper.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]
+    )
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch]
+    )
+
+    im2col_node = helper.make_node(
+        "Im2Col",
+        ["inp"],
+        ["outp"],
+        domain="finn.custom_op.general",
+        stride=[stride_h, stride_w],
+        kernel_size=[k_h, k_w],
+        input_shape=str((1, ifm_dim_h, ifm_dim_w, ifm_ch)),
+        dilations=[dilation_h, dilation_w],
+        pad_amount=[0, 0, 0, 0],
+        pad_value=0,
+    )
+    graph = helper.make_graph(
+        nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp]
+    )
+
+    model = helper.make_model(graph, producer_name="im2col-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+
+    return model
+
+
+def make_single_slidingwindow_modelwrapper(
+    k, ifm_ch, ifm_dim, ofm_dim, simd, m, parallel_window, stride, dilation, idt, dw=0
+):
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+    ofm_dim_h, ofm_dim_w = ofm_dim
+
+    odt = idt
+    inp = helper.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]
+    )
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch]
+    )
+
+    SlidingWindow_node = helper.make_node(
+        "ConvolutionInputGenerator_rtl",
+        ["inp"],
+        ["outp"],
+        domain="finn.custom_op.fpgadataflow",
+        backend="fpgadataflow",
+        ConvKernelDim=[k_h, k_w],
+        IFMChannels=ifm_ch,
+        IFMDim=[ifm_dim_h, ifm_dim_w],
+        OFMDim=[ofm_dim_h, ofm_dim_w],
+        SIMD=simd,
+        M=m,
+        parallel_window=parallel_window,
+        Stride=[stride_h, stride_w],
+        Dilation=[dilation_h, dilation_w],
+        inputDataType=idt.name,
+        outputDataType=odt.name,
+        depthwise=dw,
+    )
+    graph = helper.make_graph(
+        nodes=[SlidingWindow_node],
+        name="slidingwindow_graph",
+        inputs=[inp],
+        outputs=[outp],
+    )
+
+    model = helper.make_model(graph, producer_name="slidingwindow-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+
+    return model
+
+
+def prepare_inputs(input_tensor):
+    return {"inp": input_tensor}
+
+
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["UINT4"]])
+# kernel size
+@pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 3]])
+# input dimension
+@pytest.mark.parametrize("ifm_dim", [[24, 24], [15, 6], [13, 13], [1, 14]])
+# input channels
+@pytest.mark.parametrize("ifm_ch", [6])
+# Stride
+@pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+# Dilation
+@pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+# depthwise
+@pytest.mark.parametrize("dw", [0, 1])
+# input channel parallelism ("SIMD")
+@pytest.mark.parametrize("simd", [1, 2, 3, 6])
+# parallel_window enable (MMV_out = M*K)
+@pytest.mark.parametrize("parallel_window", [0])
+# in/out MMV ("M")
+@pytest.mark.parametrize("m", [1])
+# Flip dimensions
+@pytest.mark.parametrize("flip", [False])
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.fpgadataflow
+def test_fpgadataflow_slidingwindow_rtl(
+    idt, k, ifm_dim, ifm_ch, stride, dilation, dw, simd, m, parallel_window, flip
+):
+    if flip:
+        if (
+            ifm_dim[0] == ifm_dim[1]
+            and k[0] == k[1]
+            and stride[0] == stride[1]
+            and dilation[0] == dilation[1]
+        ):
+            pytest.skip("Dimension flip would have no effect")
+        k = k[::-1]
+        ifm_dim = ifm_dim[::-1]
+        stride = stride[::-1]
+        dilation = dilation[::-1]
+
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+
+    kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+    kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+    if simd > ifm_ch:
+        pytest.skip("SIMD cannot be larger than number of input channels")
+    if ifm_ch % simd != 0:
+        pytest.skip("SIMD must divide number of input channels")
+    if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+        pytest.skip(
+            "Illegal convolution configuration: kernel or stride > FM dimension"
+        )
+    if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+        pytest.skip(
+            "Illegal convolution configuration: kernel or stride > FM dimension"
+        )
+    if (k_h == 1 and (stride_h != 1 or dilation_h != 1)) or (
+        k_w == 1 and (stride_w != 1 or dilation_w != 1)
+    ):
+        pytest.skip(
+            """Illegal convolution configuration:
+            stride or dilation defined for unitary kernel dim"""
+        )
+    if k_h == 1 and k_w == 1 and simd != ifm_ch:
+        pytest.skip("1x1 Kernel only supported in parallel mode (SIMD=C)")
+    if parallel_window and simd != ifm_ch:
+        pytest.skip("Parallel window requires SIMD=C")
+
+    ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+    ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+    ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+    x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch))
+    model = make_single_slidingwindow_modelwrapper(
+        k=k,
+        ifm_ch=ifm_ch,
+        ifm_dim=ifm_dim,
+        ofm_dim=ofm_dim,
+        simd=simd,
+        m=m,
+        parallel_window=parallel_window,
+        stride=stride,
+        dilation=dilation,
+        idt=idt,
+        dw=dw,
+    )
+
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+    model = model.transform(PrepareRTLSim())
+
+    # prepare input data
+    input_dict = prepare_inputs(x)
+    # execute model
+    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
+    golden = make_single_im2col_modelwrapper(
+        k=k,
+        ifm_ch=ifm_ch,
+        ifm_dim=ifm_dim,
+        ofm_dim=ofm_dim,
+        stride=stride,
+        dilation=dilation,
+        idt=idt,
+    )
+    y_expected = oxe.execute_onnx(golden, input_dict)["outp"]
+
+    if dw == 0:
+        assert (y_produced == y_expected).all()
+    else:
+        y_expected = y_expected.reshape(
+            1, ofm_dim_h, ofm_dim_w, k_h * k_w, ifm_ch // simd, simd
+        )
+        y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5)
+        y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, ifm_ch * k_h * k_w)
+        assert (y_produced == y_expected).all()
diff --git a/tests/fpgadataflow/test_fpgadataflow_downsampler.py b/tests/fpgadataflow/test_fpgadataflow_downsampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..64da0a2368a69d6037c681d88391eef2844dae2c
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_downsampler.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2022, Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+import onnx.parser as oprs
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.im2col import compute_conv_output_dim
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from qonnx.util.basic import gen_finn_dt_tensor
+
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+from finn.core.onnx_exec import execute_onnx
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+
+
+def build_model(is_1d, in_dim, k, stride, dt_in, dt_w, pad_half=0, flip_1d=False):
+    np.random.seed(0)
+    out_dim = compute_conv_output_dim(in_dim, k, stride, 2 * pad_half)
+    ifm = 8
+    ofm = 16
+    if is_1d:
+        if flip_1d:
+            shape_in = [1, ifm, 1, in_dim]
+            shape_out = [1, ofm, 1, out_dim]
+            shape_k = [1, k]
+            shape_s = [1, stride]
+            shape_p = [0, pad_half, 0, pad_half]
+        else:
+            shape_in = [1, ifm, in_dim, 1]
+            shape_out = [1, ofm, out_dim, 1]
+            shape_k = [k, 1]
+            shape_s = [stride, 1]
+            shape_p = [pad_half, 0, pad_half, 0]
+    else:
+        shape_in = [1, ifm, in_dim, in_dim]
+        shape_out = [1, ofm, out_dim, out_dim]
+        shape_k = [k, k]
+        shape_s = [stride, stride]
+        shape_p = [pad_half, pad_half, pad_half, pad_half]
+    shape_w = [ofm, ifm] + shape_k
+
+    sstr_in = str(shape_in)
+    sstr_out = str(shape_out)
+    sstr_k = str(shape_k)
+    sstr_s = str(shape_s)
+    sstr_p = str(shape_p)
+    sstr_w = str(shape_w)
+
+    input = f"""
+    <
+        ir_version: 7,
+        opset_import: ["" : 9]
+    >
+    agraph (float{sstr_in} in0) => (float{sstr_out} out0)
+    <
+        float{sstr_w} param_w_conv0
+    >
+    {{
+        out0 = Conv<kernel_shape={sstr_k}, group=1, pads={sstr_p},
+                    strides={sstr_s}>(in0, param_w_conv0)
+    }}
+    """
+    model = oprs.parse_model(input)
+    model = ModelWrapper(model)
+    model.set_tensor_datatype("in0", dt_in)
+    model.set_tensor_datatype("param_w_conv0", dt_w)
+    model.set_initializer("param_w_conv0", gen_finn_dt_tensor(dt_w, shape_w))
+    model = model.transform(InferShapes())
+    model = model.transform(LowerConvsToMatMul())
+    model = model.transform(InferShapes())
+    return model
+
+
+@pytest.mark.parametrize("is_1d", [True, False])
+@pytest.mark.parametrize("flip_1d", [True, False])
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.fpgadataflow
+def test_fpgadataflow_downsampler(is_1d, flip_1d, exec_mode):
+    if flip_1d and not is_1d:
+        pytest.skip("flip_1d only applicable for is_1d")
+    in_dim = 32
+    k = 1
+    stride = 2
+    dt_in = DataType["UINT8"]
+    dt_w = DataType["INT2"]
+    model = build_model(
+        is_1d, in_dim, k, stride, dt_in, dt_w, pad_half=0, flip_1d=flip_1d
+    )
+    inp = gen_finn_dt_tensor(dt_in, model.get_tensor_shape("in0"))
+    idict = {"in0": inp}
+    y_expected = execute_onnx(model, idict)["out0"]
+    model = model.transform(to_hls.InferConvInpGen())
+    assert len(model.get_nodes_by_op_type("DownSampler")) == 1
+    if exec_mode == "cppsim":
+        model = model.transform(SetExecMode("cppsim"))
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+    elif exec_mode == "rtlsim":
+        model = model.transform(SetExecMode("rtlsim"))
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(PrepareRTLSim())
+    else:
+        raise Exception("Unknown exec_mode")
+    y_produced = execute_onnx(model, idict)["out0"]
+    assert (y_produced == y_expected).all()
+    if exec_mode == "rtlsim":
+        node = model.get_nodes_by_op_type("DownSampler")[0]
+        inst = getCustomOp(node)
+        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        # small adjustment for 2D testcase due to how rtlsim works:
+        # output is finished before all pixels are read, since last
+        # row is dropped (rtlsim finishes based on # of expected
+        # pixels)
+        if not is_1d:
+            exp_cycles = exp_cycles - in_dim
+        assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
+        assert exp_cycles != 0
diff --git a/tests/fpgadataflow/test_fpgadataflow_eltwise.py b/tests/fpgadataflow/test_fpgadataflow_eltwise.py
new file mode 100644
index 0000000000000000000000000000000000000000..6028a9b9f0fb4a04d0f53fd8c4fae3aac3ae686e
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_eltwise.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2022, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+import onnx.parser as oprs
+import qonnx.core.data_layout as dl
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import gen_finn_dt_tensor
+
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+from finn.core.onnx_exec import execute_onnx
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+
+
+def build_model(shp, dt0, dt1, do_abs):
+    np.random.seed(0)
+    shp_str = str(shp)
+    if do_abs:
+        graph = """
+        sub_out = Sub(in0, in1)
+        out0 = Abs(sub_out)
+        """
+    else:
+        graph = "out0 = Sub(in0, in1)"
+
+    input = f"""
+    <
+        ir_version: 7,
+        opset_import: ["" : 9]
+    >
+    agraph (float{shp_str} in0, float{shp_str} in1) => (float{shp_str} out0)
+    {{
+        {graph}
+    }}
+    """
+    model = oprs.parse_model(input)
+    model = ModelWrapper(model)
+    model.set_tensor_datatype("in0", dt0)
+    model.set_tensor_datatype("in1", dt1)
+    model.set_tensor_layout("in0", dl.NHWC)
+    model.set_tensor_layout("in1", dl.NHWC)
+    model = model.transform(InferShapes())
+    return model
+
+
+# input datatype for one operand
+@pytest.mark.parametrize("dt0", [DataType["UINT4"], DataType["UINT7"]])
+# channels
+@pytest.mark.parametrize("ch", [1, 64])
+# folding
+@pytest.mark.parametrize("fold", [-1, 2, 1])
+# include Abs output node or not
+@pytest.mark.parametrize("do_abs", [True, False])
+# execution mode
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
+def test_fpgadataflow_eltwise(dt0, ch, fold, do_abs, exec_mode):
+    if fold == -1:
+        pe = 1
+    else:
+        pe = max(1, ch // fold)
+    assert ch % pe == 0
+    dt1 = DataType["UINT8"]
+    shp = [1, 4, 2, ch]
+    model = build_model(shp, dt0, dt1, do_abs)
+    in0 = gen_finn_dt_tensor(dt0, shp)
+    in1 = gen_finn_dt_tensor(dt1, shp)
+    idict = {"in0": in0, "in1": in1}
+    y_expected = execute_onnx(model, idict)["out0"]
+    model = model.transform(to_hls.InferStreamingEltwise())
+    assert len(model.graph.node) == 1
+    assert model.graph.node[0].op_type == "StreamingEltwise"
+    getCustomOp(model.graph.node[0]).set_nodeattr("PE", pe)
+    if exec_mode == "cppsim":
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+        model = model.transform(SetExecMode("cppsim"))
+    elif exec_mode == "rtlsim":
+        model = model.transform(SetExecMode("rtlsim"))
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(PrepareRTLSim())
+    else:
+        raise Exception("Unknown exec_mode")
+    y_produced = execute_onnx(model, idict)["out0"]
+    assert (y_produced == y_expected).all(), exec_mode + " failed"
+    if exec_mode == "rtlsim":
+        node = model.get_nodes_by_op_type("StreamingEltwise")[0]
+        inst = getCustomOp(node)
+        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
+        assert exp_cycles != 0
diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
index 2e2da0da7a217091d76d0a59a2a36a8e6a28af8e..34928ce45be0fd96d27b153ae28e2128bf306bb5 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
@@ -53,12 +53,11 @@ test_fpga_part = pynq_part_map[test_pynq_board]
 target_clk_ns = 10
 
 
-def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_style):
+def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt):
     pad_h = padding[0] + padding[2]
     pad_w = padding[1] + padding[3]
     idim_h, idim_w = idim
 
-    assert pad_style == 2, "only pad_style == 2 supported in hlslib"
     assert pad_h > 0 or pad_w > 0, "Output dim should be greater than input dim"
     odim_h = idim_h + pad_h
     odim_w = idim_w + pad_w
@@ -80,7 +79,6 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_sty
         Padding=padding,
         NumChannels=num_ch,
         inputDataType=str(idt.name),
-        PaddingStyle=pad_style,
         numInputVectors=1,
         SIMD=simd,
     )
@@ -101,13 +99,13 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_sty
 # input image dimension
 @pytest.mark.parametrize("idim", [[8, 8], [10, 8]])
 # number of rows and number of cols to add
-@pytest.mark.parametrize("pad", [[1, 1, 1, 1], [1, 1, 2, 2], [1, 3, 2, 3]])
+@pytest.mark.parametrize(
+    "pad", [[1, 1, 1, 1], [1, 1, 2, 2], [1, 3, 2, 3], [7, 0, 8, 0]]
+)
 # number of channels
 @pytest.mark.parametrize("num_ch", [2, 4])
 # Input parallelism
 @pytest.mark.parametrize("simd", [1, 2])
-# PaddingStyle: selects behavior when (odim-idim)%2 != 0
-@pytest.mark.parametrize("pad_style", [2])
 # FINN input datatype
 @pytest.mark.parametrize("idt", [DataType["INT2"], DataType["INT4"]])
 # execution mode
@@ -115,7 +113,7 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_sty
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode):
+def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode):
     if num_ch % simd != 0:
         pytest.skip(" num_ch % simd != 0, skipping")
 
@@ -123,19 +121,13 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode):
     pad_h = pad[0] + pad[2]
     pad_w = pad[1] + pad[3]
 
-    if idim_h == idim_w and pad_h != pad_w:
-        pytest.skip(
-            """Only equal padding along the dimensions for square images
-            is supported, skipping"""
-        )
-
     # generate input data
     x = gen_finn_dt_tensor(idt, [1, idim_h, idim_w, num_ch])
     input_dict = {"inp": x}
     odim_h = idim_h + pad_h
     odim_w = idim_w + pad_w
 
-    model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, simd, idt, pad_style)
+    model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, simd, idt)
     model = model.transform(InferShapes())
     model = model.transform(SetExecMode(mode))
     model = model.transform(GiveUniqueNodeNames())
@@ -150,26 +142,8 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode):
     expected_oshape = (1, odim_h, odim_w, num_ch)
     assert y_produced.shape == expected_oshape
 
-    # calculate reference
-    # calculate correct pad according to parameters
-    if pad_style == 2:
-        if pad_h % 2 == 0:
-            pad_up = pad_h // 2
-        else:
-            pad_up = pad_h // 2 + 1
-        if pad_w % 2 == 0:
-            pad_left = pad_w // 2
-        else:
-            pad_left = pad_w // 2 + 1
-    else:
-        pad_up = pad_h // 2
-        pad_left = pad_w // 2
-
-    pad_down = pad_h - pad_up
-    pad_right = pad_w - pad_left
-
     y_expected = np.pad(
-        x, ((0, 0), (pad_up, pad_down), (pad_left, pad_right), (0, 0)), "constant"
+        x, ((0, 0), (pad[0], pad[2]), (pad[1], pad[3]), (0, 0)), "constant"
     )
 
     assert (y_produced == y_expected).all()
diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py
index d1895a12675dce69070d280381a9982060e20c21..a7e7eba7ee8de81ec5eebe3e270e8e1d28564a00 100644
--- a/tests/fpgadataflow/test_fpgadataflow_mvau.py
+++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py
@@ -42,6 +42,7 @@ import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.derive_characteristic import DeriveCharacteristic
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
@@ -417,3 +418,67 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim(
     exp_cycles = exp_cycles_dict[node.name]
     assert np.isclose(exp_cycles, cycles_rtlsim, atol=15)
     assert exp_cycles != 0
+
+
+# mem_mode: const or decoupled
+@pytest.mark.parametrize("mem_mode", ["decoupled", "const"])
+# activation: None or DataType
+@pytest.mark.parametrize("act", [DataType["INT4"]])
+# weight datatype
+@pytest.mark.parametrize("wdt", [DataType["INT4"]])
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["INT4"]])
+# neuron folding, -1 is maximum possible
+@pytest.mark.parametrize("nf", [8])
+# synapse folding, -1 is maximum possible
+@pytest.mark.parametrize("sf", [8])
+# HLS matrix width (input features)
+@pytest.mark.parametrize("mw", [32])
+# HLS matrix height (output features)
+@pytest.mark.parametrize("mh", [32])
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
+def test_fclayer_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
+    if nf == -1:
+        nf = mh
+    if sf == -1:
+        sf = mw
+    pe = mh // nf
+    simd = mw // sf
+    assert mh % pe == 0
+    assert mw % sf == 0
+    # generate weights
+    W = gen_finn_dt_tensor(wdt, (mw, mh))
+
+    # no activation, produce accumulators
+    T = None
+    tdt = None
+    if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+        odt = DataType["UINT32"]
+    else:
+        odt = DataType["INT32"]
+
+    model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt)
+    for node in model.graph.node:
+        # lookup op_type in registry of CustomOps
+        inst = getCustomOp(node)
+        inst.set_nodeattr("mem_mode", mem_mode)
+    total_fold = nf * sf
+    exp_total_cycles = total_fold + 10
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(PrepareRTLSim())
+    model = model.transform(DeriveCharacteristic(exp_total_cycles))
+    node_inst = getCustomOp(model.graph.node[0])
+    period_attr = node_inst.get_nodeattr("io_chrc_period")
+    assert period_attr == exp_total_cycles
+    chrc_in = node_inst.get_nodeattr("io_chrc_in")
+    chrc_out = node_inst.get_nodeattr("io_chrc_out")
+    assert chrc_in.shape == (1, 2 * exp_total_cycles)
+    assert chrc_out.shape == (1, 2 * exp_total_cycles)
+    # first sf cycles should read input continuously
+    assert (chrc_in[0, :sf] == range(1, sf + 1)).all()
+    # all outputs should be produced within the exp n of cycles
+    assert chrc_out[0, exp_total_cycles] == nf
diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau.py b/tests/fpgadataflow/test_fpgadataflow_vvau.py
index f854c997ff2b9d355933e4e2636c0280a47ea762..c54284dee9a99ed1625e7ac803958ec4ca7b4edc 100644
--- a/tests/fpgadataflow/test_fpgadataflow_vvau.py
+++ b/tests/fpgadataflow/test_fpgadataflow_vvau.py
@@ -33,12 +33,14 @@ from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.general.multithreshold import multithreshold
-from qonnx.custom_op.registry import getCustomOp
+
+# from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.util.basic import gen_finn_dt_tensor
 
 import finn.core.onnx_exec as oxe
-from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+
+# from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
@@ -75,7 +77,20 @@ def _calculate_dot_prod_range(dt_a, dt_b, len):
 
 
 def _make_single_vvau_modelwrapper(
-    W, pe, simd, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T=None, tdt=None
+    W,
+    pe,
+    simd,
+    k_h,
+    k_w,
+    channels,
+    dim_h,
+    dim_w,
+    wdt,
+    idt,
+    odt,
+    T=None,
+    tdt=None,
+    mem_mode="const",
 ):
     in_shape = [1, dim_h, dim_w, k_h * k_w * channels]  # [N, H, W, K*K*CH]
     out_shape = [
@@ -114,6 +129,7 @@ def _make_single_vvau_modelwrapper(
         weightDataType=wdt.name,
         outputDataType=odt.name,
         noActivation=no_act,
+        mem_mode=mem_mode,
     )
 
     graph = helper.make_graph(
@@ -141,7 +157,7 @@ def prepare_inputs(input_tensor):
     return {"inp": input_tensor}
 
 
-# mem_mode: const or decoupled
+# input datatype
 @pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]])
 # weight datatype
 @pytest.mark.parametrize("wdt", [DataType["INT4"]])
@@ -159,13 +175,15 @@ def prepare_inputs(input_tensor):
 @pytest.mark.parametrize("k_w", [3, 1])
 # Number of input and output channels
 @pytest.mark.parametrize("channels", [3, 4])
+# memory mode
+@pytest.mark.parametrize("mem_mode", ["const", "decoupled"])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_fpgadataflow_vvau(
-    idt, wdt, act, pe, simd, dim_h, dim_w, k_h, k_w, channels, exec_mode
+    idt, wdt, act, pe, simd, dim_h, dim_w, k_h, k_w, channels, mem_mode, exec_mode
 ):
     if pe == "channels":
         pe = channels
@@ -201,7 +219,7 @@ def test_fpgadataflow_vvau(
         tdt = DataType["INT32"]
 
     model = _make_single_vvau_modelwrapper(
-        W, pe, simd, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt
+        W, pe, simd, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt, mem_mode
     )
 
     if exec_mode == "cppsim":
@@ -242,11 +260,11 @@ def test_fpgadataflow_vvau(
 
     assert (y_produced == y_expected).all(), "incorrect result"
 
-    if exec_mode == "rtlsim":
-        node = model.get_nodes_by_op_type("VectorVectorActivation")[0]
-        inst = getCustomOp(node)
-        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
-        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
-        exp_cycles = exp_cycles_dict[node.name]
-        # assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
-        # assert exp_cycles != 0
+    # if exec_mode == "rtlsim":
+    # node = model.get_nodes_by_op_type("VectorVectorActivation")[0]
+    # inst = getCustomOp(node)
+    # cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+    # exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+    # exp_cycles = exp_cycles_dict[node.name]
+    # assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
+    # assert exp_cycles != 0
diff --git a/tests/transformation/streamline/test_scale_resize_nhwc.py b/tests/transformation/streamline/test_scale_resize_nhwc.py
new file mode 100644
index 0000000000000000000000000000000000000000..f10930f4e7d5aeb98a60630e7e4f48adfc371d59
--- /dev/null
+++ b/tests/transformation/streamline/test_scale_resize_nhwc.py
@@ -0,0 +1,293 @@
+import pytest
+
+import numpy as np
+import onnx
+import onnx.helper as oh
+import qonnx.core.data_layout as DataLayout
+from onnx import TensorProto
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import gen_finn_dt_tensor
+
+import finn.core.onnx_exec as oxe
+from finn.transformation.streamline.reorder import MakeScaleResizeNHWC
+
+
+def create_resize_transpose(ifm_dim, ifm_ch, scales, mode, idt):
+    ofm_dim_h = ifm_dim[0] * scales[2]
+    ofm_dim_w = ifm_dim[1] * scales[3]
+    inp = oh.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim[0], ifm_dim[1]]
+    )
+
+    param = oh.make_tensor_value_info("scales", TensorProto.FLOAT, [4])
+
+    # Not actually used, only needed for compliance with the Resize node interface
+    roi = oh.make_tensor_value_info("roi", TensorProto.FLOAT, [4])
+
+    outp_up = oh.make_tensor_value_info(
+        "outp_up", TensorProto.FLOAT, [1, ifm_ch, ofm_dim_h, ofm_dim_w]
+    )
+    outp = oh.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, ifm_ch]
+    )
+
+    resize_node = oh.make_node(
+        "Resize",
+        inputs=["inp", "roi", "scales"],
+        outputs=["outp_up"],
+        name="Resize1",
+        mode=mode,
+    )
+
+    transpose_node = onnx.helper.make_node(
+        "Transpose",
+        inputs=["outp_up"],
+        outputs=["outp"],
+        name="Transpose1",
+        perm=[0, 2, 3, 1],
+    )
+
+    graph = oh.make_graph(
+        nodes=[resize_node, transpose_node],
+        name="resize_graph",
+        inputs=[inp],
+        outputs=[outp],
+        value_info=[outp_up, param, roi],
+    )
+
+    model = oh.make_model(graph, producer_name="resize_model1")
+    model = ModelWrapper(model)
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", idt)
+
+    model.set_tensor_layout("inp", DataLayout.NCHW)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataLayouts())
+
+    return model
+
+
+def create_transpose_resize(ifm_dim, ifm_ch, scales, mode, idt):
+    ofm_dim_h = ifm_dim[0] * scales[2]
+    ofm_dim_w = ifm_dim[1] * scales[3]
+    inp = oh.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_dim[0], ifm_dim[1], ifm_ch]
+    )
+
+    param = oh.make_tensor_value_info("scales", TensorProto.FLOAT, [4])
+
+    # Not actually used, only needed for compliance with the Resize node interface
+    roi = oh.make_tensor_value_info("roi", TensorProto.FLOAT, [4])
+
+    outp = oh.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ifm_ch, ofm_dim_h, ofm_dim_w]
+    )
+    outp_tr = oh.make_tensor_value_info(
+        "outp_tr", TensorProto.FLOAT, [1, ifm_ch, ifm_dim[0], ifm_dim[1]]
+    )
+
+    transpose_node = onnx.helper.make_node(
+        "Transpose",
+        inputs=["inp"],
+        outputs=["outp_tr"],
+        name="Transpose1",
+        perm=[0, 3, 1, 2],
+    )
+
+    resize_node = oh.make_node(
+        "Resize",
+        inputs=["outp_tr", "roi", "scales"],
+        outputs=["outp"],
+        name="Resize1",
+        mode=mode,
+    )
+
+    graph = oh.make_graph(
+        nodes=[transpose_node, resize_node],
+        name="resize_graph",
+        inputs=[inp],
+        outputs=[outp],
+        value_info=[outp_tr, param, roi],
+    )
+
+    model = oh.make_model(graph, producer_name="resize_model2")
+    model = ModelWrapper(model)
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", idt)
+    model.set_tensor_layout("inp", DataLayout.NHWC)
+
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataLayouts())
+
+    return model
+
+
+def create_transpose_resize_transpose(ifm_dim, ifm_ch, scales, mode, idt):
+    ofm_dim_h = ifm_dim[0] * scales[2]
+    ofm_dim_w = ifm_dim[1] * scales[3]
+    inp = oh.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_dim[0], ifm_dim[1], ifm_ch]
+    )
+
+    param = oh.make_tensor_value_info("scales", TensorProto.FLOAT, scales)
+
+    # Not actually used, only needed for compliance with the Resize node interface
+    roi = oh.make_tensor_value_info("roi", TensorProto.FLOAT, [4])
+
+    outp_tr = oh.make_tensor_value_info(
+        "outp_tr", TensorProto.FLOAT, [1, ifm_ch, ifm_dim[0], ifm_dim[1]]
+    )
+
+    outp_up = oh.make_tensor_value_info(
+        "outp_up", TensorProto.FLOAT, [1, ifm_ch, ofm_dim_h, ofm_dim_w]
+    )
+    outp = oh.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, ifm_ch]
+    )
+
+    transpose_node1 = onnx.helper.make_node(
+        "Transpose",
+        inputs=["inp"],
+        outputs=["outp_tr"],
+        name="Transpose1",
+        perm=[0, 3, 1, 2],
+    )
+
+    resize_node = oh.make_node(
+        "Resize",
+        inputs=["outp_tr", "roi", "scales"],
+        outputs=["outp_up"],
+        name="Resize1",
+        mode=mode,
+    )
+
+    transpose_node2 = onnx.helper.make_node(
+        "Transpose",
+        inputs=["outp_up"],
+        outputs=["outp"],
+        name="Transpose2",
+        perm=[0, 2, 3, 1],
+    )
+
+    graph = oh.make_graph(
+        nodes=[transpose_node1, resize_node, transpose_node2],
+        name="resize_graph",
+        inputs=[inp],
+        outputs=[outp],
+        value_info=[outp_up, outp_tr, param, roi],
+    )
+
+    model = oh.make_model(graph, producer_name="resize_model3")
+    model = ModelWrapper(model)
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", idt)
+    model.set_tensor_layout("inp", DataLayout.NHWC)
+
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataLayouts())
+
+    return model
+
+
+def check_transform(model):
+    graph = model.graph
+    node_ind = 0
+    for n in graph.node:
+        node_ind += 1
+        if n.op_type == "Upsample" or n.op_type == "Resize":
+            if model.get_tensor_layout(n.output[0]) == DataLayout.NHWC:
+                return True
+    return False
+
+
+@pytest.mark.streamline
+# input dimension
+@pytest.mark.parametrize("ifm_dim", [[2**i, 2**i] for i in range(3, 6)])
+# input channels
+@pytest.mark.parametrize("ifm_ch", [3])
+# scales
+@pytest.mark.parametrize(
+    "scales", [[1, 1, i, j] for i in range(2, 5) for j in range(2, 5)]
+)
+# mode
+@pytest.mark.parametrize("mode", ["nearest"])
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["INT4"]])
+def test_scale_resize_nhwc(ifm_dim, ifm_ch, scales, mode, idt):
+    # create models
+    resize_model1 = create_resize_transpose(ifm_dim, ifm_ch, scales, mode, idt)
+    resize_model2 = create_transpose_resize(ifm_dim, ifm_ch, scales, mode, idt)
+    resize_model3 = create_transpose_resize_transpose(
+        ifm_dim, ifm_ch, scales, mode, idt
+    )
+
+    # set initializers
+    resize_model1.set_initializer("scales", np.array(scales, dtype=np.float32))
+    resize_model2.set_initializer("scales", np.array(scales, dtype=np.float32))
+    resize_model3.set_initializer("scales", np.array(scales, dtype=np.float32))
+
+    # generate input tensor for testing
+    input_tensor_nchw = gen_finn_dt_tensor(idt, [1, ifm_ch, ifm_dim[0], ifm_dim[1]])
+    input_tensor_nhwc = gen_finn_dt_tensor(idt, [1, ifm_dim[0], ifm_dim[1], ifm_ch])
+    input_dict_nchw = {"inp": input_tensor_nchw}
+    input_dict_nhwc = {"inp": input_tensor_nhwc}
+
+    # execute first model
+    output_dict1 = oxe.execute_onnx(resize_model1, input_dict_nchw)
+    expected1 = output_dict1["outp"]
+
+    # transform Resize into ResizeNHWC
+    resize_model1 = resize_model1.transform(MakeScaleResizeNHWC())
+    resize_model1 = resize_model1.transform(InferDataLayouts())
+
+    # execute transformed model
+    output_node_name1 = resize_model1.graph.output[0].name
+    output_dict1 = oxe.execute_onnx(
+        resize_model1, input_dict_nchw, return_full_exec_context=False
+    )
+    output1 = output_dict1[output_node_name1]
+
+    # compare outputs
+    assert (expected1 == output1).all()
+    assert check_transform(resize_model1)
+
+    # execute second model
+    output_dict2 = oxe.execute_onnx(resize_model2, input_dict_nhwc)
+    expected2 = output_dict2["outp"]
+
+    # transform Resize into ResizeNHWC
+    resize_model2 = resize_model2.transform(MakeScaleResizeNHWC())
+    resize_model2 = resize_model2.transform(InferDataLayouts())
+
+    # execute transformed model
+    output_node_name2 = resize_model2.graph.output[0].name
+    output_dict2 = oxe.execute_onnx(
+        resize_model2, input_dict_nhwc, return_full_exec_context=False
+    )
+    output2 = output_dict2[output_node_name2]
+
+    # compare outputs
+    assert (expected2 == output2).all()
+    assert check_transform(resize_model2)
+
+    # execute third model
+    output_dict3 = oxe.execute_onnx(resize_model3, input_dict_nhwc)
+    expected3 = output_dict3["outp"]
+
+    # transform Resize into ResizeNHWC
+    resize_model3 = resize_model3.transform(MakeScaleResizeNHWC())
+    resize_model3 = resize_model3.transform(InferDataLayouts())
+
+    # execute transformed model
+    output_node_name3 = resize_model3.graph.output[0].name
+    output_dict3 = oxe.execute_onnx(
+        resize_model3, input_dict_nhwc, return_full_exec_context=False
+    )
+    output3 = output_dict3[output_node_name3]
+
+    # compare outputs
+    assert (expected3 == output3).all()
+    assert check_transform(resize_model3)
diff --git a/tests/util/test_build_dataflow.py b/tests/util/test_build_dataflow.py
index cdf69aebddc4d6af2288774acbff5dd8a52512b3..39f0b0dc89e9388c54a013becb53d9afbfb2ce4e 100644
--- a/tests/util/test_build_dataflow.py
+++ b/tests/util/test_build_dataflow.py
@@ -30,6 +30,7 @@ import pkg_resources as pk
 
 import pytest
 
+import numpy as np
 import os
 from shutil import copytree
 
@@ -55,7 +56,6 @@ def test_end2end_build_dataflow_directory():
     assert os.path.isfile(output_dir + "/driver/driver.py")
     assert os.path.isfile(output_dir + "/report/estimate_layer_cycles.json")
     assert os.path.isfile(output_dir + "/report/estimate_layer_resources.json")
-    assert os.path.isfile(output_dir + "/report/verify_rtlsim.vcd")
     assert os.path.isfile(output_dir + "/report/rtlsim_perf_batch_1.vcd")
     assert os.path.isfile(
         output_dir + "/report/estimate_layer_config_alternatives.json"
@@ -68,8 +68,19 @@ def test_end2end_build_dataflow_directory():
     assert os.path.isfile(output_dir + "/report/post_synth_resources.xml")
     assert os.path.isfile(output_dir + "/report/post_route_timing.rpt")
     # verification outputs
-    verify_out_dir = output_dir + "/verification_output"
-    assert os.path.isfile(verify_out_dir + "/verify_initial_python_SUCCESS.npy")
-    assert os.path.isfile(verify_out_dir + "/verify_streamlined_python_SUCCESS.npy")
-    assert os.path.isfile(verify_out_dir + "/verify_folded_hls_cppsim_SUCCESS.npy")
-    assert os.path.isfile(verify_out_dir + "/verify_stitched_ip_rtlsim_SUCCESS.npy")
+    verif_batchsize = np.load(target_dir + "/input.npy").shape[0]
+    for i in range(verif_batchsize):
+        verify_out_dir = output_dir + "/verification_output"
+        assert os.path.isfile(
+            verify_out_dir + f"/verify_initial_python_{i}_SUCCESS.npy"
+        )
+        assert os.path.isfile(
+            verify_out_dir + f"/verify_streamlined_python_{i}_SUCCESS.npy"
+        )
+        assert os.path.isfile(
+            verify_out_dir + f"/verify_folded_hls_cppsim_{i}_SUCCESS.npy"
+        )
+        assert os.path.isfile(
+            verify_out_dir + f"/verify_stitched_ip_rtlsim_{i}_SUCCESS.npy"
+        )
+        assert os.path.isfile(output_dir + f"/report/verify_rtlsim_{i}.vcd")