diff --git a/finn-rtllib/swg/swg_hdl_template.v b/finn-rtllib/swg/swg_hdl_template.v
index 98ea1cf9cdef64635f35a8d2473fc0943151be89..89ebb8da518ef7e01fa6cf3c04d372a7f51f217d 100755
--- a/finn-rtllib/swg/swg_hdl_template.v
+++ b/finn-rtllib/swg/swg_hdl_template.v
@@ -223,7 +223,7 @@ assign shift_out = out_reg;
 
 integer addr_w, addr_r; //todo: minimize width (as reg), make r addr depend on w
 
-(* ram_style = "block" *) reg [WIDTH-1:0] ram [DEPTH-1:0];
+$RAM_STYLE$ reg [WIDTH-1:0] ram [DEPTH-1:0];
 
 always @(posedge CLK) begin 
     if (RST == 1'b0) begin
@@ -291,7 +291,7 @@ end
 
 endmodule //window_buffer
 
-module $TOP_MODULE_NAME$ (
+module $TOP_MODULE_NAME$_impl (
         ap_clk,
         ap_rst_n,
         in0_V_V_TDATA,
@@ -315,11 +315,9 @@ parameter BUF_ELEM_TOTAL = $BUF_ELEM_TOTAL$;
 //IO ports
 input   ap_clk;
 input   ap_rst_n;
-(* X_INTERFACE_PARAMETER = "FREQ_HZ 250000000.000000" *)
 input  [BUF_IN_WIDTH-1:0] in0_V_V_TDATA;
 input   in0_V_V_TVALID;
 output   in0_V_V_TREADY;
-(* X_INTERFACE_PARAMETER = "FREQ_HZ 250000000.000000" *)
 output  [BUF_OUT_WIDTH-1:0] out_V_V_TDATA;
 output   out_V_V_TVALID;
 input   out_V_V_TREADY;
@@ -404,4 +402,4 @@ always @ (posedge ap_clk) begin
     end
 end
 
-endmodule //ConvolutionInputGenerator1D_0_ConvolutionInputGenerator1D_0
+endmodule //TOP_MODULE_NAME_impl
diff --git a/finn-rtllib/swg/swg_hdl_template_mmv_1.v b/finn-rtllib/swg/swg_hdl_template_mmv_1.v
new file mode 100644
index 0000000000000000000000000000000000000000..670598d9a094ad686116d92f4d04fb16c6619d93
--- /dev/null
+++ b/finn-rtllib/swg/swg_hdl_template_mmv_1.v
@@ -0,0 +1,399 @@
+`timescale 1 ns / 1 ps
+
+module $TOP_MODULE_NAME$_controller
+(
+    CLK,
+    RST,
+    advance,
+    addr_incr,
+    tail_incr
+);
+
+input CLK;
+input RST;
+input advance;
+output [31:0] addr_incr; //todo: minimize width
+output [31:0] tail_incr; //todo: minimize width
+
+////code generation part:
+localparam LOOP_H_ITERATIONS = $LOOP_H_ITERATIONS$;
+localparam LOOP_W_ITERATIONS = $LOOP_W_ITERATIONS$;
+localparam LOOP_KH_ITERATIONS = $LOOP_KH_ITERATIONS$;
+localparam LOOP_KW_ITERATIONS = $LOOP_KW_ITERATIONS$;
+localparam LOOP_SIMD_ITERATIONS = $LOOP_SIMD_ITERATIONS$;
+localparam [31:0] ADDR_INCREMENT_MAP [0:5] = $ADDR_INCREMENT_MAP$; //todo: minimize width
+////
+
+//state and counters
+reg [2:0] state, state_next;
+parameter STATE_START = 0, STATE_LOOP_SIMD = 1, STATE_LOOP_KW = 2, STATE_LOOP_KH = 3, STATE_LOOP_W = 4, STATE_LOOP_H = 5;
+integer counter_loop_h; //todo: minimize width
+integer counter_loop_w;
+integer counter_loop_kh;
+integer counter_loop_kw;
+integer counter_loop_simd;
+
+assign addr_incr = ADDR_INCREMENT_MAP[state];
+
+//combinational logic for tail_incr generation
+$TAIL_INCR_GENERATION$
+
+//combinational next state logic
+always @ (state, counter_loop_simd, counter_loop_kw, counter_loop_kh, counter_loop_w, counter_loop_h) begin
+    state_next = state; //default
+    if (state == $INNERMOST_STATE$) begin
+        if (counter_loop_simd == 0)
+            if (counter_loop_kw != 0)
+                state_next = STATE_LOOP_KW;
+            else
+                if(counter_loop_kh != 0)
+                    state_next = STATE_LOOP_KH;
+                else
+                    if(counter_loop_w != 0)
+                        state_next = STATE_LOOP_W;
+                    else
+                        if(counter_loop_h != 0)
+                            state_next = STATE_LOOP_H;
+                        else
+                            state_next = STATE_START;
+    end else
+        state_next = $INNERMOST_STATE$;
+end
+
+//sequential logic
+always @ (posedge CLK) begin
+    if (RST == 1'b0) begin
+        counter_loop_h <= LOOP_H_ITERATIONS;
+        counter_loop_w <= LOOP_W_ITERATIONS;
+        counter_loop_kh <= LOOP_KH_ITERATIONS;
+        counter_loop_kw <= LOOP_KW_ITERATIONS;
+        counter_loop_simd <= LOOP_SIMD_ITERATIONS;
+        state <= $INNERMOST_STATE$; //STATE_START; //debug: omit start state to fix timing, maybe omit during FM transition as well
+    end else begin
+        if (advance) begin
+            state <= state_next;
+
+            if (state == $INNERMOST_STATE$) begin
+                if (counter_loop_simd == 0) begin
+                    counter_loop_simd <= LOOP_SIMD_ITERATIONS;
+                    if (counter_loop_kw == 0) begin
+                        counter_loop_kw <= LOOP_KW_ITERATIONS;
+                        if (counter_loop_kh == 0) begin
+                            counter_loop_kh <= LOOP_KH_ITERATIONS;
+                            if (counter_loop_w == 0) begin
+                                counter_loop_w <= LOOP_W_ITERATIONS;
+                                if (counter_loop_h == 0) begin
+                                    counter_loop_h <= LOOP_H_ITERATIONS;
+                                end else
+                                    counter_loop_h <= counter_loop_h-1;
+                            end else
+                                counter_loop_w <= counter_loop_w-1;
+                        end else
+                            counter_loop_kh <= counter_loop_kh-1;
+                    end else
+                        counter_loop_kw <= counter_loop_kw-1;
+                end else
+                    counter_loop_simd <= counter_loop_simd-1;
+            end
+        end
+    end
+end
+endmodule //controller
+
+module $TOP_MODULE_NAME$_cyclic_buffer_addressable
+#(
+    parameter WIDTH = 1,
+    parameter DEPTH = 1
+)
+(
+    CLK,
+    RST,
+    read_addr,
+    read_enable,
+    write_enable,
+    data_in,
+    data_out
+);
+
+input CLK, RST, read_enable, write_enable;
+input [$clog2(DEPTH)-1:0] read_addr; // absolute (!) read address of cyclic buffer
+input [WIDTH-1:0] data_in;
+output [WIDTH-1:0] data_out;
+
+integer addr_w; //todo: minimize width (as reg)
+
+$RAM_STYLE$ reg [WIDTH-1:0] ram [DEPTH-1:0];
+
+reg [WIDTH-1:0] out_reg;
+assign data_out = out_reg;
+
+always @(posedge CLK) begin 
+    if (RST == 1'b0) begin
+        addr_w <= 0;
+    end else begin
+        if (read_enable)
+            out_reg <= ram[read_addr];
+
+        if (write_enable) begin
+            ram[addr_w] <= data_in;
+            
+            if (addr_w == DEPTH-1)
+                addr_w <= 0;
+            else
+                addr_w <= addr_w + 1;
+        end
+    end
+end
+endmodule //cyclic_buffer_addressable
+
+module $TOP_MODULE_NAME$_impl (
+        ap_clk,
+        ap_rst_n,
+        in0_V_V_TDATA,
+        in0_V_V_TVALID,
+        in0_V_V_TREADY,
+        out_V_V_TDATA,
+        out_V_V_TVALID,
+        out_V_V_TREADY
+);
+
+parameter BIT_WIDTH = $BIT_WIDTH$;
+parameter SIMD = $SIMD$;
+parameter MMV_IN = $MMV_IN$;
+parameter MMV_OUT = $MMV_OUT$;
+parameter BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN;
+parameter BUF_OUT_ELEM_WIDTH = BIT_WIDTH * SIMD;
+parameter BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT;
+parameter LAST_READ_ELEM = $LAST_READ_ELEM$;
+parameter LAST_WRITE_ELEM = $LAST_WRITE_ELEM$;
+parameter BUF_ELEM_TOTAL = $BUF_ELEM_TOTAL$;
+parameter ELEM_PER_WINDOW = $ELEM_PER_WINDOW$;
+
+//IO ports
+input   ap_clk;
+input   ap_rst_n;
+input  [BUF_IN_WIDTH-1:0] in0_V_V_TDATA;
+input   in0_V_V_TVALID;
+output   in0_V_V_TREADY;
+output  [BUF_OUT_WIDTH-1:0] out_V_V_TDATA;
+output   out_V_V_TVALID;
+input   out_V_V_TREADY;
+
+//main buffer instantiation
+wire [BUF_IN_WIDTH-1:0] window_buffer_in;
+wire [BUF_OUT_WIDTH-1:0] window_buffer_out;
+wire window_buffer_write_enable;
+wire window_buffer_read_enable;
+wire [$clog2(BUF_ELEM_TOTAL)-1:0] window_buffer_read_addr;
+$TOP_MODULE_NAME$_cyclic_buffer_addressable
+#(
+    .WIDTH(BUF_IN_WIDTH),
+    .DEPTH(BUF_ELEM_TOTAL)
+)
+window_buffer_inst
+(
+    .CLK(ap_clk),
+    .RST(ap_rst_n),
+    .read_addr(window_buffer_read_addr),
+    .read_enable(window_buffer_read_enable),
+    .write_enable(window_buffer_write_enable),
+    .data_in(window_buffer_in),
+    .data_out(window_buffer_out)
+);
+
+//counters to keep track when to read/write
+integer newest_buffered_elem; //todo: minimize width
+integer newest_buffered_elem_available; //todo: minimize width
+integer current_elem;
+integer current_elem_available;
+integer first_elem_next_window;
+integer k;
+
+reg [$clog2(BUF_ELEM_TOTAL)-1:0] window_buffer_read_addr_reg;
+assign window_buffer_read_addr = window_buffer_read_addr_reg;
+
+//reg write_done; //keep track if W of current cycle was already completed, but we still wait on a R in the same cycle
+
+wire advance_controller;
+wire [31:0] addr_incr;
+wire [31:0] tail_incr;
+
+$TOP_MODULE_NAME$_controller
+controller_inst
+(
+    .CLK(ap_clk),
+    .RST(ap_rst_n),
+    .advance(advance_controller),
+    .addr_incr(addr_incr),
+    .tail_incr(tail_incr)
+);
+
+wire reading_done;
+assign reading_done = newest_buffered_elem == LAST_READ_ELEM;
+
+reg fetching_done;
+reg writing_done; //instead of a separate write cycle/element counter, trigger this flag once current_element reaches LAST_WRITE_ELEM
+//assign writing_done = current_elem == LAST_WRITE_ELEM;
+
+
+wire write_blocked;
+
+//reg write_prefetch_available; // stores if the write of prefetched data is still outstanding
+
+wire fetch_cmd;
+assign fetch_cmd = !(current_elem > newest_buffered_elem) && !write_blocked && !fetching_done;
+    
+    
+//determine whether to read/write in this cycle
+//wire write_cmd;
+//assign write_cmd = write_prefetch_available && !writing_done;
+reg write_cmd;                 
+
+
+
+wire read_cmd;
+assign read_cmd = 
+    (
+      (  
+          (newest_buffered_elem - BUF_ELEM_TOTAL+1) < first_elem_next_window
+        &&(newest_buffered_elem - BUF_ELEM_TOTAL+1) < current_elem
+      )  // (over-)write to buffer if oldest buffered element is no longer needed  
+      || fetching_done
+    )                                                      //or if fetching is done (e.g. for skipped rows at FM end due to stride)
+    && !reading_done;                                                    //and if there is still an input element left to read
+
+//todo: optmize (e.g. is < or != more efficient?)
+// ToDo: ideally this should point to the oldest elem of the next window,
+// to allow reading while still writing the remainder of the current window                 
+
+
+
+assign write_blocked = write_cmd && !out_V_V_TREADY; //&& !write_done;
+
+wire read_ok;
+// with transition to next cycle:
+//              want to read      can read       source is ready (waiting on VALID allowed)
+assign read_ok = read_cmd && !write_blocked && in0_V_V_TVALID;
+
+wire write_ok;
+// with transition to next cycle:
+//              output is VALID   sink is ready  sink has already read (we are waiting on source)
+//assign write_ok = write_cmd && (out_V_V_TREADY || write_done);
+assign write_ok = write_cmd && out_V_V_TREADY;
+
+//wire advance;
+//            includes waiting on W    if W-only cycle: wait only on W     no R/W to wait for
+//assign advance =      read_ok        ||   (!read_cmd && write_ok)    || (!read_cmd && !write_cmd);
+//todo: optimize/simplify advance logic for write_done generation
+
+//assign buffer control
+assign window_buffer_write_enable = read_ok;
+assign window_buffer_read_enable = fetch_cmd;
+assign advance_controller = fetch_cmd; //write_ok
+
+//assign I/O ports
+assign window_buffer_in = in0_V_V_TDATA;
+assign out_V_V_TDATA = window_buffer_out;
+assign in0_V_V_TREADY = ap_rst_n && read_ok; //only asserted if data is available and we can store it (allowed)
+assign out_V_V_TVALID = ap_rst_n && write_cmd; //&& !write_done; //only asserted if we have data available and it has not been read yet (don't wait for READY from sink)
+
+//main process for advancing counters
+always @ (posedge ap_clk) begin
+    if (ap_rst_n == 1'b0) begin
+        newest_buffered_elem <= -1;
+        //newest_buffered_elem_available <= -1;
+        current_elem <= 0;
+        //current_elem_available <= 0;
+        first_elem_next_window <= 0;
+        k <= 0;
+        window_buffer_read_addr_reg <= 0;
+        fetching_done <= 0;
+        writing_done <= 0;
+        //write_prefetch_available <= 0;
+        write_cmd <= 0;
+    end else begin
+        if (read_ok) begin
+            //check if this is the last read cycle (reading_done will be true afterwards)
+            if ((newest_buffered_elem == LAST_READ_ELEM-1) && writing_done) begin
+                //start processing of next FM if writing is done already (possible due to unused input elements at the tail end)
+                //todo: allow for read overlapping between feature maps (i.e., reading first elements from next FM while still writing last window of current FM)
+                newest_buffered_elem <= -1;
+                current_elem <= 0;
+                first_elem_next_window <= 0;
+                writing_done <= 0;
+                fetching_done <= 0;
+            end
+            
+            newest_buffered_elem <= newest_buffered_elem+1;
+        end
+                 
+        if (fetch_cmd) begin
+            //count up to track which element index is about to be read from the buffer, and where it is located within the buffer
+            //use increment value calculated by controller
+
+            //keep track where we are within a window
+            if (k == ELEM_PER_WINDOW-1)
+                k <= 0;
+            else
+                k <= k+1;
+
+            //absolute buffer address always wraps around (in both directions for depthwise support)
+            if ($signed(window_buffer_read_addr_reg + addr_incr) > BUF_ELEM_TOTAL-1)
+                window_buffer_read_addr_reg <= window_buffer_read_addr_reg + addr_incr - BUF_ELEM_TOTAL;
+            else if ($signed(window_buffer_read_addr_reg + addr_incr) < 0)
+                window_buffer_read_addr_reg <= window_buffer_read_addr_reg + addr_incr + BUF_ELEM_TOTAL;
+            else
+                window_buffer_read_addr_reg <= window_buffer_read_addr_reg + addr_incr;
+
+            //check if this is the last write cycle (writing_done will be true afterwards)
+            if (current_elem == LAST_WRITE_ELEM) begin
+                fetching_done <= 1;
+            end else begin
+                //current element index wraps around only at window boundary
+                //if (((current_elem + addr_incr) > BUF_ELEM_TOTAL-1) && (k == ELEM_PER_WINDOW-1))
+                
+                //if (k == ELEM_PER_WINDOW-1)
+                //    current_elem <= current_elem + addr_incr - BUF_ELEM_TOTAL;
+                //else
+                    current_elem <= current_elem + addr_incr;
+            end
+
+            if (k == 0)
+                first_elem_next_window <= first_elem_next_window + tail_incr;
+
+            // determine if prefetched data will be outstanding in the next cycle
+            // if we fetch in this cycle -> yes
+            // if we do not fetch nor write successfully -> do not change
+            // if we do not fetch but write -> clear outstanding data
+            //write_prefetch_available <= fetch_cmd;
+            write_cmd <= fetch_cmd;
+        end       
+
+        if (write_ok)
+            // determine if prefetched data will be outstanding in the next cycle
+            // if we fetch in this cycle -> yes
+            // if we do not fetch nor write successfully -> do not change
+            // if we do not fetch but write -> clear outstanding data
+            //write_prefetch_available <= fetch_cmd;
+            write_cmd <= fetch_cmd;
+
+        if (write_ok && fetching_done) begin
+            //check if this is the last write cycle (writing_done will be true afterwards)
+            if (reading_done || (read_ok && (newest_buffered_elem == LAST_READ_ELEM-1))) begin
+                //start processing of next FM if reading is done already, or completes in the same cycle
+                newest_buffered_elem <= -1;
+                current_elem <= 0;
+                first_elem_next_window <= 0;
+                fetching_done <= 0;
+            end else
+                writing_done <= 1;
+        end
+
+        //if (advance)
+        //    write_done <= 1'b0; //reset flag
+        //else if (write_ok) // successful W in this cycle, but R still outstanding
+        //    write_done <= 1'b1; //write can happen even if read is blocked, but only for the current cycle!
+    end
+end
+
+endmodule //TOP_MODULE_NAME_impl
diff --git a/finn-rtllib/swg/swg_hdl_template_wrapper.v b/finn-rtllib/swg/swg_hdl_template_wrapper.v
new file mode 100644
index 0000000000000000000000000000000000000000..db0556d940553d0c24fb94276f9a574c94880294
--- /dev/null
+++ b/finn-rtllib/swg/swg_hdl_template_wrapper.v
@@ -0,0 +1,46 @@
+`timescale 1 ns / 1 ps
+
+module $TOP_MODULE_NAME$ (
+        ap_clk,
+        ap_rst_n,
+        in0_V_V_TDATA,
+        in0_V_V_TVALID,
+        in0_V_V_TREADY,
+        out_V_V_TDATA,
+        out_V_V_TVALID,
+        out_V_V_TREADY
+);
+
+parameter BIT_WIDTH = $BIT_WIDTH$;
+parameter SIMD = $SIMD$;
+parameter MMV_IN = $MMV_IN$;
+parameter MMV_OUT = $MMV_OUT$;
+parameter BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN;
+parameter BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT;
+
+input  ap_clk;
+input  ap_rst_n;
+(* X_INTERFACE_PARAMETER = "FREQ_HZ 100000000.000000" *) //todo: make configurable or set later
+input  [BUF_IN_WIDTH-1:0] in0_V_V_TDATA;
+input  in0_V_V_TVALID;
+output in0_V_V_TREADY;
+(* X_INTERFACE_PARAMETER = "FREQ_HZ 100000000.000000" *)
+output [BUF_OUT_WIDTH-1:0] out_V_V_TDATA;
+output out_V_V_TVALID;
+input  out_V_V_TREADY;
+
+$TOP_MODULE_NAME$_impl
+#()
+impl
+(
+    .ap_clk(ap_clk),
+    .ap_rst_n(ap_rst_n),
+    .in0_V_V_TDATA(in0_V_V_TDATA),
+    .in0_V_V_TVALID(in0_V_V_TVALID),
+    .in0_V_V_TREADY(in0_V_V_TREADY),
+    .out_V_V_TDATA(out_V_V_TDATA),
+    .out_V_V_TVALID(out_V_V_TVALID),
+    .out_V_V_TREADY(out_V_V_TREADY)
+);
+
+endmodule //TOP_MODULE_NAME
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
index cfd6572a8dd6e5b047c79e9d959f62a39663e166..4b31b7c97316b8eecc150e85e6504b21d3b1593e 100755
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
@@ -27,6 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import math
+from math import copysign
 import numpy as np
 import os
 
@@ -46,15 +47,6 @@ try:
 except ModuleNotFoundError:
     PyVerilator = None
 
-# This operation should only be used for 1D convolutions. Either the
-# IFMDim_H or IFMDim_W should be '1', which represents the so-called
-# dummy-dimension
-
-# ONNX i/o tensor shape assumptions for ConvolutionInputGenerator1D:
-# input 0 is the input tensor, shape NHWC = (1, IFMDim_H, IFMDim_W, IFMChannels)
-# output 0 is the output tensor, shape NHWC:
-#     = (1, OFMDim_H, OFMDim_W, (ConvKernelDim_H*ConvKernelDim_W)*IFMChannels)
-
 # note: the actual data layout produced by the hlslib kernels is different
 # for depthwise and non-depthwise ops.
 # * non-depthwise SWG: (1, OFMDim_H, OFMDim_W, K_H, K_W, IFMChannels/SIMD, SIMD)
@@ -62,12 +54,9 @@ except ModuleNotFoundError:
 # see test_fpgadataflow_slidingwindow.py for an example of how to transform
 # between the two layouts
 
-
 class ConvolutionInputGenerator_rtl(HLSCustomOp):
-    """Class that corresponds to one of the 1D finn-hlslib ConvolutionInputGenerator
-    (sliding window) function variants. Depending on the combination of
-    attributes (e.g. depthwise or not, whether dilation is 0) a different
-    variant will be picked for the actual HLS implementation."""
+    """Class that does not correspond to one of the finn-hlslib ConvolutionInputGenerator
+    (sliding window) function variants! ... """
 
     def __init__(self, onnx_node):
         super().__init__(onnx_node)
@@ -80,6 +69,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
             "OFMDim": ("ints", True, []),  # [H, W] = [Y, X]
             "SIMD": ("i", True, 0),
             "M": ("i", False, 1),
+            "parallel_window": ("i", False, 0, {0, 1}),
             "Stride": ("ints", True, []),  # [H, W] = [Y, X]
             "Dilation": ("ints", True, []),  # [H, W] = [Y, X]
             # FINN DataTypes for inputs, weights, outputs
@@ -87,14 +77,14 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
             "outputDataType": ("s", True, ""),
             "depthwise": ("i", False, 0, {0, 1}),
             # FPGA resource type for ConvolutionInputGenerator input buffer
-            # auto -- let Vivado HLS decide
+            # auto -- let Vivado decide
             # block -- use BRAM
             # distributed -- use LUTRAM
             # ultra -- use URAM
             "ram_style": (
                 "s",
                 False,
-                "distributed",
+                "auto",
                 {"auto", "block", "distributed", "ultra"},
             ),
             "gen_top_module": ("s", False, ""),
@@ -147,7 +137,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h)
         ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w)
         assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
-        if self.use_parallel_window_output():
+        if (self.get_nodeattr("parallel_window")):
             wf = int((ifm_ch) // simd)
             #folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd)
             if ofm_dim_w == 1:
@@ -193,7 +183,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         return in_width
 
     def get_outstream_width(self):
-        if self.use_parallel_window_output():
+        if (self.get_nodeattr("parallel_window")):
             # feed all window pixels in parallel
             k_h, k_w = self.get_nodeattr("ConvKernelDim")
             return self.get_instream_width() * k_h * k_w
@@ -201,6 +191,11 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
             # if parallel variant not in use: same width for output and input stream
             return self.get_instream_width()
 
+    def get_number_input_values(self):
+        folded_ishape = self.get_folded_input_shape()
+        num_input_elems = np.prod(folded_ishape[:-1])
+        return num_input_elems
+
     def get_number_output_values(self):
         folded_oshape = self.get_folded_output_shape()
         num_output_elems = np.prod(folded_oshape[:-1])
@@ -235,20 +230,6 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
 
         return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation)
 
-    def use_parallel_window_output(self):
-        # Check if simple "ConvolutionInputGenerator_1D_parallel" variant can be used to
-        # feed window in parallel to the following layer, enabling full SIMD unfolding.
-        dilation = self.get_nodeattr("Dilation")
-        dilation_h, dilation_w = dilation
-
-        #todo: make this configurable via mmv_out instead of an automatic selection
-
-        if self.get_nodeattr("SIMD") == self.get_nodeattr("IFMChannels"):
-            if self.get_nodeattr("depthwise") == 0:
-                    return True
-
-        return False
-
     def get_exp_cycles(self):
         simd = self.get_nodeattr("SIMD")
         (
@@ -268,7 +249,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         # since mmv != 1 is not supported yet, we set mmv for now to 1
         mmv = 1
         # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h
-        if self.use_parallel_window_output():
+        if (self.get_nodeattr("parallel_window")):
             exp_cycles = ifm_dim_w + 1
         else:
             cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv
@@ -467,8 +448,6 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
     def generate_hdl(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
         f_debug = open(os.path.join(code_gen_dir, "swg_hdl_debuginfo.log"), "w")
-        #debug:
-        #f_debug = open(os.path.join("/workspace/finn/finn-rtllib/swg/", "swg_hdl_debuginfo.log"), "w")
         code_gen_dict = {}
 
         #--------------------
@@ -480,26 +459,38 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         ofm_dim = self.get_nodeattr("OFMDim")
         stride = self.get_nodeattr("Stride")
         dilation = self.get_nodeattr("Dilation")
+        depthwise = self.get_nodeattr("depthwise")
 
         n = 1
         h, w = ifm_dim
         c = 1 # ifm_ch not considered atm (always parallelize across c)
         k_h, k_w = k
-        pad = [0,0,0,0] # padding happens in separate padding node
+        pad = [0,0,0,0] # padding happens in separate padding node for now
         pad_val = 0
         stride_h, stride_w = stride
         dilation_h, dilation_w = dilation
         conv_c = 99
 
         # init folding config
-        M = self.get_nodeattr("M")
         simd = self.get_nodeattr("SIMD")
-        mmv_in = 1*M
-        mmv_out = k_h*k_w*M
+        M = self.get_nodeattr("M")
+        if (self.get_nodeattr("parallel_window")):
+            mmv_in = M*1
+            mmv_out = M*k_h*k_w
+            assert ifm_ch==simd, "Constraint violated: SIMD must be equal to C"
+        else:
+            mmv_in = 1
+            mmv_out = 1
+            assert ifm_ch%simd==0, "Constraint violated: SIMD must divide C"
 
-        assert simd==ifm_ch, "Constraint violated: SIMD = C"
-        assert mmv_in==1*M, "Constraint violated: MMV_IN = 1" # *M
-        assert mmv_out==k_h*k_w*M, "Constraint violated: mmv_out = K" # *M
+        # todo: check allowed hyperparams
+        # ToDo: move/duplicate these checks in corresponding convert_to_hls transformation
+
+        # choose implementation style
+        if (mmv_out > 1 or (k_h==1 and k_w==1)):
+            impl_style = "parallel"
+        else:
+            impl_style = "default"
 
         # how many "unused" registers are allowed between buffer positions that will be accessed in parallel
         # example:
@@ -579,218 +570,245 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         f_debug.write("\n"+"sequential pixel indices (shape %s" % str(idx_px.shape))
         f_debug.write("\n"+str(idx_px))
 
-        output_elem, output_cycles = idx_px.shape
+        k, cycles = idx_px.shape
+
+        output_elements = mmv_out
+        output_cycles = int(cycles/(mmv_out/k))
+
         # ToDo: what happens when output_cycles=OFMdim % M != 0
         # ...try to support IFMdim % M != 0 first, so we can work with the usual k=3 where OFMdim = IFMdim - -2
         # the additional garbage input elements that are read in the last cycle are not read by any window anyway
         idx_px = idx_px.transpose()
-        idx_px = idx_px.reshape((int(output_cycles/M), int(output_elem*M)))
+        idx_px = idx_px.reshape(output_cycles, output_elements)
         idx_px = idx_px.transpose()
 
+        # result: first dim is number of parallel output elements, second dim is the input element (pixel in case of SIMD=C) index that each output element outputs per cycle
         f_debug.write("\n"+"sequential pixel indices, MMV_out grouping (shape %s" % str(idx_px.shape))
         f_debug.write("\n"+str(idx_px))
+        #f_debug.close()
 
         buffer = []
         buffer_max_size = 0
         # buffer schedule (write from input, read to output)
         schedule_write = []
         schedule_read = []
+        schedule_shift = []
+
 
         schedule = []
         schedule_prev = ''
 
         next_in_px = 0
+        oldest_px = 0
+        buffer_space_freed = False
 
         idx_px_relative = idx_px.copy()
+        idx_px_addr = idx_px.copy()
+        idx_px_addr_incr = idx_px.copy()
+        idx_px_addr_rel = idx_px.copy()
 
-        # compute schedule and buffer read pattern
+        # compute schedule and buffer read pattern (output driven)
         output_elem, output_cycles = idx_px_relative.shape
-        for x in range(output_cycles):
-            # load missing inputs into buffer
-            for y in range(output_elem):
-                while int(idx_px_relative[y,x]) not in buffer:
-                    # load M inputs at once (keep "buffer" list 1D for now, handle actual 2D buffer generation later)
+
+        if (impl_style == "parallel"):
+            for x in range(output_cycles):
+                # load missing inputs into buffer
+                for y in range(output_elem):
+                    while int(idx_px_relative[y,x]) not in buffer:
+                        # load M inputs at once (keep "buffer" list 1D for now, handle actual 2D buffer generation later)
+                        for m in range(M):
+                            buffer.append(next_in_px)
+                            next_in_px += 1
+                        schedule_write.append(1)
+                        schedule_read.append(0)
+                        if schedule_prev == 'w':
+                            count, cmd = schedule[-1]
+                            schedule[-1] = (count+1, cmd)
+                        else:
+                            schedule.append((1, 'w'))
+                            schedule_prev = 'w'
+                
+                # discard unused buffer elements
+                oldest_px = np.min(idx_px_relative[:,x:])
+                #check whether M elements can be shifted out, not just the single oldest one
+                # must this be "while" for MMV to work?!? breaks mmvout = 1 case
+                #while all([buffer[i] < oldest_px for i in range(M)]):
+                if all([buffer[i] < oldest_px for i in range(M)]):
+                    # M buffer elements are shifted out at once
+                    for m in range(M):
+                        buffer.pop(0)
+        
+                # adjust relative buffer index of current x (according to last discarded buffer elements)
+                for y in range(output_elem):
+                    idx_px_relative[y,x] -= oldest_px
+
+                
+                # read from buffer    
+                # + simultaneously load next pixel(s) into buffer if there are any left
+                if (next_in_px > (h_padded*w_padded-1)):
+                    # read only (append above)
+                    schedule_read.append(1)
+                    schedule_write.append(0)
+                    if schedule_prev == 'r':
+                        count, cmd = schedule[-1]
+                        schedule[-1] = (count+1, cmd)
+                    else:
+                        schedule.append((1, 'r'))
+                        schedule_prev = 'r'
+                else:
+                    # load M inputs at once
                     for m in range(M):
                         buffer.append(next_in_px)
                         next_in_px += 1
+                    schedule_read.append(1)
                     schedule_write.append(1)
-                    schedule_read.append(0)
-                    if schedule_prev == 'w':
+                    if schedule_prev == 'wr':
                         count, cmd = schedule[-1]
                         schedule[-1] = (count+1, cmd)
                     else:
-                        schedule.append((1, 'w'))
-                        schedule_prev = 'w'
-            
-            # discard unused buffer elements (assumes in-order access)
-            oldest_px = min(idx_px_relative[:,x])
-            #while buffer[0] < oldest_px:
-            #check whether M elements can be shifted out, not just the single oldest one
-            while all([buffer[i] < oldest_px for i in range(M)]):
-                # M buffer elements are shifted out at once
-                for m in range(M):
-                    buffer.pop(0)
-                
-            # adjust relative buffer index
-            for y in range(output_elem):
-                idx_px_relative[y,x] -= oldest_px
-                
-            # record max needed buffer depth
-            if len(buffer) > buffer_max_size:
-                buffer_max_size = len(buffer)
-            
-            # read from buffer
-            schedule_read.append(1)
-            
-            # simultaneously load next pixel(s) into buffer if there are any left
-            if next_in_px > (h_padded*w_padded-1):
-                schedule_write.append(0)
-                if schedule_prev == 'r':
-                    count, cmd = schedule[-1]
-                    schedule[-1] = (count+1, cmd)
-                else:
-                    schedule.append((1, 'r'))
-                    schedule_prev = 'r'
-            else:
-                # load M inputs at once
-                for m in range(M):
-                    buffer.append(next_in_px)
-                    next_in_px += 1
+                        schedule.append((1, 'wr'))
+                        schedule_prev = 'wr'
+
+                # record max needed buffer depth
+                #f_debug.write("\n"+str(buffer))
+                if len(buffer) > buffer_max_size:
+                    buffer_max_size = len(buffer)
+
+            # insert dummy write operations for data at the input FM tail-end that is never read (e.g. in case of stride > 1)
+            while next_in_px <= (h_padded*w_padded-1):
+                next_in_px += 1
                 schedule_write.append(1)
-                if schedule_prev == 'wr':
+                schedule_read.append(0)
+                if schedule_prev == 'w':
                     count, cmd = schedule[-1]
                     schedule[-1] = (count+1, cmd)
                 else:
-                    schedule.append((1, 'wr'))
-                    schedule_prev = 'wr'
+                    schedule.append((1, 'w'))
+                    schedule_prev = 'w'
 
+            # find buffer access patterns
+            buffer_access_patterns = []
+            for x in range(output_cycles):
+                if idx_px_relative[:,x].tolist() not in buffer_access_patterns:
+                    buffer_access_patterns.append(idx_px_relative[:,x].tolist())
 
-        # find buffer access patterns
-        buffer_access_patterns = []
-        for x in range(output_cycles):
-            if idx_px_relative[:,x].tolist() not in buffer_access_patterns:
-                buffer_access_patterns.append(idx_px_relative[:,x].tolist())
-                
-        # from itertools import groupby
-        # schedule_write_compressed = ''.join('(' + str(k) + ',' + str(sum(1 for x in g)) + '),' for k, g in groupby(schedule_write))
-        # schedule_read_compressed = ''.join('(' + str(k) + ',' + str(sum(1 for x in g)) + '),' for k, g in groupby(schedule_read))
-
-        # analyse schedule
-        # class sched_gen:
-        #     start_counter = 0
-        #     start_val = 0
-
-        #     end_last_sequence_counter = 0
-        #     end_sequence = []
-
-        #     outer_counter = 0
-        #     outer_sequence_counter = 0
-        #     outer_sequence_val = 0
-
-        #     inner_counter = 0
-        #     inner_sequence = []
-
-        #     def __str__(self):
-        #         return "\nstart: %d x %d\n %d x\n   %d x %s + %d x %d\nend: %d x %s + %s\n" % (
-        #             self.start_counter,
-        #             self.start_val,
-        #             self.outer_counter,
-        #             self.inner_counter,
-        #             str(self.inner_sequence),
-        #             self.outer_sequence_counter,
-        #             self.outer_sequence_val,
-        #             self.end_last_sequence_counter,
-        #             str(self.inner_sequence),
-        #             self.end_sequence
-        #         )
 
-        
-        # def analyse_schedule(schedule):
-        #     generator = sched_gen()
-            
-        #     #determine start sequence
-        #     for i, v in enumerate(schedule):
-        #         if i > 0 and v != schedule[i-1]:
-        #             generator.start_counter = i
-        #             generator.start_val = schedule[i-1]
-        #             break
-
-        #     #determine inner loop/sequence
-        #     sequence_MAX = 10
-        #     schedule = schedule[generator.start_counter:] # cut off processed entries
-        #     sequence = []
-        #     repititions = 0
-        #     i = 0
-        #     while i < len(schedule):
-        #         if not sequence:
-        #             sequence.append(schedule[i])
-        #             i = i+1
-        #         else:
-        #             # is this a beginning of a repitition of the current sequence?
-        #             if i + len(sequence) < len(schedule) and all([schedule[i+offset] == sequence[offset] for offset in range(len(sequence))]):  
-        #                 repititions = repititions + 1
-        #                 i = i+len(sequence)
-        #             else:
-        #                 # did we already count repitions of the sequence?
-        #                 sequence_candidate = sequence + sequence * repititions
-        #                 sequence_candidate.append(schedule[i])
-        #                 if len(sequence_candidate) < sequence_MAX:
-        #                     sequence = sequence_candidate.copy()
-        #                     repititions = 0
-        #                     i = i+1
-        #                 else:
-        #                     schedule = schedule[i:] # cut off processed entries
-        #                     break
-        #     generator.inner_counter = repititions + 1
-        #     generator.inner_sequence = sequence
+        else:
+
+            #simulate cyclic buffer, which is advanced on every write (as opposed to on overy sheduled cycle)
+            #buffer_tail = 0
+            buffer_head = 0 #buffer_tail+1
+            # compute minimal buffer length (assuming it holds 1 complete window)
+            buffer_len = (k_h-1) * dilation_h * w + (k_w-1) * dilation_w + 1
+            buffer = [-1] * buffer_len
             
-        #     #determine outer sequence
-        #     for i, v in enumerate(schedule):
-        #         if i > 0 and v != schedule[i-1]:
-        #             generator.outer_sequence_counter = i
-        #             generator.outer_sequence_val = schedule[i-1]
-        #             break
-
-        #     schedule = schedule[generator.outer_sequence_counter:] # cut off processed entries
-
-        #     sequence_to_compare = generator.inner_sequence * generator.inner_counter + [generator.outer_sequence_val] * generator.outer_sequence_counter
-
-        #     generator.outer_counter = 1
-        #     i = 0
-        #     while i < len(schedule):
-        #         # is this a beginning of a repitition of the current sequence?
-        #         if i + len(sequence_to_compare) < len(schedule) and all([schedule[i+offset] == sequence_to_compare[offset] for offset in range(len(sequence_to_compare))]):
-        #             generator.outer_counter = generator.outer_counter + 1
-        #             i = i+len(sequence_to_compare)
-        #         else:
-        #             schedule = schedule[i:] # cut off processed entries
-        #             break
-
-        #     #determine end sequence
-        #     #for i, v in enumerate(schedule):
-        #     #    if i > 0 and v != schedule[i-1]:
-        #     #        generator.end_counter = i
-        #     #        generator.end_val = schedule[i-1]
-        #     #        break
-        
-        #     sequence = generator.inner_sequence
-        #     repititions = 0
-        #     i = 0
-        #     while i < len(schedule):
-        #         # is this a beginning of a repitition of the current sequence?
-        #         if i + len(sequence) < len(schedule) and all([schedule[i+offset] == sequence[offset] for offset in range(len(sequence))]):  
-        #             repititions = repititions + 1
-        #             i = i+len(sequence)
-        #         else:
-        #             schedule = schedule[i:] # cut off processed entries
-        #             break
-        #     generator.end_last_sequence_counter = repititions
-
-        #     #remainder
-        #     generator.end_sequence = schedule
-
-        #     return generator
+            # todo: remove this simulation, not needed and doesnt accout for SIMD anyways
+            for x in range(output_cycles):
 
+                # load missing inputs into buffer
+                while int(idx_px_relative[0,x]) not in buffer:
+                    # load M inputs at once (keep "buffer" list 1D for now, handle actual 2D buffer generation later)
+                    for m in range(M):
+                        #buffer.append(next_in_px)
+                        buffer[buffer_head] = next_in_px
+                        next_in_px += 1
+                    schedule_write.append(1)
+                    schedule_read.append(0)
+                    if schedule_prev == 'w':
+                        count, cmd = schedule[-1]
+                        schedule[-1] = (count+1, cmd)
+                    else:
+                        schedule.append((1, 'w'))
+                        schedule_prev = 'w'
+
+                    #try to advance/shift the buffer by one, discarding the oldest element
+                    #discard_oldest_elem = buffer[0] < np.min(idx_px_relative[0,x:])
+                    #if discard_oldest_elem:
+                    #    buffer.pop(0)
+                    #    schedule_shift.append(1)
+                    #else:
+                    #    schedule_shift.append(0)
+                    buffer_head += 1
+                    if buffer_head > buffer_len-1:
+                        buffer_head = 0
+
+                ### perform read ###
+
+                #try to advance/shift the buffer by one, discarding the oldest element
+                #discard_oldest_elem = buffer[0] < np.min(idx_px_relative[0,x:])
+                #if discard_oldest_elem:
+                #    buffer.pop(0)
+                #    schedule_shift.append(1)
+                #else:
+                #    schedule_shift.append(0)
+
+                # note current relative addr in buffer
+                idx_px_addr[0,x] = buffer.index(idx_px_relative[0,x])
+                if x > 0:
+                    idx_px_addr_incr[0,x] = idx_px_addr[0,x] - idx_px_addr[0,x-1]
+                    if idx_px_addr_incr[0,x] < 0:
+                        idx_px_addr_incr[0,x] += buffer_len
+                else:
+                    idx_px_addr_incr[0,x] = idx_px_addr[0,x]
+
+                idx_px_addr_rel [0,x] = buffer.index(idx_px_relative[0,x]) - buffer_head
+                if idx_px_addr_rel [0,x] < 0:
+                    idx_px_addr_rel [0,x] += buffer_len
+
+
+                #try to write a new input into the buffer simultaneously (during this read as opposed to before the next read) 
+                # assume in-order write into the buffer (oldest element is always at head+1)
+                discard_oldest_elem = np.min(buffer) < np.min(idx_px_relative[0,x:])
+                read_only = True
+                if not (next_in_px > (h_padded*w_padded-1)):
+                    # input data available
+                    #if (x < k_h*k_w) or discard_oldest_elem:
+                    if discard_oldest_elem:
+                        # buffer is not complete, as the first window has not been fully output
+                        # or we can discard one element from the buffer after this read, so there is space for a new one
+                        read_only = False
+
+    
+                # read from buffer    
+                # + simultaneously load next pixel(s) into buffer if there are any left
+                # if mmv_out = 1: addressable BRAM implementation style -> do not shift in while outputting K kernel elements to keep addressing consistent
+                #if (next_in_px > (h_padded*w_padded-1)) or ((x+1) % (k_h*k_w) != 0):
+                #if (next_in_px > (h_padded*w_padded-1)) or (x > 1 and (not buffer_space_freed)):
+                if read_only:
+                    # read only
+                    schedule_read.append(1)
+                    schedule_write.append(0)
+                    if schedule_prev == 'r':
+                        count, cmd = schedule[-1]
+                        schedule[-1] = (count+1, cmd)
+                    else:
+                        schedule.append((1, 'r'))
+                        schedule_prev = 'r'
+                else:
+                    # read + write
+                    #buffer.append(next_in_px)
+                    buffer[buffer_head] = next_in_px
+                    next_in_px += 1
+                    schedule_read.append(1)
+                    schedule_write.append(1)
+                    if schedule_prev == 'wr':
+                        count, cmd = schedule[-1]
+                        schedule[-1] = (count+1, cmd)
+                    else:
+                        schedule.append((1, 'wr'))
+                        schedule_prev = 'wr'
+
+                    # advance buffer
+                    buffer_head += 1
+                    if buffer_head > buffer_len-1:
+                        buffer_head = 0
+
+                # record max needed buffer depth
+                #f_debug.write("\n"+str(buffer))
+                if len(buffer) > buffer_max_size:
+                    buffer_max_size = len(buffer)
+
+        # ToDo: maybe replace with directly-computed schedule (similar to addr. buffer impl. style)
         def compact_schedule(schedule):
 
             # leave first sequence (pre-load) as is
@@ -852,6 +870,16 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
                 end_sequence = schedule[i]
                 i += 1
 
+            if i < len(schedule):
+                end_sequence = end_sequence + schedule[i]
+                i += 1
+
+            assert len(start_sequence) == 1*2, "ERROR: invalid start sequence"
+            assert len(loop_sequence_1) == 2*2, "ERROR: invalid loop 1 sequence"
+            if loop_sequence_2:
+                assert len(loop_sequence_2) <= 2*2, "ERROR: invalid loop 2 sequence"
+            if end_sequence:
+                assert len(end_sequence) <= 2*2, "ERROR: invalid end sequence"
             assert i == len(schedule), "ERROR: schedule could not be compacted %d / %d" %(i, len(schedule))
 
             return (
@@ -866,9 +894,12 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         f_debug.write("\n"+"max buffer size observed: %d" %(buffer_max_size))
         f_debug.write("\n"+"output vector elements: relative buffer indices")
         f_debug.write("\n"+str(idx_px_relative))
-        f_debug.write("\n"+"found %d buffer access patterns:" % len(buffer_access_patterns))
-        f_debug.write("\n"+str(buffer_access_patterns))
-        f_debug.write("\n"+"required parallel-access registers for mmv_out=k: %d" % len(sum(buffer_access_patterns,[])))
+        f_debug.write("\n"+"output vector elements: absolute buffer address")
+        f_debug.write("\n"+str(idx_px_addr))
+        f_debug.write("\n"+"output vector elements: absolute buffer address increment from last")
+        f_debug.write("\n"+str(idx_px_addr_incr))
+        f_debug.write("\n"+"output vector elements: relative buffer address (from head)")
+        f_debug.write("\n"+str(idx_px_addr_rel))
         f_debug.write("\n"+"buffer write schedule (%d cycles)" % len(schedule_write))
         f_debug.write("\n"+str(schedule_write))
         f_debug.write("\n"+"writing buffer in %d cycles" % schedule_write.count(1))
@@ -879,19 +910,112 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         f_debug.write("\n"+"buffer read schedule (%d cycles)" % len(schedule_read))
         f_debug.write("\n"+str(schedule_read))
         f_debug.write("\n"+"reading buffer in %d cycles" % schedule_read.count(1))
+
+        #f_debug.write("\n"+"buffer shift schedule (%d cycles)" % len(schedule_shift))
+        #f_debug.write("\n"+str(schedule_shift))
+        #f_debug.write("\n"+"shifting buffer in %d cycles" % schedule_shift.count(1))
         #f_debug.write("\n"+"buffer read schedule COMPRESSED")
         #f_debug.write("\n"+str(schedule_read_compressed))
         #f_debug.write("\n"+"buffer read schedule ANALYZED")
         #f_debug.write("\n"+str(analyse_schedule(schedule_read)))
-        f_debug.write("\n"+"buffer rw schedule NEW")
-        f_debug.write("\n"+str(schedule))
-        f_debug.write("\n"+"buffer rw schedule NEW compacted")
-        f_debug.write("\n"+"\nstart_sequence: %s\nloop_counter: %s\nloop_sequence_1_counter: %s\nloop_sequence_1: %s\nloop_sequence_2: %s\nend_sequence: %s\n" % compact_schedule(schedule))
 
-        assert len(schedule_write) == len(schedule_read), "ERROR: Schedules have different lenghts"
-        cycles_total = len(schedule_write)
+        addr_incr_end_window_elem = 0
+        addr_incr_end_window_row = 0
+        addr_incr_end_window = 0
+        addr_incr_end_row = 0
+
+        if (impl_style == "default"):
+            f_debug.write("\n"+"mmv_out = 1: computing incremental addressing scheme directly:")
+            addressing_scheme = [[0]]
+
+            # compute index/address increments for each nested loop
+            channel_factor = int(ifm_ch/simd)
+
+            #todo: rename to (min) buffer len
+            buffer_max_size = buffer_max_size * channel_factor
+
+            kernel_width = (k_w-1)*dilation_w+1 # incl. dilation
+            addr_incr_end_simd = 1
+            addr_incr_end_window_elem = (dilation_w-1) * channel_factor + 1
+            
+            remaining_line = (w - kernel_width) * channel_factor
+            skip_lines = (dilation_h-1) * w * channel_factor
+            addr_incr_end_window_row = remaining_line + skip_lines + 1 # 1 = wrap around of minimally sized buffer
+            
+            #addr_incr_end_window = stride_w * channel_factor + 1 # 1 = wrap around of minimally sized buffer
+            addr_incr_end_window = -buffer_max_size + stride_w * channel_factor + 1 # 1 = wrap around of minimally sized buffer
+
+            # rows that are skipped due to imperfect stride<->W combination
+            skip_columns = w%(kernel_width + (out_dim_w-1)*stride_w)
+            remaining_line = (skip_columns + kernel_width) * channel_factor # increment from oldest buffer position (top left) to end of line
+            skip_lines = (stride_h-1) * w * channel_factor
+            #addr_incr_end_row = remaining_line + skip_lines + 1 # 1 = wrap around of minimally sized buffer
+            addr_incr_end_row = -buffer_max_size + remaining_line + skip_lines + 1 # 1 = wrap around of minimally sized buffer
+  
+            
+
+            if (depthwise):
+                addr_incr_end_window_elem = dilation_w * channel_factor
+                addr_incr_end_window_row = (channel_factor 
+                                            + (w - kernel_width) * channel_factor
+                                            + (dilation_h-1) * w * channel_factor
+                                           )
+                addr_incr_end_simd = -buffer_max_size + (channel_factor + 1)
+                #addr_incr_end_simd = channel_factor + 1
+                
+                # just for testing:
+                for i_windows_per_h in range(out_dim_h): # LOOP_H
+                    for i_windows_per_w in range(out_dim_w): # LOOP_W
+                        for i_simd_per_px in range(channel_factor): # LOOP_SIMD
+                            for i_px_per_window_h in range(k_h): # LOOP_KH
+                                for i_px_per_window_w in range(k_w-1): # LOOP_KW
+                                    addressing_scheme[0].append(addr_incr_end_window_elem)
+                                if i_px_per_window_h != k_h-1: # skip on last iteration
+                                    addressing_scheme[0].append(addr_incr_end_window_row)
+                            if i_simd_per_px != channel_factor-1: # skip on last iteration
+                                addressing_scheme[0].append(addr_incr_end_simd)
+                        if i_windows_per_w != out_dim_w-1: # skip on last iteration
+                            addressing_scheme[0].append(addr_incr_end_window)
+                    if i_windows_per_h != out_dim_h-1: # skip on last iteration
+                        addressing_scheme[0].append(addr_incr_end_row)
+            else:
+                # just for testing:
+                for i_windows_per_h in range(out_dim_h): # LOOP_H
+                    for i_windows_per_w in range(out_dim_w): # LOOP_W
+                        for i_px_per_window_h in range(k_h): # LOOP_KH
+                            for i_px_per_window_w in range(k_w): # LOOP_KW
+                                for i_simd_per_px in range(channel_factor-1): # LOOP_SIMD
+                                    addressing_scheme[0].append(addr_incr_end_simd)
+                                if i_px_per_window_w != k_w-1: # skip on last iteration
+                                    addressing_scheme[0].append(addr_incr_end_window_elem)
+                            if i_px_per_window_h != k_h-1: # skip on last iteration
+                                addressing_scheme[0].append(addr_incr_end_window_row)
+                        if i_windows_per_w != out_dim_w-1: # skip on last iteration
+                            addressing_scheme[0].append(addr_incr_end_window)
+                    if i_windows_per_h != out_dim_h-1: # skip on last iteration
+                        addressing_scheme[0].append(addr_incr_end_row)
+            
+            f_debug.write("\n"+str(np.array(addressing_scheme)))
+            if simd == ifm_ch:
+                # simd < c currently not simulated
+                if (np.array(addressing_scheme) == idx_px_addr_incr).all:
+                    f_debug.write("\n"+"computed addressing matches simulated addressing")
+                else:
+                    f_debug.write("\n"+"ERROR")
+        else:
+            f_debug.write("\n"+"found %d buffer access patterns:" % len(buffer_access_patterns))
+            f_debug.write("\n"+str(buffer_access_patterns))
+            f_debug.write("\n"+"required parallel-access registers for mmv_out=k: %d" % len(sum(buffer_access_patterns,[])))
+            f_debug.write("\n"+"buffer rw schedule NEW")
+            f_debug.write("\n"+str(schedule))
+            f_debug.write("\n"+"buffer rw schedule NEW compacted")
+            f_debug.write("\n"+"\nstart_sequence: %s\nloop_counter: %s\nloop_sequence_1_counter: %s\nloop_sequence_1: %s\nloop_sequence_2: %s\nend_sequence: %s\n" % compact_schedule(schedule))
+            assert len(schedule_write) == len(schedule_read), "ERROR: Schedules have different lenghts"
+            assert schedule_write.count(1) == self.get_number_input_values(), "ERROR: Writing buffer in fewer cycles than expected"
+            assert schedule_read.count(1) == self.get_number_output_values(), "ERROR: Reading buffer in fewer cycles than expected"
+            cycles_total = len(schedule_write)
+   
         
-        assert schedule_read.count(1) == self.get_number_output_values(), "ERROR: Reading buffer in fewer cycles than expected"
 
         code_gen_dict["$TOP_MODULE_NAME$"] = [self.get_verilog_top_module_name()]
         #save top module name so we can refer to it even after this node has been renamed (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject)
@@ -900,216 +1024,297 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         code_gen_dict["$SIMD$"] = [str(simd)]
         code_gen_dict["$MMV_IN$"] = [str(mmv_in)]
         code_gen_dict["$MMV_OUT$"] = [str(mmv_out)]
-        code_gen_dict["$CYCLES_TOTAL$"] = [str(cycles_total)]
-        code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_max_size)]
         
-        # determine buffer partitioning into REG FIFOs (parallel access) and BRAM FIFOs (line buffers)
-        # ToDo: this part doesn't fully account for M (2D buffer) yet
-        assert len(buffer_access_patterns) == 1, "ERROR: Buffer access pattern is not static"
-        buf_static_access_pattern = buffer_access_patterns[0]
-        reg_fifos = []
-        reg_fifos_depth = []
-        bram_fifos = []
-        bram_fifos_depth = []
-        current = []
-        for i in range(len(buf_static_access_pattern)):
-            access_idx = buf_static_access_pattern[i]
-            if len(current) == 0:
-                current.append(access_idx)
+
+        ram_style = self.get_nodeattr("ram_style")
+        if ram_style == "auto":
+            code_gen_dict["$RAM_STYLE$"]=[""]
+        else:
+            code_gen_dict["$RAM_STYLE$"]=["(* ram_style = \"{}\" *)".format(ram_style)]
+        
+        if (impl_style == "default"):
+            ### MMVout = 1: addressable buffer implementation style
+            f_debug.write("\n"+"Choosing implementation style: Addressable buffer due to mmv_out=1")
+
+            # add additional buffer space in case of stride > 1
+            # this minimizes cycle count, as it allows an earlier pre-load of skipped input elements
+            buffer_actual_size = (buffer_max_size + max(0,((stride_w-1)   - (int(mmv_out*k_h*k_w/mmv_in)))*channel_factor)
+                                                  + max(0,((stride_h-1)*w - (int(mmv_out*k_h*k_w/mmv_in)))*channel_factor))
+            code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_actual_size)]
+
+            assert not(abs(addr_incr_end_window) > buffer_actual_size), "ERROR: W increment > buffer size, wrap logic doesn't account for this"
+            assert not(abs(addr_incr_end_row) > buffer_actual_size), "ERROR: H increment > buffer size, wrap logic doesn't account for this"
+
+            kernel_width = (k_w-1)*dilation_w+1 # incl. dilation
+            kernel_height = (k_h-1)*dilation_h+1 # incl. dilation
+            skip_columns = w%(kernel_width + (out_dim_w-1)*stride_w)
+            skip_rows = h%(kernel_height + (out_dim_h-1)*stride_h)
+            code_gen_dict["$LAST_READ_ELEM$"] = [str(h*w*channel_factor-1)]
+            code_gen_dict["$LAST_WRITE_ELEM$"] = [str(((h - skip_rows - 1) * w + (w - skip_columns))*channel_factor -1)]
+
+            loop_h_iterations = out_dim_h
+            loop_w_iterations = out_dim_w
+            loop_kh_iterations = k_h
+            loop_kw_iterations = k_w
+            loop_simd_iterations = channel_factor
+
+            if (depthwise and channel_factor > 1):
+                # re-arrange existing controller loop structure for depthwise convolutions
+                loop_kh_iterations = channel_factor
+                loop_kw_iterations = k_h
+                loop_simd_iterations = k_w
+                addr_incr_end_simd_ = addr_incr_end_simd
+                addr_incr_end_simd = addr_incr_end_window_elem
+                addr_incr_end_window_elem = addr_incr_end_window_row
+                addr_incr_end_window_row = addr_incr_end_simd_
+                elem_per_window = k_h*k_w         
+                                                                                                       
+                code_gen_dict["$TAIL_INCR_GENERATION$"] = ["""
+                always @ (counter_loop_kh, counter_loop_w, counter_loop_h) begin
+                         if (counter_loop_kh != 0)
+                             tail_incr = 1;
+                         else if (counter_loop_w != 0)
+                             tail_incr = ADDR_INCREMENT_MAP[STATE_LOOP_W]-{channel_factor}+{buffer_min_size};
+                         else // do not check for counter_loop_h to increment past LAST_WRITE_ELEM during last window
+                             tail_incr = ADDR_INCREMENT_MAP[STATE_LOOP_H]-{channel_factor}+{buffer_min_size};
+                end
+                """.format(channel_factor=channel_factor, buffer_min_size=buffer_max_size)]
             else:
-                # assume non-decreasing index order in access pattern
-                # ToDo: this assumption does not hold for M>1 case (2D buffer)
-                distance = access_idx - max(current)
-                if not (distance-1 > REG_BRAM_THRESHOLD):
-                    for i in range(distance-1):
-                        # insert dummy into REG FIFO (not read as part of window)
-                        current.append(-1)
-                    # assign this access to same REG FIFO as previous one
-                    current.append(access_idx)
+                # depthwise output format is equivalent to non-depthwise if SIMD=C
+                elem_per_window = k_h*k_w*channel_factor
+
+                code_gen_dict["$TAIL_INCR_GENERATION$"] = ["""
+                always @ (counter_loop_w, counter_loop_h) begin
+                        if (counter_loop_w != 0)
+                            tail_incr = ADDR_INCREMENT_MAP[STATE_LOOP_W]-1+{buffer_min_size};
+                        else // do not check for counter_loop_h to increment past LAST_WRITE_ELEM during last window
+                            tail_incr = ADDR_INCREMENT_MAP[STATE_LOOP_H]-1+{buffer_min_size};
+                end
+                """.format(buffer_min_size=buffer_max_size)]
+
+            # support SIMD = C and k_w = 1 cases
+            # for k = [k_h, k_w] = [1, k_w], no adjustment is needed
+            # for k = [k_h, k_w] = [1, 1], do not use this impl. style (mmv_out=K=1)
+            # innermost loop is executed at least once -> adjust if needed
+            if (loop_simd_iterations == 1):
+                # skip innermost SIMD loop completely
+                if (loop_kw_iterations == 1):
+                    # skip innermost KW loop completely
+                    code_gen_dict["$INNERMOST_STATE$"]=["STATE_LOOP_KH"]
+                    loop_kh_iterations -= 1  # -1 because state is initial state
                 else:
-                    # assign skipped accesses to new BRAM FIFO
-                    bram_fifos.append([-1]*(distance-1))
-                    bram_fifos_depth.append(math.ceil((distance-1)/M)) # really ceil?
-                    # start with new REG FIFO
-                    reg_fifos.append(current)
-                    #reg_fifos_depth.append(math.ceil((max(current)+1)/M)) ToDo: fix for M again
-                    reg_fifos_depth.append(len(current))
-                    current = []
+                    code_gen_dict["$INNERMOST_STATE$"]=["STATE_LOOP_KW"]
+                    loop_kw_iterations -= 1 # -1 because state is initial state
+            else:
+                code_gen_dict["$INNERMOST_STATE$"]=["STATE_LOOP_SIMD"]
+                loop_simd_iterations -= 1 # -1 because state is initial state
+            
+            code_gen_dict["$LOOP_H_ITERATIONS$"]=[str(loop_h_iterations-1)]
+            code_gen_dict["$LOOP_W_ITERATIONS$"]=[str(loop_w_iterations-1)]
+            code_gen_dict["$LOOP_KH_ITERATIONS$"]=[str(loop_kh_iterations-1)]
+            code_gen_dict["$LOOP_KW_ITERATIONS$"]=[str(loop_kw_iterations-1)]
+            code_gen_dict["$LOOP_SIMD_ITERATIONS$"]=[str(loop_simd_iterations-1)]
+
+            w = 32 #ToDo: minimize
+            code_gen_dict["$ADDR_INCREMENT_MAP$"]=["'{{ {}'d0, {}'d{}, {}'d{}, {}'d{}, {}'d{}, {}'d{}}}".format(w, 
+                                                int(copysign(w,addr_incr_end_simd)),abs(addr_incr_end_simd),
+                                                int(copysign(w,addr_incr_end_window_elem)),abs(addr_incr_end_window_elem),
+                                                int(copysign(w,addr_incr_end_window_row)),abs(addr_incr_end_window_row),
+                                                int(copysign(w,addr_incr_end_window)),abs(addr_incr_end_window),
+                                                int(copysign(w,addr_incr_end_row)),abs(addr_incr_end_row))]
+
+            code_gen_dict["$ELEM_PER_WINDOW$"] = [str(elem_per_window)]
+
+            with open("/workspace/finn/finn-rtllib/swg/swg_hdl_template_mmv_1.v", "r") as f:
+                template = f.read()
+        else:
+            f_debug.write("\n"+"Choosing implementation style: Parallel Registers (+ line buffers) due to mmv_out>1")
+            ### determine buffer partitioning into REG FIFOs (parallel access) and BRAM FIFOs (line buffers)
+            # ToDo: this part doesn't fully account for M (2D buffer) yet
+
+            code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_max_size)]
+
+            assert len(buffer_access_patterns) == 1, "ERROR: Buffer access pattern is not static"
+            buf_static_access_pattern = buffer_access_patterns[0]
+            reg_fifos = []
+            reg_fifos_depth = []
+            bram_fifos = []
+            bram_fifos_depth = []
+            current = []
+            for i in range(len(buf_static_access_pattern)):
+                access_idx = buf_static_access_pattern[i]
+                if len(current) == 0:
                     current.append(access_idx)
-        reg_fifos.append(current)
-        #reg_fifos_depth.append(math.ceil((max(current)+1)/M)) ToDo fix for M again
-        reg_fifos_depth.append(len(current))
-
-        f_debug.write("\n"+"Buffer partitioning using REG_BRAM_THRESHOLD=%d" % REG_BRAM_THRESHOLD)
-        f_debug.write("\n"+"%d REG FIFOs (parallel read access):" % len(reg_fifos))
-        f_debug.write("\n"+str(reg_fifos))
-        f_debug.write("\n"+"%d BRAM FIFOs (line buffers):" % len(bram_fifos))
-        f_debug.write("\n"+str(bram_fifos))
-
-        code_gen_dict["$GENERATE_REG_FIFOS$"] = []
-        for i in range(len(reg_fifos)):
-            code_gen_dict["$GENERATE_REG_FIFOS$"].append(
-                """
-                wire [IN_WIDTH-1:0] reg_fifo_{id}_in;
-                wire [IN_WIDTH-1:0] reg_fifo_{id}_out;
-                wire [IN_WIDTH*{len}-1:0] reg_fifo_{id};
-                {name}_reg_buffer
-                #(
-                .WIDTH(IN_WIDTH),
-                .DEPTH({len})
-                )
-                reg_buffer_inst_{id}
-                (
-                    .CLK(CLK),
-                    .shift_enable(shift_enable),
-                    .shift_in(reg_fifo_{id}_in),
-                    .shift_out(reg_fifo_{id}_out),
-                    .data_out(reg_fifo_{id})
-                );""".format(name=self.get_verilog_top_module_name(), id=i, len=reg_fifos_depth[i]))
-
-        code_gen_dict["$GENERATE_BRAM_FIFOS$"] = []
-        for i in range(len(bram_fifos)):
-            code_gen_dict["$GENERATE_BRAM_FIFOS$"].append(
-                """
-                wire [IN_WIDTH-1:0] bram_fifo_{id}_in;
-                wire [IN_WIDTH-1:0] bram_fifo_{id}_out;
-                {name}_ram_buffer
-                #(
-                .WIDTH(IN_WIDTH),
-                .DEPTH({len})
-                )
-                ram_buffer_inst_{id}
-                (
-                    .CLK(CLK),
-                    .RST(RST),
-                    .shift_enable(shift_enable),
-                    .shift_in(bram_fifo_{id}_in),
-                    .shift_out(bram_fifo_{id}_out)
-                );""".format(name=self.get_verilog_top_module_name(), id=i, len=bram_fifos_depth[i]))
-
-        code_gen_dict["$GENERATE_OUTPUT_MAPPING$"] = []
-        out_idx = mmv_out-1
-        for fifo_id, reg_fifo in enumerate(reg_fifos):
-            for fifo_idx, access_idx in enumerate(reg_fifo):
-                if(access_idx != -1):
-                    #code_gen_dict["$GENERATE_OUTPUT_MAPPING$"].append(
-                    #    "assign data_out[OUT_ELEM_WIDTH*{out_idx}+:OUT_ELEM_WIDTH] = reg_fifo_{fifo_id}[{fifo_idx}]; //{access_idx}".format(
-                    #        out_idx=out_idx, fifo_id=fifo_id, fifo_idx=fifo_idx, access_idx=access_idx
-                    #    )
-                    #)
-                    code_gen_dict["$GENERATE_OUTPUT_MAPPING$"].append(
-                        "assign data_out[OUT_ELEM_WIDTH*{out_idx}+:OUT_ELEM_WIDTH] = reg_fifo_{fifo_id}[{access_idx}*{mmv}*OUT_ELEM_WIDTH+OUT_ELEM_WIDTH*{mmv_idx}+:OUT_ELEM_WIDTH];".format(
-                            out_idx=out_idx, fifo_id=fifo_id, 
-                            access_idx=reg_fifos_depth[fifo_id]-1-int((max(reg_fifo)-access_idx)/M), 
-                            mmv_idx=(max(reg_fifo)-access_idx)%M,
-                            mmv = M
-                        )
+                else:
+                    # assume non-decreasing index order in access pattern
+                    # ToDo: this assumption does not hold for M>1 case (2D buffer)
+                    distance = access_idx - max(current)
+                    if not (distance-1 > REG_BRAM_THRESHOLD):
+                        for i in range(distance-1):
+                            # insert dummy into REG FIFO (not read as part of window)
+                            current.append(-1)
+                        # assign this access to same REG FIFO as previous one
+                        current.append(access_idx)
+                    else:
+                        # assign skipped accesses to new BRAM FIFO
+                        bram_fifos.append([-1]*(distance-1))
+                        bram_fifos_depth.append(math.ceil((distance-1)/M)) # really ceil?
+                        # start with new REG FIFO
+                        reg_fifos.append(current)
+                        #reg_fifos_depth.append(math.ceil((max(current)+1)/M)) #ToDo: fix for M again
+                        reg_fifos_depth.append(len(current))
+                        current = []
+                        current.append(access_idx)
+            reg_fifos.append(current)
+            #reg_fifos_depth.append(math.ceil((max(current)+1)/M)) #ToDo fix for M again
+            reg_fifos_depth.append(len(current))
+
+            f_debug.write("\n"+"Buffer partitioning using REG_BRAM_THRESHOLD=%d" % REG_BRAM_THRESHOLD)
+            f_debug.write("\n"+"%d REG FIFOs (parallel read access):" % len(reg_fifos))
+            f_debug.write("\n"+str(reg_fifos))
+            f_debug.write("\n"+"%d BRAM FIFOs (line buffers):" % len(bram_fifos))
+            f_debug.write("\n"+str(bram_fifos))
+
+            code_gen_dict["$GENERATE_REG_FIFOS$"] = []
+            for i in range(len(reg_fifos)):
+                code_gen_dict["$GENERATE_REG_FIFOS$"].append(
+                    """
+                    wire [IN_WIDTH-1:0] reg_fifo_{id}_in;
+                    wire [IN_WIDTH-1:0] reg_fifo_{id}_out;
+                    wire [IN_WIDTH*{len}-1:0] reg_fifo_{id};
+                    {name}_reg_buffer
+                    #(
+                    .WIDTH(IN_WIDTH),
+                    .DEPTH({len})
                     )
-                    # reversal: out_idx=0 -> oldest buffer element -> highest access_idx
-                    out_idx = out_idx-1
-        assert out_idx==-1, "ERROR: Not all output vector elements connected"
-
-        code_gen_dict["$GENERATE_BUFFER_CONNECTION$"] = []
-        for i in range(len(reg_fifos)):
-            if i == 0:
-                # first FIFO containing newest elements -> input comes from input reg
-                code_gen_dict["$GENERATE_BUFFER_CONNECTION$"].append(
-                    """assign reg_fifo_{fifo_id}_in = reg_input;""".format(fifo_id=i,))
-            else:
-                # other REG FIFOs -> input comes from connected BRAM FIFO (line buffer)
-                input_fifo_id = i-1
+                    reg_buffer_inst_{id}
+                    (
+                        .CLK(CLK),
+                        .shift_enable(shift_enable),
+                        .shift_in(reg_fifo_{id}_in),
+                        .shift_out(reg_fifo_{id}_out),
+                        .data_out(reg_fifo_{id})
+                    );""".format(name=self.get_verilog_top_module_name(), id=i, len=reg_fifos_depth[i]))
+
+            code_gen_dict["$GENERATE_BRAM_FIFOS$"] = []
+            for i in range(len(bram_fifos)):
+                code_gen_dict["$GENERATE_BRAM_FIFOS$"].append(
+                    """
+                    wire [IN_WIDTH-1:0] bram_fifo_{id}_in;
+                    wire [IN_WIDTH-1:0] bram_fifo_{id}_out;
+                    {name}_ram_buffer
+                    #(
+                    .WIDTH(IN_WIDTH),
+                    .DEPTH({len})
+                    )
+                    ram_buffer_inst_{id}
+                    (
+                        .CLK(CLK),
+                        .RST(RST),
+                        .shift_enable(shift_enable),
+                        .shift_in(bram_fifo_{id}_in),
+                        .shift_out(bram_fifo_{id}_out)
+                    );""".format(name=self.get_verilog_top_module_name(), id=i, len=bram_fifos_depth[i]))
+
+            code_gen_dict["$GENERATE_OUTPUT_MAPPING$"] = []
+            out_idx = mmv_out-1
+            for fifo_id, reg_fifo in enumerate(reg_fifos):
+                for fifo_idx, access_idx in enumerate(reg_fifo):
+                    if(access_idx != -1):
+                        #code_gen_dict["$GENERATE_OUTPUT_MAPPING$"].append(
+                        #    "assign data_out[OUT_ELEM_WIDTH*{out_idx}+:OUT_ELEM_WIDTH] = reg_fifo_{fifo_id}[{fifo_idx}]; //{access_idx}".format(
+                        #        out_idx=out_idx, fifo_id=fifo_id, fifo_idx=fifo_idx, access_idx=access_idx
+                        #    )
+                        #)
+                        code_gen_dict["$GENERATE_OUTPUT_MAPPING$"].append(
+                            "assign data_out[OUT_ELEM_WIDTH*{out_idx}+:OUT_ELEM_WIDTH] = reg_fifo_{fifo_id}[{access_idx}*{mmv}*OUT_ELEM_WIDTH+OUT_ELEM_WIDTH*{mmv_idx}+:OUT_ELEM_WIDTH];".format(
+                                out_idx=out_idx, fifo_id=fifo_id, 
+                                access_idx=reg_fifos_depth[fifo_id]-1-int((max(reg_fifo)-access_idx)/M), 
+                                mmv_idx=(max(reg_fifo)-access_idx)%M,
+                                mmv = M
+                            )
+                        )
+                        # reversal: out_idx=0 -> oldest buffer element -> highest access_idx
+                        out_idx = out_idx-1
+            assert out_idx==-1, "ERROR: Not all output vector elements connected"
+
+            code_gen_dict["$GENERATE_BUFFER_CONNECTION$"] = []
+            for i in range(len(reg_fifos)):
+                if i == 0:
+                    # first FIFO containing newest elements -> input comes from input reg
+                    code_gen_dict["$GENERATE_BUFFER_CONNECTION$"].append(
+                        """assign reg_fifo_{fifo_id}_in = reg_input;""".format(fifo_id=i,))
+                else:
+                    # other REG FIFOs -> input comes from connected BRAM FIFO (line buffer)
+                    input_fifo_id = i-1
+                    code_gen_dict["$GENERATE_BUFFER_CONNECTION$"].append(
+                        """assign reg_fifo_{fifo_id}_in = bram_fifo_{input_fifo_id}_out;""".format(fifo_id=i, input_fifo_id=input_fifo_id))
+            for i in range(len(bram_fifos)):
+                input_fifo_id = i
                 code_gen_dict["$GENERATE_BUFFER_CONNECTION$"].append(
-                    """assign reg_fifo_{fifo_id}_in = bram_fifo_{input_fifo_id}_out;""".format(fifo_id=i, input_fifo_id=input_fifo_id))
-        for i in range(len(bram_fifos)):
-            input_fifo_id = i
-            code_gen_dict["$GENERATE_BUFFER_CONNECTION$"].append(
-                """assign bram_fifo_{fifo_id}_in = reg_fifo_{input_fifo_id}_out;""".format(fifo_id=i, input_fifo_id=input_fifo_id))
-
-        # Generate read schedule (when data is read from input, written to buffer)
-        # code_gen_dict["$GENERATE_READ_SCHEDULE$"] = []
-        # schedule_as_string = ""
-        # #todo: change naming to swap write/read
-        # for i in schedule_write:
-        #     if i == 1:
-        #         schedule_as_string += "1'b1,"
-        #     else:
-        #         schedule_as_string += "1'b0,"
-        # schedule_as_string = schedule_as_string[:-1] # remove trailing ','
-        # code_gen_dict["$GENERATE_READ_SCHEDULE$"].append(
-        #     "localparam [0:{len}-1] READ_SCHEDULE = {{{str}}};".format(len=cycles_total, str=schedule_as_string)
-        # )
-        # code_gen_dict["$GENERATE_READ_SCHEDULE$"].append(
-        #     "assign read_state = READ_SCHEDULE[cycle];"
-        # )
-
-        # # Generate write schedule (when data is written to output, read from buffer)
-        # code_gen_dict["$GENERATE_WRITE_SCHEDULE$"] = []
-        # schedule_as_string = ""
-        # #todo: change naming to swap write/read
-        # for i in schedule_read:
-        #     if i == 1:
-        #         schedule_as_string += "1'b1,"
-        #     else:
-        #         schedule_as_string += "1'b0,"
-        # schedule_as_string = schedule_as_string[:-1] # remove trailing ','
-        # code_gen_dict["$GENERATE_WRITE_SCHEDULE$"].append(
-        #     "localparam [0:{len}-1] WRITE_SCHEDULE = {{{str}}};".format(len=cycles_total, str=schedule_as_string)
-        # )
-        # code_gen_dict["$GENERATE_WRITE_SCHEDULE$"].append(
-        #     "assign write_state = WRITE_SCHEDULE[cycle_last];"
-        # )
-
-        def convert_tuple(seq):
-            mapping = {'w': ("1'b1", "1'b0"),
-                        'r': ("1'b0", "1'b1"),
-                        'wr':("1'b1", "1'b1"),
-                        'n': ("1'b0", "1'b0")}
-            if seq:
-                if len(seq) == 2:
-                    return (seq[0], mapping[seq[1]], 0, mapping['n'])
-                if len(seq) == 4:
-                    return (seq[0], mapping[seq[1]], seq[2], mapping[seq[3]])
-            else:
-                return (0, mapping['n'], 0, mapping['n'])
+                    """assign bram_fifo_{fifo_id}_in = reg_fifo_{input_fifo_id}_out;""".format(fifo_id=i, input_fifo_id=input_fifo_id))
+
+            def convert_tuple(seq):
+                mapping = {'w': ("1'b1", "1'b0"),
+                            'r': ("1'b0", "1'b1"),
+                            'wr':("1'b1", "1'b1"),
+                            'n': ("1'b0", "1'b0")}
+                if seq:
+                    if len(seq) == 2:
+                        return (seq[0], mapping[seq[1]], 0, mapping['n'])
+                    if len(seq) == 4:
+                        return (seq[0], mapping[seq[1]], seq[2], mapping[seq[3]])
+                else:
+                    return (0, mapping['n'], 0, mapping['n'])
 
-        start_sequence,loop_counter,loop_sequence_1_counter,loop_sequence_1,loop_sequence_2,end_sequence = compact_schedule(schedule)
+            start_sequence,loop_counter,loop_sequence_1_counter,loop_sequence_1,loop_sequence_2,end_sequence = compact_schedule(schedule)
 
-        start_sequence = convert_tuple(start_sequence)
-        loop_sequence_1 = convert_tuple(loop_sequence_1)
-        loop_sequence_2 = convert_tuple(loop_sequence_2)
-        end_sequence = convert_tuple(end_sequence)
+            start_sequence = convert_tuple(start_sequence)
+            loop_sequence_1 = convert_tuple(loop_sequence_1)
+            loop_sequence_2 = convert_tuple(loop_sequence_2)
+            end_sequence = convert_tuple(end_sequence)
 
-        code_gen_dict["$START_COUNTER$"]=[str(start_sequence[0])]
-        code_gen_dict["$LOOP_MAIN_COUNTER$"]=[str(loop_sequence_1_counter)]
-        code_gen_dict["$LOOP_INTER_COUNTER$"]=[str(loop_counter)]
+            code_gen_dict["$CYCLES_TOTAL$"] = [str(cycles_total)]
 
-        code_gen_dict["$LOOP_MAIN_1_COUNTER$"]=[str(loop_sequence_1[0])]
-        code_gen_dict["$LOOP_MAIN_2_COUNTER$"]=[str(loop_sequence_1[2])]
+            code_gen_dict["$START_COUNTER$"]=[str(start_sequence[0])]
+            code_gen_dict["$LOOP_MAIN_COUNTER$"]=[str(loop_sequence_1_counter)]
+            code_gen_dict["$LOOP_INTER_COUNTER$"]=[str(loop_counter)]
 
-        code_gen_dict["$LOOP_INTER_1_COUNTER$"]=[str(loop_sequence_2[0])]
-        code_gen_dict["$LOOP_INTER_2_COUNTER$"]=[str(loop_sequence_2[2])]
+            code_gen_dict["$LOOP_MAIN_1_COUNTER$"]=[str(loop_sequence_1[0])]
+            code_gen_dict["$LOOP_MAIN_2_COUNTER$"]=[str(loop_sequence_1[2])]
 
-        code_gen_dict["$LOOP_END_1_COUNTER$"]=[str(end_sequence[0])]
-        code_gen_dict["$LOOP_END_2_COUNTER$"]=[str(end_sequence[2])]
+            code_gen_dict["$LOOP_INTER_1_COUNTER$"]=[str(loop_sequence_2[0])]
+            code_gen_dict["$LOOP_INTER_2_COUNTER$"]=[str(loop_sequence_2[2])]
 
-        code_gen_dict["$READ_CMD_MAP$"]=["{{ {}, {}, {}, {}, {}, {}, {} }}".format(
-            start_sequence[1][0],loop_sequence_1[1][0],loop_sequence_1[3][0],loop_sequence_2[1][0],loop_sequence_2[3][0],end_sequence[1][0],end_sequence[3][0])]
-        code_gen_dict["$WRITE_CMD_MAP$"]=["{{ {}, {}, {}, {}, {}, {}, {} }}".format(
-            start_sequence[1][1],loop_sequence_1[1][1],loop_sequence_1[3][1],loop_sequence_2[1][1],loop_sequence_2[3][1],end_sequence[1][1],end_sequence[3][1])]
+            code_gen_dict["$LOOP_END_1_COUNTER$"]=[str(end_sequence[0])]
+            code_gen_dict["$LOOP_END_2_COUNTER$"]=[str(end_sequence[2])]
 
-        with open("/workspace/finn/finn-rtllib/swg/swg_hdl_template.v", "r") as f:
-            template = f.read()
+            code_gen_dict["$READ_CMD_MAP$"]=["{{ {}, {}, {}, {}, {}, {}, {} }}".format(
+                start_sequence[1][0],loop_sequence_1[1][0],loop_sequence_1[3][0],loop_sequence_2[1][0],loop_sequence_2[3][0],end_sequence[1][0],end_sequence[3][0])]
+            code_gen_dict["$WRITE_CMD_MAP$"]=["{{ {}, {}, {}, {}, {}, {}, {} }}".format(
+                start_sequence[1][1],loop_sequence_1[1][1],loop_sequence_1[3][1],loop_sequence_2[1][1],loop_sequence_2[3][1],end_sequence[1][1],end_sequence[3][1])]
+
+            with open("/workspace/finn/finn-rtllib/swg/swg_hdl_template.v", "r") as f:
+                template = f.read()
+        
         
+        with open("/workspace/finn/finn-rtllib/swg/swg_hdl_template_wrapper.v", "r") as f:
+            template_wrapper = f.read()
+
         for key in code_gen_dict:
             # transform list into long string separated by '\n'
             code_gen_line = "\n".join(code_gen_dict[key])
             template = template.replace(key, code_gen_line)
+            template_wrapper = template_wrapper.replace(key, code_gen_line)
 
-        f = open(os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_hdl_gen.v"), "w")
-        #debug:
-        #f = open(os.path.join("/workspace/finn/finn-rtllib/swg/", "swg_hdl_generated.v"), "w")
+        f = open(os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv"), "w")
         f.write(template)
         f.close()
+
+        f = open(os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"), "w")
+        f.write(template_wrapper)
+        f.close()
+
         f_debug.close()
 
         #set ipgen_path and ip_path so that HLS-Synth transformation and stich_ip transformation do not complain
@@ -1127,10 +1332,9 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
 
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
         verilog_paths = [code_gen_dir]    
-        verilog_files = [self.get_nodeattr("gen_top_module") + "_hdl_gen.v"]
-        #debug:
-        #verilog_paths = ["/workspace/finn/finn-rtllib/swg/"]
-        #verilog_files = ["swg_hdl_generated.v"]
+        verilog_files = [self.get_nodeattr("gen_top_module") + "_wrapper.v",
+                         self.get_nodeattr("gen_top_module") + "_impl.sv"]
+
         # build the Verilator emu library
         sim = PyVerilator.build(
             verilog_files,
@@ -1149,22 +1353,24 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         vlnv = self.get_nodeattr("ip_vlnv")
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
 
-        cmd = ["add_files -norecurse %s" % (os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_hdl_gen.v")),
+        cmd = ["add_files -norecurse %s" % (os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v")),
+            "add_files -norecurse %s" % (os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv")),
             "create_bd_cell -type module -reference %s %s" % (self.get_nodeattr("gen_top_module"), self.onnx_node.name)]
 
         return cmd
 
     def code_generation_ipgen(self, model, fpgapart, clk):
-        """Generates c++ code and tcl script for ip generation."""
+        """Normally: Generates c++ code and tcl script for ip generation.
+           Here: Generates (System-)Verilog code for ip generation."""
         self.generate_hdl()
 
     def ipgen_singlenode_code(self):
-        """Builds the bash script for ip generation using the CallHLS from
+        """Normally: Builds the bash script for ip generation using the CallHLS from
         finn.util.hls."""
         pass
 
     def code_generation_cppsim(self, model):
-        """Generates c++ code for simulation (cppsim)."""
+        """Normally: Generates c++ code for simulation (cppsim)."""
         pass
 
     def compile_singlenode_code(self):
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index 113ccb93b839d6a3bd67e3bf8f23e477e86822c6..cd08bb46032ecb86861f26025bb48f26e8b98230 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -48,6 +48,11 @@ from finn.util.onnx import nchw_to_nhwc
 class InferConvInpGen(Transformation):
     """Convert Im2Col layers to ConvolutionInputGenerator layers."""
 
+    def __init__(self, use_rtl_variant=False):
+        super().__init__()
+        self.use_rtl_variant = use_rtl_variant
+        self.use_rtl_variant = True #testing
+
     def apply(self, model):
         graph = model.graph
         node_ind = 0
@@ -128,105 +133,128 @@ class InferConvInpGen(Transformation):
                     )
                     graph.node.insert(node_ind, padding_node)
 
-                # Ensure that only supported HLS nodes are inserted
-                is_square_image = ConvInpGen_idim_h == ConvInpGen_idim_w
-                is_square_kernel = k_h == k_w
-                is_kernel_pointwise = k_h == 1 and k_w == 1
-                is_equal_stride = stride_h == stride_w
-                is_1d_convolution = (k_h == 1 and k_w > 1 and ifm_dim_h == 1) or (
-                    k_h > 1 and k_w == 1 and ifm_dim_w == 1
-                )
-
-                if (stride_h > 1 or stride_w > 1) and is_kernel_pointwise:
-                    assert is_square_image, (
-                        "%s : DownSampler currently only supports square input images."
-                        % n.name
-                    )
-                    assert is_equal_stride, (
-                        """%s : DownSampler currently only supports equal stride value
-                        along different axes."""
-                        % n.name
-                    )
-                    ConvInpGen_idim = ConvInpGen_idim_h
-                    stride = stride_h
-                    # create DownSampler node
+                if (self.use_rtl_variant):
                     ConvInpGen_node = helper.make_node(
-                        "DownSampler",
+                        "ConvolutionInputGenerator_rtl",
                         [ConvInpGen_input],
                         [i2c_output],
                         domain="finn.custom_op.fpgadataflow",
                         backend="fpgadataflow",
-                        ImgDim=ConvInpGen_idim,
-                        NumChannels=ifm_ch,
+                        ConvKernelDim=[k_h, k_w],
+                        IFMChannels=ifm_ch,
+                        IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w],
+                        OFMDim=[ofm_dim_h, ofm_dim_w],
                         SIMD=ifm_ch,
-                        Stride=stride,
+                        M=1,
+                        parallel_window=0,
+                        Stride=[stride_h, stride_w],
+                        Dilation=[dilation_h, dilation_w],
                         inputDataType=dt.name,
-                        name="DownSampler_" + n.name,
+                        outputDataType=dt.name,
+                        depthwise=depthwise,
+                        name="ConvolutionInputGenerator_rtl" + n.name,
                     )
                     graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
                 else:
-                    # create equivalent ConvolutionInputGenerator node
-                    if (
-                        is_square_image and is_square_kernel
-                    ):  # square images and square kernels
-                        assert is_equal_stride, (
-                            """%s: Non-equal strides along different axes is not supported
-                            for (non-)square convolutions"""
+                    # Ensure that only supported HLS nodes are inserted
+                    is_square_image = ConvInpGen_idim_h == ConvInpGen_idim_w
+                    is_square_kernel = k_h == k_w
+                    is_kernel_pointwise = k_h == 1 and k_w == 1
+                    is_equal_stride = stride_h == stride_w
+                    is_1d_convolution = (k_h == 1 and k_w > 1 and ifm_dim_h == 1) or (
+                        k_h > 1 and k_w == 1 and ifm_dim_w == 1
+                    )
+
+                    if (stride_h > 1 or stride_w > 1) and is_kernel_pointwise:
+                        assert is_square_image, (
+                            "%s : DownSampler currently only supports square input images."
                             % n.name
                         )
-                        assert dilation_h == 1 and dilation_w == 1, (
-                            """%s: Dilation value != 1 is not supported
-                            for square convolutions"""
+                        assert is_equal_stride, (
+                            """%s : DownSampler currently only supports equal stride value
+                            along different axes."""
                             % n.name
                         )
+                        ConvInpGen_idim = ConvInpGen_idim_h
+                        stride = stride_h
+                        # create DownSampler node
                         ConvInpGen_node = helper.make_node(
-                            "ConvolutionInputGenerator",
+                            "DownSampler",
                             [ConvInpGen_input],
                             [i2c_output],
                             domain="finn.custom_op.fpgadataflow",
                             backend="fpgadataflow",
-                            ConvKernelDim=[k_h, k_w],
-                            IFMChannels=ifm_ch,
-                            IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w],
-                            OFMDim=[ofm_dim_h, ofm_dim_w],
+                            ImgDim=ConvInpGen_idim,
+                            NumChannels=ifm_ch,
                             SIMD=ifm_ch,
-                            Stride=[stride_h, stride_w],
-                            Dilation=[dilation_h, dilation_w],
+                            Stride=stride,
                             inputDataType=dt.name,
-                            outputDataType=dt.name,
-                            depthwise=depthwise,
-                            name="ConvolutionInputGenerator_" + n.name,
-                        )
-                    else:  # non-square images and/or kernels
-                        assert is_1d_convolution, (
-                            "%s: ConvolutionInputGenerator1D works only for 1D convs"
-                            % n.name
+                            name="DownSampler_" + n.name,
                         )
-                        if dilation_h > 1 or dilation_w > 1:
-                            assert stride_h == 1 and stride_w == 1, (
-                                """%s: Stride value of greater than 1 is not supported for convolutions
-                                with dilation value greater than 1"""
+                        graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
+                    else:
+                        # create equivalent ConvolutionInputGenerator node
+                        if (
+                            is_square_image and is_square_kernel
+                        ):  # square images and square kernels
+                            assert is_equal_stride, (
+                                """%s: Non-equal strides along different axes is not supported
+                                for (non-)square convolutions"""
                                 % n.name
                             )
-                        ConvInpGen_node = helper.make_node(
-                            "ConvolutionInputGenerator1D",
-                            [ConvInpGen_input],
-                            [i2c_output],
-                            domain="finn.custom_op.fpgadataflow",
-                            backend="fpgadataflow",
-                            ConvKernelDim=[k_h, k_w],
-                            IFMChannels=ifm_ch,
-                            IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w],
-                            OFMDim=[ofm_dim_h, ofm_dim_w],
-                            SIMD=ifm_ch,
-                            Stride=[stride_h, stride_w],
-                            Dilation=[dilation_h, dilation_w],
-                            inputDataType=dt.name,
-                            outputDataType=dt.name,
-                            depthwise=depthwise,
-                            name="ConvolutionInputGenerator1D_" + n.name,
-                        )
-                    graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
+                            assert dilation_h == 1 and dilation_w == 1, (
+                                """%s: Dilation value != 1 is not supported
+                                for square convolutions"""
+                                % n.name
+                            )
+                            ConvInpGen_node = helper.make_node(
+                                "ConvolutionInputGenerator",
+                                [ConvInpGen_input],
+                                [i2c_output],
+                                domain="finn.custom_op.fpgadataflow",
+                                backend="fpgadataflow",
+                                ConvKernelDim=[k_h, k_w],
+                                IFMChannels=ifm_ch,
+                                IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w],
+                                OFMDim=[ofm_dim_h, ofm_dim_w],
+                                SIMD=ifm_ch,
+                                Stride=[stride_h, stride_w],
+                                Dilation=[dilation_h, dilation_w],
+                                inputDataType=dt.name,
+                                outputDataType=dt.name,
+                                depthwise=depthwise,
+                                name="ConvolutionInputGenerator_" + n.name,
+                            )
+                        else:  # non-square images and/or kernels
+                            assert is_1d_convolution, (
+                                "%s: ConvolutionInputGenerator1D works only for 1D convs"
+                                % n.name
+                            )
+                            if dilation_h > 1 or dilation_w > 1:
+                                assert stride_h == 1 and stride_w == 1, (
+                                    """%s: Stride value of greater than 1 is not supported for convolutions
+                                    with dilation value greater than 1"""
+                                    % n.name
+                                )
+                            ConvInpGen_node = helper.make_node(
+                                "ConvolutionInputGenerator1D",
+                                [ConvInpGen_input],
+                                [i2c_output],
+                                domain="finn.custom_op.fpgadataflow",
+                                backend="fpgadataflow",
+                                ConvKernelDim=[k_h, k_w],
+                                IFMChannels=ifm_ch,
+                                IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w],
+                                OFMDim=[ofm_dim_h, ofm_dim_w],
+                                SIMD=ifm_ch,
+                                Stride=[stride_h, stride_w],
+                                Dilation=[dilation_h, dilation_w],
+                                inputDataType=dt.name,
+                                outputDataType=dt.name,
+                                depthwise=depthwise,
+                                name="ConvolutionInputGenerator1D_" + n.name,
+                            )
+                        graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
                 # remove old nodes
                 graph.node.remove(n)
                 graph_modified = True
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py
index ef1fda8e31eab93c8a79167a51d152400f317bc0..870f5593bfbe778802d7a3f4f057de5ff5d6e740 100755
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, Xilinx
+# Copyright (c) 2022, Xilinx
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -37,18 +37,14 @@ from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
 from finn.custom_op.general.im2col import compute_conv_output_dim
 from finn.custom_op.registry import getCustomOp
-from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
-from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.general import GiveUniqueNodeNames
 from finn.util.basic import gen_finn_dt_tensor
 
-
 def make_single_im2col_modelwrapper(
-    k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt
+    k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt
 ):
     k_h, k_w = k
     ifm_dim_h, ifm_dim_w = ifm_dim
@@ -90,7 +86,7 @@ def make_single_im2col_modelwrapper(
 
 
 def make_single_slidingwindow_modelwrapper(
-    k, ifm_ch, ifm_dim, ofm_dim, simd, m, stride, dilation, idt, dw=0
+    k, ifm_ch, ifm_dim, ofm_dim, simd, m, parallel_window, stride, dilation, idt, dw=0
 ):
     k_h, k_w = k
     ifm_dim_h, ifm_dim_w = ifm_dim
@@ -118,6 +114,7 @@ def make_single_slidingwindow_modelwrapper(
         OFMDim=[ofm_dim_h, ofm_dim_w],
         SIMD=simd,
         M=m,
+        parallel_window=parallel_window,
         Stride=[stride_h, stride_w],
         Dilation=[dilation_h, dilation_w],
         inputDataType=idt.name,
@@ -150,31 +147,33 @@ def prepare_inputs(input_tensor):
 
 
 # input datatype
-@pytest.mark.parametrize("idt", [DataType["INT4"]])
+@pytest.mark.parametrize("idt", [DataType["UINT4"]])
 # kernel size
-@pytest.mark.parametrize("k", [[3, 1]])
+@pytest.mark.parametrize("k", [[3,3]])
 # input dimension
-@pytest.mark.parametrize("ifm_dim", [[10, 1]])
+@pytest.mark.parametrize("ifm_dim", [[24,24]])
 # input channels
-@pytest.mark.parametrize("ifm_ch", [2])
+@pytest.mark.parametrize("ifm_ch", [8])
 # Stride
-@pytest.mark.parametrize("stride", [[1, 1]])
+@pytest.mark.parametrize("stride", [[3,3],[6,6]])
 # Dilation
-@pytest.mark.parametrize("dilation", [[1, 1]])
-# execution mode
-@pytest.mark.parametrize("exec_mode", ["rtlsim"])
+@pytest.mark.parametrize("dilation", [[1,1],[2,2]])
+# depthwise
+@pytest.mark.parametrize("dw", [0,1])
+
 # input channel parallelism ("SIMD")
-@pytest.mark.parametrize("simd", [2])
+@pytest.mark.parametrize("simd", [1,2,8])
 # in/out MMV ("M")
-@pytest.mark.parametrize("m", [1, 2, 4])
-# depthwise
-@pytest.mark.parametrize("dw", [0])
+@pytest.mark.parametrize("m", [1])
+# paralle_window enable (MMV_out = M*K)
+@pytest.mark.parametrize("parallel_window", [0])
+
 # Flip dimensions
-@pytest.mark.parametrize("flip", [False])
+@pytest.mark.parametrize("flip", [False,True])
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_fpgadataflow_slidingwindow_rtl(
-    idt, k, ifm_dim, ifm_ch, stride, dilation, exec_mode, simd, m, dw, flip
+    idt, k, ifm_dim, ifm_ch, stride, dilation, dw, simd, m, parallel_window, flip
 ):
     if flip:
         k = k[::-1]
@@ -187,11 +186,6 @@ def test_fpgadataflow_slidingwindow_rtl(
     stride_h, stride_w = stride
     dilation_h, dilation_w = dilation
 
-    #if (dilation_h > 1 or dilation_w > 1) and (stride_h > 1 or stride_w > 1):
-    #    pytest.skip(
-    #        """Dilation value greater than 1 and stride greater than 1
-    #        currently not supported for 1D convolutions"""
-    #    )
     if simd > ifm_ch:
         pytest.skip("SIMD cannot be larger than number of input channels")
 
@@ -207,21 +201,17 @@ def test_fpgadataflow_slidingwindow_rtl(
         ofm_dim=ofm_dim,
         simd=simd,
         m=m,
+        parallel_window=parallel_window,
         stride=stride,
         dilation=dilation,
         idt=idt,
         dw=dw,
     )
 
-    if exec_mode == "cppsim":
-        raise Exception("cppsim not supported in test_fpgadataflow_slidingwindow_rtl")
-    elif exec_mode == "rtlsim":
-        model = model.transform(SetExecMode("rtlsim"))
-        model = model.transform(GiveUniqueNodeNames())
-        model = model.transform(PrepareIP("xc7z020clg400-1", 4))
-        model = model.transform(PrepareRTLSim())
-    else:
-        raise Exception("Unknown exec_mode in test_fpgadataflow_slidingwindow_rtl")
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+    model = model.transform(PrepareRTLSim())
 
     # prepare input data
     input_dict = prepare_inputs(x)
@@ -232,7 +222,6 @@ def test_fpgadataflow_slidingwindow_rtl(
         ifm_ch=ifm_ch,
         ifm_dim=ifm_dim,
         ofm_dim=ofm_dim,
-        simd=simd,
         stride=stride,
         dilation=dilation,
         idt=idt,
@@ -245,6 +234,11 @@ def test_fpgadataflow_slidingwindow_rtl(
     print("--------produced:")
     print(y_produced)
 
+    node = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")[0]
+    inst = getCustomOp(node)
+    cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+    print("RTLSIM cycles: %d"%cycles_rtlsim)
+
     if dw == 0:
         assert (y_produced == y_expected).all()
     else:
@@ -255,12 +249,7 @@ def test_fpgadataflow_slidingwindow_rtl(
         y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, ifm_ch * k_h * k_w)
         assert (y_produced == y_expected).all()
 
-
-    # if exec_mode == "rtlsim":
-    #     node = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")[0]
-    #     inst = getCustomOp(node)
-    #     cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
-    #     exp_cycles_dict = model.analysis(exp_cycles_per_layer)
-    #     exp_cycles = exp_cycles_dict[node.name]
-    #     assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
-    #     assert exp_cycles != 0
+#     exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+#     exp_cycles = exp_cycles_dict[node.name]
+#     assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
+#     assert exp_cycles != 0