From f46e2d0a79a6a19cd09e4ee3d0503d81a42cc87e Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 25 Aug 2022 00:16:02 +0200
Subject: [PATCH] Restructure, basic resource estimation

---
 finn-rtllib/swg/swg_template_parallel.sv      |   74 +-
 .../convolutioninputgenerator_rtl.py          | 1474 +++++++++--------
 ...est_fpgadataflow_convinputgenerator_rtl.py |   81 +-
 3 files changed, 915 insertions(+), 714 deletions(-)

diff --git a/finn-rtllib/swg/swg_template_parallel.sv b/finn-rtllib/swg/swg_template_parallel.sv
index 7c1e04222..19638d8a1 100755
--- a/finn-rtllib/swg/swg_template_parallel.sv
+++ b/finn-rtllib/swg/swg_template_parallel.sv
@@ -3,13 +3,15 @@
 module $TOP_MODULE_NAME$_controller
 (
     CLK,
-    cycle,
+    RST,
+    advance,
     cmd_read,
     cmd_write
 );
 
 input CLK;
-input [31:0] cycle; //todo: minimize width or switch to single bit flag
+input RST;
+input advance;
 output cmd_read;
 output cmd_write;
 
@@ -39,10 +41,6 @@ integer counter_loop_inter;
 assign cmd_read = READ_CMD_MAP[state_next]; //read command indicates read in *upcoming* cycle, due to how schedule is constructed
 assign cmd_write = WRITE_CMD_MAP[state];
 
-reg cycle_last;
-wire cycle_advance;
-assign cycle_advance = !(cycle == cycle_last);
-
 //combinational next state logic
 always @ (state, counter_current, counter_loop_main, counter_loop_inter) begin
     state_next = state; //default
@@ -67,7 +65,7 @@ always @ (state, counter_current, counter_loop_main, counter_loop_inter) begin
                         if (LOOP_END_1_COUNTER != 0)
                             state_next = STATE_END_1;
                         else
-                            state_next = STATE_START;
+                            state_next = STATE_LOOP_MAIN_2; //wait in current state until reset
                     end
                 end
             end
@@ -91,49 +89,46 @@ always @ (state, counter_current, counter_loop_main, counter_loop_inter) begin
                 if (LOOP_END_2_COUNTER != 0)
                     state_next = STATE_END_2;
                 else
-                    state_next = STATE_START;
+                    state_next = STATE_END_1; //wait in current state until reset
             end
         end
 
         STATE_END_2:
             if (counter_current == LOOP_END_2_COUNTER-1)
-                state_next = STATE_START;
+                state_next = STATE_END_2; //wait in current state until reset
     endcase
 end
 
 //sequential logic
 always @ (posedge CLK) begin
-    if (cycle == 0) begin
-        counter_current <= 0;
+    if (RST) begin
+        counter_current <= -1;
         counter_loop_main <= 0;
         counter_loop_inter <= 0;
-        cycle_last <= 0;
         state <= STATE_START;
     end else begin
-        cycle_last <= cycle;
-        state <= state_next;
-
-        if (cycle_advance) begin
+        if (advance) begin
             counter_current <= counter_current+1;
-        end
+            state <= state_next;
 
-        if (state != state_next) begin
-            counter_current <= 0;
+            if (state != state_next) begin
+                counter_current <= 0;
 
-            //count up main loop upon re-entering this loop (not on first enter from start)
-            if ((state_next == STATE_LOOP_MAIN_1) && (state != STATE_START)) begin
-                if (counter_loop_main == LOOP_MAIN_COUNTER-1) begin
-                    counter_loop_main <= 0;
-                end else begin
-                    counter_loop_main <= counter_loop_main+1;
+                //count up main loop upon re-entering this loop (not on first enter from start)
+                if ((state_next == STATE_LOOP_MAIN_1) && (state != STATE_START)) begin
+                    if (counter_loop_main == LOOP_MAIN_COUNTER-1) begin
+                        counter_loop_main <= 0;
+                    end else begin
+                        counter_loop_main <= counter_loop_main+1;
+                    end
                 end
-            end
 
-            if (state_next == STATE_LOOP_INTER_1) begin
-                if (counter_loop_inter == LOOP_INTER_COUNTER) begin //no -1 because this counter marks the currently active iteration, not finished iterations
-                    counter_loop_inter <= 0;
-                end else begin
-                    counter_loop_inter <= counter_loop_inter+1;
+                if (state_next == STATE_LOOP_INTER_1) begin
+                    if (counter_loop_inter == LOOP_INTER_COUNTER) begin //no -1 because this counter marks the currently active iteration, not finished iterations
+                        counter_loop_inter <= 0;
+                    end else begin
+                        counter_loop_inter <= counter_loop_inter+1;
+                    end
                 end
             end
         end
@@ -169,8 +164,8 @@ output [WIDTH*DEPTH-1:0] data_out;
 // File: shift_registers_1.v
 //
 //module shift_registers_1 (clk, clken, SI, SO);
-//parameter WIDTH = 32; 
-//input clk, clken, SI; 
+//parameter WIDTH = 32;
+//input clk, clken, SI;
 //output SO;
 //reg [WIDTH-1:0] shreg;
 //
@@ -181,7 +176,7 @@ output [WIDTH*DEPTH-1:0] data_out;
 //    begin
 //    for (i = 0; i < WIDTH-1; i = i+1)
 //        shreg[i+1] <= shreg[i];
-//      shreg[0] <= SI; 
+//      shreg[0] <= SI;
 //    end
 //end
 //assign SO = shreg[WIDTH-1];
@@ -227,7 +222,7 @@ integer addr_w, addr_r; //todo: minimize width (as reg), make r addr depend on w
 
 $RAM_STYLE$ reg [WIDTH-1:0] ram [DEPTH-1:0];
 
-always @(posedge CLK) begin 
+always @(posedge CLK) begin
     if (RST == 1'b0) begin
         addr_w <= 0;
         addr_r <= 1;
@@ -349,11 +344,15 @@ wire read_cmd;
 wire write_cmd;
 reg write_done; //keep track if W of current cycle was already completed, but we still wait on a R in the same cycle
 
+wire controller_reset;
+wire controller_advance;
+
 $TOP_MODULE_NAME$_controller
 controller_inst
 (
     .CLK(ap_clk),
-    .cycle(cycle),
+    .RST(controller_reset),
+    .advance(controller_advance),
     .cmd_read(read_cmd),
     .cmd_write(write_cmd)
 );
@@ -379,6 +378,9 @@ assign advance =      read_ok        ||   (!read_cmd && write_ok)    || (!read_c
 //todo: if mmv_out < k: might not shift and/or write for multiple read_cmd cycles
 assign window_buffer_shift_enable = advance;
 
+assign controller_reset = !ap_rst_n || ((cycle == CYCLES_TOTAL-1) && advance);
+assign controller_advance = advance;
+
 //assign I/O ports
 assign window_buffer_in = in0_V_V_TDATA;
 assign out_V_V_TDATA = window_buffer_out;
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
index 936954258..f1e0f53a7 100755
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
@@ -27,21 +27,17 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import math
-from math import copysign
 import numpy as np
 import os
-
+from math import copysign
 from qonnx.core.datatype import DataType
 from qonnx.custom_op.general import im2col
 from qonnx.custom_op.general.im2col import compute_conv_output_dim
+
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
-from finn.util.basic import (
-    get_rtlsim_trace_depth,
-    make_build_dir,
-)
-
 try:
     from pyverilator import PyVerilator
 except ModuleNotFoundError:
@@ -57,9 +53,124 @@ except ModuleNotFoundError:
 # * non-depthwise SWG: (1, OFMDim_H, OFMDim_W, K_H, K_W, IFMChannels/SIMD, SIMD)
 # * depthwise SWG: (1, OFMDim_H, OFMDim_W, IFMChannels/SIMD, K_H, K_W, SIMD)
 
+
+# helper functions for parallel mode buffer scheduling (to be superseded by improved implementation):
+
+
+def schedule_append(schedule, op):
+    if len(schedule) > 0 and schedule[-1][1] == op:
+        count, op_ = schedule[-1]
+        schedule[-1] = (count + 1, op_)
+    else:
+        schedule.append((1, op))
+    return schedule
+
+
+def schedule_map_cmds(seq):
+    mapping = {
+        "w": ("1'b1", "1'b0"),
+        "r": ("1'b0", "1'b1"),
+        "wr": ("1'b1", "1'b1"),
+        "n": ("1'b0", "1'b0"),
+    }
+    if seq:
+        if len(seq) == 2:
+            return (seq[0], mapping[seq[1]], 0, mapping["n"])
+        if len(seq) == 4:
+            return (seq[0], mapping[seq[1]], seq[2], mapping[seq[3]])
+    else:
+        return (0, mapping["n"], 0, mapping["n"])
+
+
+def schedule_map_controller(schedule):
+    # Experimental implementation to map fixed controller loop structure to R/W schedule by analyzing
+    # the access pattern given by Im2Col, rather than direct computation.
+    # TODO: Probably replace this with a directly-computed schedule, similar to the default implementation style.
+
+    # leave first sequence (pre-load) as is
+    start_sequence = schedule[0]
+    loop_sequence_1_counter = 1
+    loop_sequence_1 = schedule[1]
+    loop_counter = 0
+    loop_sequence_2 = None
+    end_sequence = None
+
+    i = 2
+    if i < len(schedule):
+        loop_sequence_1 += schedule[i]
+        i += 1
+    while i + 1 < len(schedule):
+        candidate = schedule[i] + schedule[i + 1]
+        if candidate == loop_sequence_1:
+            loop_sequence_1_counter += 1
+            i += 2
+        else:
+            break
+
+    if i < len(schedule):
+        loop_sequence_2 = schedule[i]
+        i += 1
+    if i + 1 < len(schedule):
+        candidate = schedule[i] + schedule[i + 1]
+        if candidate != loop_sequence_1:
+            loop_sequence_2 += schedule[i]
+        i -= 1
+        loop_sequence_total_len = (
+            int(len(loop_sequence_2) / 2)
+        ) + loop_sequence_1_counter * (int(len(loop_sequence_1) / 2))
+        loop_sequence_total = (
+            loop_sequence_2 + loop_sequence_1_counter * loop_sequence_1
+        )
+        while i + loop_sequence_total_len < len(schedule):
+            candidate = schedule[i]
+            for x in range(i + 1, i + loop_sequence_total_len):
+                candidate += schedule[x]
+
+            if candidate == loop_sequence_total:
+                loop_counter += 1
+                i += loop_sequence_total_len
+            else:
+                break
+    else:
+        if i < len(schedule):
+            end_sequence = loop_sequence_2 + schedule[i]
+            i += 1
+            loop_sequence_2 = None
+        else:
+            end_sequence = loop_sequence_2
+            loop_sequence_2 = None
+
+    if i < len(schedule):
+        end_sequence = schedule[i]
+        i += 1
+    if i < len(schedule):
+        end_sequence = end_sequence + schedule[i]
+        i += 1
+
+    assert len(start_sequence) == 1 * 2, "ERROR: invalid start sequence"
+    assert len(loop_sequence_1) == 2 * 2, "ERROR: invalid loop 1 sequence"
+    if loop_sequence_2:
+        assert len(loop_sequence_2) <= 2 * 2, "ERROR: invalid loop 2 sequence"
+    if end_sequence:
+        assert len(end_sequence) <= 2 * 2, "ERROR: invalid end sequence"
+    assert i == len(schedule), "ERROR: schedule could not be compacted %d / %d" % (
+        i,
+        len(schedule),
+    )
+
+    return (
+        start_sequence,
+        loop_counter,
+        loop_sequence_1_counter,
+        loop_sequence_1,
+        loop_sequence_2,
+        end_sequence,
+    )
+
+
 class ConvolutionInputGenerator_rtl(HLSCustomOp):
     """Class that does not correspond to one of the finn-hlslib ConvolutionInputGenerator
-    (sliding window) function variants! ... """
+    (sliding window) function variants! ..."""
 
     def __init__(self, onnx_node):
         super().__init__(onnx_node)
@@ -108,12 +219,12 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         M = self.get_nodeattr("M")
         assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
         wf = int(ifm_ch / simd)
-        #folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd)
-        #round up to support ifm_dim % M != 0
+        # folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd)
+        # round up to support ifm_dim % M != 0
         if ifm_dim_w == 1:
-            folded_ishape = (1, math.ceil(ifm_dim_h/M), ifm_dim_w, wf, int(simd*M))
+            folded_ishape = (1, math.ceil(ifm_dim_h / M), ifm_dim_w, wf, int(simd * M))
         else:
-            folded_ishape = (1, ifm_dim_h, math.ceil(ifm_dim_w/M), wf, int(simd*M))
+            folded_ishape = (1, ifm_dim_h, math.ceil(ifm_dim_w / M), wf, int(simd * M))
         return folded_ishape
 
     def get_normal_output_shape(self):
@@ -140,13 +251,25 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h)
         ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w)
         assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
-        if (self.get_nodeattr("parallel_window")):
+        if self.get_nodeattr("parallel_window"):
             wf = int((ifm_ch) // simd)
-            #folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd)
+            # folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd)
             if ofm_dim_w == 1:
-                folded_oshape = (1, int(ofm_dim_h/M), ofm_dim_w, wf, k_h * k_w * int(simd*M))
+                folded_oshape = (
+                    1,
+                    int(ofm_dim_h / M),
+                    ofm_dim_w,
+                    wf,
+                    k_h * k_w * int(simd * M),
+                )
             else:
-                folded_oshape = (1, ofm_dim_h, int(ofm_dim_w/M), wf, k_h * k_w * int(simd*M))
+                folded_oshape = (
+                    1,
+                    ofm_dim_h,
+                    int(ofm_dim_w / M),
+                    wf,
+                    k_h * k_w * int(simd * M),
+                )
         else:
             wf = int((k_h * k_w * ifm_ch) // simd)
             folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd)
@@ -186,7 +309,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         return in_width
 
     def get_outstream_width(self):
-        if (self.get_nodeattr("parallel_window")):
+        if self.get_nodeattr("parallel_window"):
             # feed all window pixels in parallel
             k_h, k_w = self.get_nodeattr("ConvKernelDim")
             return self.get_instream_width() * k_h * k_w
@@ -205,25 +328,31 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         return num_output_elems
 
     def get_exp_cycles(self):
-        # TODO: update
         simd = self.get_nodeattr("SIMD")
+        m = self.get_nodeattr("M")
         ifm_ch = self.get_nodeattr("IFMChannels")
         k = self.get_nodeattr("ConvKernelDim")
         ifm_dim = self.get_nodeattr("IFMDim")
         ofm_dim = self.get_nodeattr("OFMDim")
         stride = self.get_nodeattr("Stride")
         dilation = self.get_nodeattr("Dilation")
+        depthwise = self.get_nodeattr("depthwise")
         ifm_dim_h, ifm_dim_w = ifm_dim
         ofm_dim_h, ofm_dim_w = ofm_dim
         k_h, k_w = k
         stride_h, stride_w = stride
         dilation_h, dilation_w = dilation
-        mmv = 1
+        k_h, k_w = k
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
 
-        if (self.get_nodeattr("parallel_window")):
-            exp_cycles = ifm_dim_w + 1
+        impl_style = self.select_impl_style()
+        if impl_style == "parallel":
+            exp_cycles = self.get_number_input_values() + 2
         else:
-            cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv
+            # based on 2D HLS SWG estimate
+            # FIXME: increase accuracy for newly supported parameter scenarios
+            cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / 1
             cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd)
             max_cycles = max(cycles_write_block, cycles_read_block)
             exp_cycles = (
@@ -233,15 +362,21 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         return int(exp_cycles)
 
     def bram_estimation(self):
-        # TODO: update
         simd = self.get_nodeattr("SIMD")
-        ifm_ch = self.get_nodeattr("IFMChannels")
-        ifm_dim = np.prod(self.get_nodeattr("IFMDim"))
-        k = np.prod(self.get_nodeattr("ConvKernelDim"))
-        stride = np.prod(self.get_nodeattr("Stride"))
         ram_style = self.get_nodeattr("ram_style")
+
+        impl_style = self.select_impl_style()
+        # call codegen preparation to populate self.buffer_depth
+        if impl_style == "default":
+            template_path, code_gen_dict = self.prepare_codegen_default()
+        elif impl_style == "parallel":
+            template_path, code_gen_dict = self.prepare_codegen_parallel()
+
+        buffer_width = simd * self.get_input_datatype().bitwidth()
+        buffer_depth = self.buffer_depth
+
         if ram_style == "block" or ram_style == "auto":
-            ram_depth = ifm_dim * ifm_ch / simd
+            ram_depth = buffer_depth
             if ram_depth <= 512:
                 ram_width = 36
             elif ram_depth <= 1024:
@@ -254,57 +389,37 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
                 ram_width = 2
             else:
                 ram_width = 1
-            return int(
-                (k + stride)
-                * (
-                    math.ceil(simd * self.get_input_datatype().bitwidth() / ram_width)
-                    * math.ceil(ifm_dim * ifm_ch / simd / ram_depth)
-                )
-            )
+
+            ram_cascade_depth = math.ceil(buffer_depth / 16384)
+            ram_cascade_width = math.ceil(buffer_width / ram_width)
+
+            return int(ram_cascade_depth * ram_cascade_width)
         else:
             return 0
 
     def lut_estimation(self):
-        # TODO: update
-        # NOTE: not tested for correctness
         simd = self.get_nodeattr("SIMD")
-        ifm_ch = self.get_nodeattr("IFMChannels")
-        ifm_dim = np.prod(self.get_nodeattr("IFMDim"))
-        k = np.prod(self.get_nodeattr("ConvKernelDim"))
-        stride = np.prod(self.get_nodeattr("Stride"))
         ram_style = self.get_nodeattr("ram_style")
+
+        impl_style = self.select_impl_style()
+        # call codegen preparation to populate self.buffer_depth
+        if impl_style == "default":
+            template_path, code_gen_dict = self.prepare_codegen_default()
+        elif impl_style == "parallel":
+            template_path, code_gen_dict = self.prepare_codegen_parallel()
+
+        buffer_width = simd * self.get_input_datatype().bitwidth()
+        buffer_depth = self.buffer_depth
+
         if ram_style == "distributed":
-            ram_luts = int(
-                (k + stride)
-                * (
-                    simd
-                    * self.get_input_datatype().bitwidth()
-                    * math.ceil(ifm_dim * ifm_ch / simd / 64)
-                )
-            )
+            ram_luts = int(buffer_width * math.ceil(buffer_depth / 32))
         else:
             ram_luts = 0
         return 300 + ram_luts
 
     def uram_estimation(self):
-        # TODO: update
-        # NOTE: not tested for correctness
-        simd = self.get_nodeattr("SIMD")
-        ifm_ch = self.get_nodeattr("IFMChannels")
-        ifm_dim = np.prod(self.get_nodeattr("IFMDim"))
-        k = np.prod(self.get_nodeattr("ConvKernelDim"))
-        stride = np.prod(self.get_nodeattr("Stride"))
-        ram_style = self.get_nodeattr("ram_style")
-        if ram_style == "ultra":
-            return int(
-                (k + stride)
-                * (
-                    math.ceil(simd * self.get_input_datatype().bitwidth() / 64)
-                    * math.ceil(ifm_dim * ifm_ch / simd / 4096)
-                )
-            )
-        else:
-            return 0
+        # TODO: implement URAM estimation
+        return 0
 
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
@@ -314,14 +429,8 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         folded_ishape = self.get_folded_input_shape()
         folded_oshape = self.get_folded_output_shape()
 
-        # TODO ensure codegen dir exists
         if mode == "cppsim":
-            #code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
-            raise Exception(
-                """cppsim not possible for RTL SWG""".format(
-                    mode
-                )
-            )
+            raise Exception("""cppsim not possible for RTL SWG""".format(mode))
         elif mode == "rtlsim":
             code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
         else:
@@ -335,10 +444,10 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         inp = context[node.input[0]]
         assert str(inp.dtype) == "float32", "Input datatype is not float32"
         # disable this check to allow for IFMdim % M != 0 case (see below) where input comes from MMV-output capable node
-        #assert (
+        # assert (
         #    inp.shape == exp_ishape
-        #), """Input shape doesn't
-        #match expected shape (1, ifm_dim, ifm_dim, ifm_ch)."""
+        # ), """Input shape doesn't
+        # match expected shape (1, ifm_dim, ifm_dim, ifm_ch)."""
         if self.get_input_datatype() == DataType["BIPOLAR"]:
             # store bipolar activations as binary
             inp = (inp + 1) / 2
@@ -349,11 +458,17 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         # pad test input stream to work when IFMdim % M != 0
         # during normal operation, the AXI Stream should not care, in the last cycle garbage elements are read but not used
         # TODO: only works for 1D case
-        mmv_stream_padding_px = int((np.prod(folded_ishape) - np.prod(inp.shape)) / exp_ishape[-1])
-        if exp_ishape [2] == 1:
-            inp = np.pad(inp, ((0,0),(0,mmv_stream_padding_px),(0,0),(0,0)), 'constant')
+        mmv_stream_padding_px = int(
+            (np.prod(folded_ishape) - np.prod(inp.shape)) / exp_ishape[-1]
+        )
+        if exp_ishape[2] == 1:
+            inp = np.pad(
+                inp, ((0, 0), (0, mmv_stream_padding_px), (0, 0), (0, 0)), "constant"
+            )
         else:
-            inp = np.pad(inp, ((0,0),(0,0),(0,mmv_stream_padding_px),(0,0)), 'constant')
+            inp = np.pad(
+                inp, ((0, 0), (0, 0), (0, mmv_stream_padding_px), (0, 0)), "constant"
+            )
         # reshape input into folded form
         inp = inp.reshape(folded_ishape)
         # make copy before saving array
@@ -391,633 +506,660 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         ), """Output
         shape doesn't match expected shape (1, ofm_dim_h, ofm_dim_w, k_h*k_w*ifm_ch)."""
 
-    def global_includes(self):
-        pass
+    def prepare_codegen_default(self):
+        # Default implementation style for MMV_out = 1: addressable cyclic buffer
+        # Computing incremental addressing scheme directly..
+        template_path = (
+            os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_template_default.sv"
+        )
+        code_gen_dict = {}
 
-    def defines(self, var):
-        pass
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        k = self.get_nodeattr("ConvKernelDim")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        stride = self.get_nodeattr("Stride")
+        dilation = self.get_nodeattr("Dilation")
+        depthwise = self.get_nodeattr("depthwise")
+        simd = self.get_nodeattr("SIMD")
+        M = self.get_nodeattr("M")
 
-    def read_npy_data(self):
-        pass
+        k_h, k_w = k
+        h, w = ifm_dim
+        pad = [0, 0, 0, 0]  # padding happens in separate padding node for now
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+        pad_h = pad[0] + pad[2]
+        pad_w = pad[1] + pad[3]
+        out_dim_h = im2col.compute_conv_output_dim(h, k_h, stride_h, pad_h, dilation_h)
+        out_dim_w = im2col.compute_conv_output_dim(w, k_w, stride_w, pad_w, dilation_w)
 
-    def strm_decl(self):
-        pass
+        if self.get_nodeattr("parallel_window"):
+            mmv_in = M * 1
+            mmv_out = M * k_h * k_w
+        else:
+            mmv_in = 1
+            mmv_out = 1
 
-    def docompute(self):
-        pass
+        # compute index/address increments for each nested loop
+        channel_factor = int(ifm_ch / simd)
+
+        # compute minimal buffer length (assuming it holds 1 complete window)
+        buffer_min_size = (
+            (k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1
+        ) * channel_factor
+
+        # add additional buffer space in case of stride > 1
+        # this minimizes cycle count, as it allows an earlier pre-load of skipped input elements
+        buffer_actual_size = (
+            buffer_min_size
+            + max(
+                0,
+                ((stride_w - 1) - (int(mmv_out * k_h * k_w / mmv_in))) * channel_factor,
+            )
+            + max(
+                0,
+                ((stride_h - 1) * w - (int(mmv_out * k_h * k_w / mmv_in)))
+                * channel_factor,
+            )
+        )
+        self.buffer_depth = buffer_actual_size  # for resource estimation
+        code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_actual_size)]
+
+        # compute some intermediate values, e.g., kernel "width" = k_w incl. dilation
+        # or cols/rows that are skipped due to imperfect stride<->dim combination
+        kernel_width = (k_w - 1) * dilation_w + 1
+        kernel_height = (k_h - 1) * dilation_h + 1
+        skip_columns = w % (kernel_width + (out_dim_w - 1) * stride_w)
+        skip_rows = h % (kernel_height + (out_dim_h - 1) * stride_h)
+
+        # compute address increment values for 5-loop nest
+        addr_incr_end_simd = 1
+        addr_incr_end_window_elem = (dilation_w - 1) * channel_factor + 1
+        addr_incr_end_window_row = (
+            ((w - kernel_width) * channel_factor)  # remaining line
+            + ((dilation_h - 1) * w * channel_factor)  # skip lines
+            + 1  # wrap-around of minimally sized buffer
+        )
+        addr_incr_end_window = -buffer_min_size + stride_w * channel_factor + 1
+        addr_incr_end_row = (
+            -buffer_min_size
+            + ((skip_columns + kernel_width) * channel_factor)  # remaining line
+            + ((stride_h - 1) * w * channel_factor)  # skip lines
+            + 1
+        )
 
-    def dataoutstrm(self):
-        pass
+        # re-use same controller structure -> re-assign address increments for the dw case
+        if depthwise:
+            addr_incr_end_window_elem = dilation_w * channel_factor
+            addr_incr_end_window_row = (
+                channel_factor
+                + (w - kernel_width) * channel_factor
+                + (dilation_h - 1) * w * channel_factor
+            )
+            addr_incr_end_simd = -buffer_min_size + (channel_factor + 1)
+
+        # sanity check
+        assert not (
+            abs(addr_incr_end_window) > buffer_actual_size
+        ), "ERROR: W increment > buffer size, wrap logic doesn't account for this"
+        assert not (
+            abs(addr_incr_end_row) > buffer_actual_size
+        ), "ERROR: H increment > buffer size, wrap logic doesn't account for this"
+
+        # set certain threshold indices to detect when reading/writing finishes
+        code_gen_dict["$LAST_READ_ELEM$"] = [str(h * w * channel_factor - 1)]
+        code_gen_dict["$LAST_WRITE_ELEM$"] = [
+            str(((h - skip_rows - 1) * w + (w - skip_columns)) * channel_factor - 1)
+        ]
+
+        # default controller loop structure: # iterations (counters) map directly
+        loop_h_iterations = out_dim_h
+        loop_w_iterations = out_dim_w
+        loop_kh_iterations = k_h
+        loop_kw_iterations = k_w
+        loop_simd_iterations = channel_factor
+
+        if depthwise and channel_factor > 1:
+            # re-arrange existing controller loop structure for depthwise convolutions
+            loop_kh_iterations = channel_factor
+            loop_kw_iterations = k_h
+            loop_simd_iterations = k_w
+            addr_incr_end_simd_ = addr_incr_end_simd
+            addr_incr_end_simd = addr_incr_end_window_elem
+            addr_incr_end_window_elem = addr_incr_end_window_row
+            addr_incr_end_window_row = addr_incr_end_simd_
+            elem_per_window = k_h * k_w
+
+            tail_incr_w = addr_incr_end_window + buffer_min_size - channel_factor
+            tail_incr_h = addr_incr_end_row + buffer_min_size - channel_factor
+            tail_incr_last_window = buffer_min_size - 1
+            code_gen_dict["$TAIL_INCR_GENERATION$"] = [
+                """
+            always @ (counter_loop_kh, counter_loop_w, counter_loop_h) begin
+                        if (counter_loop_kh >= 0)
+                            tail_incr_reg = 1;
+                        else if (counter_loop_w >= 0)
+                            tail_incr_reg = {};
+                        else if (counter_loop_h >= 0)
+                            tail_incr_reg = {};
+                        else
+                            tail_incr_reg = {};
+            end
+            """.format(
+                    tail_incr_w, tail_incr_h, tail_incr_last_window
+                )
+            ]
+        else:
+            # depthwise output format is equivalent to non-depthwise if SIMD=C
+            elem_per_window = k_h * k_w * channel_factor
+
+            tail_incr_w = addr_incr_end_window + buffer_min_size - 1
+            tail_incr_h = addr_incr_end_row + buffer_min_size - 1
+            tail_incr_last_window = buffer_min_size - 1
+            code_gen_dict["$TAIL_INCR_GENERATION$"] = [
+                """
+            always @ (counter_loop_w, counter_loop_h) begin
+                    if (counter_loop_w >= 0)
+                        tail_incr_reg = {};
+                    else if (counter_loop_h >= 0)
+                        tail_incr_reg = {};
+                    else
+                        tail_incr_reg = {};
+            end
+            """.format(
+                    tail_incr_w, tail_incr_h, tail_incr_last_window
+                )
+            ]
+
+        # support SIMD = C and k_w = 1 cases
+        # for k = [k_h, k_w] = [1, k_w], no adjustment is needed
+        # for k = [k_h, k_w] = [1, 1], do not use this impl. style (mmv_out=K=1)
+        # innermost loop is executed at least once -> adjust if needed
+        if loop_simd_iterations == 1:
+            # skip innermost SIMD loop completely
+            if loop_kw_iterations == 1:
+                # skip innermost KW loop completely
+                code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_KH"]
+                loop_kh_iterations -= 1  # -1 because state is initial state
+            else:
+                code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_KW"]
+                loop_kw_iterations -= 1  # -1 because state is initial state
+        else:
+            code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_SIMD"]
+            loop_simd_iterations -= 1  # -1 because state is initial state
+
+        code_gen_dict["$LOOP_H_ITERATIONS$"] = [str(loop_h_iterations - 1)]
+        code_gen_dict["$LOOP_W_ITERATIONS$"] = [str(loop_w_iterations - 1)]
+        code_gen_dict["$LOOP_KH_ITERATIONS$"] = [str(loop_kh_iterations - 1)]
+        code_gen_dict["$LOOP_KW_ITERATIONS$"] = [str(loop_kw_iterations - 1)]
+        code_gen_dict["$LOOP_SIMD_ITERATIONS$"] = [str(loop_simd_iterations - 1)]
+
+        incr_bitwidth = 1 + math.ceil(
+            math.log2(
+                max(
+                    abs(addr_incr_end_simd) + 1,
+                    abs(addr_incr_end_window_elem) + 1,
+                    abs(addr_incr_end_window_row) + 1,
+                    abs(addr_incr_end_window) + 1,
+                    abs(addr_incr_end_row) + 1,
+                    abs(tail_incr_w) + 1,
+                    abs(tail_incr_h) + 1,
+                    abs(tail_incr_last_window) + 1,
+                )
+            )
+        )
+        code_gen_dict["$INCR_BITWIDTH$"] = [str(incr_bitwidth)]
+        code_gen_dict["$ADDR_INCREMENT_MAP$"] = [
+            "'{{ {}'d0, {}'d{}, {}'d{}, {}'d{}, {}'d{}, {}'d{}}}".format(
+                incr_bitwidth,
+                int(copysign(incr_bitwidth, addr_incr_end_simd)),
+                abs(addr_incr_end_simd),
+                int(copysign(incr_bitwidth, addr_incr_end_window_elem)),
+                abs(addr_incr_end_window_elem),
+                int(copysign(incr_bitwidth, addr_incr_end_window_row)),
+                abs(addr_incr_end_window_row),
+                int(copysign(incr_bitwidth, addr_incr_end_window)),
+                abs(addr_incr_end_window),
+                int(copysign(incr_bitwidth, addr_incr_end_row)),
+                abs(addr_incr_end_row),
+            )
+        ]
 
-    def save_as_npy(self):
-        pass
+        code_gen_dict["$ELEM_PER_WINDOW$"] = [str(elem_per_window)]
+        code_gen_dict["$SIMD$"] = [str(simd)]
+        code_gen_dict["$MMV_IN$"] = [str(mmv_in)]
+        code_gen_dict["$MMV_OUT$"] = [str(mmv_out)]
 
-    def blackboxfunction(self):
-        pass
+        return template_path, code_gen_dict
 
-    def pragmas(self):
-        pass
-        
-    def generate_hdl(self):
-        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-        #f_debug = open(os.path.join(code_gen_dir, "swg_hdl_debuginfo.log"), "w")
+    def prepare_codegen_parallel(self):
+        # Parallel implementation style for MMV_out = K:
+        # mix of shift-registers (for parallel read) and line buffers (BRAM or LUTRAM)
+        # compute a static schedule by analyzing access pattern (from im2col function)
+        template_path = (
+            os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_template_parallel.sv"
+        )
         code_gen_dict = {}
 
-        ##### BEGIN INITIALIZE/CHECK CONFIGURATION #####
         ifm_ch = self.get_nodeattr("IFMChannels")
         k = self.get_nodeattr("ConvKernelDim")
         ifm_dim = self.get_nodeattr("IFMDim")
-        ofm_dim = self.get_nodeattr("OFMDim")
         stride = self.get_nodeattr("Stride")
         dilation = self.get_nodeattr("Dilation")
-        depthwise = self.get_nodeattr("depthwise")
+        simd = self.get_nodeattr("SIMD")
+        M = self.get_nodeattr("M")
 
-        n = 1
-        h, w = ifm_dim
-        c = 1 # assume SIMD=C (parallelize across all channels)
         k_h, k_w = k
-        pad = [0,0,0,0] # padding happens in separate padding node for now
-        pad_val = 0
+        h, w = ifm_dim
+        n = c = 1  # no need to consider fully-parallel C dimension
+        in_shape = (n, c, h, w)
+        pad = [0, 0, 0, 0]
         stride_h, stride_w = stride
         dilation_h, dilation_w = dilation
-
-        in_shape = (n,c,h,w) #NCHW
-
         in_image = np.empty(in_shape, dtype=int)
         in_image_padded = np.pad(
             in_image,
             ((0, 0), (0, 0), (pad[0], pad[2]), (pad[1], pad[3])),
             mode="constant",
-            constant_values=pad_val,
+            constant_values=0,
         )
         in_shape_padded = in_image_padded.shape
         h_padded = in_shape_padded[2]
         w_padded = in_shape_padded[3]
-
         pad_h = pad[0] + pad[2]
         pad_w = pad[1] + pad[3]
         out_dim_h = im2col.compute_conv_output_dim(h, k_h, stride_h, pad_h, dilation_h)
         out_dim_w = im2col.compute_conv_output_dim(w, k_w, stride_w, pad_w, dilation_w)
 
-        # init folding config
-        simd = self.get_nodeattr("SIMD")
-        M = self.get_nodeattr("M")
-        if (self.get_nodeattr("parallel_window")):
-            mmv_in = M*1
-            mmv_out = M*k_h*k_w
-            assert ifm_ch==simd, "Constraint violated: SIMD must be equal to C"
+        if self.get_nodeattr("parallel_window"):
+            mmv_in = M * 1
+            mmv_out = M * k_h * k_w
+            assert ifm_ch == simd, "Constraint violated: SIMD must be equal to C"
         else:
             mmv_in = 1
             mmv_out = 1
-            assert ifm_ch%simd==0, "Constraint violated: SIMD must divide C"
+            assert ifm_ch % simd == 0, "Constraint violated: SIMD must divide C"
 
-        # TODO: check allowed hyperparams
-        # for 1D case: it does not matter if dummy dim is x or y
-        # TODO: move/duplicate these checks in corresponding convert_to_hls transformation (?)
-
-        # choose implementation style
-        if (mmv_out > 1 or (k_h==1 and k_w==1)):
-            impl_style = "parallel"
-        else:
-            impl_style = "default"
-
-        ##### END INITIALIZE/CHECK CONFIGURATION #####
-
-        ##### BEGIN CODE GEN FOR DEFAULT STYLE #####
-        if (impl_style == "default"):
-            # Default implementation style for MMV_out = 1: addressable cyclic buffer
-            # Computing incremental addressing scheme directly..
-
-            # compute index/address increments for each nested loop
-            channel_factor = int(ifm_ch/simd)
-
-            # compute minimal buffer length (assuming it holds 1 complete window)
-            buffer_min_size = ((k_h-1) * dilation_h * w + (k_w-1) * dilation_w + 1) * channel_factor
-
-            kernel_width = (k_w-1)*dilation_w+1 # incl. dilation
-            addr_incr_end_simd = 1
-            addr_incr_end_window_elem = (dilation_w-1) * channel_factor + 1
-            
-            remaining_line = (w - kernel_width) * channel_factor
-            skip_lines = (dilation_h-1) * w * channel_factor
-            addr_incr_end_window_row = remaining_line + skip_lines + 1 # 1 = wrap around of minimally sized buffer
-            
-            addr_incr_end_window = -buffer_min_size + stride_w * channel_factor + 1 # 1 = wrap around of minimally sized buffer
-
-            # rows that are skipped due to imperfect stride<->W combination
-            skip_columns = w%(kernel_width + (out_dim_w-1)*stride_w)
-            remaining_line = (skip_columns + kernel_width) * channel_factor # increment from oldest buffer position (top left) to end of line
-            skip_lines = (stride_h-1) * w * channel_factor
-            addr_incr_end_row = -buffer_min_size + remaining_line + skip_lines + 1 # 1 = wrap around of minimally sized buffer
-
-            if (depthwise):
-                addr_incr_end_window_elem = dilation_w * channel_factor
-                addr_incr_end_window_row = (channel_factor 
-                                            + (w - kernel_width) * channel_factor
-                                            + (dilation_h-1) * w * channel_factor
-                                           )
-                addr_incr_end_simd = -buffer_min_size + (channel_factor + 1)
-
-            # add additional buffer space in case of stride > 1
-            # this minimizes cycle count, as it allows an earlier pre-load of skipped input elements
-            buffer_actual_size = (buffer_min_size + max(0,((stride_w-1)   - (int(mmv_out*k_h*k_w/mmv_in)))*channel_factor)
-                                                  + max(0,((stride_h-1)*w - (int(mmv_out*k_h*k_w/mmv_in)))*channel_factor))
-            code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_actual_size)]
-
-            assert not(abs(addr_incr_end_window) > buffer_actual_size), "ERROR: W increment > buffer size, wrap logic doesn't account for this"
-            assert not(abs(addr_incr_end_row) > buffer_actual_size), "ERROR: H increment > buffer size, wrap logic doesn't account for this"
-
-            kernel_width = (k_w-1)*dilation_w+1 # incl. dilation
-            kernel_height = (k_h-1)*dilation_h+1 # incl. dilation
-            skip_columns = w%(kernel_width + (out_dim_w-1)*stride_w)
-            skip_rows = h%(kernel_height + (out_dim_h-1)*stride_h)
-            code_gen_dict["$LAST_READ_ELEM$"] = [str(h*w*channel_factor-1)]
-            code_gen_dict["$LAST_WRITE_ELEM$"] = [str(((h - skip_rows - 1) * w + (w - skip_columns))*channel_factor -1)]
-
-            loop_h_iterations = out_dim_h
-            loop_w_iterations = out_dim_w
-            loop_kh_iterations = k_h
-            loop_kw_iterations = k_w
-            loop_simd_iterations = channel_factor
-
-            if (depthwise and channel_factor > 1):
-                # re-arrange existing controller loop structure for depthwise convolutions
-                loop_kh_iterations = channel_factor
-                loop_kw_iterations = k_h
-                loop_simd_iterations = k_w
-                addr_incr_end_simd_ = addr_incr_end_simd
-                addr_incr_end_simd = addr_incr_end_window_elem
-                addr_incr_end_window_elem = addr_incr_end_window_row
-                addr_incr_end_window_row = addr_incr_end_simd_
-                elem_per_window = k_h*k_w         
-
-                tail_incr_w = addr_incr_end_window + buffer_min_size - channel_factor     
-                tail_incr_h = addr_incr_end_row + buffer_min_size - channel_factor
-                tail_incr_last_window = buffer_min_size-1                                                
-                code_gen_dict["$TAIL_INCR_GENERATION$"] = ["""
-                always @ (counter_loop_kh, counter_loop_w, counter_loop_h) begin
-                         if (counter_loop_kh >= 0)
-                             tail_incr_reg = 1;
-                         else if (counter_loop_w >= 0)
-                             tail_incr_reg = {};
-                         else if (counter_loop_h >= 0)
-                             tail_incr_reg = {};
-                         else
-                             tail_incr_reg = {};
-                end
-                """.format(tail_incr_w, tail_incr_h, tail_incr_last_window)]
-            else:
-                # depthwise output format is equivalent to non-depthwise if SIMD=C
-                elem_per_window = k_h*k_w*channel_factor
-
-                tail_incr_w = addr_incr_end_window + buffer_min_size - 1     
-                tail_incr_h = addr_incr_end_row + buffer_min_size - 1
-                tail_incr_last_window = buffer_min_size-1
-                code_gen_dict["$TAIL_INCR_GENERATION$"] = ["""
-                always @ (counter_loop_w, counter_loop_h) begin
-                        if (counter_loop_w >= 0)
-                            tail_incr_reg = {};
-                        else if (counter_loop_h >= 0)
-                            tail_incr_reg = {};
-                        else
-                            tail_incr_reg = {};
-                end
-                """.format(tail_incr_w, tail_incr_h, tail_incr_last_window)]
-
-            # support SIMD = C and k_w = 1 cases
-            # for k = [k_h, k_w] = [1, k_w], no adjustment is needed
-            # for k = [k_h, k_w] = [1, 1], do not use this impl. style (mmv_out=K=1)
-            # innermost loop is executed at least once -> adjust if needed
-            if (loop_simd_iterations == 1):
-                # skip innermost SIMD loop completely
-                if (loop_kw_iterations == 1):
-                    # skip innermost KW loop completely
-                    code_gen_dict["$INNERMOST_STATE$"]=["STATE_LOOP_KH"]
-                    loop_kh_iterations -= 1  # -1 because state is initial state
-                else:
-                    code_gen_dict["$INNERMOST_STATE$"]=["STATE_LOOP_KW"]
-                    loop_kw_iterations -= 1 # -1 because state is initial state
-            else:
-                code_gen_dict["$INNERMOST_STATE$"]=["STATE_LOOP_SIMD"]
-                loop_simd_iterations -= 1 # -1 because state is initial state
-            
-            code_gen_dict["$LOOP_H_ITERATIONS$"]=[str(loop_h_iterations-1)]
-            code_gen_dict["$LOOP_W_ITERATIONS$"]=[str(loop_w_iterations-1)]
-            code_gen_dict["$LOOP_KH_ITERATIONS$"]=[str(loop_kh_iterations-1)]
-            code_gen_dict["$LOOP_KW_ITERATIONS$"]=[str(loop_kw_iterations-1)]
-            code_gen_dict["$LOOP_SIMD_ITERATIONS$"]=[str(loop_simd_iterations-1)]
-
-            incr_bitwidth = 1 + math.ceil(math.log2(max(abs(addr_incr_end_simd)+1, 
-                                                        abs(addr_incr_end_window_elem)+1, 
-                                                        abs(addr_incr_end_window_row)+1, 
-                                                        abs(addr_incr_end_window)+1, 
-                                                        abs(addr_incr_end_row)+1, 
-                                                        abs(tail_incr_w)+1, 
-                                                        abs(tail_incr_h)+1,
-                                                        abs(tail_incr_last_window)+1)))
-            code_gen_dict["$INCR_BITWIDTH$"] = [str(incr_bitwidth)]
-            code_gen_dict["$ADDR_INCREMENT_MAP$"]=["'{{ {}'d0, {}'d{}, {}'d{}, {}'d{}, {}'d{}, {}'d{}}}".format(incr_bitwidth, 
-                                                int(copysign(incr_bitwidth,addr_incr_end_simd)),abs(addr_incr_end_simd),
-                                                int(copysign(incr_bitwidth,addr_incr_end_window_elem)),abs(addr_incr_end_window_elem),
-                                                int(copysign(incr_bitwidth,addr_incr_end_window_row)),abs(addr_incr_end_window_row),
-                                                int(copysign(incr_bitwidth,addr_incr_end_window)),abs(addr_incr_end_window),
-                                                int(copysign(incr_bitwidth,addr_incr_end_row)),abs(addr_incr_end_row))]
-
-            code_gen_dict["$ELEM_PER_WINDOW$"] = [str(elem_per_window)]
-
-            with open(os.environ['FINN_ROOT']+"/finn-rtllib/swg/swg_template_default.sv", "r") as f:
-                template = f.read()
-       
-        ##### END CODE GEN FOR DEFAULT STYLE #####
-    
-        ##### BEGIN CODE GEN FOR PARALLEL STYLE #####
-        elif (impl_style == "parallel"):
-            # Out width > In width: Parallel implementation style using registers + line buffers
-            idx_c, idx_h, idx_w = im2col.get_im2col_indices_nchw(
-            in_shape,
-            k_h,
-            k_w,
-            pad,
-            stride_h,
-            stride_w,
-            dilation_h,
-            dilation_w
-            )
+        # Out width > In width: Parallel implementation style using registers + line buffers
+        idx_c, idx_h, idx_w = im2col.get_im2col_indices_nchw(
+            in_shape, k_h, k_w, pad, stride_h, stride_w, dilation_h, dilation_w
+        )
 
-            cols = in_image_padded[:, idx_c, idx_h, idx_w]
-            cols = cols.transpose(1, 2, 0).reshape(k_h * k_w * c, -1)
-
-            # result shape is (k_H*k_W*N, out_dim_H*out_dim_W), convert to NCHW
-            out_image = cols.reshape(n, c, k_h, k_w, out_dim_h, out_dim_w)
-            # (N=0,C=1,kh=2,kw=3,H=4,W=5) -> (N=0,H=4,W=5,kh=2,kw=3,C=1)
-            out_image = out_image.transpose(0, 4, 5, 2, 3, 1)
-            out_image = out_image.reshape(n, out_dim_h, out_dim_w, k_h * k_w * c)
-
-            idx_px = idx_h*w+idx_w # sequential pixel indices
-    
-            k, cycles = idx_px.shape
-
-            output_elements = mmv_out
-            output_cycles = int(cycles/(mmv_out/k))
-
-            # TODO: what happens when output_cycles=OFMdim % M != 0
-            # ...try to support IFMdim % M != 0 first, so we can work with the usual k=3 where OFMdim = IFMdim - -2
-            # the additional garbage input elements that are read in the last cycle are not read by any window anyway
-            idx_px = idx_px.transpose()
-            idx_px = idx_px.reshape(output_cycles, output_elements)
-            idx_px = idx_px.transpose()
-            # result: first dim is number of parallel output elements, 
-            # second dim is the input element (pixel in case of SIMD=C) index that each output element outputs per cycle
-
-            buffer = []
-            buffer_max_size = 0
-            schedule = []
-            next_in_px = 0
-            oldest_px = 0
-
-            def schedule_append(schedule, op):
-                if len(schedule) > 0 and schedule[-1][1] == op:
-                    count, op_ = schedule[-1]
-                    schedule[-1] = (count+1, op_)
-                else:
-                    schedule.append((1, op))
-                return schedule
-            
-            # compute schedule and buffer read pattern (output driven)
-            idx_px_relative = idx_px.copy()
-            output_elem, output_cycles = idx_px_relative.shape
-            
-            for x in range(output_cycles):
-                # load missing inputs into buffer
-                for y in range(output_elem):
-                    while int(idx_px_relative[y,x]) not in buffer:
-                        # load M inputs at once (keep "buffer" list 1D for now, handle actual 2D buffer generation later)
-                        for m in range(M):
-                            buffer.append(next_in_px)
-                            next_in_px += 1
-                        schedule = schedule_append(schedule,'w')
-                
-                # discard unused buffer elements
-                oldest_px = np.min(idx_px_relative[:,x:])
-                #check whether M elements can be shifted out, not just the single oldest one
-                #while all([buffer[i] < oldest_px for i in range(M)]):
-                if all([buffer[i] < oldest_px for i in range(M)]):
-                    # M buffer elements are shifted out at once
-                    for m in range(M):
-                        buffer.pop(0)
-        
-                # adjust relative buffer index of current x (according to last discarded buffer elements)
-                for y in range(output_elem):
-                    idx_px_relative[y,x] -= oldest_px
-                
-                # read from buffer    
-                # + simultaneously load next pixel(s) into buffer if there are any left
-                if (next_in_px > (h_padded*w_padded-1)):
-                    # read only (append above)
-                    schedule = schedule_append(schedule,'r')
-                else:
-                    # load M inputs at once
+        cols = in_image_padded[:, idx_c, idx_h, idx_w]
+        cols = cols.transpose(1, 2, 0).reshape(k_h * k_w * c, -1)
+        # result shape is (k_H*k_W*N, out_dim_H*out_dim_W), convert to NCHW
+        out_image = cols.reshape(n, c, k_h, k_w, out_dim_h, out_dim_w)
+        # (N=0,C=1,kh=2,kw=3,H=4,W=5) -> (N=0,H=4,W=5,kh=2,kw=3,C=1)
+        out_image = out_image.transpose(0, 4, 5, 2, 3, 1)
+        out_image = out_image.reshape(n, out_dim_h, out_dim_w, k_h * k_w * c)
+        idx_px = idx_h * w + idx_w  # sequential pixel indices
+        k, cycles = idx_px.shape
+        output_elements = mmv_out
+        output_cycles = int(cycles / (mmv_out / k))
+
+        idx_px = idx_px.transpose()
+        idx_px = idx_px.reshape(output_cycles, output_elements)
+        idx_px = idx_px.transpose()
+        # result: first dim is number of parallel output elements,
+        # second dim is the input element (pixel in case of SIMD=C) index that each output element outputs per cycle
+
+        buffer = []
+        buffer_max_size = 0
+        schedule = []
+        next_in_px = 0
+        oldest_px = 0
+
+        # compute schedule and buffer read pattern (output driven)
+        idx_px_relative = idx_px.copy()
+        output_elem, output_cycles = idx_px_relative.shape
+
+        for x in range(output_cycles):
+            # load missing inputs into buffer
+            for y in range(output_elem):
+                while int(idx_px_relative[y, x]) >= next_in_px:
+                    # load M inputs at once (keep "buffer" list 1D for now, handle actual 2D buffer generation later)
                     for m in range(M):
                         buffer.append(next_in_px)
                         next_in_px += 1
-                    schedule = schedule_append(schedule,'wr')
-
-                # record max needed buffer depth
-                if len(buffer) > buffer_max_size:
-                    buffer_max_size = len(buffer)
-
-            # insert dummy write operations for data at the input FM tail-end that is never read (e.g. in case of stride > 1)
-            while next_in_px <= (h_padded*w_padded-1):
-                next_in_px += 1
-                schedule = schedule_append(schedule,'w')
-
-            # find buffer access patterns
-            buffer_access_patterns = []
-            for x in range(output_cycles):
-                if idx_px_relative[:,x].tolist() not in buffer_access_patterns:
-                    buffer_access_patterns.append(idx_px_relative[:,x].tolist())
-
-            # Experimental implementation to map fixed controller loop structure to R/W schedule by analyzing
-            # the access pattern given by Im2Col, rather than direct computation.
-            # TODO: Probably replace this with a directly-computed schedule, similar to the default implementation style.
-            def compact_schedule(schedule):
-                # leave first sequence (pre-load) as is
-                start_sequence = schedule[0]
-                loop_sequence_1_counter = 1
-                loop_sequence_1 = schedule[1]
-                loop_counter = 0
-                loop_sequence_2 = None
-                end_sequence = None
-
-                i = 2
-                if i < len(schedule):
-                    loop_sequence_1 += schedule[i]
-                    i += 1
-                while i+1 < len(schedule):
-                    candidate = schedule[i] + schedule[i+1]
-                    if candidate == loop_sequence_1:
-                        loop_sequence_1_counter += 1
-                        i += 2
-                    else:
-                        break
-
-                if i < len(schedule):
-                    loop_sequence_2 = schedule[i]
-                    i += 1
-                if i+1 < len(schedule):
-                    candidate = schedule[i] + schedule[i+1]
-                    if candidate != loop_sequence_1:
-                        loop_sequence_2 += schedule[i]
-                    i -= 1
-                    loop_sequence_total_len = (int(len(loop_sequence_2)/2)) + loop_sequence_1_counter*(int(len(loop_sequence_1)/2))
-                    loop_sequence_total = loop_sequence_2 + loop_sequence_1_counter*loop_sequence_1
-                    while i+loop_sequence_total_len < len(schedule):
-                        candidate = schedule[i] 
-                        for x in range (i+1, i+loop_sequence_total_len):
-                            candidate += schedule[x]
-
-                        if candidate == loop_sequence_total:
-                            loop_counter += 1
-                            i += loop_sequence_total_len
-                        else:
-                            break
-                else:
-                    if i < len(schedule):
-                        end_sequence = loop_sequence_2 + schedule[i]
-                        i += 1
-                        loop_sequence_2 = None
-                    else:
-                        end_sequence = loop_sequence_2
-                        loop_sequence_2 = None
-
-                if i < len(schedule):
-                    end_sequence = schedule[i]
-                    i += 1
-                if i < len(schedule):
-                    end_sequence = end_sequence + schedule[i]
-                    i += 1
-
-                assert len(start_sequence) == 1*2, "ERROR: invalid start sequence"
-                assert len(loop_sequence_1) == 2*2, "ERROR: invalid loop 1 sequence"
-                if loop_sequence_2:
-                    assert len(loop_sequence_2) <= 2*2, "ERROR: invalid loop 2 sequence"
-                if end_sequence:
-                    assert len(end_sequence) <= 2*2, "ERROR: invalid end sequence"
-                assert i == len(schedule), "ERROR: schedule could not be compacted %d / %d" %(i, len(schedule))
-
-                return (start_sequence, loop_counter, loop_sequence_1_counter,
-                        loop_sequence_1, loop_sequence_2, end_sequence)
-
-            ### determine buffer partitioning into REG FIFOs (parallel access) and BRAM FIFOs (line buffers)
-            # TODO: this part doesn't fully account for M for 2D buffers yet
-
-            # how many "unused" registers are allowed between buffer positions that will be accessed in parallel
-            # example:
-            # 0: only consecutive access patterns will be implemented in regs, rest in (LUTRAM/BRAM) line buffers
-            # 2: [0, 3, 6] access pattern is still allowed and will be implemented with one 7-position shift reg
-            REG_BRAM_THRESHOLD = 8
-
-            code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_max_size)]
-
-            assert len(buffer_access_patterns) == 1, "ERROR: Buffer access pattern is not static"
-            buf_static_access_pattern = buffer_access_patterns[0]
-            reg_fifos = []
-            reg_fifos_depth = []
-            bram_fifos = []
-            bram_fifos_depth = []
-            current = []
-            for i in range(len(buf_static_access_pattern)):
-                access_idx = buf_static_access_pattern[i]
-                if len(current) == 0:
+                    schedule = schedule_append(schedule, "w")
+
+            # discard unused buffer elements
+            # FIXME: this is very slow for large feature maps (e.g., 4096x4096)
+            oldest_px = np.min(idx_px_relative[:, x:])
+            # check whether M elements can be shifted out, not just the single oldest one
+            # while all([buffer[i] < oldest_px for i in range(M)]):
+            if all([buffer[i] < oldest_px for i in range(M)]):
+                # M buffer elements are shifted out at once
+                for m in range(M):
+                    buffer.pop(0)
+
+            # adjust relative buffer index of current x (according to last discarded buffer elements)
+            for y in range(output_elem):
+                idx_px_relative[y, x] -= oldest_px
+
+            # read from buffer
+            # + simultaneously load next pixel(s) into buffer if there are any left
+            if next_in_px > (h_padded * w_padded - 1):
+                # read only (append above)
+                schedule = schedule_append(schedule, "r")
+            else:
+                # load M inputs at once
+                for m in range(M):
+                    buffer.append(next_in_px)
+                    next_in_px += 1
+                schedule = schedule_append(schedule, "wr")
+
+            # record max needed buffer depth
+            if len(buffer) > buffer_max_size:
+                buffer_max_size = len(buffer)
+
+        # insert dummy write operations for data at the input FM tail-end that is never read (e.g. in case of stride > 1)
+        while next_in_px <= (h_padded * w_padded - 1):
+            next_in_px += 1
+            schedule = schedule_append(schedule, "w")
+
+        # add 1 extra cycle after final READ+WRITE cycle for transition b/w feature maps
+        if schedule[-1][1] == "wr":
+            schedule_append(schedule, "n")
+
+        # find buffer access patterns
+        buffer_access_patterns = []
+        for x in range(output_cycles):
+            if idx_px_relative[:, x].tolist() not in buffer_access_patterns:
+                buffer_access_patterns.append(idx_px_relative[:, x].tolist())
+
+        ### determine buffer partitioning into REG FIFOs (parallel access) and BRAM FIFOs (line buffers)
+        # TODO: this part doesn't fully account for M>1 for 2D buffers yet
+        REG_BRAM_THRESHOLD = 8
+        # how many "unused" registers are allowed between buffer positions that will be accessed in parallel
+        # example:
+        # 0: only consecutive access patterns will be implemented in regs, rest in (LUTRAM/BRAM) line buffers
+        # 2: [0, 3, 6] access pattern is still allowed and will be implemented with one 7-position shift reg
+
+        code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_max_size)]
+        self.buffer_depth = buffer_max_size  # for resource estimation
+
+        assert (
+            len(buffer_access_patterns) == 1
+        ), "ERROR: Buffer access pattern is not static"
+        buf_static_access_pattern = buffer_access_patterns[0]
+        reg_fifos = []
+        reg_fifos_depth = []
+        bram_fifos = []
+        bram_fifos_depth = []
+        current = []
+        for i in range(len(buf_static_access_pattern)):
+            access_idx = buf_static_access_pattern[i]
+            if len(current) == 0:
+                current.append(access_idx)
+            else:
+                # assume non-decreasing index order in access pattern
+                # TODO: this assumption does not hold for M>1 for the 2D case
+                distance = access_idx - max(current)
+                if not (distance - 1 > REG_BRAM_THRESHOLD):
+                    for i in range(distance - 1):
+                        # insert dummy into REG FIFO (not read as part of window)
+                        current.append(-1)
+                    # assign this access to same REG FIFO as previous one
                     current.append(access_idx)
                 else:
-                    # assume non-decreasing index order in access pattern
-                    # TODO: this assumption does not hold for M>1 case (2D buffer)
-                    distance = access_idx - max(current)
-                    if not (distance-1 > REG_BRAM_THRESHOLD):
-                        for i in range(distance-1):
-                            # insert dummy into REG FIFO (not read as part of window)
-                            current.append(-1)
-                        # assign this access to same REG FIFO as previous one
-                        current.append(access_idx)
-                    else:
-                        # assign skipped accesses to new BRAM FIFO
-                        bram_fifos.append([-1]*(distance-1))
-                        bram_fifos_depth.append(math.ceil((distance-1)/M)) # really ceil?
-                        # start with new REG FIFO
-                        reg_fifos.append(current)
-                        #reg_fifos_depth.append(math.ceil((max(current)+1)/M)) # fix for M again
-                        reg_fifos_depth.append(len(current))
-                        current = []
-                        current.append(access_idx)
-            reg_fifos.append(current)
-            #reg_fifos_depth.append(math.ceil((max(current)+1)/M)) # fix for M again
-            reg_fifos_depth.append(len(current))
-
-            code_gen_dict["$GENERATE_REG_FIFOS$"] = []
-            for i in range(len(reg_fifos)):
-                code_gen_dict["$GENERATE_REG_FIFOS$"].append(
-                    """
-                    wire [IN_WIDTH-1:0] reg_fifo_{id}_in;
-                    wire [IN_WIDTH-1:0] reg_fifo_{id}_out;
-                    wire [IN_WIDTH*{len}-1:0] reg_fifo_{id};
-                    {name}_reg_buffer
-                    #(
-                    .WIDTH(IN_WIDTH),
-                    .DEPTH({len})
+                    # assign skipped accesses to new BRAM FIFO
+                    bram_fifos.append([-1] * (distance - 1))
+                    bram_fifos_depth.append(
+                        math.ceil((distance - 1) / M)
+                    )  # really ceil?
+                    # start with new REG FIFO
+                    reg_fifos.append(current)
+                    # reg_fifos_depth.append(math.ceil((max(current)+1)/M)) # allows for MMV in the 1D case
+                    reg_fifos_depth.append(len(current))
+                    current = []
+                    current.append(access_idx)
+        reg_fifos.append(current)
+        # reg_fifos_depth.append(math.ceil((max(current)+1)/M)) # allows for MMV in the 1D case
+        reg_fifos_depth.append(len(current))
+
+        code_gen_dict["$GENERATE_REG_FIFOS$"] = []
+        for i in range(len(reg_fifos)):
+            code_gen_dict["$GENERATE_REG_FIFOS$"].append(
+                """
+                wire [IN_WIDTH-1:0] reg_fifo_{id}_in;
+                wire [IN_WIDTH-1:0] reg_fifo_{id}_out;
+                wire [IN_WIDTH*{len}-1:0] reg_fifo_{id};
+                {name}_reg_buffer
+                #(
+                .WIDTH(IN_WIDTH),
+                .DEPTH({len})
+                )
+                reg_buffer_inst_{id}
+                (
+                    .CLK(CLK),
+                    .shift_enable(shift_enable),
+                    .shift_in(reg_fifo_{id}_in),
+                    .shift_out(reg_fifo_{id}_out),
+                    .data_out(reg_fifo_{id})
+                );""".format(
+                    name=self.get_verilog_top_module_name(),
+                    id=i,
+                    len=reg_fifos_depth[i],
+                )
+            )
+
+        code_gen_dict["$GENERATE_BRAM_FIFOS$"] = []
+        for i in range(len(bram_fifos)):
+            code_gen_dict["$GENERATE_BRAM_FIFOS$"].append(
+                """
+                wire [IN_WIDTH-1:0] bram_fifo_{id}_in;
+                wire [IN_WIDTH-1:0] bram_fifo_{id}_out;
+                {name}_ram_buffer
+                #(
+                .WIDTH(IN_WIDTH),
+                .DEPTH({len})
+                )
+                ram_buffer_inst_{id}
+                (
+                    .CLK(CLK),
+                    .RST(RST),
+                    .shift_enable(shift_enable),
+                    .shift_in(bram_fifo_{id}_in),
+                    .shift_out(bram_fifo_{id}_out)
+                );""".format(
+                    name=self.get_verilog_top_module_name(),
+                    id=i,
+                    len=bram_fifos_depth[i],
+                )
+            )
+
+        code_gen_dict["$GENERATE_OUTPUT_MAPPING$"] = []
+        out_idx = mmv_out - 1
+        for fifo_id, reg_fifo in enumerate(reg_fifos):
+            for fifo_idx, access_idx in enumerate(reg_fifo):
+                if access_idx != -1:
+                    code_gen_dict["$GENERATE_OUTPUT_MAPPING$"].append(
+                        "assign data_out[OUT_ELEM_WIDTH*{out_idx}+:OUT_ELEM_WIDTH] = reg_fifo_{fifo_id}[{access_idx}*{mmv}*OUT_ELEM_WIDTH+OUT_ELEM_WIDTH*{mmv_idx}+:OUT_ELEM_WIDTH];".format(
+                            out_idx=out_idx,
+                            fifo_id=fifo_id,
+                            access_idx=reg_fifos_depth[fifo_id]
+                            - 1
+                            - int((max(reg_fifo) - access_idx) / M),
+                            mmv_idx=(max(reg_fifo) - access_idx) % M,
+                            mmv=M,
+                        )
                     )
-                    reg_buffer_inst_{id}
-                    (
-                        .CLK(CLK),
-                        .shift_enable(shift_enable),
-                        .shift_in(reg_fifo_{id}_in),
-                        .shift_out(reg_fifo_{id}_out),
-                        .data_out(reg_fifo_{id})
-                    );""".format(name=self.get_verilog_top_module_name(), id=i, len=reg_fifos_depth[i]))
-
-            code_gen_dict["$GENERATE_BRAM_FIFOS$"] = []
-            for i in range(len(bram_fifos)):
-                code_gen_dict["$GENERATE_BRAM_FIFOS$"].append(
-                    """
-                    wire [IN_WIDTH-1:0] bram_fifo_{id}_in;
-                    wire [IN_WIDTH-1:0] bram_fifo_{id}_out;
-                    {name}_ram_buffer
-                    #(
-                    .WIDTH(IN_WIDTH),
-                    .DEPTH({len})
+                    # reversal: out_idx=0 -> oldest buffer element -> highest access_idx
+                    out_idx = out_idx - 1
+        assert out_idx == -1, "ERROR: Not all output vector elements connected"
+
+        code_gen_dict["$GENERATE_BUFFER_CONNECTION$"] = []
+        for i in range(len(reg_fifos)):
+            if i == 0:
+                # first FIFO containing newest elements -> input comes from input reg
+                code_gen_dict["$GENERATE_BUFFER_CONNECTION$"].append(
+                    """assign reg_fifo_{fifo_id}_in = reg_input;""".format(
+                        fifo_id=i,
                     )
-                    ram_buffer_inst_{id}
-                    (
-                        .CLK(CLK),
-                        .RST(RST),
-                        .shift_enable(shift_enable),
-                        .shift_in(bram_fifo_{id}_in),
-                        .shift_out(bram_fifo_{id}_out)
-                    );""".format(name=self.get_verilog_top_module_name(), id=i, len=bram_fifos_depth[i]))
-
-            code_gen_dict["$GENERATE_OUTPUT_MAPPING$"] = []
-            out_idx = mmv_out-1
-            for fifo_id, reg_fifo in enumerate(reg_fifos):
-                for fifo_idx, access_idx in enumerate(reg_fifo):
-                    if(access_idx != -1):
-                        #code_gen_dict["$GENERATE_OUTPUT_MAPPING$"].append(
-                        #    "assign data_out[OUT_ELEM_WIDTH*{out_idx}+:OUT_ELEM_WIDTH] = reg_fifo_{fifo_id}[{fifo_idx}]; //{access_idx}".format(
-                        #        out_idx=out_idx, fifo_id=fifo_id, fifo_idx=fifo_idx, access_idx=access_idx
-                        #    )
-                        #)
-                        code_gen_dict["$GENERATE_OUTPUT_MAPPING$"].append(
-                            "assign data_out[OUT_ELEM_WIDTH*{out_idx}+:OUT_ELEM_WIDTH] = reg_fifo_{fifo_id}[{access_idx}*{mmv}*OUT_ELEM_WIDTH+OUT_ELEM_WIDTH*{mmv_idx}+:OUT_ELEM_WIDTH];".format(
-                                out_idx=out_idx, fifo_id=fifo_id, 
-                                access_idx=reg_fifos_depth[fifo_id]-1-int((max(reg_fifo)-access_idx)/M), 
-                                mmv_idx=(max(reg_fifo)-access_idx)%M,
-                                mmv = M
-                            )
-                        )
-                        # reversal: out_idx=0 -> oldest buffer element -> highest access_idx
-                        out_idx = out_idx-1
-            assert out_idx==-1, "ERROR: Not all output vector elements connected"
-
-            code_gen_dict["$GENERATE_BUFFER_CONNECTION$"] = []
-            for i in range(len(reg_fifos)):
-                if i == 0:
-                    # first FIFO containing newest elements -> input comes from input reg
-                    code_gen_dict["$GENERATE_BUFFER_CONNECTION$"].append(
-                        """assign reg_fifo_{fifo_id}_in = reg_input;""".format(fifo_id=i,))
-                else:
-                    # other REG FIFOs -> input comes from connected BRAM FIFO (line buffer)
-                    input_fifo_id = i-1
-                    code_gen_dict["$GENERATE_BUFFER_CONNECTION$"].append(
-                        """assign reg_fifo_{fifo_id}_in = bram_fifo_{input_fifo_id}_out;""".format(fifo_id=i, input_fifo_id=input_fifo_id))
-            for i in range(len(bram_fifos)):
-                input_fifo_id = i
+                )
+            else:
+                # other REG FIFOs -> input comes from connected BRAM FIFO (line buffer)
+                input_fifo_id = i - 1
                 code_gen_dict["$GENERATE_BUFFER_CONNECTION$"].append(
-                    """assign bram_fifo_{fifo_id}_in = reg_fifo_{input_fifo_id}_out;""".format(fifo_id=i, input_fifo_id=input_fifo_id))
-
-            def convert_tuple(seq):
-                mapping = {'w': ("1'b1", "1'b0"),
-                            'r': ("1'b0", "1'b1"),
-                            'wr':("1'b1", "1'b1"),
-                            'n': ("1'b0", "1'b0")}
-                if seq:
-                    if len(seq) == 2:
-                        return (seq[0], mapping[seq[1]], 0, mapping['n'])
-                    if len(seq) == 4:
-                        return (seq[0], mapping[seq[1]], seq[2], mapping[seq[3]])
-                else:
-                    return (0, mapping['n'], 0, mapping['n'])
+                    """assign reg_fifo_{fifo_id}_in = bram_fifo_{input_fifo_id}_out;""".format(
+                        fifo_id=i, input_fifo_id=input_fifo_id
+                    )
+                )
+        for i in range(len(bram_fifos)):
+            input_fifo_id = i
+            code_gen_dict["$GENERATE_BUFFER_CONNECTION$"].append(
+                """assign bram_fifo_{fifo_id}_in = reg_fifo_{input_fifo_id}_out;""".format(
+                    fifo_id=i, input_fifo_id=input_fifo_id
+                )
+            )
 
-            start_sequence,loop_counter,loop_sequence_1_counter,loop_sequence_1,loop_sequence_2,end_sequence = compact_schedule(schedule)
+        (
+            start_sequence,
+            loop_counter,
+            loop_sequence_1_counter,
+            loop_sequence_1,
+            loop_sequence_2,
+            end_sequence,
+        ) = schedule_map_controller(schedule)
+
+        start_sequence = schedule_map_cmds(start_sequence)
+        loop_sequence_1 = schedule_map_cmds(loop_sequence_1)
+        loop_sequence_2 = schedule_map_cmds(loop_sequence_2)
+        end_sequence = schedule_map_cmds(end_sequence)
+
+        cycles_total = 0
+        for t in schedule:
+            cycles_total += t[0]
+        # add extra cycle if schedule ends on READ
+        if schedule[-1][1] == "r":
+            cycles_total += 1
+        code_gen_dict["$CYCLES_TOTAL$"] = [str(cycles_total)]
+
+        code_gen_dict["$START_COUNTER$"] = [str(start_sequence[0])]
+        code_gen_dict["$LOOP_MAIN_COUNTER$"] = [str(loop_sequence_1_counter)]
+        code_gen_dict["$LOOP_INTER_COUNTER$"] = [str(loop_counter)]
+
+        code_gen_dict["$LOOP_MAIN_1_COUNTER$"] = [str(loop_sequence_1[0])]
+        code_gen_dict["$LOOP_MAIN_2_COUNTER$"] = [str(loop_sequence_1[2])]
+
+        code_gen_dict["$LOOP_INTER_1_COUNTER$"] = [str(loop_sequence_2[0])]
+        code_gen_dict["$LOOP_INTER_2_COUNTER$"] = [str(loop_sequence_2[2])]
+
+        code_gen_dict["$LOOP_END_1_COUNTER$"] = [str(end_sequence[0])]
+        code_gen_dict["$LOOP_END_2_COUNTER$"] = [str(end_sequence[2])]
+
+        code_gen_dict["$READ_CMD_MAP$"] = [
+            "{{ {}, {}, {}, {}, {}, {}, {} }}".format(
+                start_sequence[1][0],
+                loop_sequence_1[1][0],
+                loop_sequence_1[3][0],
+                loop_sequence_2[1][0],
+                loop_sequence_2[3][0],
+                end_sequence[1][0],
+                end_sequence[3][0],
+            )
+        ]
+        code_gen_dict["$WRITE_CMD_MAP$"] = [
+            "{{ {}, {}, {}, {}, {}, {}, {} }}".format(
+                start_sequence[1][1],
+                loop_sequence_1[1][1],
+                loop_sequence_1[3][1],
+                loop_sequence_2[1][1],
+                loop_sequence_2[3][1],
+                end_sequence[1][1],
+                end_sequence[3][1],
+            )
+        ]
 
-            start_sequence = convert_tuple(start_sequence)
-            loop_sequence_1 = convert_tuple(loop_sequence_1)
-            loop_sequence_2 = convert_tuple(loop_sequence_2)
-            end_sequence = convert_tuple(end_sequence)
+        code_gen_dict["$SIMD$"] = [str(simd)]
+        code_gen_dict["$MMV_IN$"] = [str(mmv_in)]
+        code_gen_dict["$MMV_OUT$"] = [str(mmv_out)]
 
-            cycles_total = 0
-            for t in schedule:
-                cycles_total += t[0]
-            code_gen_dict["$CYCLES_TOTAL$"] = [str(cycles_total)]
+        return template_path, code_gen_dict
 
-            code_gen_dict["$START_COUNTER$"]=[str(start_sequence[0])]
-            code_gen_dict["$LOOP_MAIN_COUNTER$"]=[str(loop_sequence_1_counter)]
-            code_gen_dict["$LOOP_INTER_COUNTER$"]=[str(loop_counter)]
+    def select_impl_style(self):
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        k = self.get_nodeattr("ConvKernelDim")
+        simd = self.get_nodeattr("SIMD")
+        M = self.get_nodeattr("M")
 
-            code_gen_dict["$LOOP_MAIN_1_COUNTER$"]=[str(loop_sequence_1[0])]
-            code_gen_dict["$LOOP_MAIN_2_COUNTER$"]=[str(loop_sequence_1[2])]
+        k_h, k_w = k
+        # init folding config
+        if self.get_nodeattr("parallel_window"):
+            mmv_in = M * 1
+            mmv_out = M * k_h * k_w
+            assert ifm_ch == simd, "Constraint violated: SIMD must be equal to C"
+        else:
+            mmv_in = 1
+            mmv_out = 1
+            assert ifm_ch % simd == 0, "Constraint violated: SIMD must divide C"
 
-            code_gen_dict["$LOOP_INTER_1_COUNTER$"]=[str(loop_sequence_2[0])]
-            code_gen_dict["$LOOP_INTER_2_COUNTER$"]=[str(loop_sequence_2[2])]
+        # TODO: check allowed hyperparams
+        # for 1D case: it does not matter if dummy dim is x or y
+        # TODO: move/duplicate these checks in corresponding convert_to_hls transformation (?)
 
-            code_gen_dict["$LOOP_END_1_COUNTER$"]=[str(end_sequence[0])]
-            code_gen_dict["$LOOP_END_2_COUNTER$"]=[str(end_sequence[2])]
+        # choose implementation style
+        if mmv_out > 1 or (k_h == 1 and k_w == 1):
+            impl_style = "parallel"
+            assert ifm_ch == simd, "Constraint violated: SIMD must be equal to C"
+        else:
+            impl_style = "default"
 
-            code_gen_dict["$READ_CMD_MAP$"]=["{{ {}, {}, {}, {}, {}, {}, {} }}".format(
-                start_sequence[1][0],loop_sequence_1[1][0],loop_sequence_1[3][0],loop_sequence_2[1][0],loop_sequence_2[3][0],end_sequence[1][0],end_sequence[3][0])]
-            code_gen_dict["$WRITE_CMD_MAP$"]=["{{ {}, {}, {}, {}, {}, {}, {} }}".format(
-                start_sequence[1][1],loop_sequence_1[1][1],loop_sequence_1[3][1],loop_sequence_2[1][1],loop_sequence_2[3][1],end_sequence[1][1],end_sequence[3][1])]
+        return impl_style
 
-            with open(os.environ['FINN_ROOT']+"/finn-rtllib/swg/swg_template_parallel.sv", "r") as f:
-                template = f.read()
+    def generate_hdl(self):
+        impl_style = self.select_impl_style()
 
-        ##### END CODE GEN FOR PARALLEL STYLE #####
+        # prepare code generation by filling out dictionaries
+        if impl_style == "default":
+            template_path, code_gen_dict = self.prepare_codegen_default()
+        elif impl_style == "parallel":
+            template_path, code_gen_dict = self.prepare_codegen_parallel()
 
-        ##### BEGIN GENERAL CODE GEN #####
+        # add general parameters to dictionary
         code_gen_dict["$TOP_MODULE_NAME$"] = [self.get_verilog_top_module_name()]
-        # save top module name so we can refer to it even after this node has been renamed 
+        # save top module name so we can refer to it even after this node has been renamed
         # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject)
         self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name())
         code_gen_dict["$BIT_WIDTH$"] = [str(self.get_input_datatype().bitwidth())]
-        code_gen_dict["$SIMD$"] = [str(simd)]
-        code_gen_dict["$MMV_IN$"] = [str(mmv_in)]
-        code_gen_dict["$MMV_OUT$"] = [str(mmv_out)]
-        
         ram_style = self.get_nodeattr("ram_style")
         if ram_style == "auto":
-            code_gen_dict["$RAM_STYLE$"]=[""]
+            code_gen_dict["$RAM_STYLE$"] = [""]
         else:
-            code_gen_dict["$RAM_STYLE$"]=["(* ram_style = \"{}\" *)".format(ram_style)]
+            code_gen_dict["$RAM_STYLE$"] = ['(* ram_style = "{}" *)'.format(ram_style)]
 
-        with open(os.environ['FINN_ROOT']+"/finn-rtllib/swg/swg_template_wrapper.v", "r") as f:
+        # apply code generation to templates
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        with open(template_path, "r") as f:
+            template = f.read()
+        with open(
+            os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_template_wrapper.v", "r"
+        ) as f:
             template_wrapper = f.read()
-
         for key in code_gen_dict:
             # transform list into long string separated by '\n'
             code_gen_line = "\n".join(code_gen_dict[key])
             template = template.replace(key, code_gen_line)
             template_wrapper = template_wrapper.replace(key, code_gen_line)
+        with open(
+            os.path.join(
+                code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv"
+            ),
+            "w",
+        ) as f:
+            f.write(template)
+        with open(
+            os.path.join(
+                code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
+            ),
+            "w",
+        ) as f:
+            f.write(template_wrapper)
 
-        f = open(os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv"), "w")
-        f.write(template)
-        f.close()
-        f = open(os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"), "w")
-        f.write(template_wrapper)
-        f.close()
-        #f_debug.close()
-
-        #set ipgen_path and ip_path so that HLS-Synth transformation and stich_ip transformation do not complain
+        # set ipgen_path and ip_path so that HLS-Synth transformation and stich_ip transformation do not complain
         self.set_nodeattr("ipgen_path", code_gen_dir)
         self.set_nodeattr("ip_path", code_gen_dir)
-        ##### END GENERAL CODE GEN #####
 
     def prepare_rtlsim(self):
         """Creates a Verilator emulation library for the RTL code generated
@@ -1029,9 +1171,11 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
             raise ImportError("Installation of PyVerilator is required.")
 
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-        verilog_paths = [code_gen_dir]    
-        verilog_files = [self.get_nodeattr("gen_top_module") + "_wrapper.v",
-                         self.get_nodeattr("gen_top_module") + "_impl.sv"]
+        verilog_paths = [code_gen_dir]
+        verilog_files = [
+            self.get_nodeattr("gen_top_module") + "_wrapper.v",
+            self.get_nodeattr("gen_top_module") + "_impl.sv",
+        ]
 
         # build the Verilator emu library
         sim = PyVerilator.build(
@@ -1045,31 +1189,69 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         self.set_nodeattr("rtlsim_so", sim.lib._name)
         return sim
 
-
     def code_generation_ipi(self):
         """Constructs and returns the TCL for node instantiation in Vivado IPI."""
         vlnv = self.get_nodeattr("ip_vlnv")
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
 
-        cmd = ["add_files -norecurse %s" % (os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v")),
-            "add_files -norecurse %s" % (os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv")),
-            "create_bd_cell -type module -reference %s %s" % (self.get_nodeattr("gen_top_module"), self.onnx_node.name)]
+        cmd = [
+            "add_files -norecurse %s"
+            % (
+                os.path.join(
+                    code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
+                )
+            ),
+            "add_files -norecurse %s"
+            % (
+                os.path.join(
+                    code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv"
+                )
+            ),
+            "create_bd_cell -type module -reference %s %s"
+            % (self.get_nodeattr("gen_top_module"), self.onnx_node.name),
+        ]
 
         return cmd
 
     def code_generation_ipgen(self, model, fpgapart, clk):
-        """Normally: Generates c++ code and tcl script for ip generation.
-           Here: Generates (System-)Verilog code for ip generation."""
+        """Normally: Generates C++ code and tcl script for IP generation.
+        Here: Generates (System-)Verilog code for IP generation."""
         self.generate_hdl()
 
     def ipgen_singlenode_code(self):
-        """Normally: Builds the bash script for ip generation using the CallHLS from
-        finn.util.hls."""
+        """Normally: Builds the bash script for IP generation."""
         pass
 
     def code_generation_cppsim(self, model):
-        """Normally: Generates c++ code for simulation (cppsim)."""
+        """Normally: Generates C++ code for simulation (cppsim)."""
         pass
 
     def compile_singlenode_code(self):
         pass
+
+    def global_includes(self):
+        pass
+
+    def defines(self, var):
+        pass
+
+    def read_npy_data(self):
+        pass
+
+    def strm_decl(self):
+        pass
+
+    def docompute(self):
+        pass
+
+    def dataoutstrm(self):
+        pass
+
+    def save_as_npy(self):
+        pass
+
+    def blackboxfunction(self):
+        pass
+
+    def pragmas(self):
+        pass
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py
index c94aa1eab..d3ea9d117 100755
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py
@@ -30,22 +30,21 @@ import pytest
 
 import numpy as np
 from onnx import TensorProto, helper
-
-import finn.core.onnx_exec as oxe
-from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.general.im2col import compute_conv_output_dim
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.util.basic import gen_finn_dt_tensor
+
+import finn.core.onnx_exec as oxe
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 
-def make_single_im2col_modelwrapper(
-    k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt
-):
+
+def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt):
     k_h, k_w = k
     ifm_dim_h, ifm_dim_w = ifm_dim
     stride_h, stride_w = stride
@@ -134,10 +133,10 @@ def make_single_slidingwindow_modelwrapper(
     model.set_tensor_datatype("inp", idt)
     model.set_tensor_datatype("outp", odt)
 
-    #DEBUG
-    swg_node = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")[0]
-    swg_inst = getCustomOp(swg_node)
-    swg_inst.set_nodeattr("rtlsim_trace", "/workspace/finn/finn-rtllib/swg/swg_test_trace.vcd")
+    # DEBUG
+    # swg_node = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")[0]
+    # swg_inst = getCustomOp(swg_node)
+    # swg_inst.set_nodeattr("rtlsim_trace", "/home/felixj/WD/finn/finn-rtllib/swg/swg_test_trace.vcd")
 
     return model
 
@@ -159,39 +158,46 @@ def prepare_inputs(input_tensor):
 #     ],
 # )
 # kernel size
-@pytest.mark.parametrize("k", [[1,1],[2,2],[3,3],[4,5],[1,3]])
+@pytest.mark.parametrize("k", [[1, 1], [2, 2], [3, 3], [1, 2], [1, 3]])
 # input dimension
-@pytest.mark.parametrize("ifm_dim", [[8,8],[13,13],[1,12]])
+@pytest.mark.parametrize(
+    "ifm_dim", [[8, 8], [13, 13], [1, 11], [1, 12], [1, 13], [1, 14]]
+)
 # input channels
 @pytest.mark.parametrize("ifm_ch", [6])
 # Stride
-@pytest.mark.parametrize("stride", [[1,1],[2,2],[3,4]])
+@pytest.mark.parametrize("stride", [[1, 1], [2, 2], [1, 2]])
 # Dilation
-@pytest.mark.parametrize("dilation", [[1,1],[2,2],[4,3]])
+@pytest.mark.parametrize("dilation", [[1, 1], [2, 2], [1, 3]])
 # depthwise
-@pytest.mark.parametrize("dw", [0,1])
+@pytest.mark.parametrize("dw", [0, 1])
 
 # input channel parallelism ("SIMD")
-@pytest.mark.parametrize("simd", [1,2,3,6])
+@pytest.mark.parametrize("simd", [1, 2, 3, 6])
 # parallel_window enable (MMV_out = M*K)
-@pytest.mark.parametrize("parallel_window", [0,1])
+@pytest.mark.parametrize("parallel_window", [0, 1])
 # in/out MMV ("M")
 @pytest.mark.parametrize("m", [1])
 
 # Flip dimensions
-@pytest.mark.parametrize("flip", [False,True])
+@pytest.mark.parametrize("flip", [False])
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_fpgadataflow_slidingwindow_rtl(
     idt, k, ifm_dim, ifm_ch, stride, dilation, dw, simd, m, parallel_window, flip
 ):
-    #ifm_dim = conv_config[0]
-    #k = conv_config[1]
-    #stride = conv_config[2]
-    #dilation= conv_config[3]
+    # ifm_dim = conv_config[0]
+    # k = conv_config[1]
+    # stride = conv_config[2]
+    # dilation= conv_config[3]
 
     if flip:
-        if (ifm_dim[0]==ifm_dim[1] and k[0]==k[1] and stride[0]==stride[1] and dilation[0] == dilation[1]):
+        if (
+            ifm_dim[0] == ifm_dim[1]
+            and k[0] == k[1]
+            and stride[0] == stride[1]
+            and dilation[0] == dilation[1]
+        ):
             pytest.skip("Dimension flip would have no effect")
         k = k[::-1]
         ifm_dim = ifm_dim[::-1]
@@ -203,21 +209,31 @@ def test_fpgadataflow_slidingwindow_rtl(
     stride_h, stride_w = stride
     dilation_h, dilation_w = dilation
 
-    kernel_width = (k_w-1)*dilation_w+1 # incl. dilation
-    kernel_height = (k_h-1)*dilation_h+1 # incl. dilation
+    kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+    kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
 
     if simd > ifm_ch:
         pytest.skip("SIMD cannot be larger than number of input channels")
     if ifm_ch % simd != 0:
         pytest.skip("SIMD must divide number of input channels")
     if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
-        pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        pytest.skip(
+            "Illegal convolution configuration: kernel or stride > FM dimension"
+        )
     if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
-        pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
-    if (k_h==1 and (stride_h!=1 or dilation_h!=1)) or (k_w==1 and (stride_w!=1 or dilation_w!=1)):
-        pytest.skip("Illegal convolution configuration: stride or dilation defined for unitary kernel dim")
-    if k_h==1 and k_w==1 and simd != ifm_ch:
+        pytest.skip(
+            "Illegal convolution configuration: kernel or stride > FM dimension"
+        )
+    if (k_h == 1 and (stride_h != 1 or dilation_h != 1)) or (
+        k_w == 1 and (stride_w != 1 or dilation_w != 1)
+    ):
+        pytest.skip(
+            "Illegal convolution configuration: stride or dilation defined for unitary kernel dim"
+        )
+    if k_h == 1 and k_w == 1 and simd != ifm_ch:
         pytest.skip("1x1 Kernel only supported in parallel mode (SIMD=C)")
+    if parallel_window and simd != ifm_ch:
+        pytest.skip("Parallel window requires SIMD=C")
 
     ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
     ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
@@ -258,7 +274,7 @@ def test_fpgadataflow_slidingwindow_rtl(
     )
     y_expected = oxe.execute_onnx(golden, input_dict)["outp"]
 
-    #DEBUG
+    # DEBUG
     print("-------expected:")
     print(y_expected)
     print("--------produced:")
@@ -267,7 +283,7 @@ def test_fpgadataflow_slidingwindow_rtl(
     node = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")[0]
     inst = getCustomOp(node)
     cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
-    print("RTLSIM cycles: %d"%cycles_rtlsim)
+    print("RTLSIM cycles: %d" % cycles_rtlsim)
 
     if dw == 0:
         assert (y_produced == y_expected).all()
@@ -279,6 +295,7 @@ def test_fpgadataflow_slidingwindow_rtl(
         y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, ifm_ch * k_h * k_w)
         assert (y_produced == y_expected).all()
 
+
 #     exp_cycles_dict = model.analysis(exp_cycles_per_layer)
 #     exp_cycles = exp_cycles_dict[node.name]
 #     assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
-- 
GitLab