diff --git a/finn-rtllib/swg/swg_hdl_template.v b/finn-rtllib/swg/swg_hdl_template.v
new file mode 100755
index 0000000000000000000000000000000000000000..b0e00ea4d23395a446cc3522e3d2d61b158dc5e3
--- /dev/null
+++ b/finn-rtllib/swg/swg_hdl_template.v
@@ -0,0 +1,184 @@
+// ==============================================================
+// RTL generated by Vivado(TM) HLS - High-Level Synthesis from C, C++ and OpenCL
+// Version: 2020.1
+// Copyright (C) 1986-2020 Xilinx, Inc. All Rights Reserved.
+// 
+// ===========================================================
+
+`timescale 1 ns / 1 ps 
+module window_buffer 
+#(
+    parameter IN_WIDTH = 1, //c*bit-width
+    parameter OUT_WIDTH = 1, //c*bit-width*MMV_out
+    parameter BUFFER_ELEM_TOTAL = 1
+)
+(
+    CLK,
+    data_in,
+    shift_enable,
+    data_out
+);
+
+input CLK;
+input [IN_WIDTH-1:0] data_in;
+input shift_enable;
+output [OUT_WIDTH-1:0] data_out;
+
+//Input REG to enable simultaneous R/W
+reg [IN_WIDTH-1:0] reg_input;
+
+//REG FIFOs
+$GENERATE_REG_FIFOS$
+
+//BRAM FIFOs
+//todo: generate real BRAM shift buffers if these get too large
+$GENERATE_BRAM_FIFOS$
+
+//Fixed REG FIFO <-> output mapping
+$GENERATE_OUTPUT_MAPPING$
+
+//main process
+integer i;
+always @ (posedge CLK) begin
+    if (shift_enable) begin
+        //shift logic
+        $GENERATE_SHIFT_LOGIC$
+
+        //shift in new data
+        reg_input <= data_in;
+    end
+end
+
+endmodule //window_buffer
+
+module $TOP_MODULE_NAME$ (
+        ap_clk,
+        ap_rst_n,
+        in0_V_V_TDATA,
+        in0_V_V_TVALID,
+        in0_V_V_TREADY,
+        out_V_V_TDATA,
+        out_V_V_TVALID,
+        out_V_V_TREADY
+);
+
+//parameters
+parameter BIT_WIDTH = $BIT_WIDTH$;
+parameter SIMD = $SIMD$; //assuming SIMD=C for now
+parameter MMV_IN = $MMV_IN$; //assuming MMV_IN=1 for now
+parameter MMV_OUT = $MMV_OUT$; //assuming MMV_OUT=K for now
+parameter BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN; //c*bit-width
+parameter BUF_OUT_WIDTH = BUF_IN_WIDTH * MMV_OUT; //c*bit-width*MMV_out
+
+parameter CYCLES_TOTAL = $CYCLES_TOTAL$;
+parameter BUF_ELEM_TOTAL = $BUF_ELEM_TOTAL$;
+
+//IO ports
+input   ap_clk;
+input   ap_rst_n;
+input  [BUF_IN_WIDTH-1:0] in0_V_V_TDATA;
+input   in0_V_V_TVALID;
+output   in0_V_V_TREADY;
+output  [BUF_OUT_WIDTH-1:0] out_V_V_TDATA;
+output   out_V_V_TVALID;
+input   out_V_V_TREADY;
+
+//main buffer instantiation
+wire [BUF_IN_WIDTH-1:0] window_buffer_in;
+wire [BUF_OUT_WIDTH-1:0] window_buffer_out;
+wire window_buffer_shift_enable;
+window_buffer
+#(
+    .IN_WIDTH(BUF_IN_WIDTH),
+    .OUT_WIDTH(BUF_OUT_WIDTH),
+    .BUFFER_ELEM_TOTAL(BUF_ELEM_TOTAL)
+)
+window_buffer_inst
+(
+    .CLK(ap_clk),
+    .data_in(window_buffer_in),
+    .shift_enable(window_buffer_shift_enable),
+    .data_out(window_buffer_out)
+);
+
+//FSM state
+reg [1:0] state;
+parameter STATE_RESET = 0, STATE_OPERATE = 1, S2 = 2;
+
+//main cycle counter (where either read/write/both happen, resets for each image)
+integer cycle;
+
+//read/write loop state
+wire read_state;
+wire write_state;
+
+//output registers
+reg   out_V_V_TVALID_reg;
+
+//assign buffer control
+//todo: if mmv_out < k: might not shift and/or write for multiple read_state cycles
+assign window_buffer_shift_enable = (read_state && in0_V_V_TVALID) || write_state;
+
+//assign I/O ports
+assign window_buffer_in = in0_V_V_TDATA;
+assign in0_V_V_TREADY = read_state; //accept data whenever read loop wants to read
+assign out_V_V_TDATA = window_buffer_out; //out_V_V_TDATA_reg;
+assign out_V_V_TVALID = out_V_V_TVALID_reg;
+
+//read schedule
+//todo: generate differently
+$GENERATE_READ_SCHEDULE$
+
+//write schedule
+//todo: generate differently
+$GENERATE_WRITE_SCHEDULE$
+
+//read process (writing to buffer)
+always @ (posedge ap_clk) begin
+    if (ap_rst_n == 1'b0) begin
+        state <= STATE_RESET;
+    end else begin
+        case (state)
+            STATE_RESET: begin
+                state <= STATE_OPERATE;
+                cycle <= 0;
+            end
+            STATE_OPERATE: begin
+                if (read_state && in0_V_V_TVALID) begin
+                    //read into buffer
+                      //done in concurrent assignment
+                    //count cycle (R)
+                    cycle <= cycle+1;
+                    if (cycle == CYCLES_TOTAL-1)
+                        state <= STATE_RESET;
+                end else if (write_state && out_V_V_TREADY) begin
+                    cycle <= cycle+1; //count cycle (or W)
+                    if (cycle == CYCLES_TOTAL-1)
+                        state <= STATE_RESET;
+                end
+            end
+        endcase
+    end
+end
+
+//write process (reading from buffer)
+always @ (posedge ap_clk) begin
+    if (ap_rst_n == 1'b0) begin
+    end else begin
+        case (state)
+            STATE_RESET: begin
+            end
+            STATE_OPERATE: begin
+                if (write_state && out_V_V_TREADY) begin
+                    //write from buffer
+                    //todo: VALID seems to be deasserted 1 cycle too late?!
+                    out_V_V_TVALID_reg <= 1'b1;
+                end else begin
+                    out_V_V_TVALID_reg <= 1'b0;
+                end
+            end
+        endcase 
+    end
+end      
+
+endmodule //ConvolutionInputGenerator1D_0_ConvolutionInputGenerator1D_0
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 417a505898fb1aba751e4b44db336b8cf313cb6a..50746d4834cb1e7b29979f1876da007425352e76 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -34,6 +34,9 @@ from finn.custom_op.fpgadataflow.convolutioninputgenerator import (
 from finn.custom_op.fpgadataflow.convolutioninputgenerator1d import (
     ConvolutionInputGenerator1D,
 )
+from finn.custom_op.fpgadataflow.convolutioninputgenerator_rtl import (
+    ConvolutionInputGenerator_rtl,
+)
 from finn.custom_op.fpgadataflow.downsampler import DownSampler
 from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch
 from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch
@@ -67,6 +70,7 @@ custom_op["StreamingMaxPool_Batch"] = StreamingMaxPool_Batch
 custom_op["StreamingFCLayer_Batch"] = StreamingFCLayer_Batch
 custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator
 custom_op["ConvolutionInputGenerator1D"] = ConvolutionInputGenerator1D
+custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl
 custom_op["TLastMarker"] = TLastMarker
 custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch
 custom_op["StreamingFIFO"] = StreamingFIFO
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
new file mode 100755
index 0000000000000000000000000000000000000000..9908bbb30d2dd6669ecdc44d3568dcadc3ac17ad
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
@@ -0,0 +1,1016 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import math
+import numpy as np
+import os
+
+from finn.core.datatype import DataType
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.custom_op.general.im2col import compute_conv_output_dim
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+from finn.custom_op.general import im2col
+
+from finn.util.basic import (
+    get_rtlsim_trace_depth,
+    make_build_dir,
+)
+
+try:
+    from pyverilator import PyVerilator
+except ModuleNotFoundError:
+    PyVerilator = None
+
+# This operation should only be used for 1D convolutions. Either the
+# IFMDim_H or IFMDim_W should be '1', which represents the so-called
+# dummy-dimension
+
+# ONNX i/o tensor shape assumptions for ConvolutionInputGenerator1D:
+# input 0 is the input tensor, shape NHWC = (1, IFMDim_H, IFMDim_W, IFMChannels)
+# output 0 is the output tensor, shape NHWC:
+#     = (1, OFMDim_H, OFMDim_W, (ConvKernelDim_H*ConvKernelDim_W)*IFMChannels)
+
+# note: the actual data layout produced by the hlslib kernels is different
+# for depthwise and non-depthwise ops.
+# * non-depthwise SWG: (1, OFMDim_H, OFMDim_W, K_H, K_W, IFMChannels/SIMD, SIMD)
+# * depthwise SWG: (1, OFMDim_H, OFMDim_W, IFMChannels/SIMD, K_H, K_W, SIMD)
+# see test_fpgadataflow_slidingwindow.py for an example of how to transform
+# between the two layouts
+
+
+class ConvolutionInputGenerator_rtl(HLSCustomOp):
+    """Class that corresponds to one of the 1D finn-hlslib ConvolutionInputGenerator
+    (sliding window) function variants. Depending on the combination of
+    attributes (e.g. depthwise or not, whether dilation is 0) a different
+    variant will be picked for the actual HLS implementation."""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "ConvKernelDim": ("ints", True, []),  # [H, W] = [Y, X]
+            "IFMChannels": ("i", True, 0),
+            "IFMDim": ("ints", True, []),  # [H, W] = [Y, X]
+            "OFMDim": ("ints", True, []),  # [H, W] = [Y, X]
+            "SIMD": ("i", True, 0),
+            "Stride": ("ints", True, []),  # [H, W] = [Y, X]
+            "Dilation": ("ints", True, []),  # [H, W] = [Y, X]
+            # FINN DataTypes for inputs, weights, outputs
+            "inputDataType": ("s", True, ""),
+            "outputDataType": ("s", True, ""),
+            "depthwise": ("i", False, 0, {0, 1}),
+            # FPGA resource type for ConvolutionInputGenerator input buffer
+            # auto -- let Vivado HLS decide
+            # block -- use BRAM
+            # distributed -- use LUTRAM
+            # ultra -- use URAM
+            "ram_style": (
+                "s",
+                False,
+                "distributed",
+                {"auto", "block", "distributed", "ultra"},
+            ),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_normal_input_shape(self):
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch)
+        return ishape
+
+    def get_folded_input_shape(self):
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        simd = self.get_nodeattr("SIMD")
+        assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
+        wf = int(ifm_ch / simd)
+        folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd)
+        return folded_ishape
+
+    def get_normal_output_shape(self):
+        k_h, k_w = self.get_nodeattr("ConvKernelDim")
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        stride_h, stride_w = self.get_nodeattr("Stride")
+        dilation_h, dilation_w = self.get_nodeattr("Dilation")
+        pad = 0
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w)
+        oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch)
+        return oshape
+
+    def get_folded_output_shape(self):
+        k_h, k_w = self.get_nodeattr("ConvKernelDim")
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        stride_h, stride_w = self.get_nodeattr("Stride")
+        dilation_h, dilation_w = self.get_nodeattr("Dilation")
+        simd = self.get_nodeattr("SIMD")
+        pad = 0
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w)
+        assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
+        if self.use_parallel_window_output():
+            wf = int((ifm_ch) // simd)
+            folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd)
+        else:
+            wf = int((k_h * k_w * ifm_ch) // simd)
+            folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd)
+        return folded_oshape
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpect input shape for ConvInpGen."
+        return super().make_const_shape_op(oshape)
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        # data type stays the same
+        dtype = model.get_tensor_datatype(node.input[0])
+        model.set_tensor_datatype(node.output[0], dtype)
+
+    def verify_node(self):
+        pass
+
+    def get_input_datatype(self):
+        """Returns FINN DataType of input."""
+        return DataType[self.get_nodeattr("inputDataType")]
+
+    def get_output_datatype(self):
+        """Returns FINN DataType of output."""
+        return DataType[self.get_nodeattr("outputDataType")]
+
+    def get_instream_width(self):
+        ibits = self.get_input_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
+        in_width = simd * ibits
+        return in_width
+
+    def get_outstream_width(self):
+        if self.use_parallel_window_output():
+            # feed all window pixels in parallel
+            k_h, k_w = self.get_nodeattr("ConvKernelDim")
+            return self.get_instream_width() * k_h * k_w
+        else:
+            # if parallel variant not in use: same width for output and input stream
+            return self.get_instream_width()
+
+    def get_number_output_values(self):
+        folded_oshape = self.get_folded_output_shape()
+        num_output_elems = np.prod(folded_oshape[:-1])
+        return num_output_elems
+
+    def get_1d_conv_attrs_normalized(self):
+        # support both (1, D) and (D, 1) cases transparently:
+        # For the kernel, presenting the input data of size D as
+        # [H, W] = [Y, X] = [1, D] or [D, 1]
+        # effectively gives the same result. Because the
+        # ConvolutionInputGenerator_NonSquare_Dilated(_dws) kernel currently only
+        # supports dilation>1 along the X-axis and the
+        # ConvolutionInputGenerator_NonSquare only works for stride>1 along the
+        # X-axis, we are working with the following assumption:
+        # the dummy ('1') dimension is the Y-dimension, i.e.
+        # images and kernels (and their attributes) of dimension
+        # [H, W] = [Y, X] = [D, 1] or [1, D] are always mapped to [1, D]
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        k = self.get_nodeattr("ConvKernelDim")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        ofm_dim = self.get_nodeattr("OFMDim")
+        stride = self.get_nodeattr("Stride")
+        dilation = self.get_nodeattr("Dilation")
+
+        # see defines() for an explanation
+        if ifm_dim[1] == 1:
+            ifm_dim = ifm_dim[::-1]
+            ofm_dim = ofm_dim[::-1]
+            k = k[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation)
+
+    def use_parallel_window_output(self):
+        # Check if simple "ConvolutionInputGenerator_1D_parallel" variant can be used to
+        # feed window in parallel to the following layer, enabling full SIMD unfolding.
+        dilation = self.get_nodeattr("Dilation")
+        dilation_h, dilation_w = dilation
+
+        #todo: make this configurable via mmv_out instead of an automatic selection
+
+        if self.get_nodeattr("SIMD") == self.get_nodeattr("IFMChannels"):
+            if self.get_nodeattr("depthwise") == 0:
+                    return True
+
+        return False
+
+    def get_exp_cycles(self):
+        simd = self.get_nodeattr("SIMD")
+        (
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            k,
+            stride,
+            dilation,
+        ) = self.get_1d_conv_attrs_normalized()
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        ofm_dim_h, ofm_dim_w = ofm_dim
+        k_h, k_w = k
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        # since mmv != 1 is not supported yet, we set mmv for now to 1
+        mmv = 1
+        # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h
+        if self.use_parallel_window_output():
+            exp_cycles = ifm_dim_w + 1
+        else:
+            cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv
+            cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd)
+            max_cycles = max(cycles_write_block, cycles_read_block)
+            exp_cycles = (
+                ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles
+            )
+
+        return int(exp_cycles)
+
+    def bram_estimation(self):
+        # NOTE: not tested for correctness
+        simd = self.get_nodeattr("SIMD")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        ifm_dim = np.prod(self.get_nodeattr("IFMDim"))
+        k = np.prod(self.get_nodeattr("ConvKernelDim"))
+        stride = np.prod(self.get_nodeattr("Stride"))
+        ram_style = self.get_nodeattr("ram_style")
+        if ram_style == "block" or ram_style == "auto":
+            ram_depth = ifm_dim * ifm_ch / simd
+            if ram_depth <= 512:
+                ram_width = 36
+            elif ram_depth <= 1024:
+                ram_width = 18
+            elif ram_depth <= 2048:
+                ram_width = 9
+            elif ram_depth <= 4096:
+                ram_width = 4
+            elif ram_depth <= 8192:
+                ram_width = 2
+            else:
+                ram_width = 1
+            return int(
+                (k + stride)
+                * (
+                    math.ceil(simd * self.get_input_datatype().bitwidth() / ram_width)
+                    * math.ceil(ifm_dim * ifm_ch / simd / ram_depth)
+                )
+            )
+        else:
+            return 0
+
+    def lut_estimation(self):
+        # NOTE: not tested for correctness
+        simd = self.get_nodeattr("SIMD")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        ifm_dim = np.prod(self.get_nodeattr("IFMDim"))
+        k = np.prod(self.get_nodeattr("ConvKernelDim"))
+        stride = np.prod(self.get_nodeattr("Stride"))
+        ram_style = self.get_nodeattr("ram_style")
+        if ram_style == "distributed":
+            ram_luts = int(
+                (k + stride)
+                * (
+                    simd
+                    * self.get_input_datatype().bitwidth()
+                    * math.ceil(ifm_dim * ifm_ch / simd / 64)
+                )
+            )
+        else:
+            ram_luts = 0
+        return 300 + ram_luts
+
+    def uram_estimation(self):
+        # NOTE: not tested for correctness
+        simd = self.get_nodeattr("SIMD")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        ifm_dim = np.prod(self.get_nodeattr("IFMDim"))
+        k = np.prod(self.get_nodeattr("ConvKernelDim"))
+        stride = np.prod(self.get_nodeattr("Stride"))
+        ram_style = self.get_nodeattr("ram_style")
+        if ram_style == "ultra":
+            return int(
+                (k + stride)
+                * (
+                    math.ceil(simd * self.get_input_datatype().bitwidth() / 64)
+                    * math.ceil(ifm_dim * ifm_ch / simd / 4096)
+                )
+            )
+        else:
+            return 0
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        exp_ishape = self.get_normal_input_shape()
+        exp_oshape = self.get_normal_output_shape()
+        folded_ishape = self.get_folded_input_shape()
+        folded_oshape = self.get_folded_output_shape()
+
+        # TODO ensure codegen dir exists
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert (
+            inp.shape == exp_ishape
+        ), """Input shape doesn't
+        match expected shape (1, ifm_dim, ifm_dim, ifm_ch)."""
+        if self.get_input_datatype() == DataType["BIPOLAR"]:
+            # store bipolar activations as binary
+            inp = (inp + 1) / 2
+            export_idt = DataType["BINARY"]
+        else:
+            export_idt = self.get_input_datatype()
+        # reshape input into folded form
+        inp = inp.reshape(folded_ishape)
+        # make copy before saving array
+        reshaped_input = inp.copy()
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_output(context)
+            assert (
+                context[node.output[0]].shape == folded_oshape
+            ), "cppsim \
+            did not produce expected ofolded utput shape"
+            context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            rtlsim_inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+            odt = export_idt
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+        # binary -> bipolar if needed
+        if self.get_output_datatype() == DataType["BIPOLAR"]:
+            out = context[node.output[0]]
+            out = 2 * out - 1
+            context[node.output[0]] = out
+        assert (
+            context[node.output[0]].shape == exp_oshape
+        ), """Output
+        shape doesn't match expected shape (1, ofm_dim_h, ofm_dim_w, k_h*k_w*ifm_ch)."""
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "slidingwindow.h"']
+
+    def defines(self, var):
+        numReps = 1
+        (
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            k,
+            stride,
+            dilation,
+        ) = self.get_1d_conv_attrs_normalized()
+        simd = self.get_nodeattr("SIMD")
+        ifm_precision = self.get_input_datatype().bitwidth()
+        ifm_dim_y, ifm_dim_x = ifm_dim
+        ofm_dim_y, ofm_dim_x = ofm_dim
+        k_y, k_x = k
+        dilation_y, dilation_x = dilation
+        # For a 1d convolution with stride=[S,1] or [1,S], the finn-hlslib function
+        # of ConvInpGen must be created with [stride_y, stride_x] = [S, S].
+        # TODO: changes in finn-hlslib (slidingwindow.h)
+        stride_y = np.prod(stride)
+        stride_x = np.prod(stride)
+
+        if dilation_x > 1:
+            assert (
+                dilation_y == 1
+            ), "Dilation value greater than 1 along y-axis is not yet supported"
+            self.code_gen_dict["$DEFINES$"] = [
+                """
+            #define ConvKernelDim1_x {}\n
+            #define ConvKernelDim1_y {}\n
+            #define IFMChannels1 {}\n
+            #define Input_precision1 {}\n
+            #define IFMDim1_x {}\n
+            #define IFMDim1_y {}\n
+            #define OFMDim1_x {}\n
+            #define OFMDim1_y {}\n
+            #define SIMD1 {}\n
+            #define Stride1_x {}\n
+            #define Stride1_y {}\n
+            #define Dilation1_x {}\n
+            #define Dilation1_y {}\n
+            #define numReps {}
+            """.format(
+                    k_x,
+                    k_y,
+                    ifm_ch,
+                    ifm_precision,
+                    ifm_dim_x,
+                    ifm_dim_y,
+                    ofm_dim_x,
+                    ofm_dim_y,
+                    simd,
+                    stride_x,
+                    stride_y,
+                    dilation_x,
+                    dilation_y,
+                    numReps,
+                )
+            ]
+        else:
+            ofm_dim = self.get_nodeattr("OFMDim")
+            self.code_gen_dict["$DEFINES$"] = [
+                """
+            #define ConvKernelDim1_x {}\n
+            #define ConvKernelDim1_y {}\n
+            #define IFMChannels1 {}\n
+            #define Input_precision1 {}\n
+            #define IFMDim1_x {}\n
+            #define IFMDim1_y {}\n
+            #define OFMDim1_x {}\n
+            #define OFMDim1_y {}\n
+            #define SIMD1 {}\n
+            #define Stride1_x {}\n
+            #define Stride1_y {}\n
+            #define numReps {}
+            """.format(
+                    k_x,
+                    k_y,
+                    ifm_ch,
+                    ifm_precision,
+                    ifm_dim_x,
+                    ifm_dim_y,
+                    ofm_dim_x,
+                    ofm_dim_y,
+                    simd,
+                    stride_x,
+                    stride_y,
+                    numReps,
+                )
+            ]
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_input_datatype()
+        if dtype == DataType["BIPOLAR"]:
+            # use binary for bipolar storage
+            dtype = DataType["BINARY"]
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"] = []
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
+            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+        )
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+        )
+
+    def docompute(self):
+        ram_style = self.get_nodeattr("ram_style")
+        map_to_hls_ram_style = {
+            "auto": "ap_resource_dflt()",
+            "block": "ap_resource_bram()",
+            "distributed": "ap_resource_lutram()",
+            "ultra": "ap_resource_uram()",
+        }
+        hls_ram_style = map_to_hls_ram_style[ram_style]
+
+        # check which ConvolutionInputGenerator is needed
+        if self.use_parallel_window_output():
+            hls_call = "ConvolutionInputGenerator_1D_parallel"
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}<ConvKernelDim1_x, IFMChannels1, Input_precision1,
+                IFMDim1_x, OFMDim1_x, SIMD1, Stride1_x>
+                (in0, out, numReps, {});""".format(
+                    hls_call, hls_ram_style
+                )
+            ]
+        else:
+            hls_call = "ConvolutionInputGenerator_NonSquare"
+            dilation_h, dilation_w = self.get_nodeattr("Dilation")
+            if dilation_h > 1 or dilation_w > 1:
+                hls_call += "_Dilated"
+                if self.get_nodeattr("depthwise") == 1:
+                    hls_call += "_dws"
+                self.code_gen_dict["$DOCOMPUTE$"] = [
+                    """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1,
+                    Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y,
+                    SIMD1, Stride1_x, Stride1_y, Dilation1_x, Dilation1_y>
+                    (in0, out, numReps, {});""".format(
+                        hls_call, hls_ram_style
+                    )
+                ]
+            elif self.get_nodeattr("depthwise") == 1:
+                hls_call += "_dws"
+                self.code_gen_dict["$DOCOMPUTE$"] = [
+                    """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1,
+                    Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y,
+                    SIMD1, Stride1_x, Stride1_y> (in0, out, numReps, {});""".format(
+                        hls_call, hls_ram_style
+                    )
+                ]
+            else:
+                self.code_gen_dict["$DOCOMPUTE$"] = [
+                    """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1,
+                    Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y,
+                    SIMD1, Stride1_x, Stride1_y> (in0, out, numReps, {});""".format(
+                        hls_call, hls_ram_style
+                    )
+                ]
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        if dtype == DataType["BIPOLAR"]:
+            # use binary for bipolar storage
+            dtype = DataType["BINARY"]
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        oshape = self.get_folded_output_shape()
+        oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
+        if self.use_parallel_window_output():
+            # pass the number of pixels in the folded output to apintstream2npy, needed
+            # to unpack the ouput correctly and reverse only the inner SIMD dimension
+            k_h, k_w = self.get_nodeattr("ConvKernelDim")
+            multi_pixel_out = k_h * k_w
+        else:
+            multi_pixel_out = 1
+
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", true, 1, %d);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                oshape_cpp_str,
+                npy_out,
+                multi_pixel_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        if self.use_parallel_window_output():
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0,
+                    hls::stream<ap_uint<ConvKernelDim1_x*SIMD1*Input_precision1>>
+                    &out)""".format(
+                    self.onnx_node.name
+                )
+            ]
+        else:
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0,
+                    hls::stream<ap_uint<SIMD1*Input_precision1>> &out)""".format(
+                    self.onnx_node.name
+                )
+            ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+        )
+        
+    def generate_hdl(self):
+        #todo: generate into some code gen dict
+        f_debug = open(os.path.join("/workspace/finn/finn-rtllib/swg/", "swg_hdl_debuginfo.log"), "w")
+        code_gen_dict = {}
+
+        #--------------------
+        # init hyperparameters
+        # for 1D case: it does not matter if dummy dim is x or y
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        k = self.get_nodeattr("ConvKernelDim")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        ofm_dim = self.get_nodeattr("OFMDim")
+        stride = self.get_nodeattr("Stride")
+        dilation = self.get_nodeattr("Dilation")
+
+        n = 1
+        h, w = ifm_dim
+        c = 1#ifm_ch not considered atm (always parallelize across c)
+        k_h, k_w = k
+        pad = [0,0,0,0]
+        pad_val = 0
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+        conv_c = 99
+
+        # init folding config
+        simd = self.get_nodeattr("SIMD")
+        mmv_in = 1
+        mmv_out = k_h*k_w
+
+        assert simd==ifm_ch, "Constraint violated: SIMD = C"
+        assert mmv_in==1, "Constraint violated: MMV_IN = 1"
+        assert mmv_out==k_h*k_w, "Constraint violated: mmv_out = K"
+
+        # how many "unused" registers are allowed between buffer positions that will be accessed in parallel
+        # example:
+        # 0: only consecutive access patterns will be implemented in regs, rest in BRAM line buffers
+        # 2: [0, 3, 6] access pattern is still allowed and will be implemented with 1 7-position shift reg
+        REG_BRAM_THRESHOLD = 1
+        #--------------------
+
+        in_shape = (n,c,h,w) #NCHW
+
+        in_image = np.empty(in_shape, dtype=int)
+
+        for index, x in np.ndenumerate(in_image):
+            # "HWC" dummy values
+            val = int((index[2]+1)*100+(index[3]+1)*10+(index[1]+1)*1)
+            in_image[index] = val
+
+        in_image_padded = np.pad(
+            in_image,
+            ((0, 0), (0, 0), (pad[0], pad[2]), (pad[1], pad[3])),
+            mode="constant",
+            constant_values=pad_val,
+        )
+        in_shape_padded = in_image_padded.shape
+        h_padded = in_shape_padded[2]
+        w_padded = in_shape_padded[3]
+
+        pad_h = pad[0] + pad[2]
+        pad_w = pad[1] + pad[3]
+        out_dim_h = im2col.compute_conv_output_dim(h, k_h, stride_h, pad_h, dilation_h)
+        out_dim_w = im2col.compute_conv_output_dim(w, k_w, stride_w, pad_w, dilation_w)
+
+        f_debug.write("\n"+"in shape         " + str(in_shape))
+        f_debug.write("\n"+"in shape padded  " + str(in_shape_padded))
+        f_debug.write("\n"+"conv out shape   " + str((n,conv_c,out_dim_h,out_dim_w)))
+        f_debug.write("\n"+"im2col out shape " + str((n,out_dim_h,out_dim_w,k_h*k_w*c)))
+
+        idx_c, idx_h, idx_w = im2col.get_im2col_indices_nchw(
+        in_shape,
+        k_h,
+        k_w,
+        pad,
+        stride_h,
+        stride_w,
+        dilation_h,
+        dilation_w
+        )
+
+        f_debug.write("\n"+"c indices")
+        f_debug.write("\n"+str(idx_c))
+        f_debug.write("\n"+"h indices")
+        f_debug.write("\n"+str(idx_h))
+        f_debug.write("\n"+"w indices")
+        f_debug.write("\n"+str(idx_w))
+        
+        cols = in_image_padded[:, idx_c, idx_h, idx_w]
+        cols = cols.transpose(1, 2, 0).reshape(k_h * k_w * c, -1)
+
+        f_debug.write("\n"+"cols (shape %s)" % str(cols.shape))
+        f_debug.write("\n"+str(cols))
+
+        # result shape is (k_H*k_W*N, out_dim_H*out_dim_W), convert to NCHW
+        out_image = cols.reshape(n, c, k_h, k_w, out_dim_h, out_dim_w)
+        # (N=0,C=1,kh=2,kw=3,H=4,W=5) -> (N=0,H=4,W=5,kh=2,kw=3,C=1)
+        out_image = out_image.transpose(0, 4, 5, 2, 3, 1)
+        out_image = out_image.reshape(n, out_dim_h, out_dim_w, k_h * k_w * c)
+
+        f_debug.write("\n"+"output (shape %s)" % str(out_image.shape))
+        f_debug.write("\n"+str(out_image))
+
+        f_debug.write("\n"+"h indices")
+        f_debug.write("\n"+str(idx_h))
+        f_debug.write("\n"+"w indices")
+        f_debug.write("\n"+str(idx_w))
+
+        idx_px = idx_h*w+idx_w
+        f_debug.write("\n"+"sequential pixel indices")
+        f_debug.write("\n"+str(idx_px))
+
+        buffer = []
+        buffer_max_size = 0
+        # buffer schedule (write from input, read to output)
+        schedule_write = []
+        schedule_read = []
+        next_in_px = 0
+
+        idx_px_relative = idx_px.copy()
+
+        # compute schedule and buffer read pattern
+        Y, X = idx_px_relative.shape
+        for x in range(X):
+            # load missing inputs into buffer
+            for y in range(Y):
+                while int(idx_px_relative[y,x]) not in buffer:
+                    buffer.append(next_in_px)
+                    next_in_px += 1
+                    schedule_write.append(1)
+                    schedule_read.append(0)
+            
+            # discard unused buffer elements (assumes in-order access)
+            oldest_px = min(idx_px_relative[:,x])
+            while buffer[0] < oldest_px:
+                buffer.pop(0)
+                
+            # adjust relative buffer index
+            for y in range(Y):
+                idx_px_relative[y,x] -= oldest_px
+                
+            # record max needed buffer depth
+            if len(buffer) > buffer_max_size:
+                buffer_max_size = len(buffer)
+            
+            # read from buffer
+            schedule_read.append(1)
+            
+            # simultaneously load next pixel(s) into buffer if there are any left
+            if next_in_px > (h_padded*w_padded-1):
+                schedule_write.append(0)
+            else:
+                buffer.append(next_in_px)
+                next_in_px += 1
+                schedule_write.append(1)
+
+
+        # find buffer access patterns
+        buffer_access_patterns = []
+        for x in range(X):
+            if idx_px_relative[:,x].tolist() not in buffer_access_patterns:
+                buffer_access_patterns.append(idx_px_relative[:,x].tolist())
+                
+        
+        f_debug.write("\n"+"max buffer size observed: %d" %(buffer_max_size))
+        f_debug.write("\n"+"output vector elements: relative buffer indices")
+        f_debug.write("\n"+str(idx_px_relative))
+        f_debug.write("\n"+"found %d buffer access patterns:" % len(buffer_access_patterns))
+        f_debug.write("\n"+str(buffer_access_patterns))
+        f_debug.write("\n"+"required parallel-access registers for mmv_out=k: %d" % len(sum(buffer_access_patterns,[])))
+        f_debug.write("\n"+"buffer write schedule (%d cycles)" % len(schedule_write))
+        f_debug.write("\n"+str(schedule_write))
+        f_debug.write("\n"+"writing buffer in %d cycles" % schedule_write.count(1))
+        f_debug.write("\n"+"buffer read schedule (%d cycles)" % len(schedule_read))
+        f_debug.write("\n"+str(schedule_read))
+        f_debug.write("\n"+"reading buffer in %d cycles" % schedule_read.count(1))
+
+        assert len(schedule_write) == len(schedule_read), "ERROR: Schedules have different lenghts"
+        cycles_total = len(schedule_write)
+        
+        assert schedule_read.count(1) == self.get_number_output_values(), "ERROR: Reading buffer in fewer cycles than expected"
+
+        code_gen_dict["$TOP_MODULE_NAME$"] = [self.get_verilog_top_module_name()]
+        code_gen_dict["$BIT_WIDTH$"] = [str(self.get_input_datatype().bitwidth())]
+        code_gen_dict["$SIMD$"] = [str(simd)]
+        code_gen_dict["$MMV_IN$"] = [str(mmv_in)]
+        code_gen_dict["$MMV_OUT$"] = [str(mmv_out)]
+        code_gen_dict["$CYCLES_TOTAL$"] = [str(cycles_total)]
+        code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_max_size)]
+        
+        # determine buffer partitioning into REG FIFOs (parallel access) and BRAM FIFOs (line buffers)
+        assert len(buffer_access_patterns) == 1, "ERROR: Buffer access pattern is not static"
+        buf_static_access_pattern = buffer_access_patterns[0]
+        reg_fifos = []
+        bram_fifos = []
+        current = []
+        for i in range(len(buf_static_access_pattern)):
+            access_idx = buf_static_access_pattern[i]
+            if len(current) == 0:
+                current.append(access_idx)
+            else:
+                # assume non-decreasing index order in access pattern
+                distance = access_idx - max(current)
+                if not (distance-1 > REG_BRAM_THRESHOLD):
+                    for i in range(distance-1):
+                        # insert dummy into REG FIFO (not read as part of window)
+                        current.append(-1)
+                    # assign this access to same REG FIFO as previous one
+                    current.append(access_idx)
+                else:
+                    # assign skipped accesses to new BRAM FIFO
+                    bram_fifos.append([-1]*(distance-1))
+                    # start with new REG FIFO
+                    reg_fifos.append(current)
+                    current = []
+                    current.append(access_idx)
+        reg_fifos.append(current)
+
+        f_debug.write("\n"+"Buffer partitioning using REG_BRAM_THRESHOLD=%d" % REG_BRAM_THRESHOLD)
+        f_debug.write("\n"+"%d REG FIFOs (parallel read access):" % len(reg_fifos))
+        f_debug.write("\n"+str(reg_fifos))
+        f_debug.write("\n"+"%d BRAM FIFOs (line buffers):" % len(bram_fifos))
+        f_debug.write("\n"+str(bram_fifos))
+
+        code_gen_dict["$GENERATE_REG_FIFOS$"] = []
+        for i in range(len(reg_fifos)):
+            code_gen_dict["$GENERATE_REG_FIFOS$"].append(
+                """parameter reg_fifo_{id}_len = {len};
+                reg [IN_WIDTH-1:0] reg_fifo_{id} [reg_fifo_{id}_len-1:0];
+                """.format(id=i, len=len(reg_fifos[i])))
+        
+        #todo: generate actual bram shift buffers instead of regs
+        code_gen_dict["$GENERATE_BRAM_FIFOS$"] = []
+        for i in range(len(bram_fifos)):
+            code_gen_dict["$GENERATE_BRAM_FIFOS$"].append(
+                """parameter bram_fifo_{id}_len = {len};
+                reg [IN_WIDTH-1:0] bram_fifo_{id} [bram_fifo_{id}_len-1:0];
+                """.format(id=i, len=len(bram_fifos[i])))
+
+        code_gen_dict["$GENERATE_OUTPUT_MAPPING$"] = []
+        out_idx = mmv_out-1
+        for fifo_id, reg_fifo in enumerate(reg_fifos):
+            for fifo_idx, access_idx in enumerate(reg_fifo):
+                if(access_idx != -1):
+                    code_gen_dict["$GENERATE_OUTPUT_MAPPING$"].append(
+                        "assign data_out[IN_WIDTH*{out_idx}+:IN_WIDTH] = reg_fifo_{fifo_id}[{fifo_idx}]; //{access_idx}".format(
+                            out_idx=out_idx, fifo_id=fifo_id, fifo_idx=fifo_idx, access_idx=access_idx
+                        )
+                    )
+                    # reversal: out_idx=0 -> oldest buffer element -> highest access_idx
+                    out_idx = out_idx-1
+        assert out_idx==-1, "ERROR: Not all output vector elements connected"
+
+        code_gen_dict["$GENERATE_SHIFT_LOGIC$"] = []
+        for i in range(len(reg_fifos)):
+            if i == 0:
+                # first FIFO containing newest elements -> input comes from input reg
+                code_gen_dict["$GENERATE_SHIFT_LOGIC$"].append(
+                    """for (i=reg_fifo_{fifo_id}_len-1; i>0; i=i-1)
+                        reg_fifo_{fifo_id}[i] <= reg_fifo_{fifo_id}[i-1];
+                    reg_fifo_{fifo_id}[0] <= reg_input;""".format(
+                        fifo_id=i,
+                    )
+                )
+            else:
+                # other REG FIFOs -> input comes from connected BRAM FIFO (line buffer)
+                input_fifo_id = i-1
+                code_gen_dict["$GENERATE_SHIFT_LOGIC$"].append(
+                    """for (i=reg_fifo_{fifo_id}_len-1; i>0; i=i-1)
+                        reg_fifo_{fifo_id}[i] <= reg_fifo_{fifo_id}[i-1];
+                    reg_fifo_{fifo_id}[0] <= bram_fifo_{input_fifo_id} [bram_fifo_{input_fifo_id}_len-1];""".format(
+                        fifo_id=i, input_fifo_id=input_fifo_id
+                    )
+                )
+        for i in range(len(bram_fifos)):
+            input_fifo_id = i
+            code_gen_dict["$GENERATE_SHIFT_LOGIC$"].append(
+                """for (i=bram_fifo_{fifo_id}_len-1; i>0; i=i-1)
+                    bram_fifo_{fifo_id}[i] <= bram_fifo_{fifo_id}[i-1];
+                bram_fifo_{fifo_id}[0] <= reg_fifo_{input_fifo_id} [reg_fifo_{input_fifo_id}_len-1];""".format(
+                    fifo_id=i, input_fifo_id=input_fifo_id
+                )
+            )
+
+        # Generate read schedule (when data is read from input, written to buffer)
+        code_gen_dict["$GENERATE_READ_SCHEDULE$"] = []
+        schedule_as_string = ""
+        #todo: change naming to swap write/read
+        for i in schedule_write:
+            if i == 1:
+                schedule_as_string += "1'b1,"
+            else:
+                schedule_as_string += "1'b0,"
+        schedule_as_string = schedule_as_string[:-1] # remove trailing ','
+        code_gen_dict["$GENERATE_READ_SCHEDULE$"].append(
+            "localparam [0:{len}-1] READ_SCHEDULE = {{{str}}};".format(len=cycles_total, str=schedule_as_string)
+        )
+        code_gen_dict["$GENERATE_READ_SCHEDULE$"].append(
+            "assign read_state = READ_SCHEDULE[cycle];"
+        )
+
+        # Generate write schedule (when data is written to output, read from buffer)
+        code_gen_dict["$GENERATE_WRITE_SCHEDULE$"] = []
+        schedule_as_string = ""
+        #todo: change naming to swap write/read
+        for i in schedule_read:
+            if i == 1:
+                schedule_as_string += "1'b1,"
+            else:
+                schedule_as_string += "1'b0,"
+        schedule_as_string = schedule_as_string[:-1] # remove trailing ','
+        code_gen_dict["$GENERATE_WRITE_SCHEDULE$"].append(
+            "localparam [0:{len}-1] WRITE_SCHEDULE = {{{str}}};".format(len=cycles_total, str=schedule_as_string)
+        )
+        code_gen_dict["$GENERATE_WRITE_SCHEDULE$"].append(
+            "assign write_state = WRITE_SCHEDULE[cycle];"
+        )
+
+        with open("/workspace/finn/finn-rtllib/swg/swg_hdl_template.v", "r") as f:
+            template = f.read()
+        
+        for key in code_gen_dict:
+            # transform list into long string separated by '\n'
+            code_gen_line = "\n".join(code_gen_dict[key])
+            template = template.replace(key, code_gen_line)
+        f = open(os.path.join("/workspace/finn/finn-rtllib/swg/", "swg_hdl_generated.v"), "w")
+        f.write(template)
+        f.close()
+        f_debug.close()
+
+    def prepare_rtlsim(self):
+        """Creates a Verilator emulation library for the RTL code generated
+        for this node, sets the rtlsim_so attribute to its path and returns
+        a PyVerilator wrapper around it."""
+        #modified to use generated verilog instead of HLS output products
+
+        self.generate_hdl()
+
+        if PyVerilator is None:
+            raise ImportError("Installation of PyVerilator is required.")
+        verilog_paths = ["/workspace/finn/finn-rtllib/swg/"]
+        verilog_files = ["swg_hdl_generated.v"]
+        # build the Verilator emu library
+        sim = PyVerilator.build(
+            verilog_files,
+            build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
+            verilog_path=verilog_paths,
+            trace_depth=get_rtlsim_trace_depth(),
+            top_module_name=self.get_verilog_top_module_name(),
+        )
+        # save generated lib filename in attribute
+        self.set_nodeattr("rtlsim_so", sim.lib._name)
+        return sim
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py
new file mode 100755
index 0000000000000000000000000000000000000000..f7a724133333156811d5e3f7721c9585dba94eca
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py
@@ -0,0 +1,265 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+from onnx import TensorProto, helper
+
+import finn.core.onnx_exec as oxe
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.custom_op.general.im2col import compute_conv_output_dim
+from finn.custom_op.registry import getCustomOp
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.util.basic import gen_finn_dt_tensor
+
+
+def make_single_im2col_modelwrapper(
+    k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt
+):
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+    ofm_dim_h, ofm_dim_w = ofm_dim
+
+    odt = idt
+    inp = helper.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]
+    )
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch]
+    )
+
+    im2col_node = helper.make_node(
+        "Im2Col",
+        ["inp"],
+        ["outp"],
+        domain="finn.custom_op.general",
+        stride=[stride_h, stride_w],
+        kernel_size=[k_h, k_w],
+        input_shape=str((1, ifm_dim_h, ifm_dim_w, ifm_ch)),
+        dilations=[dilation_h, dilation_w],
+        pad_amount=[0, 0, 0, 0],
+        pad_value=0,
+    )
+    graph = helper.make_graph(
+        nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp]
+    )
+
+    model = helper.make_model(graph, producer_name="im2col-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+
+    return model
+
+
+def make_single_slidingwindow_modelwrapper(
+    k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt, dw=0
+):
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+    ofm_dim_h, ofm_dim_w = ofm_dim
+
+    odt = idt
+    inp = helper.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]
+    )
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch]
+    )
+
+    SlidingWindow_node = helper.make_node(
+        "ConvolutionInputGenerator_rtl",
+        ["inp"],
+        ["outp"],
+        domain="finn.custom_op.fpgadataflow",
+        backend="fpgadataflow",
+        ConvKernelDim=[k_h, k_w],
+        IFMChannels=ifm_ch,
+        IFMDim=[ifm_dim_h, ifm_dim_w],
+        OFMDim=[ofm_dim_h, ofm_dim_w],
+        SIMD=simd,
+        Stride=[stride_h, stride_w],
+        Dilation=[dilation_h, dilation_w],
+        inputDataType=idt.name,
+        outputDataType=odt.name,
+        depthwise=dw,
+    )
+    graph = helper.make_graph(
+        nodes=[SlidingWindow_node],
+        name="slidingwindow_graph",
+        inputs=[inp],
+        outputs=[outp],
+    )
+
+    model = helper.make_model(graph, producer_name="slidingwindow-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+
+    #DEBUG
+    swg_node = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")[0]
+    swg_inst = getCustomOp(swg_node)
+    swg_inst.set_nodeattr("rtlsim_trace", "/workspace/finn/finn-rtllib/swg/swg_test_trace.vcd")
+
+    return model
+
+
+def prepare_inputs(input_tensor):
+    return {"inp": input_tensor}
+
+
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["INT4"]])
+# kernel size
+@pytest.mark.parametrize("k", [[3, 3]])
+# input dimension
+@pytest.mark.parametrize("ifm_dim", [[6, 11]])
+# input channels
+@pytest.mark.parametrize("ifm_ch", [2])
+# Stride
+@pytest.mark.parametrize("stride", [[1, 2]])
+# Dilation
+@pytest.mark.parametrize("dilation", [[1, 2]])
+# execution mode
+@pytest.mark.parametrize("exec_mode", ["rtlsim"])
+# input channel parallelism ("SIMD")
+@pytest.mark.parametrize("simd", [2])
+# depthwise
+@pytest.mark.parametrize("dw", [0])
+# Flip dimensions
+@pytest.mark.parametrize("flip", [False])
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_fpgadataflow_slidingwindow_rtl(
+    idt, k, ifm_dim, ifm_ch, stride, dilation, exec_mode, simd, dw, flip
+):
+    if flip:
+        k = k[::-1]
+        ifm_dim = ifm_dim[::-1]
+        stride = stride[::-1]
+        dilation = dilation[::-1]
+
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+
+    #if (dilation_h > 1 or dilation_w > 1) and (stride_h > 1 or stride_w > 1):
+    #    pytest.skip(
+    #        """Dilation value greater than 1 and stride greater than 1
+    #        currently not supported for 1D convolutions"""
+    #    )
+    if simd > ifm_ch:
+        pytest.skip("SIMD cannot be larger than number of input channels")
+
+    ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+    ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+    ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+    x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch))
+    model = make_single_slidingwindow_modelwrapper(
+        k=k,
+        ifm_ch=ifm_ch,
+        ifm_dim=ifm_dim,
+        ofm_dim=ofm_dim,
+        simd=simd,
+        stride=stride,
+        dilation=dilation,
+        idt=idt,
+        dw=dw,
+    )
+
+    if exec_mode == "cppsim":
+        model = model.transform(SetExecMode("cppsim"))
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+    elif exec_mode == "rtlsim":
+        model = model.transform(SetExecMode("rtlsim"))
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(PrepareRTLSim())
+    else:
+        raise Exception("Unknown exec_mode in test_fpgadataflow_slidingwindow")
+
+    # prepare input data
+    input_dict = prepare_inputs(x)
+    # execute model
+    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
+    golden = make_single_im2col_modelwrapper(
+        k=k,
+        ifm_ch=ifm_ch,
+        ifm_dim=ifm_dim,
+        ofm_dim=ofm_dim,
+        simd=simd,
+        stride=stride,
+        dilation=dilation,
+        idt=idt,
+    )
+    y_expected = oxe.execute_onnx(golden, input_dict)["outp"]
+
+    #DEBUG
+    print("-------expected:")
+    print(y_expected)
+    print("--------produced:")
+    print(y_produced)
+
+    if dw == 0:
+        assert (y_produced == y_expected).all()
+    else:
+        y_expected = y_expected.reshape(
+            1, ofm_dim_h, ofm_dim_w, k_h * k_w, ifm_ch // simd, simd
+        )
+        y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5)
+        y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, ifm_ch * k_h * k_w)
+        assert (y_produced == y_expected).all()
+
+
+    # if exec_mode == "rtlsim":
+    #     node = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")[0]
+    #     inst = getCustomOp(node)
+    #     cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+    #     exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+    #     exp_cycles = exp_cycles_dict[node.name]
+    #     assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
+    #     assert exp_cycles != 0