diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index bfb09f86b8a493a44182450d09029de6486b8fbd..8117a251c4c92a0d19ce8f5fbeba6849a93f8e8f 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -12,11 +12,11 @@ gecho () {
 
 # checkout the correct dependency repo commits
 # the repos themselves are cloned in the Dockerfile
-FINN_BASE_COMMIT=8908c6a3f6674c4fa790954bd41c23ee5bf053df
+FINN_BASE_COMMIT=2c08044c5e9011c19911e731a18ac20d775bbf46
 FINN_EXP_COMMIT=e9f97dcdb4db2f889b0f36af079a6a1792b7d4de
 BREVITAS_COMMIT=14abbe1e7ef82485d79415871fcf5766b0a40a00
 CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4
-HLSLIB_COMMIT=2e49322d1bbc4969ca293843bda1f3f9c05456fc
+HLSLIB_COMMIT=4d74baefa79df48b5a0348d63f39a26df075de51
 PYVERILATOR_COMMIT=e2ff74030de3992dcac54bf1b6aad2915946e8cb
 OMX_COMMIT=1bae737669901e762f581af73348332b5c4b2ada
 
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 068950b89ae543f5a37c28d83d87ecfa605eaab4..b20b652254028c4ee5dc2edd1f1302ea3359019b 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -29,6 +29,9 @@
 from finn.custom_op.fpgadataflow.convolutioninputgenerator import (
     ConvolutionInputGenerator,
 )
+from finn.custom_op.fpgadataflow.convolutioninputgenerator1d import (
+    ConvolutionInputGenerator1D,
+)
 from finn.custom_op.fpgadataflow.downsampler import DownSampler
 from finn.custom_op.fpgadataflow.streamingfclayer_batch import StreamingFCLayer_Batch
 from finn.custom_op.fpgadataflow.streamingmaxpool_batch import StreamingMaxPool_Batch
@@ -49,6 +52,9 @@ from finn.custom_op.fpgadataflow.vector_vector_activate_batch import (
 )
 from finn.custom_op.fpgadataflow.channelwise_op_batch import ChannelwiseOp_Batch
 from finn.custom_op.fpgadataflow.iodma import IODMA
+from finn.custom_op.fpgadataflow.streamingdataflowpartition import (
+    StreamingDataflowPartition,
+)
 
 custom_op = dict()
 
@@ -58,6 +64,7 @@ custom_op["DownSampler"] = DownSampler
 custom_op["StreamingMaxPool_Batch"] = StreamingMaxPool_Batch
 custom_op["StreamingFCLayer_Batch"] = StreamingFCLayer_Batch
 custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator
+custom_op["ConvolutionInputGenerator1D"] = ConvolutionInputGenerator1D
 custom_op["TLastMarker"] = TLastMarker
 custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch
 custom_op["StreamingFIFO"] = StreamingFIFO
@@ -71,3 +78,4 @@ custom_op["DuplicateStreams_Batch"] = DuplicateStreams_Batch
 custom_op["Vector_Vector_Activate_Batch"] = Vector_Vector_Activate_Batch
 custom_op["ChannelwiseOp_Batch"] = ChannelwiseOp_Batch
 custom_op["IODMA"] = IODMA
+custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
index 3f400053df8de6ec1e53e39fb5a3edee15f3ab30..6e77cd3da7328fd81dccc2ff171a9ae84723d165 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
@@ -61,12 +61,14 @@ class ConvolutionInputGenerator(HLSCustomOp):
 
     def get_nodeattr_types(self):
         my_attrs = {
-            "ConvKernelDim": ("i", True, 0),
+            "ConvKernelDim": ("ints", True, []),  # [H, W] = [Y, X]
             "IFMChannels": ("i", True, 0),
-            "IFMDim": ("i", True, 0),
-            "OFMDim": ("i", True, 0),
+            "IFMDim": ("ints", True, []),  # [H, W] = [Y, X]
+            "OFMDim": ("ints", True, []),  # [H, W] = [Y, X]
             "SIMD": ("i", True, 0),
-            "Stride": ("i", True, 0),
+            "Stride": ("ints", True, [1, 1]),  # [H, W] = [Y, X]
+            # note: only dilation=1 supported for now
+            "Dilation": ("ints", True, [1, 1]),  # [H, W] = [Y, X]
             # FINN DataTypes for inputs, weights, outputs
             "inputDataType": ("s", True, ""),
             "outputDataType": ("s", True, ""),
@@ -86,44 +88,59 @@ class ConvolutionInputGenerator(HLSCustomOp):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
-    def get_normal_input_shape(self):
+    def get_nodeattr(self, name):
+        # overriding get_nodeattr to check for square kernel/img.. requirement
+        # since this can't be done with the attribute restriction in nodeattr_types
+        # TODO non-square can be enabled in theory but needs testing
+        ret = super().get_nodeattr(name)
+        props_to_check = ["ConvKernelDim", "IFMDim", "OFMDim", "Stride", "Dilation"]
+        if name in props_to_check:
+            is_square = ret[0] == ret[1]
+            assert is_square, "Only square %s supported" % name
+        if name == "Dilation":
+            assert ret[0] == ret[1] == 1, "Only dilation=1 supported"
+        return ret
 
-        ifm_dim = self.get_nodeattr("IFMDim")
+    def get_normal_input_shape(self):
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
         ifm_ch = self.get_nodeattr("IFMChannels")
-
-        ishape = (1, ifm_dim, ifm_dim, ifm_ch)
+        ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch)
         return ishape
 
     def get_folded_input_shape(self):
-        ifm_dim = self.get_nodeattr("IFMDim")
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
         ifm_ch = self.get_nodeattr("IFMChannels")
         simd = self.get_nodeattr("SIMD")
         assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
         wf = int(ifm_ch / simd)
-        folded_ishape = (1, ifm_dim, ifm_dim, wf, simd)
+        folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd)
         return folded_ishape
 
     def get_normal_output_shape(self):
-        k = self.get_nodeattr("ConvKernelDim")
-        ifm_dim = self.get_nodeattr("IFMDim")
+        k_h, k_w = self.get_nodeattr("ConvKernelDim")
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
         ifm_ch = self.get_nodeattr("IFMChannels")
-        stride = self.get_nodeattr("Stride")
+        stride_h, stride_w = self.get_nodeattr("Stride")
+        dilation_h, dilation_w = self.get_nodeattr("Dilation")
         pad = 0
-        ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, pad)
-        oshape = (1, ofm_dim, ofm_dim, k * k * ifm_ch)
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w)
+        oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch)
         return oshape
 
     def get_folded_output_shape(self):
-        k = self.get_nodeattr("ConvKernelDim")
-        ifm_dim = self.get_nodeattr("IFMDim")
+        k_h, k_w = self.get_nodeattr("ConvKernelDim")
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
         ifm_ch = self.get_nodeattr("IFMChannels")
-        stride = self.get_nodeattr("Stride")
+        stride_h, stride_w = self.get_nodeattr("Stride")
+        dilation_h, dilation_w = self.get_nodeattr("Dilation")
         simd = self.get_nodeattr("SIMD")
         pad = 0
-        ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, pad)
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w)
         assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
-        wf = int((k * k * ifm_ch) // simd)
-        folded_oshape = (1, ofm_dim, ofm_dim, wf, simd)
+        wf = int((k_h * k_w * ifm_ch) // simd)
+        folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd)
         return folded_oshape
 
     def make_shape_compatible_op(self, model):
@@ -186,26 +203,31 @@ class ConvolutionInputGenerator(HLSCustomOp):
     def get_exp_cycles(self):
         simd = self.get_nodeattr("SIMD")
         ifm_ch = self.get_nodeattr("IFMChannels")
-        k = self.get_nodeattr("ConvKernelDim")
-        ifm_dim = self.get_nodeattr("IFMDim")
-        ofm_dim = self.get_nodeattr("OFMDim")
-        stride = self.get_nodeattr("Stride")
+        k_h, k_w = self.get_nodeattr("ConvKernelDim")
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
+        ofm_dim_h, ofm_dim_w = self.get_nodeattr("OFMDim")
+        stride_h, stride_w = self.get_nodeattr("Stride")
+        dilation_h, dilation_w = self.get_nodeattr("Dilation")
+
         # since mmv != 1 is not supported yet, we set mmv for now to 1
         mmv = 1
         # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h
-        cycles_write_block = (ofm_dim * k * k * (ifm_ch / simd)) / mmv
-        cycles_read_block = stride * ifm_dim * (ifm_ch / simd)
+        cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv
+        cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd)
         max_cycles = max(cycles_write_block, cycles_read_block)
-        exp_cycles = ifm_dim * k * (ifm_ch / simd) + ofm_dim * max_cycles
+        exp_cycles = (
+            ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles
+        )
 
         return int(exp_cycles)
 
     def bram_estimation(self):
+        # NOTE: only tested with a square convolution
         simd = self.get_nodeattr("SIMD")
         ifm_ch = self.get_nodeattr("IFMChannels")
-        ifm_dim = self.get_nodeattr("IFMDim")
-        k = self.get_nodeattr("ConvKernelDim")
-        stride = self.get_nodeattr("Stride")
+        ifm_dim = self.get_nodeattr("IFMDim")[0]
+        k = self.get_nodeattr("ConvKernelDim")[0]
+        stride = self.get_nodeattr("Stride")[0]
         ram_style = self.get_nodeattr("ram_style")
         if ram_style == "block" or ram_style == "auto":
             ram_depth = ifm_dim * ifm_ch / simd
@@ -232,11 +254,12 @@ class ConvolutionInputGenerator(HLSCustomOp):
             return 0
 
     def lut_estimation(self):
+        # NOTE: only tested with a square convolution
         simd = self.get_nodeattr("SIMD")
         ifm_ch = self.get_nodeattr("IFMChannels")
-        ifm_dim = self.get_nodeattr("IFMDim")
-        k = self.get_nodeattr("ConvKernelDim")
-        stride = self.get_nodeattr("Stride")
+        ifm_dim = self.get_nodeattr("IFMDim")[0]
+        k = self.get_nodeattr("ConvKernelDim")[0]
+        stride = self.get_nodeattr("Stride")[0]
         ram_style = self.get_nodeattr("ram_style")
         if ram_style == "distributed":
             ram_luts = int(
@@ -252,11 +275,12 @@ class ConvolutionInputGenerator(HLSCustomOp):
         return 300 + ram_luts
 
     def uram_estimation(self):
+        # NOTE: only tested with a square convolution
         simd = self.get_nodeattr("SIMD")
         ifm_ch = self.get_nodeattr("IFMChannels")
-        ifm_dim = self.get_nodeattr("IFMDim")
-        k = self.get_nodeattr("ConvKernelDim")
-        stride = self.get_nodeattr("Stride")
+        ifm_dim = self.get_nodeattr("IFMDim")[0]
+        k = self.get_nodeattr("ConvKernelDim")[0]
+        stride = self.get_nodeattr("Stride")[0]
         ram_style = self.get_nodeattr("ram_style")
         if ram_style == "ultra":
             return int(
@@ -295,7 +319,7 @@ class ConvolutionInputGenerator(HLSCustomOp):
         assert (
             inp.shape == exp_ishape
         ), """Input shape doesn't
-        match expected shape (1, ifm_dim, ifm_dim, ifm_ch)."""
+        match expected shape (1, ifm_dim_h, ifm_dim_w, ifm_ch)."""
         if self.get_input_datatype() == DataType.BIPOLAR:
             # store bipolar activations as binary
             inp = (inp + 1) / 2
@@ -354,26 +378,27 @@ class ConvolutionInputGenerator(HLSCustomOp):
         assert (
             context[node.output[0]].shape == exp_oshape
         ), """Output
-        shape doesn't match expected shape (1, ofm_dim, ofm_dim, k*k*ifm_ch)."""
+        shape doesn't match expected shape (1, ofm_dim_h, ofm_dim_w, k_h*k_w*ifm_ch)."""
 
     def global_includes(self):
         self.code_gen_dict["$GLOBALS$"] = ['#include "slidingwindow.h"']
 
     def defines(self, var):
         numReps = 1
+        ifm_dim = self.get_nodeattr("IFMDim")[0]
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        ofm_dim = self.get_nodeattr("OFMDim")[0]
+        k = self.get_nodeattr("ConvKernelDim")[0]
+        stride = self.get_nodeattr("Stride")[0]
+        simd = self.get_nodeattr("SIMD")
+        ifm_precision = self.get_input_datatype().bitwidth()
+
         self.code_gen_dict["$DEFINES$"] = [
             """#define ConvKernelDim1 {}\n #define IFMChannels1 {}\n
             #define Input_precision1 {}\n #define IFMDim1 {}\n
             #define OFMDim1 {}\n #define SIMD1 {}\n
             #define Stride1 {}\n #define numReps {}""".format(
-                self.get_nodeattr("ConvKernelDim"),
-                self.get_nodeattr("IFMChannels"),
-                self.get_input_datatype().bitwidth(),
-                self.get_nodeattr("IFMDim"),
-                self.get_nodeattr("OFMDim"),
-                self.get_nodeattr("SIMD"),
-                self.get_nodeattr("Stride"),
-                numReps,
+                k, ifm_ch, ifm_precision, ifm_dim, ofm_dim, simd, stride, numReps
             )
         ]
 
@@ -415,9 +440,11 @@ class ConvolutionInputGenerator(HLSCustomOp):
         }
         hls_ram_style = map_to_hls_ram_style[ram_style]
         hls_call = node.op_type
-        # check if non optimized ConvolutionInputGenerator is needed
-        k = self.get_nodeattr("ConvKernelDim")
-        stride = self.get_nodeattr("Stride")
+
+        # check which ConvolutionInputGenerator is needed
+        k = self.get_nodeattr("ConvKernelDim")[0]
+        stride = self.get_nodeattr("Stride")[0]
+
         if k % stride != 0:
             hls_call += "_kernel_stride"
 
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
new file mode 100644
index 0000000000000000000000000000000000000000..782655b31b7f4add4c886a46845506af875190bc
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
@@ -0,0 +1,616 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+
+import math
+import numpy as np
+
+from finn.core.datatype import DataType
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.custom_op.general.im2col import compute_conv_output_dim
+from onnx import TensorProto, helper
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+# This operation should only be used for 1D convolutions. Either the
+# IFMDim_H or IFMDim_W should be '1', which represents the so-called
+# dummy-dimension
+
+# ONNX i/o tensor shape assumptions for ConvolutionInputGenerator1D:
+# input 0 is the input tensor, shape NHWC = (1, IFMDim_H, IFMDim_W, IFMChannels)
+# output 0 is the output tensor, shape NHWC:
+#     = (1, OFMDim_H, OFMDim_W, (ConvKernelDim_H*ConvKernelDim_W)*IFMChannels)
+
+# note: the actual data layout produced by the hlslib kernels is different
+# for depthwise and non-depthwise ops.
+# * non-depthwise SWG: (1, OFMDim_H, OFMDim_W, K_H, K_W, IFMChannels/SIMD, SIMD)
+# * depthwise SWG: (1, OFMDim_H, OFMDim_W, IFMChannels/SIMD, K_H, K_W, SIMD)
+# see test_fpgadataflow_slidingwindow.py for an example of how to transform
+# between the two layouts
+
+
+class ConvolutionInputGenerator1D(HLSCustomOp):
+    """Class that corresponds to one of the 1D finn-hlslib ConvolutionInputGenerator
+    (sliding window) function variants. Depending on the combination of
+    attributes (e.g. depthwise or not, whether dilation is 0) a different
+    variant will be picked for the actual HLS implementation."""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "ConvKernelDim": ("ints", True, []),  # [H, W] = [Y, X]
+            "IFMChannels": ("i", True, 0),
+            "IFMDim": ("ints", True, []),  # [H, W] = [Y, X]
+            "OFMDim": ("ints", True, []),  # [H, W] = [Y, X]
+            "SIMD": ("i", True, 0),
+            "Stride": ("ints", True, []),  # [H, W] = [Y, X]
+            "Dilation": ("ints", True, []),  # [H, W] = [Y, X]
+            # FINN DataTypes for inputs, weights, outputs
+            "inputDataType": ("s", True, ""),
+            "outputDataType": ("s", True, ""),
+            "depthwise": ("i", False, 0, {0, 1}),
+            # FPGA resource type for ConvolutionInputGenerator input buffer
+            # auto -- let Vivado HLS decide
+            # block -- use BRAM
+            # distributed -- use LUTRAM
+            # ultra -- use URAM
+            "ram_style": (
+                "s",
+                False,
+                "distributed",
+                {"auto", "block", "distributed", "ultra"},
+            ),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_normal_input_shape(self):
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch)
+        return ishape
+
+    def get_folded_input_shape(self):
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        simd = self.get_nodeattr("SIMD")
+        assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
+        wf = int(ifm_ch / simd)
+        folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd)
+        return folded_ishape
+
+    def get_normal_output_shape(self):
+        k_h, k_w = self.get_nodeattr("ConvKernelDim")
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        stride_h, stride_w = self.get_nodeattr("Stride")
+        dilation_h, dilation_w = self.get_nodeattr("Dilation")
+        pad = 0
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w)
+        oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch)
+        return oshape
+
+    def get_folded_output_shape(self):
+        k_h, k_w = self.get_nodeattr("ConvKernelDim")
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        stride_h, stride_w = self.get_nodeattr("Stride")
+        dilation_h, dilation_w = self.get_nodeattr("Dilation")
+        simd = self.get_nodeattr("SIMD")
+        pad = 0
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w)
+        assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
+        wf = int((k_h * k_w * ifm_ch) // simd)
+        folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd)
+        return folded_oshape
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpect input shape for ConvInpGen."
+        # implement tensor with correct shape
+        values = np.random.randn(*oshape).astype(np.float32)
+        return helper.make_node(
+            "Constant",
+            inputs=[],
+            outputs=[self.onnx_node.output[0]],
+            value=helper.make_tensor(
+                name="const_tensor",
+                data_type=TensorProto.FLOAT,
+                dims=values.shape,
+                vals=values.flatten().astype(float),
+            ),
+        )
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        # data type stays the same
+        dtype = model.get_tensor_datatype(node.input[0])
+        model.set_tensor_datatype(node.output[0], dtype)
+
+    def verify_node(self):
+        pass
+
+    def get_input_datatype(self):
+        """Returns FINN DataType of input."""
+        return DataType[self.get_nodeattr("inputDataType")]
+
+    def get_output_datatype(self):
+        """Returns FINN DataType of output."""
+        return DataType[self.get_nodeattr("outputDataType")]
+
+    def get_instream_width(self):
+        """Returns stream width, input and output stream width are equal for
+        the sliding window function"""
+        ibits = self.get_input_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
+        in_width = simd * ibits
+        return in_width
+
+    def get_outstream_width(self):
+        """Returns stream width, input and output stream width are equal for
+        the sliding window function, so the function to determine the input
+        stream width can be reused."""
+        return self.get_instream_width()
+
+    def get_number_output_values(self):
+        folded_oshape = self.get_folded_output_shape()
+        num_output_elems = np.prod(folded_oshape[:-1])
+        return num_output_elems
+
+    def get_1d_conv_attrs_normalized(self):
+        # support both (1, D) and (D, 1) cases transparently:
+        # For the kernel, presenting the input data of size D as
+        # [H, W] = [Y, X] = [1, D] or [D, 1]
+        # effectively gives the same result. Because the
+        # ConvolutionInputGenerator_NonSquare_Dilated(_dws) kernel currently only
+        # supports dilation>1 along the X-axis and the
+        # ConvolutionInputGenerator_NonSquare only works for stride>1 along the
+        # X-axis, we are working with the following assumption:
+        # the dummy ('1') dimension is the Y-dimension, i.e.
+        # images and kernels (and their attributes) of dimension
+        # [H, W] = [Y, X] = [D, 1] or [1, D] are always mapped to [1, D]
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        k = self.get_nodeattr("ConvKernelDim")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        ofm_dim = self.get_nodeattr("OFMDim")
+        stride = self.get_nodeattr("Stride")
+        dilation = self.get_nodeattr("Dilation")
+
+        # see defines() for an explanation
+        if ifm_dim[1] == 1:
+            ifm_dim = ifm_dim[::-1]
+            ofm_dim = ofm_dim[::-1]
+            k = k[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation)
+
+    def get_exp_cycles(self):
+        simd = self.get_nodeattr("SIMD")
+        (
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            k,
+            stride,
+            dilation,
+        ) = self.get_1d_conv_attrs_normalized()
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        ofm_dim_h, ofm_dim_w = ofm_dim
+        k_h, k_w = k
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        # since mmv != 1 is not supported yet, we set mmv for now to 1
+        mmv = 1
+        # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h
+        cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv
+        cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd)
+        max_cycles = max(cycles_write_block, cycles_read_block)
+        exp_cycles = (
+            ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles
+        )
+
+        return int(exp_cycles)
+
+    def bram_estimation(self):
+        # NOTE: not tested for correctness
+        simd = self.get_nodeattr("SIMD")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        ifm_dim = np.prod(self.get_nodeattr("IFMDim"))
+        k = np.prod(self.get_nodeattr("ConvKernelDim"))
+        stride = np.prod(self.get_nodeattr("Stride"))
+        ram_style = self.get_nodeattr("ram_style")
+        if ram_style == "block" or ram_style == "auto":
+            ram_depth = ifm_dim * ifm_ch / simd
+            if ram_depth <= 512:
+                ram_width = 36
+            elif ram_depth <= 1024:
+                ram_width = 18
+            elif ram_depth <= 2048:
+                ram_width = 9
+            elif ram_depth <= 4096:
+                ram_width = 4
+            elif ram_depth <= 8192:
+                ram_width = 2
+            else:
+                ram_width = 1
+            return int(
+                (k + stride)
+                * (
+                    math.ceil(simd * self.get_input_datatype().bitwidth() / ram_width)
+                    * math.ceil(ifm_dim * ifm_ch / simd / ram_depth)
+                )
+            )
+        else:
+            return 0
+
+    def lut_estimation(self):
+        # NOTE: not tested for correctness
+        simd = self.get_nodeattr("SIMD")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        ifm_dim = np.prod(self.get_nodeattr("IFMDim"))
+        k = np.prod(self.get_nodeattr("ConvKernelDim"))
+        stride = np.prod(self.get_nodeattr("Stride"))
+        ram_style = self.get_nodeattr("ram_style")
+        if ram_style == "distributed":
+            ram_luts = int(
+                (k + stride)
+                * (
+                    simd
+                    * self.get_input_datatype().bitwidth()
+                    * math.ceil(ifm_dim * ifm_ch / simd / 64)
+                )
+            )
+        else:
+            ram_luts = 0
+        return 300 + ram_luts
+
+    def uram_estimation(self):
+        # NOTE: not tested for correctness
+        simd = self.get_nodeattr("SIMD")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        ifm_dim = np.prod(self.get_nodeattr("IFMDim"))
+        k = np.prod(self.get_nodeattr("ConvKernelDim"))
+        stride = np.prod(self.get_nodeattr("Stride"))
+        ram_style = self.get_nodeattr("ram_style")
+        if ram_style == "ultra":
+            return int(
+                (k + stride)
+                * (
+                    math.ceil(simd * self.get_input_datatype().bitwidth() / 64)
+                    * math.ceil(ifm_dim * ifm_ch / simd / 4096)
+                )
+            )
+        else:
+            return 0
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        exp_ishape = self.get_normal_input_shape()
+        exp_oshape = self.get_normal_output_shape()
+        folded_ishape = self.get_folded_input_shape()
+        folded_oshape = self.get_folded_output_shape()
+
+        # TODO ensure codegen dir exists
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert (
+            inp.shape == exp_ishape
+        ), """Input shape doesn't
+        match expected shape (1, ifm_dim, ifm_dim, ifm_ch)."""
+        if self.get_input_datatype() == DataType.BIPOLAR:
+            # store bipolar activations as binary
+            inp = (inp + 1) / 2
+            export_idt = DataType.BINARY
+        else:
+            export_idt = self.get_input_datatype()
+        # reshape input into folded form
+        inp = inp.reshape(folded_ishape)
+        # make copy before saving array
+        reshaped_input = inp.copy()
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_output(context)
+            assert (
+                context[node.output[0]].shape == folded_oshape
+            ), "cppsim \
+            did not produce expected ofolded utput shape"
+            context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            rtlsim_inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+            odt = export_idt
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+        # binary -> bipolar if needed
+        if self.get_output_datatype() == DataType.BIPOLAR:
+            out = context[node.output[0]]
+            out = 2 * out - 1
+            context[node.output[0]] = out
+        assert (
+            context[node.output[0]].shape == exp_oshape
+        ), """Output
+        shape doesn't match expected shape (1, ofm_dim_h, ofm_dim_w, k_h*k_w*ifm_ch)."""
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "slidingwindow.h"']
+
+    def defines(self, var):
+        numReps = 1
+        (
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            k,
+            stride,
+            dilation,
+        ) = self.get_1d_conv_attrs_normalized()
+        simd = self.get_nodeattr("SIMD")
+        ifm_precision = self.get_input_datatype().bitwidth()
+        ifm_dim_y, ifm_dim_x = ifm_dim
+        ofm_dim_y, ofm_dim_x = ofm_dim
+        k_y, k_x = k
+        dilation_y, dilation_x = dilation
+        # For a 1d convolution with stride=[S,1] or [1,S], the finn-hlslib function
+        # of ConvInpGen must be created with [stride_y, stride_x] = [S, S].
+        # TODO: changes in finn-hlslib (slidingwindow.h)
+        stride_y = np.prod(stride)
+        stride_x = np.prod(stride)
+
+        if dilation_x > 1:
+            assert (
+                dilation_y == 1
+            ), "Dilation value greater than 1 along y-axis is not yet supported"
+            self.code_gen_dict["$DEFINES$"] = [
+                """
+            #define ConvKernelDim1_x {}\n
+            #define ConvKernelDim1_y {}\n
+            #define IFMChannels1 {}\n
+            #define Input_precision1 {}\n
+            #define IFMDim1_x {}\n
+            #define IFMDim1_y {}\n
+            #define OFMDim1_x {}\n
+            #define OFMDim1_y {}\n
+            #define SIMD1 {}\n
+            #define Stride1_x {}\n
+            #define Stride1_y {}\n
+            #define Dilation1_x {}\n
+            #define Dilation1_y {}\n
+            #define numReps {}
+            """.format(
+                    k_x,
+                    k_y,
+                    ifm_ch,
+                    ifm_precision,
+                    ifm_dim_x,
+                    ifm_dim_y,
+                    ofm_dim_x,
+                    ofm_dim_y,
+                    simd,
+                    stride_x,
+                    stride_y,
+                    dilation_x,
+                    dilation_y,
+                    numReps,
+                )
+            ]
+        else:
+            ofm_dim = self.get_nodeattr("OFMDim")
+            self.code_gen_dict["$DEFINES$"] = [
+                """
+            #define ConvKernelDim1_x {}\n
+            #define ConvKernelDim1_y {}\n
+            #define IFMChannels1 {}\n
+            #define Input_precision1 {}\n
+            #define IFMDim1_x {}\n
+            #define IFMDim1_y {}\n
+            #define OFMDim1_x {}\n
+            #define OFMDim1_y {}\n
+            #define SIMD1 {}\n
+            #define Stride1_x {}\n
+            #define Stride1_y {}\n
+            #define numReps {}
+            """.format(
+                    k_x,
+                    k_y,
+                    ifm_ch,
+                    ifm_precision,
+                    ifm_dim_x,
+                    ifm_dim_y,
+                    ofm_dim_x,
+                    ofm_dim_y,
+                    simd,
+                    stride_x,
+                    stride_y,
+                    numReps,
+                )
+            ]
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_input_datatype()
+        if dtype == DataType.BIPOLAR:
+            # use binary for bipolar storage
+            dtype = DataType.BINARY
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"] = []
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
+            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+        )
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+        )
+
+    def docompute(self):
+        ram_style = self.get_nodeattr("ram_style")
+        map_to_hls_ram_style = {
+            "auto": "ap_resource_dflt()",
+            "block": "ap_resource_bram()",
+            "distributed": "ap_resource_lutram()",
+            "ultra": "ap_resource_uram()",
+        }
+        hls_ram_style = map_to_hls_ram_style[ram_style]
+        hls_call = "ConvolutionInputGenerator"
+        # check which ConvolutionInputGenerator is needed
+        dilation_h, dilation_w = self.get_nodeattr("Dilation")
+
+        hls_call += "_NonSquare"
+        if dilation_h > 1 or dilation_w > 1:
+            hls_call += "_Dilated"
+            if self.get_nodeattr("depthwise") == 1:
+                hls_call += "_dws"
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, Input_precision1,
+                IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, SIMD1, Stride1_x, Stride1_y,
+                Dilation1_x, Dilation1_y> (in0, out, numReps, {});""".format(
+                    hls_call, hls_ram_style
+                )
+            ]
+        elif self.get_nodeattr("depthwise") == 1:
+            hls_call += "_dws"
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, Input_precision1,
+                IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, SIMD1, Stride1_x, Stride1_y>
+                (in0, out, numReps, {});""".format(
+                    hls_call, hls_ram_style
+                )
+            ]
+        else:
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, Input_precision1,
+                IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, SIMD1, Stride1_x, Stride1_y>
+                (in0, out, numReps, {});""".format(
+                    hls_call, hls_ram_style
+                )
+            ]
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        if dtype == DataType.BIPOLAR:
+            # use binary for bipolar storage
+            dtype = DataType.BINARY
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        oshape = self.get_folded_output_shape()
+        oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
+
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                oshape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            """void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0,
+                hls::stream<ap_uint<SIMD1*Input_precision1>> &out)""".format(
+                self.onnx_node.name
+            )
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+        )
diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
index 27dfab54ec6d483d948dd383e54a44117d7c1a65..99f959bf59f06d6ab5b71dd7245d657f4964cca4 100644
--- a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
@@ -17,9 +17,17 @@ class FMPadding_Batch(HLSCustomOp):
     def get_nodeattr_types(self):
         my_attrs = {
             # spatial size of input images
-            "ImgDim": ("i", True, 0),
+            "ImgDim": ("ints", True, []),  # [H, W] = [Y, X]
             # total padding (per dimension) to apply
-            "Padding": ("i", True, 2),
+            # NOTE: Current padding scheme that is applied tries to pad the same
+            # amount of zeros in front and behind the image for each dimension.
+            # As an example, a padding scheme such as [1, x, 3, x] is equal
+            # to [2, x, 2, x]
+            "Padding": (
+                "ints",
+                True,
+                [1, 1, 1, 1],
+            ),  # [H_begin, W_begin, H_end, W_end] = [Y_begin, X_begin, Y_end, X_end]
             # number of channels in input image
             "NumChannels": ("i", True, 0),
             # SIMD Input parallelism
@@ -38,31 +46,33 @@ class FMPadding_Batch(HLSCustomOp):
 
     def get_padded_odim(self):
         "Return the padded spatial size of the output."
-
-        idim = self.get_nodeattr("ImgDim")
+        idim_h, idim_w = self.get_nodeattr("ImgDim")
         pad = self.get_nodeattr("Padding")
-        return idim + pad
+        pad_h = pad[0] + pad[2]
+        pad_w = pad[1] + pad[3]
+        odim_h = idim_h + pad_h
+        odim_w = idim_w + pad_w
+        return [odim_h, odim_w]
 
     def get_exp_cycles(self):
-        odim = self.get_padded_odim()
+        odim_h, odim_w = self.get_padded_odim()
         channels = self.get_nodeattr("NumChannels")
         simd = self.get_nodeattr("SIMD")
         batch_size = self.get_nodeattr("numInputVectors")
-        exp_cycles = (channels / simd) * batch_size * odim * odim
+        exp_cycles = (channels / simd) * batch_size * odim_h * odim_w
         return int(exp_cycles)
 
     def get_normal_input_shape(self):
-        idim = self.get_nodeattr("ImgDim")
+        idim_h, idim_w = self.get_nodeattr("ImgDim")
         num_ch = self.get_nodeattr("NumChannels")
-
-        ishape = (1, idim, idim, num_ch)
+        ishape = (1, idim_h, idim_w, num_ch)
         return ishape
 
     def get_normal_output_shape(self):
-        odim = self.get_padded_odim()
+        odim_h, odim_w = self.get_padded_odim()
         num_ch = self.get_nodeattr("NumChannels")
 
-        oshape = (1, odim, odim, num_ch)
+        oshape = (1, odim_h, odim_w, num_ch)
         return oshape
 
     def get_folded_input_shape(self):
@@ -148,20 +158,53 @@ class FMPadding_Batch(HLSCustomOp):
         self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"']
 
     def defines(self, var):
-        self.code_gen_dict["$DEFINES$"] = [
-            """#define ImgDim1 {}\n#define OutputDim1 {}\n
-            #define Padding1 {}\n#define NumChannels1 {}\n
-            #define PaddingStyle1 {}\n#define numReps {}
-            #define SIMD1 {}\n""".format(
-                self.get_nodeattr("ImgDim"),
-                self.get_padded_odim(),
-                self.get_nodeattr("Padding"),
-                self.get_nodeattr("NumChannels"),
-                self.get_nodeattr("PaddingStyle"),
-                self.get_nodeattr("numInputVectors"),
-                self.get_nodeattr("SIMD"),
-            )
-        ]
+        idim_h, idim_w = self.get_nodeattr("ImgDim")
+        odim_h, odim_w = self.get_padded_odim()
+        pad = self.get_nodeattr("Padding")
+        pad_h = pad[0] + pad[2]
+        pad_w = pad[1] + pad[3]
+        is_square = idim_h == idim_w
+
+        if is_square:
+            assert (
+                pad_h == pad_w
+            ), "Only equal padding along the dimensions for square images is supported"
+            self.code_gen_dict["$DEFINES$"] = [
+                """#define ImgDim1 {}\n#define OutputDim1 {}\n
+                #define Padding1 {}\n#define NumChannels1 {}\n
+                #define SIMD1 {}\n#define PaddingStyle1 {}\n
+                #define numReps {}\n""".format(
+                    idim_h,
+                    odim_h,
+                    pad_h,
+                    self.get_nodeattr("NumChannels"),
+                    self.get_nodeattr("SIMD"),
+                    self.get_nodeattr("PaddingStyle"),
+                    self.get_nodeattr("numInputVectors"),
+                )
+            ]
+        else:
+            self.code_gen_dict["$DEFINES$"] = [
+                """
+                #define OutputDim1_x {}\n
+                #define OutputDim1_y {}\n
+                #define Padding1_x {}\n
+                #define Padding1_y {}\n
+                #define NumChannels1 {}\n
+                #define SIMD1 {}\n
+                #define PaddingStyle1 {}\n
+                #define numReps {}\n
+                """.format(
+                    odim_w,
+                    odim_h,
+                    pad_w,
+                    pad_h,
+                    self.get_nodeattr("NumChannels"),
+                    self.get_nodeattr("SIMD"),
+                    self.get_nodeattr("PaddingStyle"),
+                    self.get_nodeattr("numInputVectors"),
+                )
+            ]
 
     def read_npy_data(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -193,12 +236,26 @@ class FMPadding_Batch(HLSCustomOp):
     def docompute(self):
         in_t = self.get_input_datatype().get_hls_datatype_str()
         node = self.onnx_node
-        self.code_gen_dict["$DOCOMPUTE$"] = [
-            """{}<ImgDim1, OutputDim1, Padding1, NumChannels1,SIMD1,
-            {}, PaddingStyle1> (in0, out, numReps);""".format(
-                node.op_type, in_t
-            )
-        ]
+
+        idim_h, idim_w = self.get_nodeattr("ImgDim")
+        is_square = idim_h == idim_w
+
+        if is_square:
+            hls_call = node.op_type
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}<ImgDim1, OutputDim1, Padding1, NumChannels1,SIMD1,
+                {}, PaddingStyle1> (in0, out, numReps);""".format(
+                    hls_call, in_t
+                )
+            ]
+        else:
+            hls_call = "FMPadding_nonsquare_Batch"
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}<OutputDim1_x, OutputDim1_y, Padding1_x, Padding1_y, NumChannels1,
+                SIMD1, {}, PaddingStyle1> (in0, out, numReps);""".format(
+                    hls_call, in_t
+                )
+            ]
 
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -270,7 +327,7 @@ class FMPadding_Batch(HLSCustomOp):
         assert (
             inp.shape == exp_ishape
         ), """Input shape doesn't
-        match expected shape (1, ImgDim, ImgDim, NumChannels)."""
+        match expected shape (1, ImgDim_h, ImgDim_w, NumChannels)."""
         export_idt = self.get_input_datatype()
 
         reshaped_input = inp.reshape(folded_ishape)
@@ -316,4 +373,4 @@ class FMPadding_Batch(HLSCustomOp):
         assert (
             context[node.output[0]].shape == exp_oshape
         ), """Output shape doesn't match expected shape
-            (1, OutputDim, OutputDim, NumChannels)."""
+            (1, OutputDim_H, OutputDim_W, NumChannels)."""
diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py
index 2ab070b2fdc059a554930345a81abc368c29bfa7..c07188430244b635ab6b1ec192337da74550d57d 100644
--- a/src/finn/custom_op/fpgadataflow/hlscustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py
@@ -38,11 +38,11 @@ from finn.util.basic import (
     roundup_to_integer_multiple,
     get_rtlsim_trace_depth,
 )
-from finn.util.fpgadataflow import (
-    IPGenBuilder,
+from finn.util.pyverilator import (
     pyverilate_get_liveness_threshold_cycles,
     rtlsim_multi_io,
 )
+from finn.util.hls import CallHLS
 from . import templates
 
 try:
@@ -310,11 +310,11 @@ class HLSCustomOp(CustomOp):
         return []
 
     def ipgen_singlenode_code(self):
-        """Builds the bash script for ip generation using the IPGenBuilder from
-        finn.util.fpgadataflow."""
+        """Builds the bash script for ip generation using the CallHLS from
+        finn.util.hls."""
         node = self.onnx_node
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-        builder = IPGenBuilder()
+        builder = CallHLS()
         builder.append_tcl(code_gen_dir + "/hls_syn_{}.tcl".format(node.name))
         builder.set_ipgen_path(code_gen_dir + "/project_{}".format(node.name))
         builder.build(code_gen_dir)
diff --git a/src/finn/custom_op/fpgadataflow/streamingdataflowpartition.py b/src/finn/custom_op/fpgadataflow/streamingdataflowpartition.py
new file mode 100644
index 0000000000000000000000000000000000000000..53446ff1f2aba30e69bf188c1673c738440567fb
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/streamingdataflowpartition.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2020 Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from finn.custom_op.base import CustomOp
+
+# TODO move StreamingDataflowPartition to HLSCustomOp base class
+
+
+class StreamingDataflowPartition(CustomOp):
+    """Class that corresponds to the meta/container node StreamingDataflowPartition
+    which is a placeholder for a group of fpgadataflow nodes that have been separated
+    out into a FINN-ONNX model of its own. Note that is does not produce any HLS or
+    bitfile by itself."""
+
+    def get_nodeattr_types(self):
+        return {
+            "model": ("s", True, ""),
+            "res_estimate": ("s", False, ""),
+            "res_hls": ("s", False, ""),
+            "res_synth": ("s", False, ""),
+            "slr": ("i", False, -1),
+            "partition_id": ("i", False, 0),
+            "device_id": ("i", False, 0),
+            "mem_port": ("s", False, ""),
+        }
+
+    def make_shape_compatible_op(self, model):
+        pass
+
+    def infer_node_datatype(self, model):
+        pass
+
+    def execute_node(self, context, graph):
+        # TODO add RPC execution with synthesized bitfile?
+        # whole-design rtlsim with PyVerilator may also be an alternative
+        pass
+
+    def verify_node(self):
+        info_messages = []
+
+        # verify number of attributes
+        num_of_attr = 1
+        if len(self.onnx_node.attribute) == num_of_attr:
+            info_messages.append("The number of attributes is correct")
+        else:
+            info_messages.append(
+                """The number of attributes is incorrect,
+            {} should have {} attributes""".format(
+                    self.onnx_node.op_type, num_of_attr
+                )
+            )
+        # verify that all necessary attributes exist
+        try:
+            self.get_nodeattr("model")
+            info_messages.append("All necessary attributes exist")
+        except Exception:
+            info_messages.append(
+                """The necessary attributes do not exist.
+                StreamingDataflowPartition needs the following attribute(s):
+                model"""
+            )
+
+        # verify the number of inputs
+        if len(self.onnx_node.input) >= 1:
+            info_messages.append("The number of inputs is correct")
+        else:
+            info_messages.append("StreamingDataflowPartition needs 1 data input")
+
+        return info_messages
diff --git a/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py b/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py
index 9a897d9fa16064017dfc02f500d2360ae8431b4a..fead30650c60d38f9cd70de8f1515f847e15276f 100644
--- a/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py
+++ b/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py
@@ -26,9 +26,9 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
     def get_nodeattr_types(self):
         my_attrs = {
             "PE": ("i", True, 0),
-            "Dim": ("i", True, 0),
+            "Dim": ("ints", True, []),  # [H, W]
             "Channels": ("i", True, 0),
-            "Kernel": ("i", True, 0),
+            "Kernel": ("ints", True, []),  # [H, W]
             "resType": ("s", False, "auto", {"auto", "lut", "dsp"}),
             "ActVal": ("i", False, 0),
             # FINN DataTypes for inputs, weights, outputs
@@ -45,10 +45,10 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
 
     def minimize_accumulator_width(self, model):
         weights = model.get_initializer(self.onnx_node.input[1])
-        k = self.get_nodeattr("Kernel")
+        k_h, k_w = self.get_nodeattr("Kernel")
         fm = self.get_nodeattr("Channels")
         # put weights into the shape expected by calculate_matvec_accumulator_range
-        weights = weights.reshape(fm, k * k).transpose()
+        weights = weights.reshape(fm, k_h * k_w).transpose()
         if len(self.onnx_node.input) > 2:
             thresholds = model.get_initializer(self.onnx_node.input[2])
         else:
@@ -85,9 +85,11 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
                     tdt = DataType.get_smallest_possible(0 - tdt_max)
             else:
                 tdt = DataType.get_smallest_possible(tdt_max)
-            assert np.vectorize(tdt.allowed)(threshold_tensor).all(), (
-                "Thresholds in %s can't be expressed with type %s"
-                % (self.onnx_node.name, str(tdt))
+            assert np.vectorize(tdt.allowed)(
+                threshold_tensor
+            ).all(), "Thresholds in %s can't be expressed with type %s" % (
+                self.onnx_node.name,
+                str(tdt),
             )
             self.set_nodeattr("accDataType", tdt.name)
         else:
@@ -110,9 +112,9 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
     def calc_wmem(self):
         """Calculates and returns WMEM."""
         ch = self.get_nodeattr("Channels")
-        k = self.get_nodeattr("Kernel")
+        k_h, k_w = self.get_nodeattr("Kernel")
         pe = self.get_nodeattr("PE")
-        wmem = k * k * ch // pe
+        wmem = k_h * k_w * ch // pe
         return wmem
 
     def calc_tmem(self):
@@ -181,34 +183,34 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
         return out_width
 
     def get_folded_input_shape(self):
-        k = self.get_nodeattr("Kernel")
-        sf = k * k
-        dim = self.get_nodeattr("Dim")
+        k_h, k_w = self.get_nodeattr("Kernel")
+        sf = k_h * k_w
+        dim_h, dim_w = self.get_nodeattr("Dim")
         ch = self.get_nodeattr("Channels")
         pe = self.get_nodeattr("PE")
         nf = ch // pe
-        folded_input_shape = tuple([1, dim, dim, sf * nf, pe])
+        folded_input_shape = tuple([1, dim_h, dim_w, sf * nf, pe])
         return folded_input_shape
 
     def get_folded_output_shape(self):
         ch = self.get_nodeattr("Channels")
         pe = self.get_nodeattr("PE")
         nf = ch // pe
-        dim = self.get_nodeattr("Dim")
-        folded_output_shape = tuple([1, dim, dim, nf, pe])
+        dim_h, dim_w = self.get_nodeattr("Dim")
+        folded_output_shape = tuple([1, dim_h, dim_w, nf, pe])
         return folded_output_shape
 
     def get_normal_input_shape(self):
-        dim = self.get_nodeattr("Dim")
+        dim_h, dim_w = self.get_nodeattr("Dim")
         ch = self.get_nodeattr("Channels")
-        k = self.get_nodeattr("Kernel")
-        normal_input_shape = tuple([1, dim, dim, k * k * ch])
+        k_h, k_w = self.get_nodeattr("Kernel")
+        normal_input_shape = tuple([1, dim_h, dim_w, k_h * k_w * ch])
         return normal_input_shape
 
     def get_normal_output_shape(self):
         ch = self.get_nodeattr("Channels")
-        dim = self.get_nodeattr("Dim")
-        normal_output_shape = tuple([1, dim, dim, ch])
+        dim_h, dim_w = self.get_nodeattr("Dim")
+        normal_output_shape = tuple([1, dim_h, dim_w, ch])
         return normal_output_shape
 
     def get_number_output_values(self):
@@ -218,13 +220,13 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
     def get_exp_cycles(self):
         pe = self.get_nodeattr("PE")
         ch = self.get_nodeattr("Channels")
-        dim = self.get_nodeattr("Dim")
-        k = self.get_nodeattr("Kernel")
+        dim_h, dim_w = self.get_nodeattr("Dim")
+        k_h, k_w = self.get_nodeattr("Kernel")
         # currently FINN supports for vvau a batch size of 1
         batch_size = 1
         # since mmv != 1 is not supported yet, we set mmv for now to 1
         mmv = 1
-        exp_cycles = ((ch * k * k) / pe) * batch_size * (dim * dim) / mmv
+        exp_cycles = ((ch * k_h * k_w) / pe) * batch_size * (dim_h * dim_w) / mmv
         return int(exp_cycles)
 
     def get_template_param_values(self):
@@ -251,17 +253,17 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
     def get_hls_compatible_weight_tensor(self, orig_weight_matrix):
         pe = self.get_nodeattr("PE")
         ch = self.get_nodeattr("Channels")
-        k = self.get_nodeattr("Kernel")
+        k_h, k_w = self.get_nodeattr("Kernel")
         wmem = self.calc_wmem()
         assert orig_weight_matrix.shape == (
             ch,
             1,
-            k,
-            k,
+            k_h,
+            k_w,
         ), """Weights matrix doesn't
         have expected shape (channels, 1, kernel_size, kernel_size)"""
         ret = orig_weight_matrix
-        ret = ret.reshape(ch, k * k)
+        ret = ret.reshape(ch, k_h * k_w)
         # distribute rows between PEs
         ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
         ret = ret.reshape(1, pe, wmem, 1)
@@ -338,9 +340,11 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
                 threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
                 # get computed threshold datatype from attribute
                 tdt = DataType[self.get_nodeattr("accDataType")]
-                assert np.vectorize(tdt.allowed)(threshold_tensor).all(), (
-                    "Thresholds in %s can't be expressed with type %s"
-                    % (self.onnx_node.name, str(tdt))
+                assert np.vectorize(tdt.allowed)(
+                    threshold_tensor
+                ).all(), "Thresholds in %s can't be expressed with type %s" % (
+                    self.onnx_node.name,
+                    str(tdt),
                 )
                 thresholds_hls_code = numpy_to_hls_code(
                     threshold_tensor, tdt, "thresholds", False, True
@@ -455,10 +459,10 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
             self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"']
 
     def defines(self, var):
-        dim = self.get_nodeattr("Dim")
-        numReps = 1 * dim * dim
-        kernel = self.get_nodeattr("Kernel")
-        innerProdDim = kernel * kernel
+        dim_h, dim_w = self.get_nodeattr("Dim")
+        numReps = 1 * dim_h * dim_w
+        k_h, k_w = self.get_nodeattr("Kernel")
+        innerProdDim = k_h * k_w
         self.code_gen_dict["$DEFINES$"] = [
             """#define Channels1 {}\n #define InnerProdDim {}\n
             #define SIMD1 1\n #define PE1 {}\n #define numReps {}""".format(
@@ -664,8 +668,8 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
         else:
             mult_luts = (2 * math.ceil((W + A) / 6) - 1) * (W + A)
         # accumulator
-        k = self.get_nodeattr("Kernel")
-        acc_bits = W + A + math.ceil(math.log(k * k, 2))
+        k_h, k_w = self.get_nodeattr("Kernel")
+        acc_bits = W + A + math.ceil(math.log(k_h * k_w, 2))
         acc_luts = acc_bits
         # thresholds and threshold comparators
         thr_luts = 0
@@ -694,20 +698,20 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
         return int(mult_dsp)
 
     def get_op_and_param_counts(self):
-        k = self.get_nodeattr("Kernel")
+        k_h, k_w = self.get_nodeattr("Kernel")
         fm = self.get_nodeattr("Channels")
-        dim = self.get_nodeattr("Dim")
+        dim_h, dim_w = self.get_nodeattr("Dim")
         weight_bits = self.get_weight_datatype().bitwidth()
         inp_bits = self.get_input_datatype().bitwidth()
-        num_repetitions = int(dim * dim)
-        mac_count = k * k * fm * num_repetitions
+        num_repetitions = int(dim_h * dim_w)
+        mac_count = k_h * k_w * fm * num_repetitions
         # cannonicalize op type: highest bitwidth operand first s.t.
         # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types
         bw1 = min(inp_bits, weight_bits)
         bw2 = max(inp_bits, weight_bits)
         mac_op_type = "op_mac_%dbx%db" % (bw1, bw2)
         weight_param_type = "param_weight_%db" % (weight_bits)
-        weight_count = k * k * fm
+        weight_count = k_h * k_w * fm
         ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count}
         if self.get_nodeattr("noActivation") == 0:
             tdt = DataType[self.get_nodeattr("accDataType")]
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index a221b510ab8d22f4daca1c32e717a9b482246712..1f3d40e929e29d16790a491bbfd7a4a5033f866f 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -64,30 +64,28 @@ class InferConvInpGen(Transformation):
                     warnings.warn("Input is not int. Can't infer ConvInpGen")
                     continue
                 i2c_inst = getCustomOp(n)
-                stride = i2c_inst.get_nodeattr("stride")
-                k_attr = i2c_inst.get_nodeattr("kernel_size")
-                k_h = k_attr[0]
-                k_w = k_attr[1]
+                stride_h, stride_w = i2c_inst.get_nodeattr("stride")
+                k_h, k_w = i2c_inst.get_nodeattr("kernel_size")
                 pad_attr = i2c_inst.get_nodeattr("pad_amount")
                 pad_h = pad_attr[0] + pad_attr[2]
                 pad_w = pad_attr[1] + pad_attr[3]
+                dilation_h, dilation_w = i2c_inst.get_nodeattr("dilations")
                 # temporary checks until non-square conv support is finalized
-                assert pad_h == pad_w, "Non-square images not yet supported."
-                assert k_h == k_w, "Non-square kernels not yet supported."
-                k = k_h
-                pad = pad_attr[0]
                 pad_val = i2c_inst.get_nodeattr("pad_value")
                 depthwise = i2c_inst.get_nodeattr("depthwise")
                 ifm_ch = i2c_in_shape[-1]
-                ifm_dim = i2c_in_shape[1]
-                ofm_dim = i2c_out_shape[1]
+                ifm_dim_h = i2c_in_shape[1]
+                ifm_dim_w = i2c_in_shape[2]
+                ofm_dim_h = i2c_out_shape[1]
+                ofm_dim_w = i2c_out_shape[2]
 
                 # default params for ConvolutionInputGenerator
                 ConvInpGen_node_idx = node_ind
                 ConvInpGen_input = i2c_input
-                ConvInpGen_idim = ifm_dim
+                ConvInpGen_idim_h = ifm_dim_h
+                ConvInpGen_idim_w = ifm_dim_w
 
-                if pad > 0:
+                if pad_h > 0 or pad_w > 0:
                     # if padding enabled, ensure pad_val supported by DataType
                     # assert dt.allowed(pad_val),"""FMPadding_Batch DataType
                     # must support pad_val"""
@@ -95,12 +93,13 @@ class InferConvInpGen(Transformation):
                         pad_val == 0
                     ), "FMPadding_Batch doesn't currently support pad_val!= 0"
 
-                    odim_padding = ifm_dim + 2 * pad
+                    odim_padding_h = ifm_dim_h + pad_h
+                    odim_padding_w = ifm_dim_w + pad_w
 
                     padding_out = helper.make_tensor_value_info(
                         model.make_new_valueinfo_name(),
                         TensorProto.FLOAT,
-                        (1, odim_padding, odim_padding, ifm_ch),
+                        (1, odim_padding_h, odim_padding_w, ifm_ch),
                     )
                     graph.value_info.append(padding_out)
                     padding_out = padding_out.name
@@ -108,7 +107,8 @@ class InferConvInpGen(Transformation):
 
                     ConvInpGen_node_idx += 1
                     ConvInpGen_input = padding_out
-                    ConvInpGen_idim = odim_padding
+                    ConvInpGen_idim_h = odim_padding_h
+                    ConvInpGen_idim_w = odim_padding_w
 
                     padding_node = helper.make_node(
                         "FMPadding_Batch",
@@ -116,15 +116,31 @@ class InferConvInpGen(Transformation):
                         [padding_out],
                         domain="finn.custom_op.fpgadataflow",
                         backend="fpgadataflow",
-                        ImgDim=ifm_dim,
-                        Padding=2 * pad,
+                        ImgDim=[ifm_dim_h, ifm_dim_w],
+                        Padding=pad_attr,
                         NumChannels=ifm_ch,
                         inputDataType=dt.name,
                         SIMD=ifm_ch,
                     )
                     graph.node.insert(node_ind, padding_node)
 
-                if stride > 1 and k == 1:
+                # Ensure that only supported HLS nodes are inserted
+                is_square_image = ConvInpGen_idim_h == ConvInpGen_idim_w
+                is_square_kernel = k_h == k_w
+                is_kernel_pointwise = k_h == 1 and k_w == 1
+                is_equal_stride = stride_h == stride_w
+                is_1d_convolution = (k_h == 1 and k_w > 1 and ifm_dim_h == 1) or (
+                    k_h > 1 and k_w == 1 and ifm_dim_w == 1
+                )
+
+                if (stride_h > 1 or stride_w > 1) and is_kernel_pointwise:
+                    assert (
+                        is_square_image
+                    ), "DownSampler currently only supports square input images."
+                    assert is_equal_stride, """DownSampler currently only supports equal stride value
+                        along different axes."""
+                    ConvInpGen_idim = ConvInpGen_idim_h
+                    stride = stride_h
                     # create DownSampler node
                     ConvInpGen_node = helper.make_node(
                         "DownSampler",
@@ -141,22 +157,58 @@ class InferConvInpGen(Transformation):
                     graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
                 else:
                     # create equivalent ConvolutionInputGenerator node
-                    ConvInpGen_node = helper.make_node(
-                        "ConvolutionInputGenerator",
-                        [ConvInpGen_input],
-                        [i2c_output],
-                        domain="finn.custom_op.fpgadataflow",
-                        backend="fpgadataflow",
-                        ConvKernelDim=k,
-                        IFMChannels=ifm_ch,
-                        IFMDim=ConvInpGen_idim,
-                        OFMDim=ofm_dim,
-                        SIMD=ifm_ch,
-                        Stride=stride,
-                        inputDataType=dt.name,
-                        outputDataType=dt.name,
-                        depthwise=depthwise,
-                    )
+                    if (
+                        is_square_image and is_square_kernel
+                    ):  # square images and square kernels
+                        assert is_equal_stride, """Non-equal strides along different axes is not supported
+                            for (non-)square convolutions"""
+                        assert (
+                            dilation_h == 1 and dilation_w == 1
+                        ), """Dilation value != 1 is not supported
+                            for square convolutions"""
+                        ConvInpGen_node = helper.make_node(
+                            "ConvolutionInputGenerator",
+                            [ConvInpGen_input],
+                            [i2c_output],
+                            domain="finn.custom_op.fpgadataflow",
+                            backend="fpgadataflow",
+                            ConvKernelDim=[k_h, k_w],
+                            IFMChannels=ifm_ch,
+                            IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w],
+                            OFMDim=[ofm_dim_h, ofm_dim_w],
+                            SIMD=ifm_ch,
+                            Stride=[stride_h, stride_w],
+                            Dilation=[dilation_h, dilation_w],
+                            inputDataType=dt.name,
+                            outputDataType=dt.name,
+                            depthwise=depthwise,
+                        )
+                    else:  # non-square images and/or kernels
+                        assert (
+                            is_1d_convolution
+                        ), "ConvultionInputGenerator1D works only for 1D convolutions"
+                        if dilation_h > 1 or dilation_w > 1:
+                            assert (
+                                stride_h == 1 and stride_w == 1
+                            ), """Stride value of greater than 1 is not supported for convolutions
+                                with dilation value greater than 1"""
+                        ConvInpGen_node = helper.make_node(
+                            "ConvolutionInputGenerator1D",
+                            [ConvInpGen_input],
+                            [i2c_output],
+                            domain="finn.custom_op.fpgadataflow",
+                            backend="fpgadataflow",
+                            ConvKernelDim=[k_h, k_w],
+                            IFMChannels=ifm_ch,
+                            IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w],
+                            OFMDim=[ofm_dim_h, ofm_dim_w],
+                            SIMD=ifm_ch,
+                            Stride=[stride_h, stride_w],
+                            Dilation=[dilation_h, dilation_w],
+                            inputDataType=dt.name,
+                            outputDataType=dt.name,
+                            depthwise=depthwise,
+                        )
                     graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
                 # remove old nodes
                 graph.node.remove(n)
@@ -338,7 +390,7 @@ class InferPool_Batch(Transformation):
                     [im2col_in],
                     [im2col_out],
                     domain="finn.custom_op.general",
-                    stride=stride,
+                    stride=[stride, stride],
                     kernel_size=[k, k],
                     pad_amount=[pad, pad, pad, pad],
                     pad_value=pad_value,
@@ -684,7 +736,7 @@ class InferVVAU(Transformation):
             ):
                 sparsity = model.get_tensor_sparsity(n.input[1])
                 try:
-                    k = sparsity["dw"]["kernel_shape"]
+                    k_h, k_w = sparsity["dw"]["kernel_shape"]
                 except KeyError:
                     raise Exception(
                         """Sparsity doesn't indicate that MatMul
@@ -702,25 +754,25 @@ class InferVVAU(Transformation):
                     mm_output = n.output[0]
                     W = model.get_initializer(mm_weight)
                     # infer dense weight tensor from sparse weight matrix
-                    # kernel size k which was extracted above and the value of
+                    # kernel size (k_h, k_w) which was extracted above and the value of
                     # the channels is used.
-                    # the weight matrix has a shape of (k * k * Channels, Channels)
+                    # the weight matrix has a shape of (k_h * k_w * Channels, Channels)
                     # we need to reverse the creation of the sparse weight matrix
-                    # to achieve a weight tensor of shape (Channels, 1, k, k)
+                    # to achieve a weight tensor of shape (Channels, 1, k_h, k_w)
                     channels = int(W.shape[1])
-                    # transpose to achieve a shape of (k * k * Channels, Channels)
+                    # transpose to achieve a shape of (k_h * k_w * Channels, Channels)
                     W = W.T
-                    # reshape to (Channels, k, k, Channels) to transpose afterwards
-                    # to (Channels, Channels, k, k)
-                    W = W.reshape(channels, k, k, channels)
+                    # reshape to (Channels, k_h, k_w, Channels) to transpose afterwards
+                    # to (Channels, Channels, k_h, k_w)
+                    W = W.reshape(channels, k_h, k_w, channels)
                     W = W.transpose(0, 3, 1, 2)
                     # now we can extract the values using a for loop over the channels
                     # and fill a zero numpy array in the correct shape
-                    w_tensor = np.zeros((channels, 1, k, k))
+                    w_tensor = np.zeros((channels, 1, k_h, k_w))
                     for ch in range(channels):
                         w_tensor[ch][0] = W[ch][ch]
                     model.set_initializer(mm_weight, w_tensor)
-                    model.set_tensor_shape(mm_weight, (channels, 1, k, k))
+                    model.set_tensor_shape(mm_weight, (channels, 1, k_h, k_w))
                     # create node with pe=channels as default
                     pe = channels
                     assert (
@@ -762,9 +814,9 @@ class InferVVAU(Transformation):
                             backend="fpgadataflow",
                             resType="lut",
                             PE=pe,
-                            Dim=mm_in_shape[1],
+                            Dim=[mm_in_shape[1], mm_in_shape[2]],
                             Channels=channels,
-                            Kernel=k,
+                            Kernel=[k_h, k_w],
                             inputDataType=idt.name,
                             weightDataType=wdt.name,
                             outputDataType=odt.name,
@@ -790,9 +842,9 @@ class InferVVAU(Transformation):
                             backend="fpgadataflow",
                             resType="lut",
                             PE=pe,
-                            Dim=mm_in_shape[1],
+                            Dim=[mm_in_shape[1], mm_in_shape[2]],
                             Channels=channels,
-                            Kernel=k,
+                            Kernel=[k_h, k_w],
                             inputDataType=idt.name,
                             weightDataType=wdt.name,
                             outputDataType=odt.name,
@@ -1345,7 +1397,11 @@ class InferGlobalAccPoolLayer(Transformation):
                 )
                 model.graph.value_info.append(mul_value)
                 model.set_initializer(mul_value.name, np.array(1 / (vecs[1] * vecs[2])))
-                new_mul = helper.make_node("Mul", [pool_out, mul_value.name], [result],)
+                new_mul = helper.make_node(
+                    "Mul",
+                    [pool_out, mul_value.name],
+                    [result],
+                )
                 graph.node.insert(insert_point, new_pool)
                 graph.node.insert(insert_point + 1, new_mul)
                 node_ind += 1
diff --git a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
index 56bfb4306e555c716a9156d6f0949c339193eb38..419a6d8c494651862f55e63e6829a61fe8040599 100644
--- a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
+++ b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
@@ -148,7 +148,7 @@ class CreateDataflowPartition(Transformation):
                     [df_out],
                     # use the model attribute to mark the df model
                     model=df_model_filename,
-                    domain="finn.custom_op.general",
+                    domain="finn.custom_op.fpgadataflow",
                     partition_id=target_partition_id,
                     slr=slr,
                     mem_port=mem_port,
diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
index f7d59978d8f8866aefb3028d570bb6b434df33b4..ea27eee04db6f90b50a58296ceaf6f6ed58602ac 100644
--- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py
+++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
@@ -39,8 +39,8 @@ from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames
-from finn.util.fpgadataflow import pyverilate_stitched_ip, is_fpgadataflow_node
-from finn.util.pyverilator import reset_rtlsim, toggle_clk
+from finn.util.fpgadataflow import is_fpgadataflow_node
+from finn.util.pyverilator import pyverilate_stitched_ip, reset_rtlsim, toggle_clk
 
 
 def reset_implementation(node):
diff --git a/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfdb21fa72cbbeeb503f7ecc447b659ef7934fb9
--- /dev/null
+++ b/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py
@@ -0,0 +1,189 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from onnx import TensorProto, helper
+import numpy as np
+import pytest
+
+from finn.core.datatype import DataType
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+import finn.core.onnx_exec as oxe
+from finn.core.modelwrapper import ModelWrapper
+from finn.util.basic import gen_finn_dt_tensor
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.custom_op.general.im2col import compute_conv_output_dim
+from finn.custom_op.registry import getCustomOp
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+
+
+# conv_config:
+# [pad_h_begin, pad_w_begin, pad_h_end, pad_w_end]
+# [kernel_size_h, kernel_size_w]
+# [stride_h, stride_w]
+# [dilation_h, dilation_w]
+@pytest.mark.parametrize(
+    "conv_config",
+    [
+        [[0, 0, 0, 0], [4, 1], [1, 1], [1, 1]],
+        [[1, 0, 1, 0], [4, 1], [1, 1], [1, 1]],
+        [[1, 0, 1, 0], [4, 1], [2, 1], [1, 1]],
+        # [[1, 0, 1, 0], [4, 1], [1, 1], [2, 1]]
+    ],
+)
+@pytest.mark.parametrize("depthwise", [False, True])
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, exec_mode):
+    pad, kernel_size, stride, dilation = conv_config
+    np.random.seed(0)
+    idt = DataType.UINT4
+
+    in_feature_dim_h, in_feature_dim_w = [10, 1]
+    in_chn = 16
+
+    k_h, k_w = kernel_size
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+    pad_h = pad[0] + pad[2]
+    pad_w = pad[1] + pad[3]
+
+    if depthwise is True:
+        group = out_chn = in_chn
+        conv_param_shape = [out_chn, 1, k_h, k_w]
+    else:
+        group = 1
+        out_chn = 20
+        conv_param_shape = [out_chn, in_chn, k_h, k_w]
+
+    out_feature_dim_h = compute_conv_output_dim(
+        in_feature_dim_h, k_h, stride_h, pad_h, dilation_h
+    )
+    out_feature_dim_w = compute_conv_output_dim(
+        in_feature_dim_w, k_w, stride_w, pad_w, dilation_w
+    )
+
+    input_shape = [1, in_chn, in_feature_dim_h, in_feature_dim_w]
+    output_shape = [1, out_chn, out_feature_dim_h, out_feature_dim_w]
+
+    conv_weight_dt = DataType.UINT4
+
+    conv_config = {}
+    conv_config["dilations"] = [dilation_h, dilation_w]
+    conv_config["group"] = group
+    conv_config["kernel_shape"] = [k_h, k_w]
+    conv_config["pads"] = pad
+    conv_config["strides"] = [stride_h, stride_w]
+
+    top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
+    top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape)
+    value_info = [
+        helper.make_tensor_value_info("p1", TensorProto.FLOAT, conv_param_shape)
+    ]
+
+    modelproto = helper.make_model(
+        helper.make_graph(
+            name="conv_test",
+            inputs=[top_in],
+            outputs=[top_out],
+            value_info=value_info,
+            nodes=[
+                helper.make_node("Conv", ["top_in", "p1"], ["top_out"], **conv_config)
+            ],
+        )
+    )
+
+    model = ModelWrapper(modelproto)
+    model.set_tensor_datatype("top_in", idt)
+    model.set_tensor_datatype("top_out", idt)
+    model.set_tensor_datatype("p1", conv_weight_dt)
+    model.set_initializer("p1", gen_finn_dt_tensor(conv_weight_dt, conv_param_shape))
+
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+
+    new_model = model.transform(LowerConvsToMatMul())
+    new_model = new_model.transform(to_hls.InferConvInpGen())
+    if depthwise is True:
+        new_model = new_model.transform(to_hls.InferVVAU())
+    else:
+        new_model = new_model.transform(to_hls.InferQuantizedStreamingFCLayer())
+        fc_node = new_model.get_nodes_by_op_type("StreamingFCLayer_Batch")[0]
+        fc_inst = getCustomOp(fc_node)
+        mw = fc_inst.get_nodeattr("MW")
+        mh = fc_inst.get_nodeattr("MH")
+        pe_cands = list(filter(lambda x: mh % x == 0, range(2, mh + 1)))
+        simd_cands = list(filter(lambda x: mw % x == 0, range(2, mw + 1)))
+        fc_inst.set_nodeattr("PE", pe_cands[0])
+        fc_inst.set_nodeattr("SIMD", simd_cands[0])
+
+    new_model = new_model.transform(GiveUniqueNodeNames())
+    new_model = new_model.transform(InferShapes())
+    new_model = new_model.transform(InferDataTypes())
+
+    if exec_mode == "cppsim":
+        new_model = new_model.transform(PrepareCppSim())
+        new_model = new_model.transform(CompileCppSim())
+        new_model = new_model.transform(SetExecMode("cppsim"))
+    elif exec_mode == "rtlsim":
+        new_model = new_model.transform(SetExecMode("rtlsim"))
+        new_model = new_model.transform(GiveUniqueNodeNames())
+        new_model = new_model.transform(PrepareIP("xc7z020clg400-1", 5))
+        new_model = new_model.transform(HLSSynthIP())
+        new_model = new_model.transform(PrepareRTLSim())
+    else:
+        raise Exception("Unknown exec_mode")
+
+    x = gen_finn_dt_tensor(idt, input_shape)
+    inp_dict = {model.graph.input[0].name: x}
+    assert oxe.compare_execution(model, new_model, inp_dict)
+
+    if pad_h == 1 and pad_w == 1:
+        padding_node = new_model.get_nodes_by_op_type("FMPadding_Batch")[0]
+        padding_inst = getCustomOp(padding_node)
+        assert padding_inst.get_nodeattr("SIMD") == in_chn
+
+    if depthwise is True and exec_mode == "rtlsim":
+        node = new_model.get_nodes_by_op_type("Vector_Vector_Activate_Batch")[0]
+        inst = getCustomOp(node)
+        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+        exp_cycles_dict = new_model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        assert np.isclose(exp_cycles, cycles_rtlsim, atol=11)
+        assert exp_cycles != 0
diff --git a/tests/fpgadataflow/test_depthwise_convolution.py b/tests/fpgadataflow/test_depthwise_convolution.py
index c406d78158c52226fea881c48bc178139653fea5..3efeacb6e6875c6defa799eb7154e02ce880e16a 100644
--- a/tests/fpgadataflow/test_depthwise_convolution.py
+++ b/tests/fpgadataflow/test_depthwise_convolution.py
@@ -98,7 +98,7 @@ def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding):
         inputs=["inp"],
         outputs=["im2col_out"],
         kernel_size=[k, k],
-        stride=stride,
+        stride=[stride, stride],
         pad_amount=[padding, padding, padding, padding],
         input_shape="(1, {}, {}, {})".format(ifm_dim, ifm_dim, ifm_ch),
         depthwise=1,
@@ -142,7 +142,7 @@ def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding):
     W_matrix = W_matrix.reshape(ofm_ch, ifm_ch * k * k)
 
     model.set_initializer("W_sparse", W_matrix.T)
-    sparsity = {"dw": {"kernel_shape": k}}
+    sparsity = {"dw": {"kernel_shape": [k, k]}}
     model.set_tensor_sparsity("W_sparse", sparsity)
 
     if act is not None:
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
index 4e0e8c7c35a8fc8a30e0ba4c27a7c0d637e24d1f..1ec12263e22a199ac7da55fdf3418185cd38e555 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
@@ -47,7 +47,9 @@ from finn.custom_op.registry import getCustomOp
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 
 
-def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt):
+def make_single_im2col_modelwrapper(
+    k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt
+):
     odt = idt
     inp = helper.make_tensor_value_info(
         "inp", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ifm_ch]
@@ -61,12 +63,12 @@ def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, simd, stride, i
         ["inp"],
         ["outp"],
         domain="finn.custom_op.general",
-        backend="fpgadataflow",
-        stride=stride,
+        stride=[stride, stride],
         kernel_size=[k, k],
         input_shape=str((1, ifm_dim, ifm_dim, ifm_ch)),
         pad_amount=[0, 0, 0, 0],
         pad_value=0,
+        dilations=[dilation, dilation],
     )
     graph = helper.make_graph(
         nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp]
@@ -82,7 +84,7 @@ def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, simd, stride, i
 
 
 def make_single_slidingwindow_modelwrapper(
-    k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt, dw=0
+    k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt, dw=0
 ):
     odt = idt
     inp = helper.make_tensor_value_info(
@@ -98,12 +100,13 @@ def make_single_slidingwindow_modelwrapper(
         ["outp"],
         domain="finn.custom_op.fpgadataflow",
         backend="fpgadataflow",
-        ConvKernelDim=k,
+        ConvKernelDim=[k, k],
         IFMChannels=ifm_ch,
-        IFMDim=ifm_dim,
-        OFMDim=ofm_dim,
+        IFMDim=[ifm_dim, ifm_dim],
+        OFMDim=[ofm_dim, ofm_dim],
         SIMD=simd,
-        Stride=stride,
+        Stride=[stride, stride],
+        Dilation=[dilation, dilation],
         inputDataType=idt.name,
         outputDataType=odt.name,
         depthwise=dw,
@@ -138,6 +141,9 @@ def prepare_inputs(input_tensor):
 @pytest.mark.parametrize("ifm_ch", [2, 4])
 # Stride
 @pytest.mark.parametrize("stride", [1, 2])
+# Dilation
+# Currently only dilation value of 1 is supported
+@pytest.mark.parametrize("dilation", [1])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 # input channel parallelism ("SIMD")
@@ -147,13 +153,13 @@ def prepare_inputs(input_tensor):
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_fpgadataflow_slidingwindow(
-    idt, k, ifm_dim, ifm_ch, stride, exec_mode, simd, dw
+    idt, k, ifm_dim, ifm_ch, stride, dilation, exec_mode, simd, dw
 ):
     ofm_dim = int(((ifm_dim - k) / stride) + 1)
 
     x = gen_finn_dt_tensor(idt, (1, ifm_dim, ifm_dim, ifm_ch))
     model = make_single_slidingwindow_modelwrapper(
-        k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt, dw
+        k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt, dw
     )
 
     if exec_mode == "cppsim":
@@ -174,9 +180,10 @@ def test_fpgadataflow_slidingwindow(
     # execute model
     y_produced = oxe.execute_onnx(model, input_dict)["outp"]
     golden = make_single_im2col_modelwrapper(
-        k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt
+        k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt
     )
     y_expected = oxe.execute_onnx(golden, input_dict)["outp"]
+
     if dw == 0:
         assert (y_produced == y_expected).all()
     else:
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c83aab0d683cdb3888aca3c46bb339bd6330917
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py
@@ -0,0 +1,256 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+import numpy as np
+
+from onnx import TensorProto, helper
+
+import finn.core.onnx_exec as oxe
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.util.basic import gen_finn_dt_tensor
+
+from finn.custom_op.registry import getCustomOp
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+from finn.custom_op.general.im2col import compute_conv_output_dim
+
+
+def make_single_im2col_modelwrapper(
+    k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt
+):
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+    ofm_dim_h, ofm_dim_w = ofm_dim
+
+    odt = idt
+    inp = helper.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]
+    )
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch]
+    )
+
+    im2col_node = helper.make_node(
+        "Im2Col",
+        ["inp"],
+        ["outp"],
+        domain="finn.custom_op.general",
+        stride=[stride_h, stride_w],
+        kernel_size=[k_h, k_w],
+        input_shape=str((1, ifm_dim_h, ifm_dim_w, ifm_ch)),
+        dilations=[dilation_h, dilation_w],
+        pad_amount=[0, 0, 0, 0],
+        pad_value=0,
+    )
+    graph = helper.make_graph(
+        nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp]
+    )
+
+    model = helper.make_model(graph, producer_name="im2col-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+
+    return model
+
+
+def make_single_slidingwindow_modelwrapper(
+    k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt, dw=0
+):
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+    ofm_dim_h, ofm_dim_w = ofm_dim
+
+    odt = idt
+    inp = helper.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]
+    )
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch]
+    )
+
+    SlidingWindow_node = helper.make_node(
+        "ConvolutionInputGenerator1D",
+        ["inp"],
+        ["outp"],
+        domain="finn.custom_op.fpgadataflow",
+        backend="fpgadataflow",
+        ConvKernelDim=[k_h, k_w],
+        IFMChannels=ifm_ch,
+        IFMDim=[ifm_dim_h, ifm_dim_w],
+        OFMDim=[ofm_dim_h, ofm_dim_w],
+        SIMD=simd,
+        Stride=[stride_h, stride_w],
+        Dilation=[dilation_h, dilation_w],
+        inputDataType=idt.name,
+        outputDataType=odt.name,
+        depthwise=dw,
+    )
+    graph = helper.make_graph(
+        nodes=[SlidingWindow_node],
+        name="slidingwindow_graph",
+        inputs=[inp],
+        outputs=[outp],
+    )
+
+    model = helper.make_model(graph, producer_name="slidingwindow-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+
+    return model
+
+
+def prepare_inputs(input_tensor):
+    return {"inp": input_tensor}
+
+
+# input datatype
+# @pytest.mark.parametrize("idt", [DataType.BIPOLAR, DataType.INT8])
+@pytest.mark.parametrize("idt", [DataType.INT8])
+# kernel size
+@pytest.mark.parametrize("k", [[4, 1]])
+# input dimension
+@pytest.mark.parametrize("ifm_dim", [[10, 1]])
+# input channels
+@pytest.mark.parametrize("ifm_ch", [1, 4])
+# Stride
+@pytest.mark.parametrize("stride", [[1, 1], [2, 1]])
+# Dilation
+# @pytest.mark.parametrize("dilation", [[1, 1], [2, 1]])
+@pytest.mark.parametrize("dilation", [[1, 1]])
+# execution mode
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+# input channel parallelism ("SIMD")
+@pytest.mark.parametrize("simd", [1, 4])
+# depthwise
+@pytest.mark.parametrize("dw", [0, 1])
+# Flip dimensions
+@pytest.mark.parametrize("flip", [False, True])
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_fpgadataflow_slidingwindow_1d(
+    idt, k, ifm_dim, ifm_ch, stride, dilation, exec_mode, simd, dw, flip
+):
+    if flip:
+        k = k[::-1]
+        ifm_dim = ifm_dim[::-1]
+        stride = stride[::-1]
+        dilation = dilation[::-1]
+
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+
+    if (dilation_h > 1 or dilation_w > 1) and (stride_h > 1 or stride_w > 1):
+        pytest.skip(
+            """Dilation value greater than 1 and stride greater than 1
+            currently not supported for 1D convolutions"""
+        )
+    if simd > ifm_ch:
+        pytest.skip("SIMD cannot be larger than number of input channels")
+
+    ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+    ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+    ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+    x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch))
+    model = make_single_slidingwindow_modelwrapper(
+        k=k,
+        ifm_ch=ifm_ch,
+        ifm_dim=ifm_dim,
+        ofm_dim=ofm_dim,
+        simd=simd,
+        stride=stride,
+        dilation=dilation,
+        idt=idt,
+        dw=dw,
+    )
+
+    if exec_mode == "cppsim":
+        model = model.transform(SetExecMode("cppsim"))
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+    elif exec_mode == "rtlsim":
+        model = model.transform(SetExecMode("rtlsim"))
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(PrepareRTLSim())
+    else:
+        raise Exception("Unknown exec_mode in test_fpgadataflow_slidingwindow")
+
+    # prepare input data
+    input_dict = prepare_inputs(x)
+    # execute model
+    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
+    golden = make_single_im2col_modelwrapper(
+        k=k,
+        ifm_ch=ifm_ch,
+        ifm_dim=ifm_dim,
+        ofm_dim=ofm_dim,
+        simd=simd,
+        stride=stride,
+        dilation=dilation,
+        idt=idt,
+    )
+    y_expected = oxe.execute_onnx(golden, input_dict)["outp"]
+
+    if dw == 0:
+        assert (y_produced == y_expected).all()
+    else:
+        y_expected = y_expected.reshape(
+            1, ofm_dim_h, ofm_dim_w, k_h * k_w, ifm_ch // simd, simd
+        )
+        y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5)
+        y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, ifm_ch * k_h * k_w)
+        assert (y_produced == y_expected).all()
+
+    if exec_mode == "rtlsim":
+        node = model.get_nodes_by_op_type("ConvolutionInputGenerator1D")[0]
+        inst = getCustomOp(node)
+        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
+        assert exp_cycles != 0
diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
index b2835d578b03ee689330d53a9a7b233c9b9f4222..ab47b300136bd95622f064b30e3bbaae76a61597 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
@@ -54,15 +54,20 @@ target_clk_ns = 10
 
 
 def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_style):
+    pad_h = padding[0] + padding[2]
+    pad_w = padding[1] + padding[3]
+    idim_h, idim_w = idim
+
     assert pad_style == 2, "only pad_style == 2 supported in hlslib"
-    assert padding > 0, "Output dim should be greater than input dim"
-    odim = idim + padding
+    assert pad_h > 0 or pad_w > 0, "Output dim should be greater than input dim"
+    odim_h = idim_h + pad_h
+    odim_w = idim_w + pad_w
 
     inp = helper.make_tensor_value_info(
-        "inp", TensorProto.FLOAT, [1, idim, idim, num_ch]
+        "inp", TensorProto.FLOAT, [1, idim_h, idim_w, num_ch]
     )
     outp = helper.make_tensor_value_info(
-        "outp", TensorProto.FLOAT, [1, odim, odim, num_ch]
+        "outp", TensorProto.FLOAT, [1, odim_h, odim_w, num_ch]
     )
 
     FMPadding = helper.make_node(
@@ -94,9 +99,9 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_sty
 
 
 # input image dimension
-@pytest.mark.parametrize("idim", [8])
+@pytest.mark.parametrize("idim", [[8, 8], [10, 8]])
 # number of rows and number of cols to add
-@pytest.mark.parametrize("pad", [2, 3])
+@pytest.mark.parametrize("pad", [[1, 1, 1, 1], [1, 1, 2, 2], [1, 3, 2, 3]])
 # number of channels
 @pytest.mark.parametrize("num_ch", [2, 4])
 # Input parallelism
@@ -112,10 +117,22 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_sty
 def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode):
     if num_ch % simd != 0:
         pytest.skip(" num_ch % simd != 0, skipping")
+
+    idim_h, idim_w = idim
+    pad_h = pad[0] + pad[2]
+    pad_w = pad[1] + pad[3]
+
+    if idim_h == idim_w and pad_h != pad_w:
+        pytest.skip(
+            """Only equal padding along the dimensions for square images
+            is supported, skipping"""
+        )
+
     # generate input data
-    x = gen_finn_dt_tensor(idt, [1, idim, idim, num_ch])
+    x = gen_finn_dt_tensor(idt, [1, idim_h, idim_w, num_ch])
     input_dict = {"inp": x}
-    odim = idim + pad
+    odim_h = idim_h + pad_h
+    odim_w = idim_w + pad_w
 
     model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, simd, idt, pad_style)
     model = model.transform(InferShapes())
@@ -129,24 +146,26 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode):
         model = model.transform(HLSSynthIP())
         model = model.transform(PrepareRTLSim())
     y_produced = oxe.execute_onnx(model, input_dict)["outp"]
-    expected_oshape = (1, odim, odim, num_ch)
+    expected_oshape = (1, odim_h, odim_w, num_ch)
     assert y_produced.shape == expected_oshape
 
     # calculate reference
     # calculate correct pad according to parameters
     if pad_style == 2:
-        if pad % 2 == 0:
-            pad_up = pad // 2
-            pad_left = pad // 2
+        if pad_h % 2 == 0:
+            pad_up = pad_h // 2
+        else:
+            pad_up = pad_h // 2 + 1
+        if pad_w % 2 == 0:
+            pad_left = pad_w // 2
         else:
-            pad_up = pad // 2 + 1
-            pad_left = pad // 2 + 1
+            pad_left = pad_w // 2 + 1
     else:
-        pad_up = pad // 2
-        pad_left = pad // 2
+        pad_up = pad_h // 2
+        pad_left = pad_w // 2
 
-    pad_down = pad - pad_up
-    pad_right = pad - pad_left
+    pad_down = pad_h - pad_up
+    pad_right = pad_w - pad_left
 
     y_expected = np.pad(
         x, ((0, 0), (pad_up, pad_down), (pad_left, pad_right), (0, 0)), "constant"
diff --git a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
index 23d7610dfdf434602f326e1117b072f312962295..4fa780548a544d92e02b28486ae1e325ff1f9a9b 100644
--- a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
+++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
@@ -52,7 +52,7 @@ from finn.util.basic import (
     alveo_part_map,
     alveo_default_platform,
 )
-from finn.util.fpgadataflow import pyverilate_stitched_ip
+from finn.util.pyverilator import pyverilate_stitched_ip
 from finn.util.test import load_test_checkpoint_or_skip
 from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext
 from finn.transformation.infer_data_layouts import InferDataLayouts
diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau.py b/tests/fpgadataflow/test_fpgadataflow_vvau.py
new file mode 100644
index 0000000000000000000000000000000000000000..4756d4fe18ccd4934b4041c70bf2f3a1bb577ec7
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_vvau.py
@@ -0,0 +1,242 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+from onnx import TensorProto, helper
+
+import finn.core.onnx_exec as oxe
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.util.basic import gen_finn_dt_tensor
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.custom_op.general.multithreshold import multithreshold
+
+from finn.custom_op.registry import getCustomOp
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+
+
+def _infer_sparse_weight_tensor(W_conv, k_h, k_w, channels):
+    W_sparse = np.zeros((channels, channels, k_h, k_w))
+    for ch in range(channels):
+        W_sparse[ch][ch] = W_conv[ch][0]
+    W_conv = W_sparse.astype(np.float32)
+    W_matmul = W_conv.transpose(0, 2, 3, 1)
+    W_matmul = W_matmul.reshape(channels, channels * k_h * k_w)
+    W_matmul = W_matmul.T
+
+    return W_matmul
+
+
+def _calculate_dot_prod_range(dt_a, dt_b, len):
+    """Returns the (min,max) values a dot product between two (un)signed vectors of
+    types dt_a and dt_b of len elements can take."""
+    min_prod = 2 ** 30
+    max_prod = -(2 ** 30)
+    for a_val in [dt_a.min(), dt_a.max()]:
+        for b_val in [dt_b.min(), dt_b.max()]:
+            prod = a_val * b_val * len
+            if prod < min_prod:
+                min_prod = prod
+            if prod > max_prod:
+                max_prod = prod
+    return (min_prod, max_prod)
+
+
+def _make_single_vvau_modelwrapper(
+    W, pe, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T=None, tdt=None
+):
+    in_shape = [1, dim_h, dim_w, k_h * k_w * channels]  # [N, H, W, K*K*CH]
+    out_shape = [
+        1,
+        dim_h,
+        dim_w,
+        channels,
+    ]  # [N, H, W, OFM_CH] (OFM_CH=IFM_CH because depthwise convolution)
+
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, in_shape)
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, out_shape)
+
+    if T is not None:
+        no_act = 0
+        node_inp_list = ["inp", "weights", "thresh"]
+        actval = odt.min()
+    else:
+        no_act = 1
+        node_inp_list = ["inp", "weights"]
+        actval = 0
+
+    VVAU_node = helper.make_node(
+        "Vector_Vector_Activate_Batch",
+        node_inp_list,
+        ["outp"],
+        domain="finn.custom_op.fpgadataflow",
+        backend="fpgadataflow",
+        PE=pe,
+        Dim=[dim_h, dim_w],
+        Channels=channels,
+        Kernel=[k_h, k_w],
+        resType="lut",
+        ActVal=actval,
+        inputDataType=idt.name,
+        weightDataType=wdt.name,
+        outputDataType=odt.name,
+        noActivation=no_act,
+    )
+
+    graph = helper.make_graph(
+        nodes=[VVAU_node], name="vvau_graph", inputs=[inp], outputs=[outp]
+    )
+
+    model = helper.make_model(graph, producer_name="vvau-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+    model.set_tensor_datatype("weights", wdt)
+
+    model.set_initializer("weights", W)
+    model.set_tensor_shape("weights", (channels, 1, k_h, k_w))
+
+    if T is not None:
+        model.set_tensor_datatype("thresh", tdt)
+        model.set_initializer("thresh", T)
+
+    return model
+
+
+def prepare_inputs(input_tensor):
+    return {"inp": input_tensor}
+
+
+# mem_mode: const or decoupled
+@pytest.mark.parametrize("idt", [DataType.UINT4, DataType.UINT8])
+# weight datatype
+@pytest.mark.parametrize("wdt", [DataType.INT4])
+# activation: None or DataType
+@pytest.mark.parametrize("act", [DataType.UINT4, None])
+# PE
+@pytest.mark.parametrize("pe", [1, "channels"])
+# Input image shape
+@pytest.mark.parametrize("dim_h", [10])
+@pytest.mark.parametrize("dim_w", [10, 1])
+# Kernel shape
+@pytest.mark.parametrize("k_h", [3])
+@pytest.mark.parametrize("k_w", [3, 1])
+# Number of input and output channels
+@pytest.mark.parametrize("channels", [3, 4])
+# execution mode
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_fpgadataflow_vvau(
+    idt, wdt, act, pe, dim_h, dim_w, k_h, k_w, channels, exec_mode
+):
+    if pe == "channels":
+        pe = channels
+
+    if dim_w == 1 and k_w != 1:
+        pytest.skip("1D image requires 1D kernel, skipping.")
+
+    if channels % pe != 0:
+        pytest.skip("Requirement Channels divisable by PE is violated.")
+
+    # Generate weights in expected shape for ONNX and HLS node
+    W = gen_finn_dt_tensor(wdt, (channels, 1, k_h, k_w))  # shape: [channels, 1, k, k]
+    W_onnx = _infer_sparse_weight_tensor(
+        W, k_h, k_w, channels
+    )  # shape: [k*k*channels, channels]
+
+    # Generate inputs in expected format for ONNX and HLS node
+    x = gen_finn_dt_tensor(idt, (1, dim_h, dim_w, k_h * k_w * channels))
+    x_vvau = x.reshape(1, dim_h, dim_w, k_h * k_w, channels // pe, pe)
+    x_vvau = x_vvau.transpose(0, 1, 2, 4, 3, 5)
+    x_vvau = x_vvau.reshape(1, dim_h, dim_w, channels * k_h * k_w)
+
+    if act is None:
+        T = None
+        tdt = None
+        odt = DataType.INT32
+    else:
+        odt = act
+        (min_v, max_v) = _calculate_dot_prod_range(idt, wdt, k_h * k_w * channels)
+        n_steps = act.get_num_possible_values() - 1
+        T = np.random.randint(min_v, max_v - 1, (channels, n_steps)).astype(np.float32)
+        T = np.sort(T, axis=1)
+        tdt = DataType.INT32
+
+    model = _make_single_vvau_modelwrapper(
+        W, pe, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt
+    )
+
+    if exec_mode == "cppsim":
+        model = model.transform(SetExecMode("cppsim"))
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+    elif exec_mode == "rtlsim":
+        model = model.transform(SetExecMode("rtlsim"))
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(PrepareRTLSim())
+    else:
+        raise Exception("Unknown exec_mode in test_fpgadataflow_vvau")
+
+    input_dict = prepare_inputs(x_vvau)
+
+    # Calculate output
+    y_expected = np.matmul(x, W_onnx)  # Y is in [N, H, W, C] format
+    if T is not None:
+        # Reshape Y, as multithreshold expects Y to be in [N, C, H, W] format
+        y_expected = np.transpose(y_expected, (0, 3, 1, 2))
+        y_expected = multithreshold(y_expected, T)
+        y_expected = np.transpose(y_expected, (0, 2, 3, 1))
+        # signed offset
+        y_expected += act.min()
+
+    y_produced = oxe.execute_onnx(model, input_dict, return_full_exec_context=False)[
+        "outp"
+    ]
+
+    assert (y_produced == y_expected).all(), "cppsim failed"
+
+    if exec_mode == "rtlsim":
+        node = model.get_nodes_by_op_type("Vector_Vector_Activate_Batch")[0]
+        inst = getCustomOp(node)
+        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
+        assert exp_cycles != 0