diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
index 03630916ad49ec6eb538a3f19d74a63f64512446..8d401c40dc3efbfef5c387bdfc2f777a2cfb553b 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
@@ -29,6 +29,7 @@
 import math
 import numpy as np
 import os
+import warnings
 
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
@@ -85,6 +86,7 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
                 "distributed",
                 {"auto", "block", "distributed", "ultra"},
             ),
+            "parallel_window": ("i", False, 0, {0, 1}),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -219,13 +221,31 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
         dilation_h, dilation_w = dilation
         ram_style = self.get_nodeattr("ram_style")
 
-        if self.get_nodeattr("SIMD") == self.get_nodeattr("IFMChannels"):
-            if self.get_nodeattr("depthwise") == 0:
-                if stride_h == 1 and stride_w == 1:
-                    if dilation_h == 1 and dilation_w == 1:
-                        return ram_style in ["auto", "distributed"]
-
-        return False
+        fully_unfolded = self.get_nodeattr("SIMD") == self.get_nodeattr("IFMChannels")
+        non_dws = self.get_nodeattr("depthwise") == 0
+        no_stride = stride_h == 1 and stride_w == 1
+        no_dilation = dilation_h == 1 and dilation_w == 1
+        supported_ram_style = ram_style in ["auto", "distributed"]
+        if self.get_nodeattr("parallel_window") == 1:
+            if (
+                fully_unfolded
+                and non_dws
+                and no_stride
+                and no_dilation
+                and supported_ram_style
+            ):
+                return True
+            else:
+                warnings.warn(
+                    "{}: Parallel window output variant is not supported for this node,\
+                     please inspect requirements in use_parallel_window_output method\
+                     of the custom_op".format(
+                        self.onnx_node.name
+                    )
+                )
+                return False
+        else:
+            return False
 
     def get_exp_cycles(self):
         simd = self.get_nodeattr("SIMD")
@@ -244,17 +264,18 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
         dilation_h, dilation_w = dilation
 
         # since mmv != 1 is not supported yet, we set mmv for now to 1
-        mmv = 1
+        # mmv = 1
         # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h
         if self.use_parallel_window_output():
             exp_cycles = k_w + ofm_dim_w
+        elif dilation_h > 1 or dilation_w > 1:
+            cycles_read_block = ifm_dim_w * ifm_ch / simd
+            cycles_write_block = ofm_dim_w * k_w * ifm_ch / simd
+            exp_cycles = cycles_read_block + cycles_write_block
         else:
-            cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv
-            cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd)
-            max_cycles = max(cycles_write_block, cycles_read_block)
-            exp_cycles = (
-                ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles
-            )
+            cycles_read_block = ifm_ch / simd * (k_w - 1) - (k_w - 1)
+            cycles_write_block = ofm_dim_w * k_w * ifm_ch / simd
+            exp_cycles = cycles_read_block + cycles_write_block
 
         return int(exp_cycles)
 
@@ -451,7 +472,6 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
         # For a 1d convolution with stride=[S,1] or [1,S], the finn-hlslib function
         # of ConvInpGen must be created with [stride_y, stride_x] = [S, S].
         # TODO: changes in finn-hlslib (slidingwindow.h)
-        stride_y = np.prod(stride)
         stride_x = np.prod(stride)
 
         if dilation_x > 1:
@@ -461,33 +481,23 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
             self.code_gen_dict["$DEFINES$"] = [
                 """
             #define ConvKernelDim1_x {}\n
-            #define ConvKernelDim1_y {}\n
             #define IFMChannels1 {}\n
             #define Input_precision1 {}\n
             #define IFMDim1_x {}\n
-            #define IFMDim1_y {}\n
             #define OFMDim1_x {}\n
-            #define OFMDim1_y {}\n
             #define SIMD1 {}\n
             #define Stride1_x {}\n
-            #define Stride1_y {}\n
             #define Dilation1_x {}\n
-            #define Dilation1_y {}\n
             #define numReps {}
             """.format(
                     k_x,
-                    k_y,
                     ifm_ch,
                     ifm_precision,
                     ifm_dim_x,
-                    ifm_dim_y,
                     ofm_dim_x,
-                    ofm_dim_y,
                     simd,
                     stride_x,
-                    stride_y,
                     dilation_x,
-                    dilation_y,
                     numReps,
                 )
             ]
@@ -496,29 +506,21 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
             self.code_gen_dict["$DEFINES$"] = [
                 """
             #define ConvKernelDim1_x {}\n
-            #define ConvKernelDim1_y {}\n
             #define IFMChannels1 {}\n
             #define Input_precision1 {}\n
             #define IFMDim1_x {}\n
-            #define IFMDim1_y {}\n
             #define OFMDim1_x {}\n
-            #define OFMDim1_y {}\n
             #define SIMD1 {}\n
             #define Stride1_x {}\n
-            #define Stride1_y {}\n
             #define numReps {}
             """.format(
                     k_x,
-                    k_y,
                     ifm_ch,
                     ifm_precision,
                     ifm_dim_x,
-                    ifm_dim_y,
                     ofm_dim_x,
-                    ofm_dim_y,
                     simd,
                     stride_x,
-                    stride_y,
                     numReps,
                 )
             ]
@@ -571,34 +573,33 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
                 )
             ]
         else:
-            hls_call = "ConvolutionInputGenerator_NonSquare"
             dilation_h, dilation_w = self.get_nodeattr("Dilation")
-            if dilation_h > 1 or dilation_w > 1:
-                hls_call += "_Dilated"
-                if self.get_nodeattr("depthwise") == 1:
-                    hls_call += "_dws"
-                self.code_gen_dict["$DOCOMPUTE$"] = [
-                    """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1,
-                    Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y,
-                    SIMD1, Stride1_x, Stride1_y, Dilation1_x, Dilation1_y>
-                    (in0, out, numReps, {});""".format(
-                        hls_call, hls_ram_style
-                    )
-                ]
-            elif self.get_nodeattr("depthwise") == 1:
-                hls_call += "_dws"
-                self.code_gen_dict["$DOCOMPUTE$"] = [
-                    """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1,
-                    Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y,
-                    SIMD1, Stride1_x, Stride1_y> (in0, out, numReps, {});""".format(
-                        hls_call, hls_ram_style
-                    )
-                ]
+            if self.get_nodeattr("depthwise") == 1:
+                if dilation_h > 1 or dilation_w > 1:
+                    hls_call = "ConvolutionInputGenerator_1D_dilated_dws"
+                    self.code_gen_dict["$DOCOMPUTE$"] = [
+                        """{}<ConvKernelDim1_x, IFMChannels1,
+                        Input_precision1, IFMDim1_x, OFMDim1_x,
+                        SIMD1, Stride1_x, Dilation1_x>
+                        (in0, out, numReps, {});""".format(
+                            hls_call, hls_ram_style
+                        )
+                    ]
+                else:
+                    hls_call = "ConvolutionInputGenerator_1D_dws_lowbuffer"
+                    self.code_gen_dict["$DOCOMPUTE$"] = [
+                        """{}<ConvKernelDim1_x, IFMChannels1,
+                        Input_precision1, IFMDim1_x, OFMDim1_x,
+                        SIMD1> (in0, out, numReps, {});""".format(
+                            hls_call, hls_ram_style
+                        )
+                    ]
             else:
+                hls_call = "ConvolutionInputGenerator_1D_lowbuffer"
                 self.code_gen_dict["$DOCOMPUTE$"] = [
-                    """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1,
-                    Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y,
-                    SIMD1, Stride1_x, Stride1_y> (in0, out, numReps, {});""".format(
+                    """{}<ConvKernelDim1_x, IFMChannels1,
+                    Input_precision1, IFMDim1_x, OFMDim1_x,
+                    SIMD1> (in0, out, numReps, {});""".format(
                         hls_call, hls_ram_style
                     )
                 ]
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index b2f50b1a23f85bf782c553057148173b6f94dde4..f7cb0a16b9d3893066eaeaae416ea0259ab6a915 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -197,7 +197,7 @@ class InferConvInpGen(Transformation):
                             depthwise=depthwise,
                             name="ConvolutionInputGenerator_" + n.name,
                         )
-                    else:  # non-square images and/or kernels
+                    else:  # 1D images and/or kernels
                         assert is_1d_convolution, (
                             "%s: ConvolutionInputGenerator1D works only for 1D convs"
                             % n.name
@@ -208,6 +208,11 @@ class InferConvInpGen(Transformation):
                                 with dilation value greater than 1"""
                                 % n.name
                             )
+                            assert depthwise == 1, (
+                                """%s: Dilation value > 1 is only supported for
+                                1D depthwise separable convolutions"""
+                                % n.name
+                            )
                         ConvInpGen_node = helper.make_node(
                             "ConvolutionInputGenerator1D",
                             [ConvInpGen_input],
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py
index 8440ac1fe46a0d1ea4db3d76489dfc4ce68ff642..f2be24185203bdf13fbaa0e2c460e316df7f267e 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py
@@ -90,7 +90,7 @@ def make_single_im2col_modelwrapper(
 
 
 def make_single_slidingwindow_modelwrapper(
-    k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt, dw=0
+    k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt, parallel_window, dw=0
 ):
     k_h, k_w = k
     ifm_dim_h, ifm_dim_w = ifm_dim
@@ -122,6 +122,7 @@ def make_single_slidingwindow_modelwrapper(
         inputDataType=idt.name,
         outputDataType=odt.name,
         depthwise=dw,
+        parallel_window=parallel_window,
     )
     graph = helper.make_graph(
         nodes=[SlidingWindow_node],
@@ -153,10 +154,10 @@ def prepare_inputs(input_tensor):
 # input channels
 @pytest.mark.parametrize("ifm_ch", [1, 4])
 # Stride
-@pytest.mark.parametrize("stride", [[1, 1], [2, 1]])
+# @pytest.mark.parametrize("stride", [[1, 1], [2, 1]])
+@pytest.mark.parametrize("stride", [[1, 1]])
 # Dilation
-# @pytest.mark.parametrize("dilation", [[1, 1], [2, 1]])
-@pytest.mark.parametrize("dilation", [[1, 1]])
+@pytest.mark.parametrize("dilation", [[1, 1], [2, 1]])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 # input channel parallelism ("SIMD")
@@ -165,10 +166,22 @@ def prepare_inputs(input_tensor):
 @pytest.mark.parametrize("dw", [0, 1])
 # Flip dimensions
 @pytest.mark.parametrize("flip", [False, True])
+# Use parallel window output variant
+@pytest.mark.parametrize("parallel_window", [False, True])
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_fpgadataflow_slidingwindow_1d(
-    idt, k, ifm_dim, ifm_ch, stride, dilation, exec_mode, simd, dw, flip
+    idt,
+    k,
+    ifm_dim,
+    ifm_ch,
+    stride,
+    dilation,
+    exec_mode,
+    simd,
+    dw,
+    flip,
+    parallel_window,
 ):
     if flip:
         k = k[::-1]
@@ -186,6 +199,11 @@ def test_fpgadataflow_slidingwindow_1d(
             """Dilation value greater than 1 and stride greater than 1
             currently not supported for 1D convolutions"""
         )
+    if (dilation_h > 1 or dilation_w > 1) and dw == 0:
+        pytest.skip(
+            """Dilation value greater than 1 currently not supported
+            for non-dws 1D convolutions"""
+        )
     if simd > ifm_ch:
         pytest.skip("SIMD cannot be larger than number of input channels")
 
@@ -203,6 +221,7 @@ def test_fpgadataflow_slidingwindow_1d(
         stride=stride,
         dilation=dilation,
         idt=idt,
+        parallel_window=parallel_window,
         dw=dw,
     )
 
@@ -213,7 +232,7 @@ def test_fpgadataflow_slidingwindow_1d(
     elif exec_mode == "rtlsim":
         model = model.transform(SetExecMode("rtlsim"))
         model = model.transform(GiveUniqueNodeNames())
-        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+        model = model.transform(PrepareIP("xcu250-figd2104-2L-e", 5))
         model = model.transform(HLSSynthIP())
         model = model.transform(PrepareRTLSim())
     else: