diff --git a/finn-rtllib/swg/swg_template_default.sv b/finn-rtllib/swg/swg_template_default.sv
index 2d255a35edf97e28053545b89512a4b1415b6f57..0aa309f890140143be75123840708b6811d47a48 100644
--- a/finn-rtllib/swg/swg_template_default.sv
+++ b/finn-rtllib/swg/swg_template_default.sv
@@ -36,28 +36,19 @@ module $TOP_MODULE_NAME$_controller #(
     logic signed [$clog2(LOOP_KW_ITERATIONS  +2)+1-1:0]  Counter_loop_kw   = LOOP_KW_ITERATIONS-1;
     logic signed [$clog2(LOOP_SIMD_ITERATIONS+2)+1-1:0]  Counter_loop_simd = LOOP_SIMD_ITERATIONS-1;
 
-    logic [INCR_BITWIDTH-1:0]  tail_incr_reg = 'x;
     assign  addr_incr = ADDR_INCREMENT_MAP[State];
-    assign  tail_incr = tail_incr_reg;
 
     // combinational logic for tail_incr generation
-    uwire tail_incr_inner_condition;
-    generate
-        if (IS_DEPTHWISE)
-            assign tail_incr_inner_condition = (Counter_loop_kh >= 0);
-        else
-            assign tail_incr_inner_condition = 0;
-    endgenerate
-
-    always @ (tail_incr_inner_condition, Counter_loop_w, Counter_loop_h) begin
+    uwire  tail_incr_inner_condition = IS_DEPTHWISE? (Counter_loop_kh >= 0) : 0;
+    always_comb begin : blkTail
         if (tail_incr_inner_condition)
-            tail_incr_reg = 1;
+            tail_incr = 1;
         else if (Counter_loop_w >= 0)
-            tail_incr_reg = $TAIL_INCR_W$;
+            tail_incr = $TAIL_INCR_W$;
         else if (Counter_loop_h >= 0)
-            tail_incr_reg = $TAIL_INCR_H$;
+            tail_incr = $TAIL_INCR_H$;
         else
-            tail_incr_reg = $TAIL_INCR_LAST$;
+            tail_incr = $TAIL_INCR_LAST$;
     end
 
     // combinational next state logic
@@ -132,13 +123,8 @@ module $TOP_MODULE_NAME$_cyclic_buffer_addressable #(
     $RAM_STYLE$ logic [WIDTH-1:0] Ram[DEPTH];
     logic [WIDTH-1:0]  Out = 'x;
     always_ff @(posedge clk) begin
-        if (!rst_n) begin
-            Out       <= 'x;
-        end
-        else begin
-            if (read_enable)  Out <= Ram[read_addr];
-            if (write_enable) Ram[write_addr] <= data_in;
-        end
+        if (read_enable)  Out <= Ram[read_addr];
+        if (write_enable) Ram[write_addr] <= data_in;
     end
     assign  data_out = Out;
 
@@ -213,7 +199,7 @@ module $TOP_MODULE_NAME$_impl #(
     logic signed [$clog2(LAST_READ_ELEM+1)+1-1:0]  Newest_buffered_elem = -1;
     logic        [$clog2(LAST_READ_ELEM+1)+1-1:0]  Current_elem = 0;
     logic        [$clog2(LAST_READ_ELEM+1)+1-1:0]  First_elem_next_window = 0;
-    logic        [$clog2(ELEM_PER_WINDOW)   -1:0]  K = 0;
+    logic        [$clog2(ELEM_PER_WINDOW)   -1:0]  Position_in_window = 0;
     logic        [$clog2(BUF_ELEM_TOTAL)+1  -1:0]  Window_buffer_read_addr_reg = 0;
     logic        [$clog2(BUF_ELEM_TOTAL)-1:0]      Window_buffer_write_addr_reg = 0;
 
@@ -255,7 +241,7 @@ module $TOP_MODULE_NAME$_impl #(
             Newest_buffered_elem <= -1;
             Current_elem <= 0;
             First_elem_next_window <= 0;
-            K <= 0;
+            Position_in_window <= 0;
             Window_buffer_read_addr_reg <= 0;
             Window_buffer_write_addr_reg <= 0;
             Fetching_done <= 0;
@@ -295,10 +281,10 @@ module $TOP_MODULE_NAME$_impl #(
                 Window_buffer_read_addr_reg <= ra + ra_correct;
 
                 //keep track where we are within a window
-                K <= (K != ELEM_PER_WINDOW - 1)? K+1 : 0;
+                Position_in_window <= (Position_in_window != ELEM_PER_WINDOW - 1)? Position_in_window+1 : 0;
 
                 //update first element of next window to allow buffer overwrite up until that point
-                if (K == 0)
+                if (Position_in_window == 0)
                     First_elem_next_window <= First_elem_next_window + tail_incr;
 
                 //check if this is the last write cycle (Writing_done will be true afterwards)
diff --git a/finn-rtllib/swg/swg_template_wrapper.v b/finn-rtllib/swg/swg_template_wrapper.v
index 1b470817d61af4140141d8478c4ae0d538678ad7..4411348beb7c338151955810021803f140c27b25 100644
--- a/finn-rtllib/swg/swg_template_wrapper.v
+++ b/finn-rtllib/swg/swg_template_wrapper.v
@@ -1,14 +1,16 @@
 `timescale 1 ns / 1 ps
 
 module $TOP_MODULE_NAME$ (
-        ap_clk,
-        ap_rst_n,
-        in0_V_TDATA,
-        in0_V_TVALID,
-        in0_V_TREADY,
-        out_V_TDATA,
-        out_V_TVALID,
-        out_V_TREADY
+(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *)
+input  ap_clk,
+(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *)
+input  ap_rst_n,
+input  [BUF_IN_WIDTH-1:0] in0_V_TDATA,
+input  in0_V_TVALID,
+output in0_V_TREADY,
+output [BUF_OUT_WIDTH-1:0] out_V_TDATA,
+output out_V_TVALID,
+input  out_V_TREADY
 );
 
 // top-level parameters (set via code-generation)
@@ -21,17 +23,6 @@ parameter MMV_OUT = $MMV_OUT$;
 parameter BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN;
 parameter BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT;
 
-(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *)
-input  ap_clk;
-(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *)
-input  ap_rst_n;
-input  [BUF_IN_WIDTH-1:0] in0_V_TDATA;
-input  in0_V_TVALID;
-output in0_V_TREADY;
-output [BUF_OUT_WIDTH-1:0] out_V_TDATA;
-output out_V_TVALID;
-input  out_V_TREADY;
-
 $TOP_MODULE_NAME$_impl
 #(
     .BIT_WIDTH(BIT_WIDTH),
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
index 98351942b9b4abd1568c9d465710a181e9cab86c..366dd396d199e02846046b00d4b829d0871060dc 100755
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, Xilinx
+# Copyright (c) 2022, Xilinx
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -72,7 +72,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
             "OFMDim": ("ints", True, []),  # [H, W] = [Y, X]
             "SIMD": ("i", True, 0),
             "M": ("i", False, 1),
-            "parallel_window": ("i", False, 0, {0, 1}),
+            "parallel_window": ("i", False, 0, {0}),
             "Stride": ("ints", True, []),  # [H, W] = [Y, X]
             "Dilation": ("ints", True, []),  # [H, W] = [Y, X]
             # FINN DataTypes for inputs, weights, outputs
@@ -212,6 +212,49 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
 
         return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation)
 
+    def get_buffer_depth(self):
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        k = self.get_nodeattr("ConvKernelDim")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        stride = self.get_nodeattr("Stride")
+        dilation = self.get_nodeattr("Dilation")
+        simd = self.get_nodeattr("SIMD")
+
+        k_h, k_w = k
+        h, w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+        mmv_in = 1
+        mmv_out = 1
+        channel_factor = int(ifm_ch / simd)
+
+        impl_style = self.select_impl_style()
+        if impl_style == "default":
+            # compute minimal buffer length (assuming it holds 1 complete window)
+            buffer_min_size = (
+                (k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1
+            ) * channel_factor
+
+            # add additional buffer space in case of stride > 1
+            # this minimizes cycle count as it allows an earlier pre-load of inputs
+            buffer_depth = (
+                buffer_min_size
+                + max(
+                    0,
+                    ((stride_w - 1) - (int(mmv_out * k_h * k_w / mmv_in)))
+                    * channel_factor,
+                )
+                + max(
+                    0,
+                    ((stride_h - 1) * w - (int(mmv_out * k_h * k_w / mmv_in)))
+                    * channel_factor,
+                )
+            )
+        else:
+            buffer_depth = 0
+            raise Exception("Requested impl. style not implemented")
+        return buffer_depth
+
     def get_exp_cycles(self):
         simd = self.get_nodeattr("SIMD")
         ifm_ch = self.get_nodeattr("IFMChannels")
@@ -268,17 +311,11 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
     def bram_estimation(self):
         simd = self.get_nodeattr("SIMD")
         ram_style = self.get_nodeattr("ram_style")
-        impl_style = self.select_impl_style()
-        # call codegen preparation to populate self.buffer_depth
-        if impl_style == "default":
-            self.prepare_codegen_default()
-        else:
-            raise Exception("Requested impl. style not implemented")
 
         # NOTE: Actual BRAM usage might be lower in some cases.
         # This does not account for the exact Vivado behavior yet.
         buffer_width = simd * self.get_input_datatype().bitwidth()
-        buffer_depth = self.buffer_depth
+        buffer_depth = self.get_buffer_depth()
         if ram_style == "block" or ram_style == "auto":
             if buffer_depth <= 512:
                 ram_width = 36
@@ -321,15 +358,8 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
     def lut_estimation(self):
         simd = self.get_nodeattr("SIMD")
         ram_style = self.get_nodeattr("ram_style")
-        impl_style = self.select_impl_style()
-        # call codegen preparation to populate self.buffer_depth
-        if impl_style == "default":
-            self.prepare_codegen_default()
-        else:
-            raise Exception("Requested impl. style not implemented")
-
         buffer_width = simd * self.get_input_datatype().bitwidth()
-        buffer_depth = self.buffer_depth
+        buffer_depth = self.get_buffer_depth()
         if ram_style == "distributed":
             ram_luts = int(buffer_width * math.ceil(buffer_depth / 38))
         else:
@@ -339,15 +369,8 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
     def uram_estimation(self):
         simd = self.get_nodeattr("SIMD")
         ram_style = self.get_nodeattr("ram_style")
-        impl_style = self.select_impl_style()
-        # call codegen preparation to populate self.buffer_depth
-        if impl_style == "default":
-            self.prepare_codegen_default()
-        else:
-            raise Exception("Requested impl. style not implemented")
-
         buffer_width = simd * self.get_input_datatype().bitwidth()
-        buffer_depth = self.buffer_depth
+        buffer_depth = self.get_buffer_depth()
 
         if ram_style == "ultra":
             ram_depth = 4096
@@ -460,21 +483,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
             (k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1
         ) * channel_factor
 
-        # add additional buffer space in case of stride > 1
-        # this minimizes cycle count as it allows an earlier pre-load of input elements
-        buffer_actual_size = (
-            buffer_min_size
-            + max(
-                0,
-                ((stride_w - 1) - (int(mmv_out * k_h * k_w / mmv_in))) * channel_factor,
-            )
-            + max(
-                0,
-                ((stride_h - 1) * w - (int(mmv_out * k_h * k_w / mmv_in)))
-                * channel_factor,
-            )
-        )
-        self.buffer_depth = buffer_actual_size  # for resource estimation
+        buffer_actual_size = self.get_buffer_depth()
         code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_actual_size)]
 
         # compute some intermediate values, e.g., kernel "width" = k_w incl. dilation
@@ -643,9 +652,6 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
             and stride_w <= ifm_dim_w
         ), "Illegal conv configuration: kernel or stride > FM dimension"
 
-        if k_h == 1 and k_w == 1:
-            assert simd == ifm_ch, "1x1 Kernel only supported in parallel mode (SIMD=C)"
-
         # init folding config
         if self.get_nodeattr("parallel_window"):
             # mmv_in = M * 1
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index 850bcf6616aebc251ebd12f9d766b89c43ee4208..540c217cbca8c47243a080ac493f19bd1c72abc8 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -132,7 +132,26 @@ class InferConvInpGen(Transformation):
                     )
                     graph.node.insert(node_ind, padding_node)
 
-                if self.use_rtl_variant:
+                is_kernel_pointwise = k_h == 1 and k_w == 1
+                is_square_image = ConvInpGen_idim_h == ConvInpGen_idim_w
+                is_square_kernel = k_h == k_w
+                is_equal_stride = stride_h == stride_w
+                is_1d_convolution = (k_h == 1 and k_w > 1 and ifm_dim_h == 1) or (
+                    k_h > 1 and k_w == 1 and ifm_dim_w == 1
+                )
+
+                # Ensure that RTL variant is not inserted for unsupported configuration
+                is_rtl_variant_compatible = True
+                if is_kernel_pointwise:
+                    is_rtl_variant_compatible = False
+                    if self.use_rtl_variant:
+                        warnings.warn(
+                            """%s : RTL ConvInpGen requested for unsupported
+                                configuration. Falling back to HLS implementation."""
+                            % n.name
+                        )
+
+                if self.use_rtl_variant and is_rtl_variant_compatible:
                     ConvInpGen_node = helper.make_node(
                         "ConvolutionInputGenerator_rtl",
                         [ConvInpGen_input],
@@ -151,19 +170,11 @@ class InferConvInpGen(Transformation):
                         inputDataType=dt.name,
                         outputDataType=dt.name,
                         depthwise=depthwise,
-                        name="ConvolutionInputGenerator_rtl" + n.name,
+                        name="ConvolutionInputGenerator_rtl_" + n.name,
                     )
                     graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
                 else:
                     # Ensure that only supported HLS nodes are inserted
-                    is_square_image = ConvInpGen_idim_h == ConvInpGen_idim_w
-                    is_square_kernel = k_h == k_w
-                    is_kernel_pointwise = k_h == 1 and k_w == 1
-                    is_equal_stride = stride_h == stride_w
-                    is_1d_convolution = (k_h == 1 and k_w > 1 and ifm_dim_h == 1) or (
-                        k_h > 1 and k_w == 1 and ifm_dim_w == 1
-                    )
-
                     if (stride_h > 1 or stride_w > 1) and is_kernel_pointwise:
                         assert is_square_image, (
                             """%s : DownSampler currently only supports square
diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py
index 5c94272bad52fd6265a9bb0054fae87a3d77b93b..e24e24f1f8ebb2873c81617884cd333311d8aea9 100644
--- a/src/finn/transformation/fpgadataflow/set_folding.py
+++ b/src/finn/transformation/fpgadataflow/set_folding.py
@@ -172,11 +172,7 @@ class SetFolding(Transformation):
                             "Expected SWU on DW op input, found " + swu_node.op_type
                         )
             elif op_type in simd_ops:
-                if op_type in [
-                    "ConvolutionInputGenerator",
-                    "ConvolutionInputGenerator1D",
-                    "ConvolutionInputGenerator_rtl",
-                ]:
+                if op_type.startswith("ConvolutionInputGenerator"):
                     depthwise = node_inst.get_nodeattr("depthwise")
                     if depthwise == 0:
                         max_simd = node_inst.get_nodeattr("IFMChannels")
diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
index 56438ac6b6c5ac835ca35d9e66073042e467224f..8c9f110c315089ec03354863bf2213963197217a 100644
--- a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
@@ -164,14 +164,10 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mod
     inp_dict = {model.graph.input[0].name: x}
     assert oxe.compare_execution(model, new_model, inp_dict)
 
-    if use_rtl_swg:
-        downsampler_op_type = "ConvolutionInputGenerator_rtl"
-    else:
-        downsampler_op_type = "DownSampler"
     if kernel_size == 1 and stride > 1 and pad == 0:
-        assert new_model.graph.node[1].op_type == downsampler_op_type
+        assert new_model.graph.node[1].op_type == "DownSampler"
         if exec_mode == "rtlsim":
-            node = new_model.get_nodes_by_op_type(downsampler_op_type)[0]
+            node = new_model.get_nodes_by_op_type("DownSampler")[0]
             inst = getCustomOp(node)
             cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
             exp_cycles_dict = new_model.analysis(exp_cycles_per_layer)
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py
index eeeb09329448f546f4a668fde3d32ffaa36f5aaf..5da1fa6eb1f63251769b9e88e06087cf51e863a1 100755
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py
@@ -142,7 +142,7 @@ def prepare_inputs(input_tensor):
 # kernel size
 @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 3]])
 # input dimension
-@pytest.mark.parametrize("ifm_dim", [[24, 24], [13, 13], [1, 14]])
+@pytest.mark.parametrize("ifm_dim", [[24, 24], [15, 6], [13, 13], [1, 14]])
 # input channels
 @pytest.mark.parametrize("ifm_ch", [6])
 # Stride