diff --git a/docs/finn/img/rtl_swg_impl_styles.png b/docs/finn/img/rtl_swg_impl_styles.png
new file mode 100644
index 0000000000000000000000000000000000000000..265ff9b915e79f8e93ca4f987bb49e57f8a2bd3e
Binary files /dev/null and b/docs/finn/img/rtl_swg_impl_styles.png differ
diff --git a/docs/finn/internals.rst b/docs/finn/internals.rst
index add70d649c773061c5b9e1d91dcaa852dcc4cbac..d0c4cd20650a7cb1ef63f68ff559bebbba93ae05 100644
--- a/docs/finn/internals.rst
+++ b/docs/finn/internals.rst
@@ -205,3 +205,84 @@ Disadvantages:
 How to set *mem_mode*
 ---------------------
 When the nodes in the network are converted to HLS layers, the *mem_mode* can be passed. More detailed information about the transformations that prepare the network and the transformation that performs the conversion to HLS layers can be found in chapter :ref:`nw_prep`. The *mem_mode* is passed as argument. Note that if no argument is passed, the default is *const*.
+
+RTL ConvolutionInputGenerator
+=============================
+
+FINN implements convolution operations by pairing a ConvolutionInputGenerator (or "sliding window generator (SWG)") with an MVAU or VVAU (for depthwise convolution).
+This RTL version is an alternative to the original `HLS implementation <https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h>`_ and aims to improve on it in the following ways:
+
+* Support a wider range of hyperparameters without the fragmentation into 16+ separate HLS functions
+
+* Support additional degrees of parallelism (i.e., across the output window or multiple input samples) that are difficult to implement in HLS
+
+* Support additional features, such as dynamic feature map sizing
+
+* Improve resource efficiency
+
+
+The component is implemented by generating (System-)Verilog code for each individual instance, realized via the template + replacement dictionary mechanism found in other FINN components.
+Despite the HDL implementation, the component is managed by its own HLSCustomOp (!) named "ConvolutionInputGenerator_rtl". Naturally, HLS simulation & synthesis are not supported.
+
+The RTL SWG is currently disabled by default and can be enabled either in the corresponding HLS conversion transformation (:py:mod:`finn.transformation.fpgadataflow.convert_to_hls_layers.InferConvInpGen`) with `use_rtl_variant=True` or in the build configuration (:py:mod:`finn.builder.build_dataflow_config.DataflowBuildConfig.force_rtl_conv_inp_gen` set to True).
+
+Implementation styles
+---------------------
+Depending on the amount of parallelism requested, one of two implementation styles is selected. The following table defines folding parameters (marked in bold text) and supported configurations.
+
+.. list-table:: Parallelism configurations
+
+   * - **SIMD**
+     - **parallel_window**
+     - **M**
+     - MMV_in
+     - MMV_out
+     - Style
+     - Notes
+   * - < C
+     - 0
+     - 1
+     - 1
+     - 1
+     - default
+     - depthwise-aware
+   * - C
+     - 0
+     - 1
+     - 1
+     - 1
+     - default
+     - depthwise-agnostic
+   * - C
+     - 1
+     - 1
+     - 1
+     - K
+     - parallel
+     - depthwise-agnostic
+   * - C
+     - 1
+     - M
+     - M
+     - M*K
+     - parallel
+     - Currently unsupported
+
+(With C = #Channels, MMV_in = input samples (or "pixels") per cycle, MMV_out = output samples (or "pixels") per cycle, K = kernel_width * kernel_height.)
+
+The following diagram shows the operating principle of both styles, the "parallel" variant is pictured for a 2x2 kernel without dilation.
+
+.. image:: img/rtl_swg_impl_styles.png
+   :align: center
+
+The main difference lies in the buffer structure. If the output width is equal to the input width ("default mode"), an addressable circular buffer is used, which can be implemented either in LUTRAM, BRAM, or URAM resources. If parallel access to multiple window elements is required ("parallel mode"), the SWG generates a fixed structure of registers and line buffers to avoid memory port limitations and exploding multiplexing logic, while still featuring LUT-saving BRAM/URAM implementation for the line buffers.
+
+The "default" style also supports a dynamic mode, which provides an interface to change feature map dimensions, stride, or dilation at run-time. See `this pull request <https://github.com/Xilinx/finn/pull/688>`_ description for more information.
+
+Folding
+-------
+The RTL SWG is supported by the basic automatic folding algorithm in FINN (:py:mod:`finn.transformation.fpgadataflow.set_folding.SetFolding`). Consider the following implications:
+
+**MVAU:** Although it is recommended to unfold SIMD first, SIMD and PE can be set independently. Full (and balanced) parallelism is achieved by using the SWG in parallel window mode and setting MVAU SIMD and PE to their maximum values (SIMD = MW = C_in * K, PE = MH = C_out).
+
+**VVAU:** While the VVAU HLS component supports SIMD unfolding independently from PE, the RTL SWG requires full unfolding across the channel dimension (SIMD of the SWG = PE of the VVAU) before enabling window-parallelism. Unlike the MVAU, the VVAU can't accept datawidth-converted input from a fully-parallel SWG in this case due to the depthwise data layout. As a result, the VVAU should be unfolded by PE first (up to PE = C), followed by SIMD (up to SIMD = K).
diff --git a/finn-rtllib/swg/swg_common.sv b/finn-rtllib/swg/swg_common.sv
new file mode 100644
index 0000000000000000000000000000000000000000..ff6778973c4d5d5663bc0c4f7043fca76ebdbf26
--- /dev/null
+++ b/finn-rtllib/swg/swg_common.sv
@@ -0,0 +1,255 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+// loop controller used for both, "default" and "parallel", implementation styles
+module swg_controller #(
+    int unsigned  LOOP_H_ITERATIONS,
+    int unsigned  LOOP_W_ITERATIONS,
+    int unsigned  LOOP_KH_ITERATIONS,
+    int unsigned  LOOP_KW_ITERATIONS,
+    int unsigned  LOOP_SIMD_ITERATIONS,
+
+    int unsigned  INCR_BITWIDTH,
+
+    bit IS_DEPTHWISE,
+
+    int HEAD_INCR_SIMD,
+    int HEAD_INCR_KW,
+    int HEAD_INCR_KH,
+    int HEAD_INCR_W,
+    int HEAD_INCR_H,
+    int TAIL_INCR_W,
+    int TAIL_INCR_H,
+    int TAIL_INCR_LAST,
+
+    parameter INNERMOST_STATE
+)(
+    input   logic  clk,
+    input   logic  rst_n,
+
+    input   logic  advance,
+    output  logic [INCR_BITWIDTH-1:0]  addr_incr,
+    output  logic [INCR_BITWIDTH-1:0]  tail_incr
+);
+
+    // state and counters
+    typedef enum logic [2:0] {
+        STATE_START,
+        STATE_LOOP_SIMD,
+        STATE_LOOP_KW,
+        STATE_LOOP_KH,
+        STATE_LOOP_W,
+        STATE_LOOP_H
+    }  state_e;
+    state_e  State = INNERMOST_STATE;
+    state_e  state_next;
+
+    logic signed [$clog2(LOOP_H_ITERATIONS   +2)+1-1:0]  Counter_loop_h    = LOOP_H_ITERATIONS;
+    logic signed [$clog2(LOOP_W_ITERATIONS   +2)+1-1:0]  Counter_loop_w    = LOOP_W_ITERATIONS;
+    logic signed [$clog2(LOOP_KH_ITERATIONS  +2)+1-1:0]  Counter_loop_kh   = LOOP_KH_ITERATIONS;
+    logic signed [$clog2(LOOP_KW_ITERATIONS  +2)+1-1:0]  Counter_loop_kw   = LOOP_KW_ITERATIONS;
+    logic signed [$clog2(LOOP_SIMD_ITERATIONS+2)+1-1:0]  Counter_loop_simd = LOOP_SIMD_ITERATIONS;
+
+    // combinational logic for addr_incr generation
+    always_comb begin : blkHead
+        unique case (State)
+            STATE_START     : addr_incr = 0;
+            STATE_LOOP_SIMD : addr_incr = HEAD_INCR_SIMD;
+            STATE_LOOP_KW   : addr_incr = HEAD_INCR_KW;
+            STATE_LOOP_KH   : addr_incr = HEAD_INCR_KH;
+            STATE_LOOP_W    : addr_incr = HEAD_INCR_W;
+            STATE_LOOP_H    : addr_incr = HEAD_INCR_H;
+        endcase
+    end
+
+    // combinational logic for tail_incr generation
+    uwire  tail_incr_inner_condition = IS_DEPTHWISE? (Counter_loop_kh >= 0) : 0;
+    assign tail_incr =
+        tail_incr_inner_condition? 1 :
+        Counter_loop_w >= 0?       TAIL_INCR_W :
+        Counter_loop_h >= 0?       TAIL_INCR_H :
+        /* else */                 TAIL_INCR_LAST;
+
+    // combinational next state logic
+    always_comb begin : blkState
+        state_next = State;
+        if(State != INNERMOST_STATE)  state_next = INNERMOST_STATE;
+        else begin
+            if(Counter_loop_simd < 0) begin
+                state_next =
+                    (Counter_loop_kw >= 0)? STATE_LOOP_KW :
+                    (Counter_loop_kh >= 0)? STATE_LOOP_KH :
+                    (Counter_loop_w  >= 0)? STATE_LOOP_W :
+                    (Counter_loop_h  >= 0)? STATE_LOOP_H :
+                    /* else */              STATE_START;
+            end
+        end
+    end : blkState
+
+    // sequential logic
+    always_ff @ (posedge clk) begin
+        if(!rst_n) begin
+            State <= INNERMOST_STATE;
+            Counter_loop_h    <= LOOP_H_ITERATIONS;
+            Counter_loop_w    <= LOOP_W_ITERATIONS;
+            Counter_loop_kh   <= LOOP_KH_ITERATIONS;
+            Counter_loop_kw   <= LOOP_KW_ITERATIONS;
+            Counter_loop_simd <= LOOP_SIMD_ITERATIONS;
+        end
+        else if(advance) begin
+            State <= state_next;
+            if (State == INNERMOST_STATE) begin
+                if(Counter_loop_simd >= 0)  Counter_loop_simd <= Counter_loop_simd-1;
+                else begin
+                    Counter_loop_simd <= LOOP_SIMD_ITERATIONS;
+                    if(Counter_loop_kw >= 0)  Counter_loop_kw <= Counter_loop_kw-1;
+                    else begin
+                        Counter_loop_kw <= LOOP_KW_ITERATIONS;
+                        if(Counter_loop_kh >= 0)  Counter_loop_kh <= Counter_loop_kh-1;
+                        else begin
+                            Counter_loop_kh <= LOOP_KH_ITERATIONS;
+                            if(Counter_loop_w >= 0)  Counter_loop_w <= Counter_loop_w-1;
+                            else begin
+                                Counter_loop_w <= LOOP_W_ITERATIONS;
+                                if(Counter_loop_h >= 0)  Counter_loop_h <= Counter_loop_h-1;
+                                else  Counter_loop_h <= LOOP_H_ITERATIONS;
+                            end
+                        end
+                    end
+                end
+            end
+        end
+    end
+
+endmodule :  swg_controller
+
+// buffer used in "default" implementation style
+module swg_cyclic_buffer_addressable #(
+    int unsigned  WIDTH,
+    int unsigned  DEPTH,
+    parameter RAM_STYLE = "auto"
+)(
+    input   logic  clk,
+
+    input   logic  write_enable,
+    input   logic [$clog2(DEPTH)-1:0] write_addr,
+    input   logic [WIDTH-1:0]  data_in,
+
+    input   logic  read_enable,
+    input   logic [$clog2(DEPTH)-1:0]  read_addr, // absolute (!) read address of cyclic buffer
+    output  logic [WIDTH-1:0]  data_out
+);
+
+    (*ram_style=RAM_STYLE*) logic [WIDTH-1:0] Ram[DEPTH];
+    logic [WIDTH-1:0]  Out = 'x;
+    always_ff @(posedge clk) begin
+        if (read_enable)  Out <= Ram[read_addr];
+        if (write_enable) Ram[write_addr] <= data_in;
+    end
+    assign  data_out = Out;
+
+endmodule : swg_cyclic_buffer_addressable
+
+// buffer used in "parallel" implementation style
+module swg_reg_buffer
+#(
+    int unsigned WIDTH = 1,
+    int unsigned DEPTH = 1
+)
+(
+    input logic clk,
+    input logic shift_enable,
+    input logic [WIDTH-1:0] shift_in,
+    output logic [WIDTH-1:0] shift_out,
+    output logic [WIDTH*DEPTH-1:0] data_out
+);
+
+logic [WIDTH-1:0] Data [DEPTH-1:0];
+
+assign shift_out = Data[DEPTH-1];
+
+for (genvar e=0; e<DEPTH; e++)
+    assign data_out[e*WIDTH +: WIDTH] = Data[e];
+
+always @ (posedge clk) begin
+    if (shift_enable) begin
+        for (int i=DEPTH-1; i>0; i--)
+            Data[i] <= Data[i-1];
+        Data[0] <= shift_in;
+    end
+end
+endmodule : swg_reg_buffer
+
+// buffer used in "parallel" implementation style
+module swg_ram_buffer
+#(
+    int unsigned WIDTH,
+    int unsigned DEPTH,
+    parameter RAM_STYLE = "auto"
+)
+(
+    input logic clk,
+    input logic rst_n,
+    input logic shift_enable,
+    input logic [WIDTH-1:0] shift_in,
+    output logic [WIDTH-1:0] shift_out
+);
+
+logic [WIDTH-1:0] Out_reg;
+assign shift_out = Out_reg;
+
+logic [$clog2(DEPTH)-1:0] Addr_w = 0;
+logic [$clog2(DEPTH)-1:0] Addr_r = 0;
+
+(*ram_style=RAM_STYLE*) logic [WIDTH-1:0] Ram [DEPTH-1:0];
+
+always_ff @(posedge clk) begin
+    if (rst_n == 1'b0) begin
+        Addr_w <= 0;
+        Addr_r <= 1;
+    end else begin
+        if (shift_enable) begin
+            Ram[Addr_w] <= shift_in;
+            Out_reg <= Ram[Addr_r];
+
+            if (Addr_w == DEPTH-1)
+                Addr_w <= 0;
+            else
+                Addr_w <= Addr_w + 1;
+
+            if (Addr_r == DEPTH-1)
+                Addr_r <= 0;
+            else
+                Addr_r <= Addr_r + 1;
+        end
+    end
+end
+endmodule : swg_ram_buffer
diff --git a/finn-rtllib/swg/swg_template_default.sv b/finn-rtllib/swg/swg_template_default.sv
index 06e65e911100dd7d3d8879b014a6d59713eb9bbd..4970762172b5bcc1c418c5bbb60bdfee52568dd8 100644
--- a/finn-rtllib/swg/swg_template_default.sv
+++ b/finn-rtllib/swg/swg_template_default.sv
@@ -28,141 +28,6 @@
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *****************************************************************************/
-module $TOP_MODULE_NAME$_controller #(
-    int unsigned  LOOP_H_ITERATIONS    = $LOOP_H_ITERATIONS$,
-    int unsigned  LOOP_W_ITERATIONS    = $LOOP_W_ITERATIONS$,
-    int unsigned  LOOP_KH_ITERATIONS   = $LOOP_KH_ITERATIONS$,
-    int unsigned  LOOP_KW_ITERATIONS   = $LOOP_KW_ITERATIONS$,
-    int unsigned  LOOP_SIMD_ITERATIONS = $LOOP_SIMD_ITERATIONS$,
-
-    int unsigned  INCR_BITWIDTH = $INCR_BITWIDTH$,
-
-    bit IS_DEPTHWISE = $IS_DEPTHWISE$
-)(
-    input   logic  clk,
-    input   logic  rst_n,
-
-    input   logic  advance,
-    output  logic [INCR_BITWIDTH-1:0]  addr_incr,
-    output  logic [INCR_BITWIDTH-1:0]  tail_incr
-);
-
-    // state and counters
-    typedef enum logic [2:0] {
-        STATE_START,
-        STATE_LOOP_SIMD,
-        STATE_LOOP_KW,
-        STATE_LOOP_KH,
-        STATE_LOOP_W,
-        STATE_LOOP_H
-    }  state_e;
-    state_e  State = $INNERMOST_STATE$;
-    state_e  state_next;
-
-    logic signed [$clog2(LOOP_H_ITERATIONS   +2)+1-1:0]  Counter_loop_h    = LOOP_H_ITERATIONS;
-    logic signed [$clog2(LOOP_W_ITERATIONS   +2)+1-1:0]  Counter_loop_w    = LOOP_W_ITERATIONS;
-    logic signed [$clog2(LOOP_KH_ITERATIONS  +2)+1-1:0]  Counter_loop_kh   = LOOP_KH_ITERATIONS;
-    logic signed [$clog2(LOOP_KW_ITERATIONS  +2)+1-1:0]  Counter_loop_kw   = LOOP_KW_ITERATIONS;
-    logic signed [$clog2(LOOP_SIMD_ITERATIONS+2)+1-1:0]  Counter_loop_simd = LOOP_SIMD_ITERATIONS;
-
-    // combinational logic for addr_incr generation
-    always_comb begin : blkHead
-        unique case (State)
-            0 : addr_incr = 0;
-            1 : addr_incr = $HEAD_INCR_SIMD$;
-            2 : addr_incr = $HEAD_INCR_KW$;
-            3 : addr_incr = $HEAD_INCR_KH$;
-            4 : addr_incr = $HEAD_INCR_W$;
-            5 : addr_incr = $HEAD_INCR_H$;
-        endcase
-    end
-
-    // combinational logic for tail_incr generation
-    uwire  tail_incr_inner_condition = IS_DEPTHWISE? (Counter_loop_kh >= 0) : 0;
-    assign tail_incr =
-        tail_incr_inner_condition? 1 :
-        Counter_loop_w >= 0?       $TAIL_INCR_W$ :
-        Counter_loop_h >= 0?       $TAIL_INCR_H$ :
-        /* else */                 $TAIL_INCR_LAST$;
-
-    // combinational next state logic
-    always_comb begin : blkState
-        state_next = State;
-        if(State != $INNERMOST_STATE$)  state_next = $INNERMOST_STATE$;
-        else begin
-            if(Counter_loop_simd < 0) begin
-                state_next =
-                    (Counter_loop_kw >= 0)? STATE_LOOP_KW :
-                    (Counter_loop_kh >= 0)? STATE_LOOP_KH :
-                    (Counter_loop_w  >= 0)? STATE_LOOP_W :
-                    (Counter_loop_h  >= 0)? STATE_LOOP_H :
-                    /* else */              STATE_START;
-            end
-        end
-    end : blkState
-
-    // sequential logic
-    always_ff @ (posedge clk) begin
-        if(!rst_n) begin
-            State <= $INNERMOST_STATE$;
-            Counter_loop_h    <= LOOP_H_ITERATIONS;
-            Counter_loop_w    <= LOOP_W_ITERATIONS;
-            Counter_loop_kh   <= LOOP_KH_ITERATIONS;
-            Counter_loop_kw   <= LOOP_KW_ITERATIONS;
-            Counter_loop_simd <= LOOP_SIMD_ITERATIONS;
-        end
-        else if(advance) begin
-            State <= state_next;
-            if (State == $INNERMOST_STATE$) begin
-                if(Counter_loop_simd >= 0)  Counter_loop_simd <= Counter_loop_simd-1;
-                else begin
-                    Counter_loop_simd <= LOOP_SIMD_ITERATIONS;
-                    if(Counter_loop_kw >= 0)  Counter_loop_kw <= Counter_loop_kw-1;
-                    else begin
-                        Counter_loop_kw <= LOOP_KW_ITERATIONS;
-                        if(Counter_loop_kh >= 0)  Counter_loop_kh <= Counter_loop_kh-1;
-                        else begin
-                            Counter_loop_kh <= LOOP_KH_ITERATIONS;
-                            if(Counter_loop_w >= 0)  Counter_loop_w <= Counter_loop_w-1;
-                            else begin
-                                Counter_loop_w <= LOOP_W_ITERATIONS;
-                                if(Counter_loop_h >= 0)  Counter_loop_h <= Counter_loop_h-1;
-                                else  Counter_loop_h <= LOOP_H_ITERATIONS;
-                            end
-                        end
-                    end
-                end
-            end
-        end
-    end
-
-endmodule :  $TOP_MODULE_NAME$_controller
-
-module $TOP_MODULE_NAME$_cyclic_buffer_addressable #(
-    int unsigned  WIDTH,
-    int unsigned  DEPTH
-)(
-    input   logic  clk,
-
-    input   logic  write_enable,
-    input   logic [$clog2(DEPTH)-1:0] write_addr,
-    input   logic [WIDTH-1:0]  data_in,
-
-    input   logic  read_enable,
-    input   logic [$clog2(DEPTH)-1:0]  read_addr, // absolute (!) read address of cyclic buffer
-    output  logic [WIDTH-1:0]  data_out
-);
-
-    $RAM_STYLE$ logic [WIDTH-1:0] Ram[DEPTH];
-    logic [WIDTH-1:0]  Out = 'x;
-    always_ff @(posedge clk) begin
-        if (read_enable)  Out <= Ram[read_addr];
-        if (write_enable) Ram[write_addr] <= data_in;
-    end
-    assign  data_out = Out;
-
-endmodule : $TOP_MODULE_NAME$_cyclic_buffer_addressable
-
 module $TOP_MODULE_NAME$_impl #(
     int  BIT_WIDTH,
     int  SIMD,
@@ -197,9 +62,10 @@ module $TOP_MODULE_NAME$_impl #(
     uwire  window_buffer_read_enable;
     uwire [$clog2(BUF_ELEM_TOTAL)-1:0]  window_buffer_write_addr;
     uwire [$clog2(BUF_ELEM_TOTAL)-1:0]  window_buffer_read_addr;
-    $TOP_MODULE_NAME$_cyclic_buffer_addressable #(
+    swg_cyclic_buffer_addressable #(
         .WIDTH(BUF_IN_WIDTH),
-        .DEPTH(BUF_ELEM_TOTAL)
+        .DEPTH(BUF_ELEM_TOTAL),
+        .RAM_STYLE($RAM_STYLE$)
     ) window_buffer_inst (
         .clk(ap_clk),
 
@@ -216,7 +82,25 @@ module $TOP_MODULE_NAME$_impl #(
     uwire  advance_controller;
     uwire signed [INCR_BITWIDTH-1:0]  addr_incr;
     uwire        [INCR_BITWIDTH-1:0]  tail_incr;
-    $TOP_MODULE_NAME$_controller controller_inst (
+    swg_controller #(
+        .LOOP_H_ITERATIONS($LOOP_H_ITERATIONS$),
+        .LOOP_W_ITERATIONS($LOOP_W_ITERATIONS$),
+        .LOOP_KH_ITERATIONS($LOOP_KH_ITERATIONS$),
+        .LOOP_KW_ITERATIONS($LOOP_KW_ITERATIONS$),
+        .LOOP_SIMD_ITERATIONS($LOOP_SIMD_ITERATIONS$),
+        .HEAD_INCR_SIMD($HEAD_INCR_SIMD$),
+        .HEAD_INCR_KW($HEAD_INCR_KW$),
+        .HEAD_INCR_KH($HEAD_INCR_KH$),
+        .HEAD_INCR_W($HEAD_INCR_W$),
+        .HEAD_INCR_H($HEAD_INCR_H$),
+        .TAIL_INCR_W($TAIL_INCR_W$),
+        .TAIL_INCR_H($TAIL_INCR_H$),
+        .TAIL_INCR_LAST($TAIL_INCR_LAST$),
+        .INCR_BITWIDTH($INCR_BITWIDTH$),
+        .IS_DEPTHWISE($IS_DEPTHWISE$),
+        .INNERMOST_STATE($INNERMOST_STATE$)
+    )
+    controller_inst (
         .clk(ap_clk),
         .rst_n(ap_rst_n),
         .advance(advance_controller),
diff --git a/finn-rtllib/swg/swg_template_default_dynamic.sv b/finn-rtllib/swg/swg_template_default_dynamic.sv
index eb53978b580a4753bbea6c8478f35912deb812b4..412f8689ba33ec248ba7ebd50ca201204b001b1a 100644
--- a/finn-rtllib/swg/swg_template_default_dynamic.sv
+++ b/finn-rtllib/swg/swg_template_default_dynamic.sv
@@ -152,31 +152,6 @@ module $TOP_MODULE_NAME$_controller #(
 
 endmodule :  $TOP_MODULE_NAME$_controller
 
-module $TOP_MODULE_NAME$_cyclic_buffer_addressable #(
-    int unsigned  WIDTH,
-    int unsigned  DEPTH
-)(
-    input   logic  clk,
-
-    input   logic  write_enable,
-    input   logic [$clog2(DEPTH)-1:0] write_addr,
-    input   logic [WIDTH-1:0]  data_in,
-
-    input   logic  read_enable,
-    input   logic [$clog2(DEPTH)-1:0]  read_addr, // absolute (!) read address of cyclic buffer
-    output  logic [WIDTH-1:0]  data_out
-);
-
-    $RAM_STYLE$ logic [WIDTH-1:0] Ram[DEPTH];
-    logic [WIDTH-1:0]  Out = 'x;
-    always_ff @(posedge clk) begin
-        if (read_enable)  Out <= Ram[read_addr];
-        if (write_enable) Ram[write_addr] <= data_in;
-    end
-    assign  data_out = Out;
-
-endmodule : $TOP_MODULE_NAME$_cyclic_buffer_addressable
-
 module $TOP_MODULE_NAME$_impl #(
     int  BIT_WIDTH,
     int  SIMD,
@@ -242,9 +217,10 @@ module $TOP_MODULE_NAME$_impl #(
     uwire  window_buffer_read_enable;
     uwire [$clog2(BUF_ELEM_TOTAL)-1:0]  window_buffer_write_addr;
     uwire [$clog2(BUF_ELEM_TOTAL)-1:0]  window_buffer_read_addr;
-    $TOP_MODULE_NAME$_cyclic_buffer_addressable #(
+    swg_cyclic_buffer_addressable #(
         .WIDTH(BUF_IN_WIDTH),
-        .DEPTH(BUF_ELEM_TOTAL)
+        .DEPTH(BUF_ELEM_TOTAL),
+        .RAM_STYLE($RAM_STYLE$)
     ) window_buffer_inst (
         .clk(ap_clk),
 
diff --git a/finn-rtllib/swg/swg_template_parallel.sv b/finn-rtllib/swg/swg_template_parallel.sv
new file mode 100644
index 0000000000000000000000000000000000000000..b55a51e4005a1a8332ebe74acb61eac10f246f7f
--- /dev/null
+++ b/finn-rtllib/swg/swg_template_parallel.sv
@@ -0,0 +1,228 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+module $TOP_MODULE_NAME$_wb
+#(
+    int unsigned IN_WIDTH          = 1, // bit-width*C*MMV_in
+    int unsigned OUT_ELEM_WIDTH    = 1, // bit-width*C
+    int unsigned OUT_WIDTH         = 1, // bit-width*C*MMV_out
+    int unsigned BUFFER_ELEM_TOTAL = 1
+)
+(
+    input logic clk,
+    input logic rst_n,
+    input logic shift_enable,
+    input logic [IN_WIDTH-1:0] data_in,
+    output logic [OUT_WIDTH-1:0] data_out
+);
+
+$GENERATE_REG_FIFOS$
+
+$GENERATE_BRAM_FIFOS$
+
+// fixed interconnect between linear buffers
+$GENERATE_BUFFER_CONNECTION$
+
+// fixed REG FIFO -> output mapping
+$GENERATE_OUTPUT_MAPPING$
+
+endmodule : $TOP_MODULE_NAME$_wb
+
+module $TOP_MODULE_NAME$_impl #(
+    int unsigned BIT_WIDTH,
+    int unsigned SIMD,
+    int unsigned MMV_IN,
+    int unsigned MMV_OUT,
+    int unsigned LAST_READ_ELEM = $LAST_READ_ELEM$,
+    int unsigned FIRST_WRITE_ELEM = $FIRST_WRITE_ELEM$,
+    int unsigned LAST_WRITE_ELEM = $LAST_WRITE_ELEM$,
+    int unsigned BUF_ELEM_TOTAL = $BUF_ELEM_TOTAL$,
+    int unsigned INCR_BITWIDTH = $INCR_BITWIDTH$
+)(
+    input   logic  ap_clk,
+    input   logic  ap_rst_n,
+
+    input   logic  in0_V_V_TVALID,
+    output  logic  in0_V_V_TREADY,
+    input   logic [BIT_WIDTH * SIMD * MMV_IN-1:0]  in0_V_V_TDATA,
+
+    output  logic  out_V_V_TVALID,
+    input   logic  out_V_V_TREADY,
+    output  logic [BIT_WIDTH * SIMD * MMV_OUT-1:0]  out_V_V_TDATA
+);
+    // derived constants
+    localparam int unsigned  BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN;
+    localparam int unsigned  BUF_OUT_ELEM_WIDTH = BIT_WIDTH * SIMD;
+    localparam int unsigned  BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT;
+
+    // main buffer instantiation
+    uwire [BUF_IN_WIDTH -1:0] window_buffer_in;
+    uwire [BUF_OUT_WIDTH-1:0] window_buffer_out;
+    uwire window_buffer_shift_enable;
+    $TOP_MODULE_NAME$_wb
+    #(
+        .IN_WIDTH(BUF_IN_WIDTH),
+        .OUT_ELEM_WIDTH(BUF_OUT_ELEM_WIDTH),
+        .OUT_WIDTH(BUF_OUT_WIDTH),
+        .BUFFER_ELEM_TOTAL(BUF_ELEM_TOTAL)
+    )
+    window_buffer_inst
+    (
+        .clk(ap_clk),
+        .rst_n(ap_rst_n),
+        .data_in(window_buffer_in),
+        .shift_enable(window_buffer_shift_enable),
+        .data_out(window_buffer_out)
+    );
+
+    // controller instantiation
+    uwire  advance_controller;
+    uwire signed [INCR_BITWIDTH-1:0]  addr_incr;
+    uwire        [INCR_BITWIDTH-1:0]  tail_incr;
+    swg_controller #(
+        .LOOP_H_ITERATIONS($LOOP_H_ITERATIONS$),
+        .LOOP_W_ITERATIONS($LOOP_W_ITERATIONS$),
+        .LOOP_KH_ITERATIONS($LOOP_KH_ITERATIONS$),
+        .LOOP_KW_ITERATIONS($LOOP_KW_ITERATIONS$),
+        .LOOP_SIMD_ITERATIONS($LOOP_SIMD_ITERATIONS$),
+        .HEAD_INCR_SIMD($HEAD_INCR_SIMD$),
+        .HEAD_INCR_KW($HEAD_INCR_KW$),
+        .HEAD_INCR_KH($HEAD_INCR_KH$),
+        .HEAD_INCR_W($HEAD_INCR_W$),
+        .HEAD_INCR_H($HEAD_INCR_H$),
+        .TAIL_INCR_W($TAIL_INCR_W$),
+        .TAIL_INCR_H($TAIL_INCR_H$),
+        .TAIL_INCR_LAST($TAIL_INCR_LAST$),
+        .INCR_BITWIDTH($INCR_BITWIDTH$),
+        .IS_DEPTHWISE($IS_DEPTHWISE$),
+        .INNERMOST_STATE($INNERMOST_STATE$)
+    )
+    controller_inst (
+        .clk(ap_clk),
+        .rst_n(ap_rst_n),
+        .advance(advance_controller),
+        .addr_incr(addr_incr),
+        .tail_incr(tail_incr)
+    );
+
+    // counters/address registers
+    logic signed [$clog2(LAST_READ_ELEM+1)+1-1:0]  Newest_buffered_elem = -1;
+    logic        [$clog2(LAST_READ_ELEM+1)+1-1:0]  Current_elem = FIRST_WRITE_ELEM;
+    logic        [$clog2(LAST_READ_ELEM+1)+1-1:0]  First_elem_next_window = 0;
+
+    // control registers/signals
+    logic  Writing_done  = 0;
+    logic  Write_done    = 0;
+    uwire  write_cmd     = !($signed(Current_elem) > Newest_buffered_elem) && !Writing_done;;
+    uwire  write_ok      = write_cmd && (out_V_V_TREADY || Write_done);
+    uwire  write_blocked = write_cmd && !out_V_V_TREADY && !Write_done;
+
+    uwire  reading_done = Newest_buffered_elem == LAST_READ_ELEM;
+    uwire  read_cmd     =
+        !reading_done && ( // if there is still an input element left to read
+            Writing_done || ( // if writing is done (e.g. for skipped rows at FM end due to stride)
+                $signed(((Newest_buffered_elem - ($signed(BUF_ELEM_TOTAL) - 1)))) < $signed(First_elem_next_window) &&
+                $signed(((Newest_buffered_elem - ($signed(BUF_ELEM_TOTAL) - 1)))) < $signed(Current_elem)
+            ) // (over-)write to buffer if oldest buffered element will no longer be needed
+        );
+    uwire  read_ok      = read_cmd && in0_V_V_TVALID && !write_blocked;
+
+    //            includes waiting on W    if W-only cycle: wait only on W     no R/W to wait for
+    uwire advance       = read_ok        ||   (!read_cmd && write_ok)    || (!read_cmd && !write_cmd);
+
+    // assign buffer control
+    assign window_buffer_shift_enable = advance;
+    assign  advance_controller = write_ok;
+
+    // assign I/O ports
+    assign  window_buffer_in = in0_V_V_TDATA;
+    assign  out_V_V_TDATA = window_buffer_out;
+    assign  in0_V_V_TREADY = ap_rst_n && read_ok; //only asserted if data is available and we can store it (allowed)
+    assign  out_V_V_TVALID = ap_rst_n && write_cmd && !Write_done; //only asserted if we have data available and it has not been read yet (don't wait for READY from sink)
+
+    // write done logic
+    always_ff @(posedge ap_clk) begin
+        if(!ap_rst_n) begin
+            Write_done <= 1'b0;
+        end
+        else begin
+            if (advance) begin
+                Write_done <= 1'b0; //reset flag
+            end else if (write_ok)  //successful W in this cycle, but R still outstanding
+                Write_done <= 1'b1; //write can happen even if read is blocked, but only for the current cycle!
+        end
+    end
+
+    // main process for advancing counters
+    always_ff @(posedge ap_clk) begin
+        if(!ap_rst_n) begin
+            Newest_buffered_elem <= -1;
+            Current_elem <= FIRST_WRITE_ELEM;
+            First_elem_next_window <= 0;
+            Writing_done <= 0;
+        end
+        else begin
+            if (read_ok) begin
+                Newest_buffered_elem <= Newest_buffered_elem+1;
+
+                // check if this is the last read cycle (reading_done will be true afterwards)
+                if ((Newest_buffered_elem == LAST_READ_ELEM-1) && Writing_done) begin
+                    // start processing of next FM if writing is done already (possible due to unused input elements at the tail end)
+                    // todo: allow for read overlapping between feature maps (i.e., reading first elements from next FM while still writing last window of current FM)
+                    Newest_buffered_elem <= -1;
+                    Current_elem <= FIRST_WRITE_ELEM;
+                    First_elem_next_window <= 0;
+                    Writing_done <= 0;
+                end
+            end
+
+            if (write_ok) begin
+                First_elem_next_window <= First_elem_next_window + tail_incr;
+
+                // check if this is the last write cycle (Writing_done will be true afterwards)
+                if (Current_elem == LAST_WRITE_ELEM) begin
+                    Writing_done <= 1;
+
+                    if (reading_done || (read_ok && (Newest_buffered_elem == LAST_READ_ELEM - 1))) begin
+                        // start processing of next FM if reading is done already, or completes in the same cycle
+                        Newest_buffered_elem <= -1;
+                        Current_elem <= FIRST_WRITE_ELEM;
+                        First_elem_next_window <= 0;
+                        Writing_done <= 0;
+                    end
+                end
+                else
+                    Current_elem <= $signed(Current_elem) + addr_incr;
+            end
+        end
+    end
+
+endmodule : $TOP_MODULE_NAME$_impl
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index e43a29d6327b779f37072175fb51defd0e76ddfd..a22b5adc9846af22833568065a0c31e6e9a4d111 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -416,6 +416,7 @@ def step_target_fps_parallelization(model: ModelWrapper, cfg: DataflowBuildConfi
         hw_attrs = [
             "PE",
             "SIMD",
+            "parallel_window",
             "ram_style",
             "resType",
             "mem_mode",
@@ -588,6 +589,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
     hw_attrs = [
         "PE",
         "SIMD",
+        "parallel_window",
         "ram_style",
         "depth",
         "impl_style",
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
index 30861f01351d0f397762c04d3404b69b56e71167..173a1578417b76dfb8ae24c94f3d40616dbe0d55 100755
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
@@ -29,6 +29,7 @@
 import math
 import numpy as np
 import os
+import shutil
 from qonnx.core.datatype import DataType
 from qonnx.custom_op.general import im2col
 from qonnx.custom_op.general.im2col import compute_conv_output_dim
@@ -72,8 +73,8 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
             "SIMD": ("i", True, 0),
             # additional parallelization parameter - not yet implemented
             "M": ("i", False, 1),
-            # alternative implementation style - not yet implemented
-            "parallel_window": ("i", False, 0, {0}),
+            # Enable parallel window output (requires full SIMD unfolding)
+            "parallel_window": ("i", False, 0, {0, 1}),
             "Stride": ("ints", True, []),  # [H, W] = [Y, X]
             "Dilation": ("ints", True, []),  # [H, W] = [Y, X]
             # FINN DataTypes for inputs, weights, outputs
@@ -81,7 +82,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
             "outputDataType": ("s", True, ""),
             "depthwise": ("i", False, 0, {0, 1}),
             # Enable reprogrammable implementation to change FM dimensions,
-            # stride, or dilation during runtime
+            # stride, or dilation during runtime (requires parallel_window = 0)
             "dynamic_mode": ("i", False, 0, {0, 1}),
             # FPGA resource type for ConvolutionInputGenerator input buffer
             # auto -- let Vivado decide
@@ -188,6 +189,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
             return self.get_instream_width()
 
     def get_number_input_values(self):
+        """Function to get the number of expected input values."""
         folded_ishape = self.get_folded_input_shape()
         num_input_elems = np.prod(folded_ishape[:-1])
         return num_input_elems
@@ -198,6 +200,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         return num_output_elems
 
     def get_1d_conv_attrs_normalized(self):
+        """Returns normalized spatial attributes, where H=1 for the 1D case."""
         # normalize FM dimensions so that:
         # [H, W] = [Y, X] = [1, D] or [D, 1] are always mapped to [1, D].
         # The dummy ('1') dimension is the Y-dimension.
@@ -218,6 +221,8 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation)
 
     def get_buffer_depth(self):
+        """Returns total depth of the internal buffer, depending on
+        implementation style."""
         ifm_ch = self.get_nodeattr("IFMChannels")
         k = self.get_nodeattr("ConvKernelDim")
         ifm_dim = self.get_nodeattr("IFMDim")
@@ -233,13 +238,13 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         mmv_out = 1
         channel_factor = int(ifm_ch / simd)
 
+        # compute minimal buffer length (assuming it holds 1 complete window)
+        buffer_min_size = (
+            (k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1
+        ) * channel_factor
+
         impl_style = self.select_impl_style()
         if impl_style == "default":
-            # compute minimal buffer length (assuming it holds 1 complete window)
-            buffer_min_size = (
-                (k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1
-            ) * channel_factor
-
             # add additional buffer space in case of stride > 1
             # this minimizes cycle count as it allows an earlier pre-load of inputs
             buffer_depth = (
@@ -255,73 +260,89 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
                     * channel_factor,
                 )
             )
-        else:
-            buffer_depth = 0
-            raise Exception("Requested impl. style not implemented")
+        elif impl_style == "parallel":
+            buffer_depth = buffer_min_size + 1
         return buffer_depth
 
     def get_exp_cycles(self):
-        simd = self.get_nodeattr("SIMD")
-        ifm_ch = self.get_nodeattr("IFMChannels")
-        k = self.get_nodeattr("ConvKernelDim")
-        ifm_dim = self.get_nodeattr("IFMDim")
-        ofm_dim = self.get_nodeattr("OFMDim")
-        stride = self.get_nodeattr("Stride")
-        dilation = self.get_nodeattr("Dilation")
-        depthwise = self.get_nodeattr("depthwise")
-        ifm_dim_h, ifm_dim_w = ifm_dim
-        ofm_dim_h, ofm_dim_w = ofm_dim
-        k_h, k_w = k
-        stride_h, stride_w = stride
-        dilation_h, dilation_w = dilation
-
-        channel_factor = int(ifm_ch / simd)
+        impl_style = self.select_impl_style()
 
-        if ifm_dim_h == 1 or ifm_dim_w == 1:
-            # 1D case
-            (
-                ifm_ch,
-                [ifm_dim_h, ifm_dim_w],
-                [ofm_dim_h, ofm_dim_w],
-                [k_h, k_w],
-                [stride_h, stride_w],
-                [dilation_h, dilation_w],
-            ) = self.get_1d_conv_attrs_normalized()
-
-            if depthwise:
-                exp_cycles = (
-                    +ofm_dim_w * k_w * channel_factor
-                    + channel_factor * (k_w - 1) * (stride_w - 1)
-                    - (k_w - 1)
-                    + 2
-                )
+        if impl_style == "parallel":
+            exp_cycles = self.get_number_input_values() + 2
+        elif impl_style == "default":
+            simd = self.get_nodeattr("SIMD")
+            ifm_ch = self.get_nodeattr("IFMChannels")
+            k = self.get_nodeattr("ConvKernelDim")
+            ifm_dim = self.get_nodeattr("IFMDim")
+            ofm_dim = self.get_nodeattr("OFMDim")
+            stride = self.get_nodeattr("Stride")
+            dilation = self.get_nodeattr("Dilation")
+            depthwise = self.get_nodeattr("depthwise")
+            ifm_dim_h, ifm_dim_w = ifm_dim
+            ofm_dim_h, ofm_dim_w = ofm_dim
+            k_h, k_w = k
+            stride_h, stride_w = stride
+            dilation_h, dilation_w = dilation
+
+            channel_factor = int(ifm_ch / simd)
+            if ifm_dim_h == 1 or ifm_dim_w == 1:
+                # 1D case
+                (
+                    ifm_ch,
+                    [ifm_dim_h, ifm_dim_w],
+                    [ofm_dim_h, ofm_dim_w],
+                    [k_h, k_w],
+                    [stride_h, stride_w],
+                    [dilation_h, dilation_w],
+                ) = self.get_1d_conv_attrs_normalized()
+
+                if depthwise:
+                    exp_cycles = (
+                        +ofm_dim_w * k_w * channel_factor
+                        + channel_factor * (k_w - 1) * (stride_w - 1)
+                        - (k_w - 1)
+                        + 2
+                    )
+                else:
+                    exp_cycles = ofm_dim_w * k_w * channel_factor + 2
             else:
-                exp_cycles = ofm_dim_w * k_w * channel_factor + 2
-        else:
-            # 2D case
-            buffer_min_size = (
-                (k_h - 1) * dilation_h * ifm_dim_w + (k_w - 1) * dilation_w + 1
-            ) * channel_factor
-            cycles_write_block = ofm_dim_w * k_w * k_h * channel_factor
-            cycles_read_block = stride_w * ifm_dim_w * channel_factor
-            max_cycles = max(cycles_write_block, cycles_read_block)
-            if depthwise:
-                max_cycles += ofm_dim_w * (stride_w - 1) * (channel_factor - 1)
-            exp_cycles = buffer_min_size + ofm_dim_h * max_cycles  # initial buffering
-            if depthwise:
-                exp_cycles += (stride_h - 1) * ifm_dim_w * channel_factor
+                # 2D case
+                buffer_min_size = (
+                    (k_h - 1) * dilation_h * ifm_dim_w + (k_w - 1) * dilation_w + 1
+                ) * channel_factor
+                cycles_write_block = ofm_dim_w * k_w * k_h * channel_factor
+                cycles_read_block = stride_w * ifm_dim_w * channel_factor
+                max_cycles = max(cycles_write_block, cycles_read_block)
+                if depthwise:
+                    max_cycles += ofm_dim_w * (stride_w - 1) * (channel_factor - 1)
+                exp_cycles = buffer_min_size + ofm_dim_h * max_cycles
+                if depthwise:
+                    exp_cycles += (stride_h - 1) * ifm_dim_w * channel_factor
 
         return int(exp_cycles)
 
     def bram_estimation(self):
         simd = self.get_nodeattr("SIMD")
         ram_style = self.get_nodeattr("ram_style")
+        impl_style = self.select_impl_style()
+        [k_h, k_w] = self.get_nodeattr("ConvKernelDim")
+        [ifm_dim_h, ifm_dim_w] = self.get_nodeattr("IFMDim")
+        [dilation_h, dilation_w] = self.get_nodeattr("Dilation")
 
-        # NOTE: Actual BRAM usage might be lower in some cases.
-        # This does not account for the exact Vivado behavior yet.
-        buffer_width = simd * self.get_input_datatype().bitwidth()
-        buffer_depth = self.get_buffer_depth()
         if ram_style == "block" or ram_style == "auto":
+            buffer_width = simd * self.get_input_datatype().bitwidth()
+            if impl_style == "default":
+                buffer_depth = self.get_buffer_depth()
+                buffer_count = 1
+            elif impl_style == "parallel":
+                if ifm_dim_h == 1 or ifm_dim_w == 1:
+                    return 0  # 1D case (no line buffers needed)
+                kernel_width = (k_w - 1) * dilation_w + 1
+                buffer_depth = (ifm_dim_w - kernel_width) + ifm_dim_w * (dilation_h - 1)
+                buffer_count = k_h - 1
+
+            # NOTE: Actual BRAM usage might be lower in some cases
+            # due to imperfect modeling of Vivado behavior
             if buffer_depth <= 512:
                 ram_width = 36
             elif buffer_depth <= 1024:
@@ -356,7 +377,9 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
                 remainder_cascade_width = math.ceil(buffer_width / remainder_width)
                 cascade_savings = ram_cascade_width - remainder_cascade_width
 
-            return int(ram_cascade_depth * ram_cascade_width - cascade_savings)
+            return int(
+                (ram_cascade_depth * ram_cascade_width - cascade_savings) * buffer_count
+            )
         else:
             return 0
 
@@ -374,15 +397,28 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
     def uram_estimation(self):
         simd = self.get_nodeattr("SIMD")
         ram_style = self.get_nodeattr("ram_style")
-        buffer_width = simd * self.get_input_datatype().bitwidth()
-        buffer_depth = self.get_buffer_depth()
+        impl_style = self.select_impl_style()
+        [k_h, k_w] = self.get_nodeattr("ConvKernelDim")
+        [ifm_dim_h, ifm_dim_w] = self.get_nodeattr("IFMDim")
+        [dilation_h, dilation_w] = self.get_nodeattr("Dilation")
 
         if ram_style == "ultra":
+            buffer_width = simd * self.get_input_datatype().bitwidth()
+            if impl_style == "default":
+                buffer_depth = self.get_buffer_depth()
+                buffer_count = 1
+            elif impl_style == "parallel":
+                if ifm_dim_h == 1 or ifm_dim_w == 1:
+                    return 0  # 1D case (no line buffers needed)
+                kernel_width = (k_w - 1) * dilation_w + 1
+                buffer_depth = (ifm_dim_w - kernel_width) + ifm_dim_w * (dilation_h - 1)
+                buffer_count = k_h - 1
+
             ram_depth = 4096
             ram_width = 72
             ram_cascade_depth = math.ceil(buffer_depth / ram_depth)
             ram_cascade_width = math.ceil(buffer_width / ram_width)
-            return int(ram_cascade_depth * ram_cascade_width)
+            return int(ram_cascade_depth * ram_cascade_width * buffer_count)
         else:
             return 0
 
@@ -457,8 +493,8 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         shape doesn't match expected shape (1, ofm_dim_h, ofm_dim_w, k_h*k_w*ifm_ch)."""
 
     def prepare_codegen_default(self):
-        # Default implementation style for MMV_out = 1: addressable cyclic buffer
-        # Computing incremental addressing scheme directly..
+        """Fills code generation dict for the default implementation style by computing
+        the incremental addressing scheme for the circular buffer."""
         if self.get_nodeattr("dynamic_mode"):
             template_select = "/finn-rtllib/swg/swg_template_default_dynamic.sv"
         else:
@@ -528,13 +564,13 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
             )
             addr_incr_end_simd = -buffer_min_size + (channel_factor + 1)
 
-        # sanity check
+        # sanity check for wrap logic
         assert not (
             abs(addr_incr_end_window) > buffer_actual_size
-        ), "ERROR: W increment > buffer size, wrap logic doesn't account for this"
+        ), "ERROR: W increment > buffer size, try setting parallel_window=1"
         assert not (
             abs(addr_incr_end_row) > buffer_actual_size
-        ), "ERROR: H increment > buffer size, wrap logic doesn't account for this"
+        ), "ERROR: H increment > buffer size, try setting parallel_window=1"
 
         # set certain threshold indices to detect when reading/writing finishes
         code_gen_dict["$LAST_READ_ELEM$"] = [str(h * w * channel_factor - 1)]
@@ -581,13 +617,13 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
             # skip innermost SIMD loop completely
             if loop_kw_iterations == 1:
                 # skip innermost KW loop completely
-                code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_KH"]
+                code_gen_dict["$INNERMOST_STATE$"] = [str(3)]  # STATE_LOOP_KH
                 loop_kh_iterations -= 1  # -1 because state is initial state
             else:
-                code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_KW"]
+                code_gen_dict["$INNERMOST_STATE$"] = [str(2)]  # STATE_LOOP_KW
                 loop_kw_iterations -= 1  # -1 because state is initial state
         else:
-            code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_SIMD"]
+            code_gen_dict["$INNERMOST_STATE$"] = [str(1)]  # STATE_LOOP_SIMD
             loop_simd_iterations -= 1  # -1 because state is initial state
 
         cntr_bitwidth = math.ceil(
@@ -639,7 +675,275 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
 
         return template_path, code_gen_dict
 
+    def prepare_codegen_parallel(self):
+        """Fills code generation dict for the parallel implementation style by computing
+        the loop controller configuration and partitioning the fixed buffer into
+        shift-registers (for parallel read access) and line buffers (for efficient
+        LUTRAM/BRAM/URAM implementation)."""
+        template_path = (
+            os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_template_parallel.sv"
+        )
+        code_gen_dict = {}
+
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        k = self.get_nodeattr("ConvKernelDim")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        stride = self.get_nodeattr("Stride")
+        dilation = self.get_nodeattr("Dilation")
+        simd = self.get_nodeattr("SIMD")
+        M = self.get_nodeattr("M")
+
+        k_h, k_w = k
+        h, w = ifm_dim
+        pad = [0, 0, 0, 0]  # padding happens in separate padding node for now
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+        pad_h = pad[0] + pad[2]
+        pad_w = pad[1] + pad[3]
+        out_dim_h = im2col.compute_conv_output_dim(h, k_h, stride_h, pad_h, dilation_h)
+        out_dim_w = im2col.compute_conv_output_dim(w, k_w, stride_w, pad_w, dilation_w)
+        mmv_in = M * 1
+        mmv_out = M * k_h * k_w
+        channel_factor = int(ifm_ch / simd)
+
+        # compute minimal buffer length (assuming it holds 1 complete window)
+        buffer_min_size = (
+            (k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1
+        ) * channel_factor
+
+        buffer_actual_size = self.get_buffer_depth()
+        code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_actual_size)]
+
+        # compute some intermediate values, e.g., kernel "width" = k_w incl. dilation
+        # or cols/rows that are skipped due to imperfect stride<->dim combination
+        kernel_width = (k_w - 1) * dilation_w + 1
+        kernel_height = (k_h - 1) * dilation_h + 1
+        skip_columns = w % (kernel_width + (out_dim_w - 1) * stride_w)
+        skip_rows = h % (kernel_height + (out_dim_h - 1) * stride_h)
+
+        # set certain threshold indices to detect when reading/writing finishes
+        code_gen_dict["$LAST_READ_ELEM$"] = [str(h * w * channel_factor - 1)]
+        code_gen_dict["$LAST_WRITE_ELEM$"] = [
+            str(((h - skip_rows - 1) * w + (w - skip_columns)) * channel_factor - 1)
+        ]
+
+        # re-use default controller loop structure
+        code_gen_dict["$IS_DEPTHWISE$"] = ["0"]
+        loop_h_iterations = out_dim_h
+        loop_w_iterations = out_dim_w  # now the innermost loop
+        loop_kh_iterations = 1
+        loop_kw_iterations = 1
+        loop_simd_iterations = 1
+
+        if loop_w_iterations == 1:
+            code_gen_dict["$INNERMOST_STATE$"] = [str(5)]  # STATE_LOOP_H
+            loop_h_iterations -= 1  # -1 because state is initial state
+        else:
+            code_gen_dict["$INNERMOST_STATE$"] = [str(4)]  # STATE_LOOP_W
+            loop_w_iterations -= 1  # -1 because state is initial state
+
+        # set head and tail address increment values
+        addr_incr_end_window = -buffer_min_size + stride_w * channel_factor + 1
+        addr_incr_end_row = (
+            -buffer_min_size
+            + ((skip_columns + kernel_width) * channel_factor)  # remaining line
+            + ((stride_h - 1) * w * channel_factor)  # skip lines
+            + 1
+        )
+
+        tail_incr_w = addr_incr_end_window + buffer_min_size - 1
+        tail_incr_h = addr_incr_end_row + buffer_min_size - 1
+        tail_incr_last_window = stride_w
+
+        addr_incr_end_simd = 1
+        addr_incr_end_window_elem = 1
+        addr_incr_end_window_row = 1
+        addr_incr_end_window = tail_incr_w
+        addr_incr_end_row = tail_incr_h
+
+        # add init value for CURRENT_ELEM counter = last elem of first window
+        code_gen_dict["$FIRST_WRITE_ELEM$"] = [str(buffer_min_size - 1)]
+
+        cntr_bitwidth = math.ceil(
+            math.log2(
+                max(
+                    loop_h_iterations - 2 + 1,
+                    loop_w_iterations - 2 + 1,
+                    loop_kh_iterations - 2 + 1,
+                    loop_kw_iterations - 2 + 1,
+                    loop_simd_iterations - 2 + 1,
+                )
+            )
+        )
+        code_gen_dict["$CNTR_BITWIDTH$"] = [str(cntr_bitwidth)]
+        code_gen_dict["$LOOP_H_ITERATIONS$"] = [str(loop_h_iterations - 2)]
+        code_gen_dict["$LOOP_W_ITERATIONS$"] = [str(loop_w_iterations - 2)]
+        code_gen_dict["$LOOP_KH_ITERATIONS$"] = [str(loop_kh_iterations - 2)]
+        code_gen_dict["$LOOP_KW_ITERATIONS$"] = [str(loop_kw_iterations - 2)]
+        code_gen_dict["$LOOP_SIMD_ITERATIONS$"] = [str(loop_simd_iterations - 2)]
+
+        incr_bitwidth = 1 + math.ceil(
+            math.log2(
+                max(
+                    abs(addr_incr_end_simd) + 1,
+                    abs(addr_incr_end_window_elem) + 1,
+                    abs(addr_incr_end_window_row) + 1,
+                    abs(addr_incr_end_window) + 1,
+                    abs(addr_incr_end_row) + 1,
+                    abs(tail_incr_w) + 1,
+                    abs(tail_incr_h) + 1,
+                    abs(tail_incr_last_window) + 1,
+                )
+            )
+        )
+        code_gen_dict["$INCR_BITWIDTH$"] = [str(incr_bitwidth)]
+        code_gen_dict["$HEAD_INCR_SIMD$"] = [str(addr_incr_end_simd)]
+        code_gen_dict["$HEAD_INCR_KW$"] = [str(addr_incr_end_window_elem)]
+        code_gen_dict["$HEAD_INCR_KH$"] = [str(addr_incr_end_window_row)]
+        code_gen_dict["$HEAD_INCR_W$"] = [str(addr_incr_end_window)]
+        code_gen_dict["$HEAD_INCR_H$"] = [str(addr_incr_end_row)]
+        code_gen_dict["$TAIL_INCR_W$"] = [str(tail_incr_w)]
+        code_gen_dict["$TAIL_INCR_H$"] = [str(tail_incr_h)]
+        code_gen_dict["$TAIL_INCR_LAST$"] = [str(tail_incr_last_window)]
+
+        code_gen_dict["$SIMD$"] = [str(simd)]
+        code_gen_dict["$MMV_IN$"] = [str(mmv_in)]
+        code_gen_dict["$MMV_OUT$"] = [str(mmv_out)]
+
+        # prepare buffer partitioning into "reg_fifos" and "bram_fifos"
+        # use normalized ([H,W]=[1,W]) dimensions for 1D case
+        (
+            ifm_ch,
+            [ifm_dim_h, ifm_dim_w],
+            [ofm_dim_h, ofm_dim_w],
+            [k_h, k_w],
+            [stride_h, stride_w],
+            [dilation_h, dilation_w],
+        ) = self.get_1d_conv_attrs_normalized()
+
+        reg_fifos = []
+        bram_fifos_depth = []
+
+        px_idx = 0
+        for ky in range(k_h):
+            reg_fifo = []
+            for kx in range(k_w):
+                reg_fifo.append(px_idx)
+                px_idx += 1
+                if kx < (k_w - 1):
+                    reg_fifo.extend([-1] * (dilation_w - 1))
+                    px_idx += dilation_w - 1
+            reg_fifos.append(reg_fifo)
+
+            if ky < (k_h - 1):
+                line_buffer_len = (w - kernel_width) + w * (dilation_h - 1)
+                bram_fifos_depth.append(line_buffer_len)
+                px_idx += line_buffer_len
+
+        code_gen_dict["$GENERATE_REG_FIFOS$"] = []
+        for i, reg_fifo in enumerate(reg_fifos):
+            code_gen_dict["$GENERATE_REG_FIFOS$"].append(
+                """
+                wire [IN_WIDTH-1:0] reg_fifo_{id}_in;
+                wire [IN_WIDTH-1:0] reg_fifo_{id}_out;
+                wire [IN_WIDTH*{len}-1:0] reg_fifo_{id};
+                swg_reg_buffer
+                #(
+                .WIDTH(IN_WIDTH),
+                .DEPTH({len})
+                )
+                reg_buffer_inst_{id}
+                (
+                    .clk(clk),
+                    .shift_enable(shift_enable),
+                    .shift_in(reg_fifo_{id}_in),
+                    .shift_out(reg_fifo_{id}_out),
+                    .data_out(reg_fifo_{id})
+                );""".format(
+                    id=i,
+                    len=len(reg_fifo),
+                )
+            )
+
+        code_gen_dict["$GENERATE_BRAM_FIFOS$"] = []
+        for i, bram_fifo_depth in enumerate(bram_fifos_depth):
+            code_gen_dict["$GENERATE_BRAM_FIFOS$"].append(
+                """
+                wire [IN_WIDTH-1:0] bram_fifo_{id}_in;
+                wire [IN_WIDTH-1:0] bram_fifo_{id}_out;
+                swg_ram_buffer
+                #(
+                .WIDTH(IN_WIDTH),
+                .DEPTH({len}),
+                .RAM_STYLE("{ram_style}")
+                )
+                ram_buffer_inst_{id}
+                (
+                    .clk(clk),
+                    .rst_n(rst_n),
+                    .shift_enable(shift_enable),
+                    .shift_in(bram_fifo_{id}_in),
+                    .shift_out(bram_fifo_{id}_out)
+                );""".format(
+                    id=i,
+                    len=bram_fifo_depth,
+                    ram_style=self.get_nodeattr("ram_style"),
+                )
+            )
+
+        code_gen_dict["$GENERATE_OUTPUT_MAPPING$"] = []
+        out_idx = mmv_out - 1
+        for fifo_id, reg_fifo in enumerate(reg_fifos):
+            for fifo_idx, access_idx in enumerate(reg_fifo):
+                if access_idx != -1:
+                    code_gen_dict["$GENERATE_OUTPUT_MAPPING$"].append(
+                        """assign data_out[OUT_ELEM_WIDTH*{out_idx}+:OUT_ELEM_WIDTH]
+                        = reg_fifo_{fifo_id}[{access_idx}*{mmv}*OUT_ELEM_WIDTH+
+                        OUT_ELEM_WIDTH*{mmv_idx}+:OUT_ELEM_WIDTH];""".format(
+                            out_idx=out_idx,
+                            fifo_id=fifo_id,
+                            access_idx=len(reg_fifo)
+                            - 1
+                            - int((max(reg_fifo) - access_idx) / M),
+                            mmv_idx=(max(reg_fifo) - access_idx) % M,
+                            mmv=M,
+                        )
+                    )
+                    # reversal: out_idx=0 -> oldest buffer element -> highest access_idx
+                    out_idx = out_idx - 1
+        assert out_idx == -1, "ERROR: Not all output vector elements connected"
+
+        code_gen_dict["$GENERATE_BUFFER_CONNECTION$"] = []
+        for i in range(len(reg_fifos)):
+            if i == 0:
+                # first FIFO containing newest elements -> input comes from input reg
+                code_gen_dict["$GENERATE_BUFFER_CONNECTION$"].append(
+                    """assign reg_fifo_{fifo_id}_in = data_in;""".format(
+                        fifo_id=i,
+                    )
+                )
+            else:
+                # other REG FIFOs -> input comes from connected BRAM FIFO (line buffer)
+                input_fifo_id = i - 1
+                code_gen_dict["$GENERATE_BUFFER_CONNECTION$"].append(
+                    """assign reg_fifo_{fifo_id}_in = bram_fifo_{input_fifo_id}_out;
+                    """.format(
+                        fifo_id=i, input_fifo_id=input_fifo_id
+                    )
+                )
+        for i in range(len(bram_fifos_depth)):
+            input_fifo_id = i
+            code_gen_dict["$GENERATE_BUFFER_CONNECTION$"].append(
+                """assign bram_fifo_{fifo_id}_in = reg_fifo_{input_fifo_id}_out;
+                """.format(
+                    fifo_id=i, input_fifo_id=input_fifo_id
+                )
+            )
+
+        return template_path, code_gen_dict
+
     def select_impl_style(self):
+        """Selects implementation style based on folding configuration."""
         simd = self.get_nodeattr("SIMD")
         M = self.get_nodeattr("M")
         ifm_ch = self.get_nodeattr("IFMChannels")
@@ -685,17 +989,20 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         else:
             impl_style = "default"
 
-        assert (
-            impl_style == "default"
-        ), "ERROR: Parallel window mode not yet implemented"
         return impl_style
 
     def generate_hdl(self):
+        """Generates HDL code and wrapper for the IP, depending on required
+        implementation style."""
         impl_style = self.select_impl_style()
 
         # prepare code generation by filling out dictionaries
         if impl_style == "default":
             template_path, code_gen_dict = self.prepare_codegen_default()
+        elif impl_style == "parallel":
+            template_path, code_gen_dict = self.prepare_codegen_parallel()
+            if self.get_nodeattr("dynamic_mode"):
+                raise Exception("Dynamic mode is not compatible with parallel_window")
         else:
             raise Exception("Requested impl. style not implemented")
 
@@ -706,10 +1013,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name())
         code_gen_dict["$BIT_WIDTH$"] = [str(self.get_input_datatype().bitwidth())]
         ram_style = self.get_nodeattr("ram_style")
-        if ram_style == "auto":
-            code_gen_dict["$RAM_STYLE$"] = [""]
-        else:
-            code_gen_dict["$RAM_STYLE$"] = ['(* ram_style = "{}" *)'.format(ram_style)]
+        code_gen_dict["$RAM_STYLE$"] = ['"{}"'.format(ram_style)]
 
         # apply code generation to templates
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
@@ -756,6 +1060,11 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
             ) as f:
                 f.write(template_axilite)
 
+        # Copy static source file for common core components
+        shutil.copy2(
+            os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_common.sv", code_gen_dir
+        )
+
         # set ipgen_path and ip_path so that HLS-Synth transformation
         # and stich_ip transformation do not complain
         self.set_nodeattr("ipgen_path", code_gen_dir)
@@ -775,6 +1084,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         verilog_files = [
             self.get_nodeattr("gen_top_module") + "_wrapper.v",
             self.get_nodeattr("gen_top_module") + "_impl.sv",
+            "swg_common.sv",
         ]
         if self.get_nodeattr("dynamic_mode"):
             verilog_files.append(self.get_nodeattr("gen_top_module") + "_axilite.v")
@@ -798,6 +1108,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         sourcefiles = [
             self.get_nodeattr("gen_top_module") + "_wrapper.v",
             self.get_nodeattr("gen_top_module") + "_impl.sv",
+            "swg_common.sv",
         ]
 
         if self.get_nodeattr("dynamic_mode"):
@@ -835,6 +1146,8 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         apply (e.g. component must be synthesized for largest buffer size)."""
         # NOTE: For better driver integration, this functionality could be packaged
         # as a standalone function in the future
+        if self.select_impl_style() != "default":
+            raise Exception("Impl. style is incompatible with dynamic mode")
 
         if ifm_dim is None:
             ifm_dim = self.get_nodeattr("IFMDim")
@@ -887,44 +1200,53 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         return config
 
     def code_generation_ipgen(self, model, fpgapart, clk):
-        """Normally: Generates C++ code and tcl script for IP generation.
-        Here: Generates (System-)Verilog code for IP generation."""
+        """Generates (System-)Verilog code for IP generation (instead of HLS code)."""
         self.generate_hdl()
 
     def ipgen_singlenode_code(self):
-        """Normally: Builds the bash script for IP generation."""
+        """Not implemented (RTL component)."""
         pass
 
     def code_generation_cppsim(self, model):
-        """Normally: Generates C++ code for simulation (cppsim)."""
+        """Not implemented (RTL component)."""
         pass
 
     def compile_singlenode_code(self):
+        """Not implemented (RTL component)."""
         pass
 
     def global_includes(self):
+        """Not implemented (RTL component)."""
         pass
 
     def defines(self, var):
+        """Not implemented (RTL component)."""
         pass
 
     def read_npy_data(self):
+        """Not implemented (RTL component)."""
         pass
 
     def strm_decl(self):
+        """Not implemented (RTL component)."""
         pass
 
     def docompute(self):
+        """Not implemented (RTL component)."""
         pass
 
     def dataoutstrm(self):
+        """Not implemented (RTL component)."""
         pass
 
     def save_as_npy(self):
+        """Not implemented (RTL component)."""
         pass
 
     def blackboxfunction(self):
+        """Not implemented (RTL component)."""
         pass
 
     def pragmas(self):
+        """Not implemented (RTL component)."""
         pass
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index eaafebebf5457548a14bada635d4fcb55eb9390d..fcfe9e7727b60bd33fea7a56d312cf4789c0189a 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -140,19 +140,7 @@ class InferConvInpGen(Transformation):
                     k_h > 1 and k_w == 1 and ifm_dim_w == 1
                 )
 
-                # Ensure that RTL variant is not inserted for unsupported configuration
-                is_rtl_variant_compatible = True
-                if is_kernel_pointwise:
-                    is_rtl_variant_compatible = False
-                    if self.use_rtl_variant:
-                        warnings.warn(
-                            """%s : RTL ConvInpGen requested for unsupported
-                                configuration. Falling back to HLS implementation."""
-                            % n.name
-                        )
-
-                if self.use_rtl_variant and is_rtl_variant_compatible:
-
+                if self.use_rtl_variant:
                     ConvInpGen_node = helper.make_node(
                         "ConvolutionInputGenerator_rtl",
                         [ConvInpGen_input],
diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py
index 2301fccdd4fff6310340ffe1dd8de7732a4f9bd4..0a466afe13fa3b96b7bcfefede16cfdf2cb5449c 100644
--- a/src/finn/transformation/fpgadataflow/set_folding.py
+++ b/src/finn/transformation/fpgadataflow/set_folding.py
@@ -75,7 +75,9 @@ class SetFolding(Transformation):
     * the producer of the node is expected to be a ConvolutionInputGenerator
       with depthwise=1, whose SIMD value will be set equal to the PE value of
       its consumer node
-
+    * the VVAU also supports SIMD ("input window") parallelism next to
+      PE ("channels"), but current ConvInpGen limitations require PE to be fully
+      unfolded before SIMD is increased
     """
 
     def __init__(
@@ -106,7 +108,9 @@ class SetFolding(Transformation):
             "Thresholding_Batch",
         ]
         # these ops use SIMD parallelism, up to a max value of NumChannels
-        # ConvolutionInputGenerator has a special case when depthwise=1
+        # ConvolutionInputGenerator* has a special case when depthwise=1
+        # ConvolutionInputGenerator_rtl supports additional parallelism by
+        # setting parallel_window=1 mode after maxing out SIMD
         simd_ops = [
             "DownSampler",
             "FMPadding_Batch",
@@ -154,15 +158,36 @@ class SetFolding(Transformation):
                 max_pe = node_inst.get_nodeattr("Labels")
                 self.optimize_attribute_val(node_inst, max_pe, "PE")
             elif op_type in depthwise_op_exceptions:
+                # init/reset SIMD of VVAU
+                if op_type == "VectorVectorActivation":
+                    node_inst.set_nodeattr("SIMD", 1)
                 max_pe = node_inst.get_nodeattr("Channels")
                 self.optimize_attribute_val(node_inst, max_pe, "PE")
+                # increase SIMD for VVAU once PE is exhausted
+                pe = node_inst.get_nodeattr("PE")
+                cyc = node_inst.get_exp_cycles()
+                if (
+                    op_type == "VectorVectorActivation"
+                    and pe == max_pe
+                    and cyc > self.target_cycles_per_frame
+                ):
+                    max_simd = np.prod(node_inst.get_nodeattr("Kernel"))
+                    self.optimize_attribute_val(node_inst, max_simd, "SIMD")
                 # also set the folding of the upsteam DW SWU
                 # which must be identical to this node
                 swu_node = model.find_producer(node.input[0])
                 if swu_node.op_type.startswith("ConvolutionInputGenerator"):
                     swu_node_inst = getCustomOp(swu_node)
-                    pe = node_inst.get_nodeattr("PE")
                     swu_node_inst.set_nodeattr("SIMD", pe)
+                    # enable parallel_window mode of RTL SWG if needed
+                    if swu_node.op_type == "ConvolutionInputGenerator_rtl":
+                        if (
+                            op_type == "VectorVectorActivation"
+                            and node_inst.get_nodeattr("SIMD") > 1
+                        ):
+                            swu_node_inst.set_nodeattr("parallel_window", 1)
+                        else:
+                            swu_node_inst.set_nodeattr("parallel_window", 0)
                 else:
                     if op_type == "VectorVectorActivation":
                         ksize = np.prod(node_inst.get_nodeattr("Kernel"))
@@ -179,7 +204,19 @@ class SetFolding(Transformation):
                     depthwise = node_inst.get_nodeattr("depthwise")
                     if depthwise == 0:
                         max_simd = node_inst.get_nodeattr("IFMChannels")
+                        # init/reset parallel_window mode of RTL SWG
+                        if op_type == "ConvolutionInputGenerator_rtl":
+                            node_inst.set_nodeattr("parallel_window", 0)
                         self.optimize_attribute_val(node_inst, max_simd, "SIMD")
+                        # enable parallel_window mode of RTL SWG if needed
+                        simd = node_inst.get_nodeattr("SIMD")
+                        cyc = node_inst.get_exp_cycles()
+                        if (
+                            op_type == "ConvolutionInputGenerator_rtl"
+                            and simd == max_simd
+                            and cyc > self.target_cycles_per_frame
+                        ):
+                            node_inst.set_nodeattr("parallel_window", 1)
                     else:
                         # depthwise SWGs are handled separately
                         continue
diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
index de31ef0f125cb96ea82f953eadb9d5ccf7aab16c..7b2793712d41d2a73545c1b5632071334a2ac694 100644
--- a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
@@ -73,9 +73,6 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mod
     if use_rtl_swg and exec_mode == "cppsim":
         pytest.skip("cppsim not supported for RTL SWG")
 
-    if use_rtl_swg and kernel_size == 1:
-        pytest.skip("1x1 kernel not supported by current RTL SWG")
-
     if depthwise is True:
         group = out_chn = in_chn
         conv_param_shape = [out_chn, 1, kernel_size, kernel_size]
@@ -164,7 +161,7 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mod
     inp_dict = {model.graph.input[0].name: x}
     assert oxe.compare_execution(model, new_model, inp_dict)
 
-    if kernel_size == 1 and stride > 1 and pad == 0:
+    if not use_rtl_swg and kernel_size == 1 and stride > 1 and pad == 0:
         assert new_model.graph.node[1].op_type == "DownSampler"
         if exec_mode == "rtlsim":
             node = new_model.get_nodes_by_op_type("DownSampler")[0]
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py
index 58fc5ec04cc471b0e8f201e235ac9bd033e3f5c4..e8236c0c6b8c7fbb9e4394fb3cada785f9c050ac 100755
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py
@@ -140,9 +140,9 @@ def prepare_inputs(input_tensor):
 # input datatype
 @pytest.mark.parametrize("idt", [DataType["UINT4"]])
 # kernel size
-@pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 3]])
+@pytest.mark.parametrize("k", [[3, 3], [1, 5]])
 # input dimension
-@pytest.mark.parametrize("ifm_dim", [[24, 24], [15, 6], [13, 13], [1, 14]])
+@pytest.mark.parametrize("ifm_dim", [[13, 13], [1, 21]])
 # input channels
 @pytest.mark.parametrize("ifm_ch", [6])
 # Stride
@@ -152,9 +152,9 @@ def prepare_inputs(input_tensor):
 # depthwise
 @pytest.mark.parametrize("dw", [0, 1])
 # input channel parallelism ("SIMD")
-@pytest.mark.parametrize("simd", [1, 2, 3, 6])
+@pytest.mark.parametrize("simd", [1, 3, 6])
 # parallel_window enable (MMV_out = M*K)
-@pytest.mark.parametrize("parallel_window", [0])
+@pytest.mark.parametrize("parallel_window", [0, 1])
 # in/out MMV ("M")
 @pytest.mark.parametrize("m", [1])
 # Flip dimensions
@@ -198,12 +198,13 @@ def test_fpgadataflow_slidingwindow_rtl(
         pytest.skip(
             "Illegal convolution configuration: kernel or stride > FM dimension"
         )
-    if (k_h == 1 and (stride_h != 1 or dilation_h != 1)) or (
-        k_w == 1 and (stride_w != 1 or dilation_w != 1)
-    ):
+    if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
         pytest.skip(
-            """Illegal convolution configuration:
-            stride or dilation defined for unitary kernel dim"""
+            "Illegal convolution configuration: dilation for unitary kernel dim"
+        )
+    if (stride_h > k_h) or (stride_w > k_w) and not parallel_window:
+        pytest.skip(
+            "Not all combinations for stride > k edge case supported in default mode"
         )
     if k_h == 1 and k_w == 1 and simd != ifm_ch:
         pytest.skip("1x1 Kernel only supported in parallel mode (SIMD=C)")