diff --git a/finn-rtllib/swg/swg_hdl_template.v b/finn-rtllib/swg/swg_hdl_template.v
index 88ef58531a8eb396c9777aa6cc2df89e60f74c5f..98ea1cf9cdef64635f35a8d2473fc0943151be89 100755
--- a/finn-rtllib/swg/swg_hdl_template.v
+++ b/finn-rtllib/swg/swg_hdl_template.v
@@ -1,4 +1,4 @@
-`timescale 1 ns / 1 ps 
+`timescale 1 ns / 1 ps
 
 module $TOP_MODULE_NAME$_controller
 (
@@ -14,7 +14,7 @@ output cmd_read;
 output cmd_write;
 
 ////code generation part:
-//mapping of R/W command values to each state (START, MAIN_1, MAIN_2, INTER_1, INTER_2, END_1, END_2)            
+//mapping of R/W command values to each state (START, MAIN_1, MAIN_2, INTER_1, INTER_2, END_1, END_2)
 localparam [0:6] READ_CMD_MAP = $READ_CMD_MAP$;
 localparam [0:6] WRITE_CMD_MAP = $WRITE_CMD_MAP$;
 
@@ -37,7 +37,7 @@ integer counter_loop_main;
 integer counter_loop_inter;
 
 assign cmd_read = READ_CMD_MAP[state_next]; //read command indicates read in *upcoming* cycle, due to how schedule is constructed
-assign cmd_write = WRITE_CMD_MAP[state]; 
+assign cmd_write = WRITE_CMD_MAP[state];
 
 reg cycle_last;
 wire cycle_advance;
@@ -66,7 +66,7 @@ always @ (state, counter_current, counter_loop_main, counter_loop_inter) begin
                         //there might not be an end sequence -> restart immediately
                         if (LOOP_END_1_COUNTER != 0)
                             state_next = STATE_END_1;
-                        else 
+                        else
                             state_next = STATE_START;
                     end
                 end
@@ -77,7 +77,7 @@ always @ (state, counter_current, counter_loop_main, counter_loop_inter) begin
             if (counter_current == LOOP_INTER_1_COUNTER-1) begin
                 if (LOOP_INTER_2_COUNTER != 0)
                     state_next = STATE_LOOP_INTER_2;
-                else 
+                else
                     state_next = STATE_LOOP_MAIN_1;
             end
         end
@@ -141,6 +141,113 @@ always @ (posedge CLK) begin
 end
 endmodule //controller
 
+module $TOP_MODULE_NAME$_reg_buffer
+#(
+    parameter WIDTH = 1,
+    parameter DEPTH = 1
+)
+(
+    CLK,
+    shift_enable,
+    shift_in,
+    shift_out,
+    data_out
+);
+
+input CLK, shift_enable;
+input [WIDTH-1:0] shift_in;
+output [WIDTH-1:0] shift_out;
+output [WIDTH*DEPTH-1:0] data_out;
+
+//UG901 template for SRL inference:
+// 32-bit Shift Register
+// Rising edge clock
+// Active high clock enable
+// For-loop based template
+// File: shift_registers_1.v
+//
+//module shift_registers_1 (clk, clken, SI, SO);
+//parameter WIDTH = 32; 
+//input clk, clken, SI; 
+//output SO;
+//reg [WIDTH-1:0] shreg;
+//
+//integer i;
+//always @(posedge clk)
+//begin
+//  if (clken)
+//    begin
+//    for (i = 0; i < WIDTH-1; i = i+1)
+//        shreg[i+1] <= shreg[i];
+//      shreg[0] <= SI; 
+//    end
+//end
+//assign SO = shreg[WIDTH-1];
+//endmodule
+
+reg [WIDTH-1:0] data [DEPTH-1:0];
+
+assign shift_out = data[DEPTH-1];
+
+for (genvar e=0; e<DEPTH; e=e+1)
+    assign data_out[e*WIDTH +: WIDTH] = data[e];
+
+always @ (posedge CLK) begin
+    if (shift_enable) begin
+        for (integer i=DEPTH-1; i>0; i=i-1)
+            data[i] <= data[i-1];
+        data[0] <= shift_in;
+    end
+end
+endmodule //reg_buffer
+
+module $TOP_MODULE_NAME$_ram_buffer
+#(
+    parameter WIDTH = 1,
+    parameter DEPTH = 1
+)
+(
+    CLK,
+    RST,
+    shift_enable,
+    shift_in,
+    shift_out
+);
+
+input CLK, RST, shift_enable;
+input [WIDTH-1:0] shift_in;
+output [WIDTH-1:0] shift_out;
+
+reg [WIDTH-1:0] out_reg;
+assign shift_out = out_reg;
+
+integer addr_w, addr_r; //todo: minimize width (as reg), make r addr depend on w
+
+(* ram_style = "block" *) reg [WIDTH-1:0] ram [DEPTH-1:0];
+
+always @(posedge CLK) begin 
+    if (RST == 1'b0) begin
+        addr_w <= 0;
+        addr_r <= 1;
+    end else begin
+        if (shift_enable) begin
+            ram[addr_w] <= shift_in;
+            out_reg <= ram[addr_r];
+
+            if (addr_w == DEPTH-1)
+                addr_w <= 0;
+            else
+                addr_w <= addr_w + 1;
+
+            if (addr_r == DEPTH-1)
+                addr_r <= 0;
+            else
+                addr_r <= addr_r + 1;
+        end
+    end
+end
+endmodule //ram_buffer
+
 module $TOP_MODULE_NAME$_wb
 #(
     parameter IN_WIDTH = 1, //bit-width*C*MMV_in
@@ -150,12 +257,13 @@ module $TOP_MODULE_NAME$_wb
 )
 (
     CLK,
+    RST,
     data_in,
     shift_enable,
     data_out
 );
 
-input CLK;
+input CLK, RST;
 input [IN_WIDTH-1:0] data_in;
 input shift_enable;
 output [OUT_WIDTH-1:0] data_out;
@@ -163,24 +271,20 @@ output [OUT_WIDTH-1:0] data_out;
 //Input REG to enable simultaneous R/W
 reg [IN_WIDTH-1:0] reg_input;
 
-//REG FIFOs
 $GENERATE_REG_FIFOS$
 
-//BRAM FIFOs
-//todo: generate real BRAM shift buffers if these get too large
 $GENERATE_BRAM_FIFOS$
 
+//Fixed interconnect between linear buffers
+$GENERATE_BUFFER_CONNECTION$
+
 //Fixed REG FIFO <-> output mapping
 $GENERATE_OUTPUT_MAPPING$
 
-//main process
+//input register logic
 integer i;
 always @ (posedge CLK) begin
     if (shift_enable) begin
-        //shift logic
-        $GENERATE_SHIFT_LOGIC$
-
-        //shift in new data
         reg_input <= data_in;
     end
 end
@@ -234,6 +338,7 @@ $TOP_MODULE_NAME$_wb
 window_buffer_inst
 (
     .CLK(ap_clk),
+    .RST(ap_rst_n),
     .data_in(window_buffer_in),
     .shift_enable(window_buffer_shift_enable),
     .data_out(window_buffer_out)
@@ -291,9 +396,9 @@ always @ (posedge ap_clk) begin
             //count cycle (completed R or W or both (depending on current cycle))
             if (cycle == CYCLES_TOTAL-1)
                 cycle <= 0;
-            else 
-                cycle <= cycle+1; 
-        
+            else
+                cycle <= cycle+1;
+
         end else if (write_ok) // successful W in this cycle, but R still outstanding
             write_done <= 1'b1; //write can happen even if read is blocked, but only for the current cycle!
     end
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
index a54dea91670ae46246bf7a47b1d2270f837faa11..cfd6572a8dd6e5b047c79e9d959f62a39663e166 100755
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
@@ -505,7 +505,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         # example:
         # 0: only consecutive access patterns will be implemented in regs, rest in BRAM line buffers
         # 2: [0, 3, 6] access pattern is still allowed and will be implemented with 1 7-position shift reg
-        REG_BRAM_THRESHOLD = 9999
+        REG_BRAM_THRESHOLD = 8
         #--------------------
 
         in_shape = (n,c,h,w) #NCHW
@@ -932,11 +932,13 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
                     bram_fifos_depth.append(math.ceil((distance-1)/M)) # really ceil?
                     # start with new REG FIFO
                     reg_fifos.append(current)
-                    reg_fifos_depth.append(math.ceil((max(current)+1)/M))
+                    #reg_fifos_depth.append(math.ceil((max(current)+1)/M)) ToDo: fix for M again
+                    reg_fifos_depth.append(len(current))
                     current = []
                     current.append(access_idx)
         reg_fifos.append(current)
-        reg_fifos_depth.append(math.ceil((max(current)+1)/M))
+        #reg_fifos_depth.append(math.ceil((max(current)+1)/M)) ToDo fix for M again
+        reg_fifos_depth.append(len(current))
 
         f_debug.write("\n"+"Buffer partitioning using REG_BRAM_THRESHOLD=%d" % REG_BRAM_THRESHOLD)
         f_debug.write("\n"+"%d REG FIFOs (parallel read access):" % len(reg_fifos))
@@ -947,17 +949,43 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         code_gen_dict["$GENERATE_REG_FIFOS$"] = []
         for i in range(len(reg_fifos)):
             code_gen_dict["$GENERATE_REG_FIFOS$"].append(
-                """parameter reg_fifo_{id}_len = {len};
-                reg [IN_WIDTH-1:0] reg_fifo_{id} [reg_fifo_{id}_len-1:0];
-                """.format(id=i, len=reg_fifos_depth[i]))
-        
-        #todo: generate actual bram shift buffers instead of regs
+                """
+                wire [IN_WIDTH-1:0] reg_fifo_{id}_in;
+                wire [IN_WIDTH-1:0] reg_fifo_{id}_out;
+                wire [IN_WIDTH*{len}-1:0] reg_fifo_{id};
+                {name}_reg_buffer
+                #(
+                .WIDTH(IN_WIDTH),
+                .DEPTH({len})
+                )
+                reg_buffer_inst_{id}
+                (
+                    .CLK(CLK),
+                    .shift_enable(shift_enable),
+                    .shift_in(reg_fifo_{id}_in),
+                    .shift_out(reg_fifo_{id}_out),
+                    .data_out(reg_fifo_{id})
+                );""".format(name=self.get_verilog_top_module_name(), id=i, len=reg_fifos_depth[i]))
+
         code_gen_dict["$GENERATE_BRAM_FIFOS$"] = []
         for i in range(len(bram_fifos)):
             code_gen_dict["$GENERATE_BRAM_FIFOS$"].append(
-                """parameter bram_fifo_{id}_len = {len};
-                reg [IN_WIDTH-1:0] bram_fifo_{id} [bram_fifo_{id}_len-1:0];
-                """.format(id=i, len=bram_fifos_depth[i]))
+                """
+                wire [IN_WIDTH-1:0] bram_fifo_{id}_in;
+                wire [IN_WIDTH-1:0] bram_fifo_{id}_out;
+                {name}_ram_buffer
+                #(
+                .WIDTH(IN_WIDTH),
+                .DEPTH({len})
+                )
+                ram_buffer_inst_{id}
+                (
+                    .CLK(CLK),
+                    .RST(RST),
+                    .shift_enable(shift_enable),
+                    .shift_in(bram_fifo_{id}_in),
+                    .shift_out(bram_fifo_{id}_out)
+                );""".format(name=self.get_verilog_top_module_name(), id=i, len=bram_fifos_depth[i]))
 
         code_gen_dict["$GENERATE_OUTPUT_MAPPING$"] = []
         out_idx = mmv_out-1
@@ -970,46 +998,32 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
                     #    )
                     #)
                     code_gen_dict["$GENERATE_OUTPUT_MAPPING$"].append(
-                        "assign data_out[OUT_ELEM_WIDTH*{out_idx}+:OUT_ELEM_WIDTH] = reg_fifo_{fifo_id}[{access_idx}][OUT_ELEM_WIDTH*{mmv_idx}+:OUT_ELEM_WIDTH];".format(
+                        "assign data_out[OUT_ELEM_WIDTH*{out_idx}+:OUT_ELEM_WIDTH] = reg_fifo_{fifo_id}[{access_idx}*{mmv}*OUT_ELEM_WIDTH+OUT_ELEM_WIDTH*{mmv_idx}+:OUT_ELEM_WIDTH];".format(
                             out_idx=out_idx, fifo_id=fifo_id, 
                             access_idx=reg_fifos_depth[fifo_id]-1-int((max(reg_fifo)-access_idx)/M), 
-                            mmv_idx=(max(reg_fifo)-access_idx)%M
+                            mmv_idx=(max(reg_fifo)-access_idx)%M,
+                            mmv = M
                         )
                     )
                     # reversal: out_idx=0 -> oldest buffer element -> highest access_idx
                     out_idx = out_idx-1
         assert out_idx==-1, "ERROR: Not all output vector elements connected"
 
-        code_gen_dict["$GENERATE_SHIFT_LOGIC$"] = []
+        code_gen_dict["$GENERATE_BUFFER_CONNECTION$"] = []
         for i in range(len(reg_fifos)):
             if i == 0:
                 # first FIFO containing newest elements -> input comes from input reg
-                code_gen_dict["$GENERATE_SHIFT_LOGIC$"].append(
-                    """for (i=reg_fifo_{fifo_id}_len-1; i>0; i=i-1)
-                        reg_fifo_{fifo_id}[i] <= reg_fifo_{fifo_id}[i-1];
-                    reg_fifo_{fifo_id}[0] <= reg_input;""".format(
-                        fifo_id=i,
-                    )
-                )
+                code_gen_dict["$GENERATE_BUFFER_CONNECTION$"].append(
+                    """assign reg_fifo_{fifo_id}_in = reg_input;""".format(fifo_id=i,))
             else:
                 # other REG FIFOs -> input comes from connected BRAM FIFO (line buffer)
                 input_fifo_id = i-1
-                code_gen_dict["$GENERATE_SHIFT_LOGIC$"].append(
-                    """for (i=reg_fifo_{fifo_id}_len-1; i>0; i=i-1)
-                        reg_fifo_{fifo_id}[i] <= reg_fifo_{fifo_id}[i-1];
-                    reg_fifo_{fifo_id}[0] <= bram_fifo_{input_fifo_id} [bram_fifo_{input_fifo_id}_len-1];""".format(
-                        fifo_id=i, input_fifo_id=input_fifo_id
-                    )
-                )
+                code_gen_dict["$GENERATE_BUFFER_CONNECTION$"].append(
+                    """assign reg_fifo_{fifo_id}_in = bram_fifo_{input_fifo_id}_out;""".format(fifo_id=i, input_fifo_id=input_fifo_id))
         for i in range(len(bram_fifos)):
             input_fifo_id = i
-            code_gen_dict["$GENERATE_SHIFT_LOGIC$"].append(
-                """for (i=bram_fifo_{fifo_id}_len-1; i>0; i=i-1)
-                    bram_fifo_{fifo_id}[i] <= bram_fifo_{fifo_id}[i-1];
-                bram_fifo_{fifo_id}[0] <= reg_fifo_{input_fifo_id} [reg_fifo_{input_fifo_id}_len-1];""".format(
-                    fifo_id=i, input_fifo_id=input_fifo_id
-                )
-            )
+            code_gen_dict["$GENERATE_BUFFER_CONNECTION$"].append(
+                """assign bram_fifo_{fifo_id}_in = reg_fifo_{input_fifo_id}_out;""".format(fifo_id=i, input_fifo_id=input_fifo_id))
 
         # Generate read schedule (when data is read from input, written to buffer)
         # code_gen_dict["$GENERATE_READ_SCHEDULE$"] = []