diff --git a/finn-rtllib/memstream/hdl/memstream.v b/finn-rtllib/memstream/hdl/memstream.v
index a1c8e066d7290699575c94b0ac939d5a3fb27b19..28acb301a583f7437c580744bae7bdc4aef76337 100644
--- a/finn-rtllib/memstream/hdl/memstream.v
+++ b/finn-rtllib/memstream/hdl/memstream.v
@@ -1,465 +1,467 @@
-/*
- Copyright (c) 2020, Xilinx
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
- * Neither the name of FINN nor the names of its
-   contributors may be used to endorse or promote products derived from
-   this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-module memstream
-#(
-//parameters to enable/disable axi-mm, set number of streams, set readmemh for memory, set per-stream offsets in memory, set per-stream widths
-    parameter CONFIG_EN = 1,
-    parameter NSTREAMS = 6,//1 up to 6
-
-    parameter MEM_DEPTH = 13824,
-    parameter MEM_WIDTH = 32,
-    parameter MEM_INIT = "./",
-    
-    //widths per stream
-	parameter STRM0_WIDTH = 32,
-	parameter STRM1_WIDTH = 32,
-	parameter STRM2_WIDTH = 32,
-	parameter STRM3_WIDTH = 32,
-	parameter STRM4_WIDTH = 32,
-	parameter STRM5_WIDTH = 32,
-
-	//depths per stream
-	parameter STRM0_DEPTH = 2304,
-	parameter STRM1_DEPTH = 2304,
-	parameter STRM2_DEPTH = 2304,
-	parameter STRM3_DEPTH = 2304,
-	parameter STRM4_DEPTH = 2304,
-	parameter STRM5_DEPTH = 2304,
-
-	//offsets for each stream
-	parameter STRM0_OFFSET = 0,
-	parameter STRM1_OFFSET = 2304,
-	parameter STRM2_OFFSET = 4608,
-	parameter STRM3_OFFSET = 6912,
-	parameter STRM4_OFFSET = 9216,
-	parameter STRM5_OFFSET = 11520
-)
-
-(
-    input aclk,
-    input aresetn,
-
-    //optional configuration interface compatible with ap_memory
-	input [31:0] config_address,
-	input config_ce,
-	input config_we,
-	input [31:0] config_d0,
-	output [31:0] config_q0,
-       
-    //multiple output AXI Streams, TDATA width rounded to multiple of 8 bits
-    input m_axis_0_afull,
-    input m_axis_0_tready,
-    output m_axis_0_tvalid,
-    output [((STRM0_WIDTH+7)/8)*8-1:0] m_axis_0_tdata,
-    
-    input m_axis_1_afull,
-    input m_axis_1_tready,
-    output m_axis_1_tvalid,
-    output [((STRM1_WIDTH+7)/8)*8-1:0] m_axis_1_tdata,
-    
-    input m_axis_2_afull,
-    input m_axis_2_tready,
-    output m_axis_2_tvalid,
-    output [((STRM2_WIDTH+7)/8)*8-1:0] m_axis_2_tdata,
-    
-    input m_axis_3_afull,
-    input m_axis_3_tready,
-    output m_axis_3_tvalid,
-    output [((STRM3_WIDTH+7)/8)*8-1:0] m_axis_3_tdata,
-    
-    input m_axis_4_afull,
-    input m_axis_4_tready,
-    output m_axis_4_tvalid,
-    output [((STRM4_WIDTH+7)/8)*8-1:0] m_axis_4_tdata,
-    
-    input m_axis_5_afull,
-    input m_axis_5_tready,
-    output m_axis_5_tvalid,
-    output [((STRM5_WIDTH+7)/8)*8-1:0] m_axis_5_tdata
-    
-
-);
-
-//calculate number of RAMB18 blocks we need depth-wise
-localparam NMEMBLOCKS = (MEM_DEPTH+1023) / 1024; //ceil(MEM_DEPTH/1024)
-
-//calculate width of address for each block
-localparam BLOCKADRWIDTH = NMEMBLOCKS > 1 ? 10 : $clog2(MEM_DEPTH);
-
-//determine whether a stream needs to multiplex between memory blocks
-localparam STRM0_MUX = ((STRM0_OFFSET/1024) != ((STRM0_OFFSET+STRM0_DEPTH)/1024));
-localparam STRM1_MUX = ((STRM1_OFFSET/1024) != ((STRM1_OFFSET+STRM1_DEPTH)/1024));
-localparam STRM2_MUX = ((STRM2_OFFSET/1024) != ((STRM2_OFFSET+STRM2_DEPTH)/1024));
-localparam STRM3_MUX = ((STRM3_OFFSET/1024) != ((STRM3_OFFSET+STRM3_DEPTH)/1024));
-localparam STRM4_MUX = ((STRM4_OFFSET/1024) != ((STRM4_OFFSET+STRM4_DEPTH)/1024));
-localparam STRM5_MUX = ((STRM5_OFFSET/1024) != ((STRM5_OFFSET+STRM5_DEPTH)/1024));
-
-//determine what the base block of each stream is
-localparam STRM0_BLOCK = (STRM0_OFFSET/1024);
-localparam STRM1_BLOCK = (STRM1_OFFSET/1024);
-localparam STRM2_BLOCK = (STRM2_OFFSET/1024);
-localparam STRM3_BLOCK = (STRM3_OFFSET/1024);
-localparam STRM4_BLOCK = (STRM4_OFFSET/1024);
-localparam STRM5_BLOCK = (STRM5_OFFSET/1024);
-
-//determine what the end block of each stream is
-localparam STRM0_END_BLOCK = ((STRM0_OFFSET+STRM0_DEPTH-1)/1024);
-localparam STRM1_END_BLOCK = ((STRM1_OFFSET+STRM1_DEPTH-1)/1024);
-localparam STRM2_END_BLOCK = ((STRM2_OFFSET+STRM2_DEPTH-1)/1024);
-localparam STRM3_END_BLOCK = ((STRM3_OFFSET+STRM3_DEPTH-1)/1024);
-localparam STRM4_END_BLOCK = ((STRM4_OFFSET+STRM4_DEPTH-1)/1024);
-localparam STRM5_END_BLOCK = ((STRM5_OFFSET+STRM5_DEPTH-1)/1024);
-
-//determine the number of blocks spanned by each stream
-localparam STRM0_NBLOCKS = STRM0_END_BLOCK - STRM0_BLOCK + 1;
-localparam STRM1_NBLOCKS = STRM1_END_BLOCK - STRM1_BLOCK + 1;
-localparam STRM2_NBLOCKS = STRM2_END_BLOCK - STRM2_BLOCK + 1;
-localparam STRM3_NBLOCKS = STRM3_END_BLOCK - STRM3_BLOCK + 1;
-localparam STRM4_NBLOCKS = STRM4_END_BLOCK - STRM4_BLOCK + 1;
-localparam STRM5_NBLOCKS = STRM5_END_BLOCK - STRM5_BLOCK + 1;
-
-//TODO: check that memory width is equal to the widest stream
-//TODO: check that the stream depths and offsets make sense, and that the memory depth is sufficient (or calculate depth here?)
+/*
+ Copyright (c) 2020, Xilinx
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name of FINN nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+module memstream
+#(
+//parameters to enable/disable axi-mm, set number of streams, set readmemh for memory, set per-stream offsets in memory, set per-stream widths
+    parameter CONFIG_EN = 1,
+    parameter NSTREAMS = 6,//1 up to 6
+
+    parameter MEM_DEPTH = 13824,
+    parameter MEM_WIDTH = 32,
+    parameter MEM_INIT = "./",
+    parameter RAM_STYLE = "auto",
+
+    //widths per stream
+	parameter STRM0_WIDTH = 32,
+	parameter STRM1_WIDTH = 32,
+	parameter STRM2_WIDTH = 32,
+	parameter STRM3_WIDTH = 32,
+	parameter STRM4_WIDTH = 32,
+	parameter STRM5_WIDTH = 32,
+
+	//depths per stream
+	parameter STRM0_DEPTH = 2304,
+	parameter STRM1_DEPTH = 2304,
+	parameter STRM2_DEPTH = 2304,
+	parameter STRM3_DEPTH = 2304,
+	parameter STRM4_DEPTH = 2304,
+	parameter STRM5_DEPTH = 2304,
+
+	//offsets for each stream
+	parameter STRM0_OFFSET = 0,
+	parameter STRM1_OFFSET = 2304,
+	parameter STRM2_OFFSET = 4608,
+	parameter STRM3_OFFSET = 6912,
+	parameter STRM4_OFFSET = 9216,
+	parameter STRM5_OFFSET = 11520
+)
+
+(
+    input aclk,
+    input aresetn,
+
+    //optional configuration interface compatible with ap_memory
+	input [31:0] config_address,
+	input config_ce,
+	input config_we,
+	input [31:0] config_d0,
+	output [31:0] config_q0,
+
+    //multiple output AXI Streams, TDATA width rounded to multiple of 8 bits
+    input m_axis_0_afull,
+    input m_axis_0_tready,
+    output m_axis_0_tvalid,
+    output [((STRM0_WIDTH+7)/8)*8-1:0] m_axis_0_tdata,
+
+    input m_axis_1_afull,
+    input m_axis_1_tready,
+    output m_axis_1_tvalid,
+    output [((STRM1_WIDTH+7)/8)*8-1:0] m_axis_1_tdata,
+
+    input m_axis_2_afull,
+    input m_axis_2_tready,
+    output m_axis_2_tvalid,
+    output [((STRM2_WIDTH+7)/8)*8-1:0] m_axis_2_tdata,
+
+    input m_axis_3_afull,
+    input m_axis_3_tready,
+    output m_axis_3_tvalid,
+    output [((STRM3_WIDTH+7)/8)*8-1:0] m_axis_3_tdata,
+
+    input m_axis_4_afull,
+    input m_axis_4_tready,
+    output m_axis_4_tvalid,
+    output [((STRM4_WIDTH+7)/8)*8-1:0] m_axis_4_tdata,
+
+    input m_axis_5_afull,
+    input m_axis_5_tready,
+    output m_axis_5_tvalid,
+    output [((STRM5_WIDTH+7)/8)*8-1:0] m_axis_5_tdata
+
+
+);
+
+//calculate number of RAMB18 blocks we need depth-wise
+localparam NMEMBLOCKS = (MEM_DEPTH+1023) / 1024; //ceil(MEM_DEPTH/1024)
+
+//calculate width of address for each block
+localparam BLOCKADRWIDTH = NMEMBLOCKS > 1 ? 10 : $clog2(MEM_DEPTH);
+
+//determine whether a stream needs to multiplex between memory blocks
+localparam STRM0_MUX = ((STRM0_OFFSET/1024) != ((STRM0_OFFSET+STRM0_DEPTH)/1024));
+localparam STRM1_MUX = ((STRM1_OFFSET/1024) != ((STRM1_OFFSET+STRM1_DEPTH)/1024));
+localparam STRM2_MUX = ((STRM2_OFFSET/1024) != ((STRM2_OFFSET+STRM2_DEPTH)/1024));
+localparam STRM3_MUX = ((STRM3_OFFSET/1024) != ((STRM3_OFFSET+STRM3_DEPTH)/1024));
+localparam STRM4_MUX = ((STRM4_OFFSET/1024) != ((STRM4_OFFSET+STRM4_DEPTH)/1024));
+localparam STRM5_MUX = ((STRM5_OFFSET/1024) != ((STRM5_OFFSET+STRM5_DEPTH)/1024));
+
+//determine what the base block of each stream is
+localparam STRM0_BLOCK = (STRM0_OFFSET/1024);
+localparam STRM1_BLOCK = (STRM1_OFFSET/1024);
+localparam STRM2_BLOCK = (STRM2_OFFSET/1024);
+localparam STRM3_BLOCK = (STRM3_OFFSET/1024);
+localparam STRM4_BLOCK = (STRM4_OFFSET/1024);
+localparam STRM5_BLOCK = (STRM5_OFFSET/1024);
+
+//determine what the end block of each stream is
+localparam STRM0_END_BLOCK = ((STRM0_OFFSET+STRM0_DEPTH-1)/1024);
+localparam STRM1_END_BLOCK = ((STRM1_OFFSET+STRM1_DEPTH-1)/1024);
+localparam STRM2_END_BLOCK = ((STRM2_OFFSET+STRM2_DEPTH-1)/1024);
+localparam STRM3_END_BLOCK = ((STRM3_OFFSET+STRM3_DEPTH-1)/1024);
+localparam STRM4_END_BLOCK = ((STRM4_OFFSET+STRM4_DEPTH-1)/1024);
+localparam STRM5_END_BLOCK = ((STRM5_OFFSET+STRM5_DEPTH-1)/1024);
+
+//determine the number of blocks spanned by each stream
+localparam STRM0_NBLOCKS = STRM0_END_BLOCK - STRM0_BLOCK + 1;
+localparam STRM1_NBLOCKS = STRM1_END_BLOCK - STRM1_BLOCK + 1;
+localparam STRM2_NBLOCKS = STRM2_END_BLOCK - STRM2_BLOCK + 1;
+localparam STRM3_NBLOCKS = STRM3_END_BLOCK - STRM3_BLOCK + 1;
+localparam STRM4_NBLOCKS = STRM4_END_BLOCK - STRM4_BLOCK + 1;
+localparam STRM5_NBLOCKS = STRM5_END_BLOCK - STRM5_BLOCK + 1;
+
+//TODO: check that memory width is equal to the widest stream
+//TODO: check that the stream depths and offsets make sense, and that the memory depth is sufficient (or calculate depth here?)
 initial begin
     if((NSTREAMS < 1) | (NSTREAMS > 6)) begin
         $display("Invalid setting for NSTREAMS, please set in range [1,6]");
         $finish();
     end
 end
-
-//invert reset
-wire rst;
-assign rst = ~aresetn;
-
-//WARNING: pipeline depth is larger than the number of streams per port so we have in-flight writes that may see not-ready when they get executed
-//solution: use prog-full to make sure we have an equal number of free slots in the stream to the read pipeline depth
-
-reg [$clog2(MEM_DEPTH)-1:0] strm0_addr = STRM0_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm1_addr = STRM1_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm2_addr = STRM2_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm3_addr = STRM3_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm4_addr = STRM4_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm5_addr = STRM5_OFFSET;
-
-reg strm0_incr_en;
-reg strm1_incr_en;
-reg strm2_incr_en;
-reg strm3_incr_en;
-reg strm4_incr_en;
-reg strm5_incr_en;
-
-wire strm0_rst;
-wire strm1_rst;
-wire strm2_rst;
-wire strm3_rst;
-wire strm4_rst;
-wire strm5_rst;
-
-reg strm0_ready;
-reg strm1_ready;
-reg strm2_ready;
-reg strm3_ready;
-reg strm4_ready;
-reg strm5_ready;
-
-//arbiter: work on one stream at a time
-//multiplex each port between (up to) half of the streams 
-reg [1:0] current_stream_porta = 0;
-reg [1:0] current_stream_portb = 0;
-
-always @(posedge aclk) begin
-    if(rst)
-        current_stream_porta <= 0;
-    else case(current_stream_porta)
-        0: current_stream_porta <= strm2_ready ? 1 : strm4_ready ? 2 : 0;
-        1: current_stream_porta <= strm4_ready ? 2 : strm0_ready ? 0 : 1;
-        2: current_stream_porta <= strm0_ready ? 0 : strm2_ready ? 1 : 2;
-    endcase
-    if(rst)
-        current_stream_portb <= 0;
-    else case(current_stream_portb)
-        0: current_stream_portb <= strm3_ready ? 1 : strm5_ready ? 2 : 0;
-        1: current_stream_portb <= strm5_ready ? 2 : strm1_ready ? 0 : 1;
-        2: current_stream_portb <= strm1_ready ? 0 : strm3_ready ? 1 : 2;
-    endcase
-end
-
-always @(posedge aclk) begin
-    if(rst) begin
-        strm0_incr_en <= 0;
-        strm1_incr_en <= 0;
-        strm2_incr_en <= 0;
-        strm3_incr_en <= 0;
-        strm4_incr_en <= 0;
-        strm5_incr_en <= 0;
-    end else begin
-        strm0_incr_en <= (current_stream_porta == 0) & strm0_ready;
-        strm1_incr_en <= (current_stream_portb == 0) & strm1_ready;
-        strm2_incr_en <= (current_stream_porta == 1) & strm2_ready;
-        strm3_incr_en <= (current_stream_portb == 1) & strm3_ready;
-        strm4_incr_en <= (current_stream_porta == 2) & strm4_ready;
-        strm5_incr_en <= (current_stream_portb == 2) & strm5_ready;
-    end
-end
-
-assign strm0_rst = strm0_incr_en & (strm0_addr == (STRM0_OFFSET + STRM0_DEPTH-1));
-assign strm1_rst = strm1_incr_en & (strm1_addr == (STRM1_OFFSET + STRM1_DEPTH-1));
-assign strm2_rst = strm2_incr_en & (strm2_addr == (STRM2_OFFSET + STRM2_DEPTH-1));
-assign strm3_rst = strm3_incr_en & (strm3_addr == (STRM3_OFFSET + STRM3_DEPTH-1));
-assign strm4_rst = strm4_incr_en & (strm4_addr == (STRM4_OFFSET + STRM4_DEPTH-1));
-assign strm5_rst = strm5_incr_en & (strm5_addr == (STRM5_OFFSET + STRM5_DEPTH-1));
-
-always @(posedge aclk) begin
-    strm0_ready <= ~m_axis_0_afull;
-    strm1_ready <= ~m_axis_1_afull & (NSTREAMS >= 2);
-    strm2_ready <= ~m_axis_2_afull & (NSTREAMS >= 3);
-    strm3_ready <= ~m_axis_3_afull & (NSTREAMS >= 4);
-    strm4_ready <= ~m_axis_4_afull & (NSTREAMS >= 5);
-    strm5_ready <= ~m_axis_5_afull & (NSTREAMS >= 6);
-end
-
-//one address counter per stream; more LUTs but keeps routing short and local
-always @(posedge aclk) begin
-    if(strm0_rst | rst)
-        strm0_addr <= STRM0_OFFSET;
-    else if(strm0_incr_en)
-        strm0_addr <= strm0_addr + 1;
-    if(strm1_rst | rst)
-        strm1_addr <= STRM1_OFFSET;
-    else if(strm1_incr_en)
-        strm1_addr <= strm1_addr + 1;
-    if(strm2_rst | rst)
-        strm2_addr <= STRM2_OFFSET;
-    else if(strm2_incr_en)
-        strm2_addr <= strm2_addr + 1;
-    if(strm3_rst | rst)
-        strm3_addr <= STRM3_OFFSET;
-    else if(strm3_incr_en)
-        strm3_addr <= strm3_addr + 1;
-    if(strm4_rst | rst)
-        strm4_addr <= STRM4_OFFSET;
-    else if(strm4_incr_en)
-        strm4_addr <= strm4_addr + 1;
-    if(strm5_rst | rst)
-        strm5_addr <= STRM5_OFFSET;
-    else if(strm5_incr_en)
-        strm5_addr <= strm5_addr + 1;
-end
-
-reg [$clog2(MEM_DEPTH)-1:0] addra;
-wire [MEM_WIDTH*NMEMBLOCKS-1:0] rdqa;
-
-reg [$clog2(MEM_DEPTH)-1:0] addrb;
-wire [MEM_WIDTH*NMEMBLOCKS-1:0] rdqb;
-
-wire [NMEMBLOCKS-1:0] we;
-
-reg [1:0] addr_select_porta;
-reg [1:0] addr_select_portb;
-
-//multiplex addresses of various streams into address ports of memory
-always @(posedge aclk) begin
-    addr_select_porta <= current_stream_porta;
-    case(addr_select_porta)
-        0: addra <= strm0_addr;
-        1: addra <= strm2_addr;
-        2: addra <= strm4_addr;
-    endcase
-    addr_select_portb <= current_stream_portb;
-    case(addr_select_portb)
-        0: addrb <= strm1_addr;
-        1: addrb <= strm3_addr;
-        2: addrb <= strm5_addr;
-    endcase
-end
-
-genvar g;
-generate for(g=0; g<NMEMBLOCKS; g=g+1) begin: blockports
-
-assign we[g] = (CONFIG_EN == 1) & config_ce & config_we & (config_address[31:BLOCKADRWIDTH] == g);
-
-ramb18_wf_dualport
-#(
-    .ID(g),
-	.DWIDTH(MEM_WIDTH),
-	.AWIDTH(BLOCKADRWIDTH),
-	.MEM_INIT(MEM_INIT)
-)
-ram
-(
-	.clk(aclk),
-	
-	.wea(we[g]),
-	.addra(we[g] ? config_address[BLOCKADRWIDTH-1:0] : addra[BLOCKADRWIDTH-1:0]),
-	.wdataa(config_d0),
-	.rdqa(rdqa[(g+1)*MEM_WIDTH-1:g*MEM_WIDTH]),
-
-	.web(1'b0),
-	.addrb(addrb[BLOCKADRWIDTH-1:0]),
-	.wdatab('d0),
-	.rdqb(rdqb[(g+1)*MEM_WIDTH-1:g*MEM_WIDTH])
-);
-
-end
-endgenerate
-
-integer i;
-
-generate if(NMEMBLOCKS > 1) begin: multiblock
-
-wire [MEM_WIDTH-1:0] rdqmux[5:0];
-
-reg [$clog2(MEM_DEPTH)-BLOCKADRWIDTH-1:0] rdblocka[2:0];
-reg [$clog2(MEM_DEPTH)-BLOCKADRWIDTH-1:0] rdblockb[2:0];
-
-always @(posedge aclk) begin
-    rdblocka[0] <= addra[$clog2(MEM_DEPTH)-1:BLOCKADRWIDTH];
-    rdblockb[0] <= addrb[$clog2(MEM_DEPTH)-1:BLOCKADRWIDTH];
-    for(i=0; i<2; i=i+1) begin
-		rdblocka[i+1] <= rdblocka[i];
-		rdblockb[i+1] <= rdblockb[i];
-    end
-end
-
-if(NSTREAMS >= 1) begin: en_strm0
-	if(STRM0_MUX == 1) begin: mux0
-		mux #(STRM0_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM0_BLOCK+STRM0_NBLOCKS)*MEM_WIDTH-1:STRM0_BLOCK*MEM_WIDTH],rdqmux[0],rdblocka[1] - STRM0_BLOCK);
-	end else begin: nomux0
-		assign rdqmux[0] = rdqa[(STRM0_BLOCK+1)*MEM_WIDTH-1:STRM0_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_0_tdata = rdqmux[0][STRM0_WIDTH-1:0];
-end
-
-if(NSTREAMS >= 2) begin: en_strm1
-	if(STRM1_MUX == 1) begin: mux1
-		mux #(STRM1_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM1_BLOCK+STRM1_NBLOCKS)*MEM_WIDTH-1:STRM1_BLOCK*MEM_WIDTH],rdqmux[1],rdblockb[1] - STRM1_BLOCK);
-	end else begin: nomux1
-		assign rdqmux[1] = rdqb[(STRM1_BLOCK+1)*MEM_WIDTH-1:STRM1_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_1_tdata = rdqmux[1][STRM1_WIDTH-1:0];
-end
-
-if(NSTREAMS >= 3) begin: en_strm2
-	if(STRM2_MUX == 1) begin: mux2
-		mux #(STRM2_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM2_BLOCK+STRM2_NBLOCKS)*MEM_WIDTH-1:STRM2_BLOCK*MEM_WIDTH],rdqmux[2],rdblocka[1] - STRM2_BLOCK);
-	end else begin: nomux2
-		assign rdqmux[2] = rdqa[(STRM2_BLOCK+1)*MEM_WIDTH-1:STRM2_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_2_tdata = rdqmux[2][STRM2_WIDTH-1:0];
-end
-
-if(NSTREAMS >= 4) begin: en_strm3
-	if(STRM3_MUX == 1) begin: mux3
-		mux #(STRM3_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM3_BLOCK+STRM3_NBLOCKS)*MEM_WIDTH-1:STRM3_BLOCK*MEM_WIDTH],rdqmux[3],rdblockb[1] - STRM3_BLOCK);
-	end else begin: nomux3
-		assign rdqmux[3] = rdqb[(STRM3_BLOCK+1)*MEM_WIDTH-1:STRM3_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_3_tdata = rdqmux[3][STRM3_WIDTH-1:0];
-end
-
-if(NSTREAMS >= 5) begin: en_strm4
-	if(STRM4_MUX == 1) begin: mux4
-		mux #(STRM4_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM4_BLOCK+STRM4_NBLOCKS)*MEM_WIDTH-1:STRM4_BLOCK*MEM_WIDTH],rdqmux[4],rdblocka[1] - STRM4_BLOCK);
-	end else begin: nomux4
-		assign rdqmux[4] = rdqa[(STRM4_BLOCK+1)*MEM_WIDTH-1:STRM4_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_4_tdata = rdqmux[4][STRM4_WIDTH-1:0];
-end
-
-if(NSTREAMS >= 6) begin: en_strm5
-	if(STRM5_MUX == 1) begin: mux5
-		mux #(STRM5_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM5_BLOCK+STRM5_NBLOCKS)*MEM_WIDTH-1:STRM5_BLOCK*MEM_WIDTH],rdqmux[5],rdblockb[1] - STRM5_BLOCK);
-	end else begin: nomux5
-		assign rdqmux[5] = rdqb[(STRM5_BLOCK+1)*MEM_WIDTH-1:STRM5_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_5_tdata = rdqmux[5][STRM5_WIDTH-1:0];
-end
-
-end else begin: singleblock
-
-if(NSTREAMS >= 1) begin: en_strm0_direct
-    assign m_axis_0_tdata = rdqa[STRM0_WIDTH-1:0];
-end
-if(NSTREAMS >= 2) begin: en_strm1_direct
-	assign m_axis_1_tdata = rdqb[STRM1_WIDTH-1:0];
-end
-if(NSTREAMS >= 3) begin: en_strm2_direct
-	assign m_axis_2_tdata = rdqa[STRM2_WIDTH-1:0];
-end
-if(NSTREAMS >= 4) begin: en_strm3_direct
-	assign m_axis_3_tdata = rdqb[STRM3_WIDTH-1:0];
-end
-if(NSTREAMS >= 5) begin: en_strm4_direct
-	assign m_axis_4_tdata = rdqa[STRM4_WIDTH-1:0];
-end
-if(NSTREAMS >= 6) begin: en_strm5_direct
-	assign m_axis_5_tdata = rdqb[STRM5_WIDTH-1:0];
-end
-
-end
-endgenerate
-
-//output to AXI Streams
-reg tvalid_pipe0[2:0];
-reg tvalid_pipe1[2:0];
-reg tvalid_pipe2[2:0];
-reg tvalid_pipe3[2:0];
-reg tvalid_pipe4[2:0];
-reg tvalid_pipe5[2:0];
-
-assign m_axis_0_tvalid = tvalid_pipe0[2];
-assign m_axis_1_tvalid = tvalid_pipe1[2];
-assign m_axis_2_tvalid = tvalid_pipe2[2];
-assign m_axis_3_tvalid = tvalid_pipe3[2];
-assign m_axis_4_tvalid = tvalid_pipe4[2];
-assign m_axis_5_tvalid = tvalid_pipe5[2];
-
-
-always @(posedge aclk) begin
-    tvalid_pipe0[0] <= strm0_incr_en;
-    tvalid_pipe1[0] <= strm1_incr_en;
-    tvalid_pipe2[0] <= strm2_incr_en;
-    tvalid_pipe3[0] <= strm3_incr_en;
-    tvalid_pipe4[0] <= strm4_incr_en;
-    tvalid_pipe5[0] <= strm5_incr_en;
-    for(i=0; i<2; i=i+1) begin: srl
-        tvalid_pipe0[i+1] <= tvalid_pipe0[i];
-        tvalid_pipe1[i+1] <= tvalid_pipe1[i];
-        tvalid_pipe2[i+1] <= tvalid_pipe2[i];
-        tvalid_pipe3[i+1] <= tvalid_pipe3[i];
-        tvalid_pipe4[i+1] <= tvalid_pipe4[i];
-        tvalid_pipe5[i+1] <= tvalid_pipe5[i];
-    end
-end
-
-assign config_q0 = 0;
-
-endmodule
\ No newline at end of file
+
+//invert reset
+wire rst;
+assign rst = ~aresetn;
+
+//WARNING: pipeline depth is larger than the number of streams per port so we have in-flight writes that may see not-ready when they get executed
+//solution: use prog-full to make sure we have an equal number of free slots in the stream to the read pipeline depth
+
+reg [$clog2(MEM_DEPTH)-1:0] strm0_addr = STRM0_OFFSET;
+reg [$clog2(MEM_DEPTH)-1:0] strm1_addr = STRM1_OFFSET;
+reg [$clog2(MEM_DEPTH)-1:0] strm2_addr = STRM2_OFFSET;
+reg [$clog2(MEM_DEPTH)-1:0] strm3_addr = STRM3_OFFSET;
+reg [$clog2(MEM_DEPTH)-1:0] strm4_addr = STRM4_OFFSET;
+reg [$clog2(MEM_DEPTH)-1:0] strm5_addr = STRM5_OFFSET;
+
+reg strm0_incr_en;
+reg strm1_incr_en;
+reg strm2_incr_en;
+reg strm3_incr_en;
+reg strm4_incr_en;
+reg strm5_incr_en;
+
+wire strm0_rst;
+wire strm1_rst;
+wire strm2_rst;
+wire strm3_rst;
+wire strm4_rst;
+wire strm5_rst;
+
+reg strm0_ready;
+reg strm1_ready;
+reg strm2_ready;
+reg strm3_ready;
+reg strm4_ready;
+reg strm5_ready;
+
+//arbiter: work on one stream at a time
+//multiplex each port between (up to) half of the streams
+reg [1:0] current_stream_porta = 0;
+reg [1:0] current_stream_portb = 0;
+
+always @(posedge aclk) begin
+    if(rst)
+        current_stream_porta <= 0;
+    else case(current_stream_porta)
+        0: current_stream_porta <= strm2_ready ? 1 : strm4_ready ? 2 : 0;
+        1: current_stream_porta <= strm4_ready ? 2 : strm0_ready ? 0 : 1;
+        2: current_stream_porta <= strm0_ready ? 0 : strm2_ready ? 1 : 2;
+    endcase
+    if(rst)
+        current_stream_portb <= 0;
+    else case(current_stream_portb)
+        0: current_stream_portb <= strm3_ready ? 1 : strm5_ready ? 2 : 0;
+        1: current_stream_portb <= strm5_ready ? 2 : strm1_ready ? 0 : 1;
+        2: current_stream_portb <= strm1_ready ? 0 : strm3_ready ? 1 : 2;
+    endcase
+end
+
+always @(posedge aclk) begin
+    if(rst) begin
+        strm0_incr_en <= 0;
+        strm1_incr_en <= 0;
+        strm2_incr_en <= 0;
+        strm3_incr_en <= 0;
+        strm4_incr_en <= 0;
+        strm5_incr_en <= 0;
+    end else begin
+        strm0_incr_en <= (current_stream_porta == 0) & strm0_ready;
+        strm1_incr_en <= (current_stream_portb == 0) & strm1_ready;
+        strm2_incr_en <= (current_stream_porta == 1) & strm2_ready;
+        strm3_incr_en <= (current_stream_portb == 1) & strm3_ready;
+        strm4_incr_en <= (current_stream_porta == 2) & strm4_ready;
+        strm5_incr_en <= (current_stream_portb == 2) & strm5_ready;
+    end
+end
+
+assign strm0_rst = strm0_incr_en & (strm0_addr == (STRM0_OFFSET + STRM0_DEPTH-1));
+assign strm1_rst = strm1_incr_en & (strm1_addr == (STRM1_OFFSET + STRM1_DEPTH-1));
+assign strm2_rst = strm2_incr_en & (strm2_addr == (STRM2_OFFSET + STRM2_DEPTH-1));
+assign strm3_rst = strm3_incr_en & (strm3_addr == (STRM3_OFFSET + STRM3_DEPTH-1));
+assign strm4_rst = strm4_incr_en & (strm4_addr == (STRM4_OFFSET + STRM4_DEPTH-1));
+assign strm5_rst = strm5_incr_en & (strm5_addr == (STRM5_OFFSET + STRM5_DEPTH-1));
+
+always @(posedge aclk) begin
+    strm0_ready <= ~m_axis_0_afull;
+    strm1_ready <= ~m_axis_1_afull & (NSTREAMS >= 2);
+    strm2_ready <= ~m_axis_2_afull & (NSTREAMS >= 3);
+    strm3_ready <= ~m_axis_3_afull & (NSTREAMS >= 4);
+    strm4_ready <= ~m_axis_4_afull & (NSTREAMS >= 5);
+    strm5_ready <= ~m_axis_5_afull & (NSTREAMS >= 6);
+end
+
+//one address counter per stream; more LUTs but keeps routing short and local
+always @(posedge aclk) begin
+    if(strm0_rst | rst)
+        strm0_addr <= STRM0_OFFSET;
+    else if(strm0_incr_en)
+        strm0_addr <= strm0_addr + 1;
+    if(strm1_rst | rst)
+        strm1_addr <= STRM1_OFFSET;
+    else if(strm1_incr_en)
+        strm1_addr <= strm1_addr + 1;
+    if(strm2_rst | rst)
+        strm2_addr <= STRM2_OFFSET;
+    else if(strm2_incr_en)
+        strm2_addr <= strm2_addr + 1;
+    if(strm3_rst | rst)
+        strm3_addr <= STRM3_OFFSET;
+    else if(strm3_incr_en)
+        strm3_addr <= strm3_addr + 1;
+    if(strm4_rst | rst)
+        strm4_addr <= STRM4_OFFSET;
+    else if(strm4_incr_en)
+        strm4_addr <= strm4_addr + 1;
+    if(strm5_rst | rst)
+        strm5_addr <= STRM5_OFFSET;
+    else if(strm5_incr_en)
+        strm5_addr <= strm5_addr + 1;
+end
+
+reg [$clog2(MEM_DEPTH)-1:0] addra;
+wire [MEM_WIDTH*NMEMBLOCKS-1:0] rdqa;
+
+reg [$clog2(MEM_DEPTH)-1:0] addrb;
+wire [MEM_WIDTH*NMEMBLOCKS-1:0] rdqb;
+
+wire [NMEMBLOCKS-1:0] we;
+
+reg [1:0] addr_select_porta;
+reg [1:0] addr_select_portb;
+
+//multiplex addresses of various streams into address ports of memory
+always @(posedge aclk) begin
+    addr_select_porta <= current_stream_porta;
+    case(addr_select_porta)
+        0: addra <= strm0_addr;
+        1: addra <= strm2_addr;
+        2: addra <= strm4_addr;
+    endcase
+    addr_select_portb <= current_stream_portb;
+    case(addr_select_portb)
+        0: addrb <= strm1_addr;
+        1: addrb <= strm3_addr;
+        2: addrb <= strm5_addr;
+    endcase
+end
+
+genvar g;
+generate for(g=0; g<NMEMBLOCKS; g=g+1) begin: blockports
+
+assign we[g] = (CONFIG_EN == 1) & config_ce & config_we & (config_address[31:BLOCKADRWIDTH] == g);
+
+ramb18_wf_dualport
+#(
+    .ID(g),
+	.DWIDTH(MEM_WIDTH),
+	.AWIDTH(BLOCKADRWIDTH),
+	.MEM_INIT(MEM_INIT),
+  .RAM_STYLE(RAM_STYLE)
+)
+ram
+(
+	.clk(aclk),
+
+	.wea(we[g]),
+	.addra(we[g] ? config_address[BLOCKADRWIDTH-1:0] : addra[BLOCKADRWIDTH-1:0]),
+	.wdataa(config_d0),
+	.rdqa(rdqa[(g+1)*MEM_WIDTH-1:g*MEM_WIDTH]),
+
+	.web(1'b0),
+	.addrb(addrb[BLOCKADRWIDTH-1:0]),
+	.wdatab('d0),
+	.rdqb(rdqb[(g+1)*MEM_WIDTH-1:g*MEM_WIDTH])
+);
+
+end
+endgenerate
+
+integer i;
+
+generate if(NMEMBLOCKS > 1) begin: multiblock
+
+wire [MEM_WIDTH-1:0] rdqmux[5:0];
+
+reg [$clog2(MEM_DEPTH)-BLOCKADRWIDTH-1:0] rdblocka[2:0];
+reg [$clog2(MEM_DEPTH)-BLOCKADRWIDTH-1:0] rdblockb[2:0];
+
+always @(posedge aclk) begin
+    rdblocka[0] <= addra[$clog2(MEM_DEPTH)-1:BLOCKADRWIDTH];
+    rdblockb[0] <= addrb[$clog2(MEM_DEPTH)-1:BLOCKADRWIDTH];
+    for(i=0; i<2; i=i+1) begin
+		rdblocka[i+1] <= rdblocka[i];
+		rdblockb[i+1] <= rdblockb[i];
+    end
+end
+
+if(NSTREAMS >= 1) begin: en_strm0
+	if(STRM0_MUX == 1) begin: mux0
+		mux #(STRM0_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM0_BLOCK+STRM0_NBLOCKS)*MEM_WIDTH-1:STRM0_BLOCK*MEM_WIDTH],rdqmux[0],rdblocka[1] - STRM0_BLOCK);
+	end else begin: nomux0
+		assign rdqmux[0] = rdqa[(STRM0_BLOCK+1)*MEM_WIDTH-1:STRM0_BLOCK*MEM_WIDTH];
+	end
+	assign m_axis_0_tdata = rdqmux[0][STRM0_WIDTH-1:0];
+end
+
+if(NSTREAMS >= 2) begin: en_strm1
+	if(STRM1_MUX == 1) begin: mux1
+		mux #(STRM1_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM1_BLOCK+STRM1_NBLOCKS)*MEM_WIDTH-1:STRM1_BLOCK*MEM_WIDTH],rdqmux[1],rdblockb[1] - STRM1_BLOCK);
+	end else begin: nomux1
+		assign rdqmux[1] = rdqb[(STRM1_BLOCK+1)*MEM_WIDTH-1:STRM1_BLOCK*MEM_WIDTH];
+	end
+	assign m_axis_1_tdata = rdqmux[1][STRM1_WIDTH-1:0];
+end
+
+if(NSTREAMS >= 3) begin: en_strm2
+	if(STRM2_MUX == 1) begin: mux2
+		mux #(STRM2_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM2_BLOCK+STRM2_NBLOCKS)*MEM_WIDTH-1:STRM2_BLOCK*MEM_WIDTH],rdqmux[2],rdblocka[1] - STRM2_BLOCK);
+	end else begin: nomux2
+		assign rdqmux[2] = rdqa[(STRM2_BLOCK+1)*MEM_WIDTH-1:STRM2_BLOCK*MEM_WIDTH];
+	end
+	assign m_axis_2_tdata = rdqmux[2][STRM2_WIDTH-1:0];
+end
+
+if(NSTREAMS >= 4) begin: en_strm3
+	if(STRM3_MUX == 1) begin: mux3
+		mux #(STRM3_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM3_BLOCK+STRM3_NBLOCKS)*MEM_WIDTH-1:STRM3_BLOCK*MEM_WIDTH],rdqmux[3],rdblockb[1] - STRM3_BLOCK);
+	end else begin: nomux3
+		assign rdqmux[3] = rdqb[(STRM3_BLOCK+1)*MEM_WIDTH-1:STRM3_BLOCK*MEM_WIDTH];
+	end
+	assign m_axis_3_tdata = rdqmux[3][STRM3_WIDTH-1:0];
+end
+
+if(NSTREAMS >= 5) begin: en_strm4
+	if(STRM4_MUX == 1) begin: mux4
+		mux #(STRM4_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM4_BLOCK+STRM4_NBLOCKS)*MEM_WIDTH-1:STRM4_BLOCK*MEM_WIDTH],rdqmux[4],rdblocka[1] - STRM4_BLOCK);
+	end else begin: nomux4
+		assign rdqmux[4] = rdqa[(STRM4_BLOCK+1)*MEM_WIDTH-1:STRM4_BLOCK*MEM_WIDTH];
+	end
+	assign m_axis_4_tdata = rdqmux[4][STRM4_WIDTH-1:0];
+end
+
+if(NSTREAMS >= 6) begin: en_strm5
+	if(STRM5_MUX == 1) begin: mux5
+		mux #(STRM5_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM5_BLOCK+STRM5_NBLOCKS)*MEM_WIDTH-1:STRM5_BLOCK*MEM_WIDTH],rdqmux[5],rdblockb[1] - STRM5_BLOCK);
+	end else begin: nomux5
+		assign rdqmux[5] = rdqb[(STRM5_BLOCK+1)*MEM_WIDTH-1:STRM5_BLOCK*MEM_WIDTH];
+	end
+	assign m_axis_5_tdata = rdqmux[5][STRM5_WIDTH-1:0];
+end
+
+end else begin: singleblock
+
+if(NSTREAMS >= 1) begin: en_strm0_direct
+    assign m_axis_0_tdata = rdqa[STRM0_WIDTH-1:0];
+end
+if(NSTREAMS >= 2) begin: en_strm1_direct
+	assign m_axis_1_tdata = rdqb[STRM1_WIDTH-1:0];
+end
+if(NSTREAMS >= 3) begin: en_strm2_direct
+	assign m_axis_2_tdata = rdqa[STRM2_WIDTH-1:0];
+end
+if(NSTREAMS >= 4) begin: en_strm3_direct
+	assign m_axis_3_tdata = rdqb[STRM3_WIDTH-1:0];
+end
+if(NSTREAMS >= 5) begin: en_strm4_direct
+	assign m_axis_4_tdata = rdqa[STRM4_WIDTH-1:0];
+end
+if(NSTREAMS >= 6) begin: en_strm5_direct
+	assign m_axis_5_tdata = rdqb[STRM5_WIDTH-1:0];
+end
+
+end
+endgenerate
+
+//output to AXI Streams
+reg tvalid_pipe0[2:0];
+reg tvalid_pipe1[2:0];
+reg tvalid_pipe2[2:0];
+reg tvalid_pipe3[2:0];
+reg tvalid_pipe4[2:0];
+reg tvalid_pipe5[2:0];
+
+assign m_axis_0_tvalid = tvalid_pipe0[2];
+assign m_axis_1_tvalid = tvalid_pipe1[2];
+assign m_axis_2_tvalid = tvalid_pipe2[2];
+assign m_axis_3_tvalid = tvalid_pipe3[2];
+assign m_axis_4_tvalid = tvalid_pipe4[2];
+assign m_axis_5_tvalid = tvalid_pipe5[2];
+
+
+always @(posedge aclk) begin
+    tvalid_pipe0[0] <= strm0_incr_en;
+    tvalid_pipe1[0] <= strm1_incr_en;
+    tvalid_pipe2[0] <= strm2_incr_en;
+    tvalid_pipe3[0] <= strm3_incr_en;
+    tvalid_pipe4[0] <= strm4_incr_en;
+    tvalid_pipe5[0] <= strm5_incr_en;
+    for(i=0; i<2; i=i+1) begin: srl
+        tvalid_pipe0[i+1] <= tvalid_pipe0[i];
+        tvalid_pipe1[i+1] <= tvalid_pipe1[i];
+        tvalid_pipe2[i+1] <= tvalid_pipe2[i];
+        tvalid_pipe3[i+1] <= tvalid_pipe3[i];
+        tvalid_pipe4[i+1] <= tvalid_pipe4[i];
+        tvalid_pipe5[i+1] <= tvalid_pipe5[i];
+    end
+end
+
+assign config_q0 = 0;
+
+endmodule
diff --git a/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v b/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v
index e107c400565f9f500e7902117a1eb2223b7dbc4c..4219d0f1c74bddff690b0d0cb21ce6a448c01c97 100644
--- a/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v
+++ b/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v
@@ -33,7 +33,8 @@ module ramb18_wf_dualport
     parameter ID = 0,
 	parameter DWIDTH = 18,
 	parameter AWIDTH = 10,
-	parameter MEM_INIT = ""
+	parameter MEM_INIT = "",
+  parameter RAM_STYLE = "auto"
 )
 (
 	input clk,
@@ -49,7 +50,7 @@ module ramb18_wf_dualport
 	output reg [DWIDTH-1:0] rdqb
 );
 
-(* ram_style = "block" *) reg [DWIDTH-1:0] mem[0:2**AWIDTH-1];
+(* ram_style = RAM_STYLE *) reg [DWIDTH-1:0] mem[0:2**AWIDTH-1];
 reg [DWIDTH-1:0] rdataa;
 reg [DWIDTH-1:0] rdatab;
 
diff --git a/notebooks/end2end_example/tfc_end2end_example.ipynb b/notebooks/end2end_example/tfc_end2end_example.ipynb
index 2f8e0207fb5856182ca77f6df03f3cef572eeaab..584190258e5f3879be49de21231f9940462e24ed 100644
--- a/notebooks/end2end_example/tfc_end2end_example.ipynb
+++ b/notebooks/end2end_example/tfc_end2end_example.ipynb
@@ -743,10 +743,9 @@
        " 'outputDataType': ('s', True, ''),\n",
        " 'binaryXnorMode': ('i', False, 0),\n",
        " 'noActivation': ('i', False, 0),\n",
-       " 'inFIFODepth': ('i', False, 0),\n",
-       " 'outFIFODepth': ('i', False, 0),\n",
        " 'numInputVectors': ('ints', False, [1]),\n",
        " 'mem_mode': ('s', False, 'const'),\n",
+       " 'ram_style': ('s', False, 'auto'),\n",
        " 'backend': ('s', True, 'fpgadataflow'),\n",
        " 'code_gen_dir_npysim': ('s', False, ''),\n",
        " 'code_gen_dir_ipgen': ('s', False, ''),\n",
@@ -759,7 +758,10 @@
        " 'rtlsim_trace': ('s', False, ''),\n",
        " 'res_estimate': ('s', False, ''),\n",
        " 'res_hls': ('s', False, ''),\n",
-       " 'res_synth': ('s', False, '')}"
+       " 'res_synth': ('s', False, ''),\n",
+       " 'rtlsim_so': ('s', False, ''),\n",
+       " 'inFIFODepth': ('i', False, 2),\n",
+       " 'outFIFODepth': ('i', False, 2)}"
       ]
      },
      "execution_count": 15,
@@ -800,30 +802,50 @@
     "fc0w.set_nodeattr(\"PE\", 16)\n",
     "fc0w.set_nodeattr(\"outFIFODepth\", 4)\n",
     "\n",
+    "fc1w.set_nodeattr(\"inFIFODepth\", 4)\n",
     "fc1w.set_nodeattr(\"SIMD\", 16)\n",
     "fc1w.set_nodeattr(\"PE\", 16)\n",
     "fc1w.set_nodeattr(\"outFIFODepth\", 4)\n",
     "\n",
+    "fc2w.set_nodeattr(\"inFIFODepth\", 4)\n",
     "fc2w.set_nodeattr(\"SIMD\", 16)\n",
     "fc2w.set_nodeattr(\"PE\", 16)\n",
     "fc2w.set_nodeattr(\"outFIFODepth\", 4)\n",
     "\n",
+    "fc3w.set_nodeattr(\"inFIFODepth\", 4)\n",
     "fc3w.set_nodeattr(\"SIMD\", 16)\n",
     "fc3w.set_nodeattr(\"PE\", 10)\n",
-    "fc3w.set_nodeattr(\"outFIFODepth\", 50)"
+    "fc3w.set_nodeattr(\"outFIFODepth\", 50)\n"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Finally, we will run the `InsertTLastMarker` transformation to get a `TLastMarker` node at the output of this graph, which is necessary to run the DMA engines correctly. "
+    "After setting the FIFO node attributes, we can insert FIFO nodes inbetween the fpgadataflow nodes and in the beginning and end of the graph. This can be done using the transformation `InsertFIFO`."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 17,
    "metadata": {},
+   "outputs": [],
+   "source": [
+    "from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO\n",
+    "model = model.transform(InsertFIFO())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, we will run the `InsertTLastMarker` transformation to get a `TLastMarker` node at the output of this graph, which is necessary to run the DMA engines correctly. Using netron we can observe that now the nodes contain the set folding, inbetween the nodes are FIFOs inserted and the last node is the `TLastMarker` node we insert in the following."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -1033,10 +1055,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "hls_syn_StreamingFCLayer_Batch_0.tcl  thresh.h\r\n",
-      "ipgen.sh\t\t\t      top_StreamingFCLayer_Batch_0.cpp\r\n",
-      "params.h\t\t\t      vivado_hls.log\r\n",
-      "project_StreamingFCLayer_Batch_0\r\n"
+      "project_StreamingFIFO_0\r\n"
      ]
     }
    ],
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index c90ccefc1581416e33b006d93e0fa0c70e7bf762..38c2ed6638bef5cd709a7da83218145de91dce0a 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -31,7 +31,7 @@ import numpy as np
 import os
 import subprocess
 from finn.custom_op import CustomOp
-from finn.util.basic import CppBuilder
+from finn.util.basic import CppBuilder, make_build_dir
 from finn.util.fpgadataflow import (
     IPGenBuilder,
     pyverilate_get_liveness_threshold_cycles,
@@ -82,6 +82,9 @@ class HLSCustomOp(CustomOp):
             "res_hls": ("s", False, ""),
             "res_synth": ("s", False, ""),
             "rtlsim_so": ("s", False, ""),
+            # input and output FIFO depths
+            "inFIFODepth": ("i", False, 2),
+            "outFIFODepth": ("i", False, 2),
         }
 
     def get_verilog_top_module_name(self):
@@ -119,6 +122,7 @@ class HLSCustomOp(CustomOp):
         # build the Verilator emu library
         sim = PyVerilator.build(
             verilog_file,
+            build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
             verilog_path=[
                 "{}/project_{}/sol1/impl/verilog/".format(
                     code_gen_dir, self.onnx_node.name
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index 43f85bb56f44591ecefd79ffe82bc2bd88030780..f04ee7ca7830760f4ed2804b8b71f8fe5d29325f 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -80,9 +80,6 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             "binaryXnorMode": ("i", False, 0),
             # no-activation mode (produce accumulators)
             "noActivation": ("i", False, 0),
-            # input and output FIFO depths
-            "inFIFODepth": ("i", False, 0),
-            "outFIFODepth": ("i", False, 0),
             # number of input vectors, examples:
             # [1] is a single vector (like a FC layer with batch=1)
             # [4] is four vectors (like a FC layer with batch=4)
@@ -92,6 +89,12 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             # const -- embedded weights, default, long compile/synth times
             # decoupled -- streaming weights
             "mem_mode": ("s", False, "const"),
+            # FPGA resource type for memories in decoupled mode
+            # auto -- let Vivado decide
+            # block -- use BRAM
+            # distributed -- use LUTRAM
+            # see also https://www.xilinx.com/support/answers/38070.html
+            "ram_style": ("s", False, "auto"),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -534,8 +537,10 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             """Saves weights into .dat file"""
             # convert weight values into hexstring
             weight_width = self.get_weightstream_width()
+            # pad to nearest 4 bits to get hex strings
+            weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
             weight_tensor_unflipped = pack_innermost_dim_as_hex_string(
-                weight_tensor_unflipped, export_wdt, weight_width, prefix=""
+                weight_tensor_unflipped, export_wdt, weight_width_padded, prefix=""
             )
             weight_stream_len = np.prod(weight_tensor_unflipped.shape)
             factor = math.ceil(weight_stream_len / 1024)
@@ -990,6 +995,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             self.code_gen_dict["$MEM_DEPTH$"] = [
                 str(roundup_to_integer_multiple(self.calc_wmem(), 1024))
             ]
+            self.code_gen_dict["$RAM_STYLE$"] = [self.get_nodeattr("ram_style")]
 
             template = self.decoupled_wrapper
 
diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py
index 8fcb1fe43a3927a7d49b6e041727a54cc384942f..6e004c47b1e13d95efa356b6b8984688f54027cc 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfifo.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py
@@ -30,7 +30,6 @@ import numpy as np
 from shutil import copy
 import subprocess
 
-from pyverilator import PyVerilator
 from finn.custom_op.fpgadataflow import HLSCustomOp
 from finn.core.datatype import DataType
 from onnx import TensorProto, helper
@@ -86,12 +85,23 @@ class StreamingFIFO(HLSCustomOp):
     def verify_node(self):
         pass
 
+    def get_verilog_top_module_name(self):
+        "Return the Verilog top module name for this node."
+
+        node = self.onnx_node
+        prefixed_top_name = "%s" % (node.name)
+        return prefixed_top_name
+
     def code_generation_ipgen(self, model, fpgapart, clk):
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-        # copy Q_srl.v from finn-rtllib to code gen directory
+        verilog_dir = "{}/project_{}/sol1/impl/verilog".format(
+            code_gen_dir, self.onnx_node.name
+        )
+        os.makedirs(verilog_dir)
+        # copy Q_srl.v from finn-rtllib to verilog directory
         memstream_dir = "/workspace/finn/finn-rtllib/memstream/hdl/"
         Q_file = os.path.join(memstream_dir, "Q_srl.v")
-        copy(Q_file, code_gen_dir)
+        copy(Q_file, verilog_dir)
 
         # empty code gen dictionary for new entries
         self.code_gen_dict.clear()
@@ -112,39 +122,42 @@ class StreamingFIFO(HLSCustomOp):
             # transform list into long string separated by '\n'
             code_gen_line = "\n".join(self.code_gen_dict[key])
             template = template.replace(key, code_gen_line)
-        f = open(os.path.join(code_gen_dir, "{}.v".format(self.onnx_node.name)), "w",)
+        f = open(os.path.join(verilog_dir, "{}.v".format(self.onnx_node.name,)), "w",)
         f.write(template)
         f.close()
         self.code_gen_dict.clear()
 
     def ipgen_singlenode_code(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        verilog_dir = "{}/project_{}/sol1/impl/verilog".format(
+            code_gen_dir, self.onnx_node.name
+        )
         # prepare the IP packaging tcl template
         template = templates.ip_package_tcl
         self.code_gen_dict.clear()
         self.code_gen_dict["$TOPNAME$"] = ["{}".format(self.onnx_node.name)]
-        self.code_gen_dict["$VERILOG_DIR$"] = [code_gen_dir]
+        self.code_gen_dict["$VERILOG_DIR$"] = [verilog_dir]
         for key in self.code_gen_dict:
             # transform list into long string separated by '\n'
             code_gen_line = "\n".join(self.code_gen_dict[key])
             template = template.replace(key, code_gen_line)
-        f = open(os.path.join(code_gen_dir, "package_ip.tcl"), "w")
+        f = open(os.path.join(verilog_dir, "package_ip.tcl"), "w")
         f.write(template)
         f.close()
         # create a shell script and call Vivado to invoke the IP pkg script
-        make_project_sh = code_gen_dir + "/make_ip.sh"
+        make_project_sh = verilog_dir + "/make_ip.sh"
         working_dir = os.environ["PWD"]
         with open(make_project_sh, "w") as f:
             f.write("#!/bin/bash \n")
-            f.write("cd {}\n".format(code_gen_dir))
+            f.write("cd {}\n".format(verilog_dir))
             f.write("vivado -mode batch -source package_ip.tcl\n")
             f.write("cd {}\n".format(working_dir))
         bash_command = ["bash", make_project_sh]
         process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
         process_compile.communicate()
         # set ipgen_path and ip_path to point to the new packaged IP
-        self.set_nodeattr("ipgen_path", code_gen_dir)
-        self.set_nodeattr("ip_path", code_gen_dir)
+        self.set_nodeattr("ipgen_path", verilog_dir)
+        self.set_nodeattr("ip_path", verilog_dir)
         vlnv = "xilinx.com:hls:%s:1.0" % (self.onnx_node.name)
         self.set_nodeattr("ip_vlnv", vlnv)
         self.code_gen_dict.clear()
@@ -224,38 +237,30 @@ class StreamingFIFO(HLSCustomOp):
             np.save(
                 os.path.join(code_gen_dir, "input_0.npy"), reshaped_input,
             )
-            verilog_file = os.path.join(
-                code_gen_dir, "{}.v".format(self.onnx_node.name)
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            output = self.rtlsim(sim, inp)
+            odt = DataType[self.get_nodeattr("dataType")]
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                output, out_npy_path, odt, out_shape, packed_bits, target_bits
             )
-            if os.path.isfile(verilog_file):
-                nbits = self.get_instream_width(axi_strm_padding=True)
-                inp = npy_to_rtlsim_input(
-                    "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
-                )
-                sim = PyVerilator.build(verilog_file, verilog_path=[code_gen_dir],)
-                super().reset_rtlsim(sim)
-                super().toggle_clk(sim)
-                output = self.rtlsim(sim, inp)
-                odt = DataType[self.get_nodeattr("dataType")]
-                target_bits = odt.bitwidth()
-                packed_bits = self.get_outstream_width(axi_strm_padding=True)
-                out_npy_path = "{}/output.npy".format(code_gen_dir)
-                out_shape = self.get_folded_output_shape()
-                rtlsim_output_to_npy(
-                    output, out_npy_path, odt, out_shape, packed_bits, target_bits
-                )
-
-                # load and reshape output
-                output = np.load(out_npy_path)
-                oshape = self.get_normal_output_shape()
-                output = np.asarray([output], dtype=np.float32).reshape(*oshape)
-                context[node.output[0]] = output
+            # load and reshape output
+            output = np.load(out_npy_path)
+            oshape = self.get_normal_output_shape()
+            output = np.asarray([output], dtype=np.float32).reshape(*oshape)
+            context[node.output[0]] = output
 
-            else:
-                raise Exception(
-                    """Found no verilog files for this node,
-                    did you run the codegen_ipgen transformation?"""
-                )
+        else:
+            raise Exception("Test")
 
     def get_number_output_values(self):
         folded_oshape = self.get_folded_output_shape()
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index ee0c0d487e6a666c2ff22e9c6f7f533ab00742a0..6313bb79c21231c4be5b242558da5ac40fb2aa78 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -188,6 +188,7 @@ memstream
 .MEM_DEPTH($MEM_DEPTH$),
 .MEM_WIDTH($WEIGHT_WIDTH$),
 .MEM_INIT("./"),
+.RAM_STYLE("$RAM_STYLE$"),
 
 //widths per stream
 .STRM0_WIDTH($WEIGHT_WIDTH$),
diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index c7efb95c8df4fbe83c210f7a3f0832f3e2a3d18d..b80f2fbc1d9893d304f28c3494c44a69a1db052e 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -35,7 +35,6 @@ class InsertFIFO(Transformation):
 
     def apply(self, model):
         # default depth for FIFOs
-        default_depth = 2
         graph = model.graph
         node_ind = -1
         graph_modified = False
@@ -51,6 +50,18 @@ class InsertFIFO(Transformation):
                     fld_shape = n0.get_folded_output_shape()
                     dtype = n0.get_output_datatype()
 
+                    # check if outFIFOdepth attribute of first node
+                    # and inFIFOdepth attribute of consumer node is equal
+                    n0_depth = n0.get_nodeattr("outFIFODepth")
+                    n1 = getCustomOp(consumer)
+                    n1_depth = n1.get_nodeattr("inFIFODepth")
+                    if n0_depth == n1_depth:
+                        fifo_depth = n0_depth
+                    elif n0_depth != n1_depth:
+                        fifo_depth = max(n0_depth, n1_depth)
+                        n0.set_nodeattr("outFIFODepth", fifo_depth)
+                        n1.set_nodeattr("inFIFODepth", fifo_depth)
+
                     # create fifo node
                     fifo_output_tensor = oh.make_tensor_value_info(
                         model.make_new_valueinfo_name(),
@@ -65,7 +76,7 @@ class InsertFIFO(Transformation):
                         [fifo_output_tensor.name],
                         domain="finn",
                         backend="fpgadataflow",
-                        depth=default_depth,
+                        depth=fifo_depth,
                         folded_shape=fld_shape,
                         dataType=str(dtype.name),
                     )
@@ -84,6 +95,7 @@ class InsertFIFO(Transformation):
                 # determine fifo node attributes
                 fld_shape = n0.get_folded_input_shape()
                 dtype = n0.get_input_datatype()
+                fifo_depth = n0.get_nodeattr("inFIFODepth")
 
                 # create fifo node
                 fifo_output_tensor = oh.make_tensor_value_info(
@@ -99,7 +111,7 @@ class InsertFIFO(Transformation):
                     [fifo_output_tensor.name],
                     domain="finn",
                     backend="fpgadataflow",
-                    depth=default_depth,
+                    depth=fifo_depth,
                     folded_shape=fld_shape,
                     dataType=str(dtype.name),
                 )
@@ -109,7 +121,7 @@ class InsertFIFO(Transformation):
                 # set fifo output tensor as new input tensor of second node
                 n.input[0] = fifo_output_tensor.name
 
-            # insert FIFO as first node
+            # insert FIFO as last node
             if graph.node[-1].op_type != "StreamingFIFO":
                 n = graph.node[-1]
                 assert (
@@ -121,6 +133,7 @@ class InsertFIFO(Transformation):
                 # determine fifo node attributes
                 fld_shape = n0.get_folded_output_shape()
                 dtype = n0.get_output_datatype()
+                fifo_depth = n0.get_nodeattr("inFIFODepth")
 
                 # create fifo node
                 fifo_input_tensor = oh.make_tensor_value_info(
@@ -136,7 +149,7 @@ class InsertFIFO(Transformation):
                     [graph_out_name],
                     domain="finn",
                     backend="fpgadataflow",
-                    depth=default_depth,
+                    depth=fifo_depth,
                     folded_shape=fld_shape,
                     dataType=str(dtype.name),
                 )
diff --git a/src/finn/util/fpgadataflow.py b/src/finn/util/fpgadataflow.py
index 5b29ddbcd1dcc4f771dbc4eb633bf7c1ecb6b3aa..e84532d8d24909cc5add09fbc623a13c955ffb72 100644
--- a/src/finn/util/fpgadataflow.py
+++ b/src/finn/util/fpgadataflow.py
@@ -33,7 +33,7 @@ try:
     from pyverilator import PyVerilator
 except ModuleNotFoundError:
     PyVerilator = None
-from finn.util.basic import get_by_name
+from finn.util.basic import get_by_name, make_build_dir
 
 
 class IPGenBuilder:
@@ -85,7 +85,10 @@ def pyverilate_stitched_ip(model):
 
     all_verilog_dirs = list(map(file_to_dir, all_verilog_srcs))
     top_verilog = model.get_metadata_prop("wrapper_filename")
-    sim = PyVerilator.build(top_verilog, verilog_path=all_verilog_dirs)
+    build_dir = make_build_dir("pyverilator_ipstitched_")
+    sim = PyVerilator.build(
+        top_verilog, verilog_path=all_verilog_dirs, build_dir=build_dir
+    )
     return sim
 
 
diff --git a/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py b/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py
index de3ed973af1b13cc7368a738046bec35e0eb8669..d1b7ca0ff2ed640e01d2f8231aa4fd3ea3b6189f 100644
--- a/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py
+++ b/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py
@@ -139,12 +139,15 @@ def test_end2end_tfc_w1a1_fold_and_tlastmarker():
     fc0w.set_nodeattr("SIMD", 196)
     fc0w.set_nodeattr("PE", 16)
     fc0w.set_nodeattr("outFIFODepth", 64)
+    fc1w.set_nodeattr("inFIFODepth", 64)
     fc1w.set_nodeattr("SIMD", 16)
     fc1w.set_nodeattr("PE", 16)
     fc1w.set_nodeattr("outFIFODepth", 64)
+    fc2w.set_nodeattr("inFIFODepth", 64)
     fc2w.set_nodeattr("SIMD", 16)
     fc2w.set_nodeattr("PE", 16)
     fc2w.set_nodeattr("outFIFODepth", 64)
+    fc3w.set_nodeattr("inFIFODepth", 64)
     fc3w.set_nodeattr("SIMD", 16)
     fc3w.set_nodeattr("PE", 10)
     fc3w.set_nodeattr("outFIFODepth", 10)