From 8fc4ca476ac51d7603e8b1b65798b77655d3a84a Mon Sep 17 00:00:00 2001
From: Yaman Umuroglu <maltanar@gmail.com>
Date: Tue, 28 Apr 2020 21:47:55 +0100
Subject: [PATCH] [StreamingFC] enable decoupled_mem_mode attribute setting

let the user manage which layer uses what kind of mem resource
defaults to auto
---
 finn-rtllib/memstream/hdl/memstream.v         | 920 +++++++++---------
 .../memstream/hdl/ramb18_wf_dualport.v        |   5 +-
 .../fpgadataflow/streamingfclayer_batch.py    |   9 +
 src/finn/custom_op/fpgadataflow/templates.py  |   1 +
 4 files changed, 474 insertions(+), 461 deletions(-)

diff --git a/finn-rtllib/memstream/hdl/memstream.v b/finn-rtllib/memstream/hdl/memstream.v
index a1c8e066d..28acb301a 100644
--- a/finn-rtllib/memstream/hdl/memstream.v
+++ b/finn-rtllib/memstream/hdl/memstream.v
@@ -1,465 +1,467 @@
-/*
- Copyright (c) 2020, Xilinx
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
- * Neither the name of FINN nor the names of its
-   contributors may be used to endorse or promote products derived from
-   this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-module memstream
-#(
-//parameters to enable/disable axi-mm, set number of streams, set readmemh for memory, set per-stream offsets in memory, set per-stream widths
-    parameter CONFIG_EN = 1,
-    parameter NSTREAMS = 6,//1 up to 6
-
-    parameter MEM_DEPTH = 13824,
-    parameter MEM_WIDTH = 32,
-    parameter MEM_INIT = "./",
-    
-    //widths per stream
-	parameter STRM0_WIDTH = 32,
-	parameter STRM1_WIDTH = 32,
-	parameter STRM2_WIDTH = 32,
-	parameter STRM3_WIDTH = 32,
-	parameter STRM4_WIDTH = 32,
-	parameter STRM5_WIDTH = 32,
-
-	//depths per stream
-	parameter STRM0_DEPTH = 2304,
-	parameter STRM1_DEPTH = 2304,
-	parameter STRM2_DEPTH = 2304,
-	parameter STRM3_DEPTH = 2304,
-	parameter STRM4_DEPTH = 2304,
-	parameter STRM5_DEPTH = 2304,
-
-	//offsets for each stream
-	parameter STRM0_OFFSET = 0,
-	parameter STRM1_OFFSET = 2304,
-	parameter STRM2_OFFSET = 4608,
-	parameter STRM3_OFFSET = 6912,
-	parameter STRM4_OFFSET = 9216,
-	parameter STRM5_OFFSET = 11520
-)
-
-(
-    input aclk,
-    input aresetn,
-
-    //optional configuration interface compatible with ap_memory
-	input [31:0] config_address,
-	input config_ce,
-	input config_we,
-	input [31:0] config_d0,
-	output [31:0] config_q0,
-       
-    //multiple output AXI Streams, TDATA width rounded to multiple of 8 bits
-    input m_axis_0_afull,
-    input m_axis_0_tready,
-    output m_axis_0_tvalid,
-    output [((STRM0_WIDTH+7)/8)*8-1:0] m_axis_0_tdata,
-    
-    input m_axis_1_afull,
-    input m_axis_1_tready,
-    output m_axis_1_tvalid,
-    output [((STRM1_WIDTH+7)/8)*8-1:0] m_axis_1_tdata,
-    
-    input m_axis_2_afull,
-    input m_axis_2_tready,
-    output m_axis_2_tvalid,
-    output [((STRM2_WIDTH+7)/8)*8-1:0] m_axis_2_tdata,
-    
-    input m_axis_3_afull,
-    input m_axis_3_tready,
-    output m_axis_3_tvalid,
-    output [((STRM3_WIDTH+7)/8)*8-1:0] m_axis_3_tdata,
-    
-    input m_axis_4_afull,
-    input m_axis_4_tready,
-    output m_axis_4_tvalid,
-    output [((STRM4_WIDTH+7)/8)*8-1:0] m_axis_4_tdata,
-    
-    input m_axis_5_afull,
-    input m_axis_5_tready,
-    output m_axis_5_tvalid,
-    output [((STRM5_WIDTH+7)/8)*8-1:0] m_axis_5_tdata
-    
-
-);
-
-//calculate number of RAMB18 blocks we need depth-wise
-localparam NMEMBLOCKS = (MEM_DEPTH+1023) / 1024; //ceil(MEM_DEPTH/1024)
-
-//calculate width of address for each block
-localparam BLOCKADRWIDTH = NMEMBLOCKS > 1 ? 10 : $clog2(MEM_DEPTH);
-
-//determine whether a stream needs to multiplex between memory blocks
-localparam STRM0_MUX = ((STRM0_OFFSET/1024) != ((STRM0_OFFSET+STRM0_DEPTH)/1024));
-localparam STRM1_MUX = ((STRM1_OFFSET/1024) != ((STRM1_OFFSET+STRM1_DEPTH)/1024));
-localparam STRM2_MUX = ((STRM2_OFFSET/1024) != ((STRM2_OFFSET+STRM2_DEPTH)/1024));
-localparam STRM3_MUX = ((STRM3_OFFSET/1024) != ((STRM3_OFFSET+STRM3_DEPTH)/1024));
-localparam STRM4_MUX = ((STRM4_OFFSET/1024) != ((STRM4_OFFSET+STRM4_DEPTH)/1024));
-localparam STRM5_MUX = ((STRM5_OFFSET/1024) != ((STRM5_OFFSET+STRM5_DEPTH)/1024));
-
-//determine what the base block of each stream is
-localparam STRM0_BLOCK = (STRM0_OFFSET/1024);
-localparam STRM1_BLOCK = (STRM1_OFFSET/1024);
-localparam STRM2_BLOCK = (STRM2_OFFSET/1024);
-localparam STRM3_BLOCK = (STRM3_OFFSET/1024);
-localparam STRM4_BLOCK = (STRM4_OFFSET/1024);
-localparam STRM5_BLOCK = (STRM5_OFFSET/1024);
-
-//determine what the end block of each stream is
-localparam STRM0_END_BLOCK = ((STRM0_OFFSET+STRM0_DEPTH-1)/1024);
-localparam STRM1_END_BLOCK = ((STRM1_OFFSET+STRM1_DEPTH-1)/1024);
-localparam STRM2_END_BLOCK = ((STRM2_OFFSET+STRM2_DEPTH-1)/1024);
-localparam STRM3_END_BLOCK = ((STRM3_OFFSET+STRM3_DEPTH-1)/1024);
-localparam STRM4_END_BLOCK = ((STRM4_OFFSET+STRM4_DEPTH-1)/1024);
-localparam STRM5_END_BLOCK = ((STRM5_OFFSET+STRM5_DEPTH-1)/1024);
-
-//determine the number of blocks spanned by each stream
-localparam STRM0_NBLOCKS = STRM0_END_BLOCK - STRM0_BLOCK + 1;
-localparam STRM1_NBLOCKS = STRM1_END_BLOCK - STRM1_BLOCK + 1;
-localparam STRM2_NBLOCKS = STRM2_END_BLOCK - STRM2_BLOCK + 1;
-localparam STRM3_NBLOCKS = STRM3_END_BLOCK - STRM3_BLOCK + 1;
-localparam STRM4_NBLOCKS = STRM4_END_BLOCK - STRM4_BLOCK + 1;
-localparam STRM5_NBLOCKS = STRM5_END_BLOCK - STRM5_BLOCK + 1;
-
-//TODO: check that memory width is equal to the widest stream
-//TODO: check that the stream depths and offsets make sense, and that the memory depth is sufficient (or calculate depth here?)
+/*
+ Copyright (c) 2020, Xilinx
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name of FINN nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+module memstream
+#(
+//parameters to enable/disable axi-mm, set number of streams, set readmemh for memory, set per-stream offsets in memory, set per-stream widths
+    parameter CONFIG_EN = 1,
+    parameter NSTREAMS = 6,//1 up to 6
+
+    parameter MEM_DEPTH = 13824,
+    parameter MEM_WIDTH = 32,
+    parameter MEM_INIT = "./",
+    parameter RAM_STYLE = "auto",
+
+    //widths per stream
+	parameter STRM0_WIDTH = 32,
+	parameter STRM1_WIDTH = 32,
+	parameter STRM2_WIDTH = 32,
+	parameter STRM3_WIDTH = 32,
+	parameter STRM4_WIDTH = 32,
+	parameter STRM5_WIDTH = 32,
+
+	//depths per stream
+	parameter STRM0_DEPTH = 2304,
+	parameter STRM1_DEPTH = 2304,
+	parameter STRM2_DEPTH = 2304,
+	parameter STRM3_DEPTH = 2304,
+	parameter STRM4_DEPTH = 2304,
+	parameter STRM5_DEPTH = 2304,
+
+	//offsets for each stream
+	parameter STRM0_OFFSET = 0,
+	parameter STRM1_OFFSET = 2304,
+	parameter STRM2_OFFSET = 4608,
+	parameter STRM3_OFFSET = 6912,
+	parameter STRM4_OFFSET = 9216,
+	parameter STRM5_OFFSET = 11520
+)
+
+(
+    input aclk,
+    input aresetn,
+
+    //optional configuration interface compatible with ap_memory
+	input [31:0] config_address,
+	input config_ce,
+	input config_we,
+	input [31:0] config_d0,
+	output [31:0] config_q0,
+
+    //multiple output AXI Streams, TDATA width rounded to multiple of 8 bits
+    input m_axis_0_afull,
+    input m_axis_0_tready,
+    output m_axis_0_tvalid,
+    output [((STRM0_WIDTH+7)/8)*8-1:0] m_axis_0_tdata,
+
+    input m_axis_1_afull,
+    input m_axis_1_tready,
+    output m_axis_1_tvalid,
+    output [((STRM1_WIDTH+7)/8)*8-1:0] m_axis_1_tdata,
+
+    input m_axis_2_afull,
+    input m_axis_2_tready,
+    output m_axis_2_tvalid,
+    output [((STRM2_WIDTH+7)/8)*8-1:0] m_axis_2_tdata,
+
+    input m_axis_3_afull,
+    input m_axis_3_tready,
+    output m_axis_3_tvalid,
+    output [((STRM3_WIDTH+7)/8)*8-1:0] m_axis_3_tdata,
+
+    input m_axis_4_afull,
+    input m_axis_4_tready,
+    output m_axis_4_tvalid,
+    output [((STRM4_WIDTH+7)/8)*8-1:0] m_axis_4_tdata,
+
+    input m_axis_5_afull,
+    input m_axis_5_tready,
+    output m_axis_5_tvalid,
+    output [((STRM5_WIDTH+7)/8)*8-1:0] m_axis_5_tdata
+
+
+);
+
+//calculate number of RAMB18 blocks we need depth-wise
+localparam NMEMBLOCKS = (MEM_DEPTH+1023) / 1024; //ceil(MEM_DEPTH/1024)
+
+//calculate width of address for each block
+localparam BLOCKADRWIDTH = NMEMBLOCKS > 1 ? 10 : $clog2(MEM_DEPTH);
+
+//determine whether a stream needs to multiplex between memory blocks
+localparam STRM0_MUX = ((STRM0_OFFSET/1024) != ((STRM0_OFFSET+STRM0_DEPTH)/1024));
+localparam STRM1_MUX = ((STRM1_OFFSET/1024) != ((STRM1_OFFSET+STRM1_DEPTH)/1024));
+localparam STRM2_MUX = ((STRM2_OFFSET/1024) != ((STRM2_OFFSET+STRM2_DEPTH)/1024));
+localparam STRM3_MUX = ((STRM3_OFFSET/1024) != ((STRM3_OFFSET+STRM3_DEPTH)/1024));
+localparam STRM4_MUX = ((STRM4_OFFSET/1024) != ((STRM4_OFFSET+STRM4_DEPTH)/1024));
+localparam STRM5_MUX = ((STRM5_OFFSET/1024) != ((STRM5_OFFSET+STRM5_DEPTH)/1024));
+
+//determine what the base block of each stream is
+localparam STRM0_BLOCK = (STRM0_OFFSET/1024);
+localparam STRM1_BLOCK = (STRM1_OFFSET/1024);
+localparam STRM2_BLOCK = (STRM2_OFFSET/1024);
+localparam STRM3_BLOCK = (STRM3_OFFSET/1024);
+localparam STRM4_BLOCK = (STRM4_OFFSET/1024);
+localparam STRM5_BLOCK = (STRM5_OFFSET/1024);
+
+//determine what the end block of each stream is
+localparam STRM0_END_BLOCK = ((STRM0_OFFSET+STRM0_DEPTH-1)/1024);
+localparam STRM1_END_BLOCK = ((STRM1_OFFSET+STRM1_DEPTH-1)/1024);
+localparam STRM2_END_BLOCK = ((STRM2_OFFSET+STRM2_DEPTH-1)/1024);
+localparam STRM3_END_BLOCK = ((STRM3_OFFSET+STRM3_DEPTH-1)/1024);
+localparam STRM4_END_BLOCK = ((STRM4_OFFSET+STRM4_DEPTH-1)/1024);
+localparam STRM5_END_BLOCK = ((STRM5_OFFSET+STRM5_DEPTH-1)/1024);
+
+//determine the number of blocks spanned by each stream
+localparam STRM0_NBLOCKS = STRM0_END_BLOCK - STRM0_BLOCK + 1;
+localparam STRM1_NBLOCKS = STRM1_END_BLOCK - STRM1_BLOCK + 1;
+localparam STRM2_NBLOCKS = STRM2_END_BLOCK - STRM2_BLOCK + 1;
+localparam STRM3_NBLOCKS = STRM3_END_BLOCK - STRM3_BLOCK + 1;
+localparam STRM4_NBLOCKS = STRM4_END_BLOCK - STRM4_BLOCK + 1;
+localparam STRM5_NBLOCKS = STRM5_END_BLOCK - STRM5_BLOCK + 1;
+
+//TODO: check that memory width is equal to the widest stream
+//TODO: check that the stream depths and offsets make sense, and that the memory depth is sufficient (or calculate depth here?)
 initial begin
     if((NSTREAMS < 1) | (NSTREAMS > 6)) begin
         $display("Invalid setting for NSTREAMS, please set in range [1,6]");
         $finish();
     end
 end
-
-//invert reset
-wire rst;
-assign rst = ~aresetn;
-
-//WARNING: pipeline depth is larger than the number of streams per port so we have in-flight writes that may see not-ready when they get executed
-//solution: use prog-full to make sure we have an equal number of free slots in the stream to the read pipeline depth
-
-reg [$clog2(MEM_DEPTH)-1:0] strm0_addr = STRM0_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm1_addr = STRM1_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm2_addr = STRM2_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm3_addr = STRM3_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm4_addr = STRM4_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm5_addr = STRM5_OFFSET;
-
-reg strm0_incr_en;
-reg strm1_incr_en;
-reg strm2_incr_en;
-reg strm3_incr_en;
-reg strm4_incr_en;
-reg strm5_incr_en;
-
-wire strm0_rst;
-wire strm1_rst;
-wire strm2_rst;
-wire strm3_rst;
-wire strm4_rst;
-wire strm5_rst;
-
-reg strm0_ready;
-reg strm1_ready;
-reg strm2_ready;
-reg strm3_ready;
-reg strm4_ready;
-reg strm5_ready;
-
-//arbiter: work on one stream at a time
-//multiplex each port between (up to) half of the streams 
-reg [1:0] current_stream_porta = 0;
-reg [1:0] current_stream_portb = 0;
-
-always @(posedge aclk) begin
-    if(rst)
-        current_stream_porta <= 0;
-    else case(current_stream_porta)
-        0: current_stream_porta <= strm2_ready ? 1 : strm4_ready ? 2 : 0;
-        1: current_stream_porta <= strm4_ready ? 2 : strm0_ready ? 0 : 1;
-        2: current_stream_porta <= strm0_ready ? 0 : strm2_ready ? 1 : 2;
-    endcase
-    if(rst)
-        current_stream_portb <= 0;
-    else case(current_stream_portb)
-        0: current_stream_portb <= strm3_ready ? 1 : strm5_ready ? 2 : 0;
-        1: current_stream_portb <= strm5_ready ? 2 : strm1_ready ? 0 : 1;
-        2: current_stream_portb <= strm1_ready ? 0 : strm3_ready ? 1 : 2;
-    endcase
-end
-
-always @(posedge aclk) begin
-    if(rst) begin
-        strm0_incr_en <= 0;
-        strm1_incr_en <= 0;
-        strm2_incr_en <= 0;
-        strm3_incr_en <= 0;
-        strm4_incr_en <= 0;
-        strm5_incr_en <= 0;
-    end else begin
-        strm0_incr_en <= (current_stream_porta == 0) & strm0_ready;
-        strm1_incr_en <= (current_stream_portb == 0) & strm1_ready;
-        strm2_incr_en <= (current_stream_porta == 1) & strm2_ready;
-        strm3_incr_en <= (current_stream_portb == 1) & strm3_ready;
-        strm4_incr_en <= (current_stream_porta == 2) & strm4_ready;
-        strm5_incr_en <= (current_stream_portb == 2) & strm5_ready;
-    end
-end
-
-assign strm0_rst = strm0_incr_en & (strm0_addr == (STRM0_OFFSET + STRM0_DEPTH-1));
-assign strm1_rst = strm1_incr_en & (strm1_addr == (STRM1_OFFSET + STRM1_DEPTH-1));
-assign strm2_rst = strm2_incr_en & (strm2_addr == (STRM2_OFFSET + STRM2_DEPTH-1));
-assign strm3_rst = strm3_incr_en & (strm3_addr == (STRM3_OFFSET + STRM3_DEPTH-1));
-assign strm4_rst = strm4_incr_en & (strm4_addr == (STRM4_OFFSET + STRM4_DEPTH-1));
-assign strm5_rst = strm5_incr_en & (strm5_addr == (STRM5_OFFSET + STRM5_DEPTH-1));
-
-always @(posedge aclk) begin
-    strm0_ready <= ~m_axis_0_afull;
-    strm1_ready <= ~m_axis_1_afull & (NSTREAMS >= 2);
-    strm2_ready <= ~m_axis_2_afull & (NSTREAMS >= 3);
-    strm3_ready <= ~m_axis_3_afull & (NSTREAMS >= 4);
-    strm4_ready <= ~m_axis_4_afull & (NSTREAMS >= 5);
-    strm5_ready <= ~m_axis_5_afull & (NSTREAMS >= 6);
-end
-
-//one address counter per stream; more LUTs but keeps routing short and local
-always @(posedge aclk) begin
-    if(strm0_rst | rst)
-        strm0_addr <= STRM0_OFFSET;
-    else if(strm0_incr_en)
-        strm0_addr <= strm0_addr + 1;
-    if(strm1_rst | rst)
-        strm1_addr <= STRM1_OFFSET;
-    else if(strm1_incr_en)
-        strm1_addr <= strm1_addr + 1;
-    if(strm2_rst | rst)
-        strm2_addr <= STRM2_OFFSET;
-    else if(strm2_incr_en)
-        strm2_addr <= strm2_addr + 1;
-    if(strm3_rst | rst)
-        strm3_addr <= STRM3_OFFSET;
-    else if(strm3_incr_en)
-        strm3_addr <= strm3_addr + 1;
-    if(strm4_rst | rst)
-        strm4_addr <= STRM4_OFFSET;
-    else if(strm4_incr_en)
-        strm4_addr <= strm4_addr + 1;
-    if(strm5_rst | rst)
-        strm5_addr <= STRM5_OFFSET;
-    else if(strm5_incr_en)
-        strm5_addr <= strm5_addr + 1;
-end
-
-reg [$clog2(MEM_DEPTH)-1:0] addra;
-wire [MEM_WIDTH*NMEMBLOCKS-1:0] rdqa;
-
-reg [$clog2(MEM_DEPTH)-1:0] addrb;
-wire [MEM_WIDTH*NMEMBLOCKS-1:0] rdqb;
-
-wire [NMEMBLOCKS-1:0] we;
-
-reg [1:0] addr_select_porta;
-reg [1:0] addr_select_portb;
-
-//multiplex addresses of various streams into address ports of memory
-always @(posedge aclk) begin
-    addr_select_porta <= current_stream_porta;
-    case(addr_select_porta)
-        0: addra <= strm0_addr;
-        1: addra <= strm2_addr;
-        2: addra <= strm4_addr;
-    endcase
-    addr_select_portb <= current_stream_portb;
-    case(addr_select_portb)
-        0: addrb <= strm1_addr;
-        1: addrb <= strm3_addr;
-        2: addrb <= strm5_addr;
-    endcase
-end
-
-genvar g;
-generate for(g=0; g<NMEMBLOCKS; g=g+1) begin: blockports
-
-assign we[g] = (CONFIG_EN == 1) & config_ce & config_we & (config_address[31:BLOCKADRWIDTH] == g);
-
-ramb18_wf_dualport
-#(
-    .ID(g),
-	.DWIDTH(MEM_WIDTH),
-	.AWIDTH(BLOCKADRWIDTH),
-	.MEM_INIT(MEM_INIT)
-)
-ram
-(
-	.clk(aclk),
-	
-	.wea(we[g]),
-	.addra(we[g] ? config_address[BLOCKADRWIDTH-1:0] : addra[BLOCKADRWIDTH-1:0]),
-	.wdataa(config_d0),
-	.rdqa(rdqa[(g+1)*MEM_WIDTH-1:g*MEM_WIDTH]),
-
-	.web(1'b0),
-	.addrb(addrb[BLOCKADRWIDTH-1:0]),
-	.wdatab('d0),
-	.rdqb(rdqb[(g+1)*MEM_WIDTH-1:g*MEM_WIDTH])
-);
-
-end
-endgenerate
-
-integer i;
-
-generate if(NMEMBLOCKS > 1) begin: multiblock
-
-wire [MEM_WIDTH-1:0] rdqmux[5:0];
-
-reg [$clog2(MEM_DEPTH)-BLOCKADRWIDTH-1:0] rdblocka[2:0];
-reg [$clog2(MEM_DEPTH)-BLOCKADRWIDTH-1:0] rdblockb[2:0];
-
-always @(posedge aclk) begin
-    rdblocka[0] <= addra[$clog2(MEM_DEPTH)-1:BLOCKADRWIDTH];
-    rdblockb[0] <= addrb[$clog2(MEM_DEPTH)-1:BLOCKADRWIDTH];
-    for(i=0; i<2; i=i+1) begin
-		rdblocka[i+1] <= rdblocka[i];
-		rdblockb[i+1] <= rdblockb[i];
-    end
-end
-
-if(NSTREAMS >= 1) begin: en_strm0
-	if(STRM0_MUX == 1) begin: mux0
-		mux #(STRM0_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM0_BLOCK+STRM0_NBLOCKS)*MEM_WIDTH-1:STRM0_BLOCK*MEM_WIDTH],rdqmux[0],rdblocka[1] - STRM0_BLOCK);
-	end else begin: nomux0
-		assign rdqmux[0] = rdqa[(STRM0_BLOCK+1)*MEM_WIDTH-1:STRM0_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_0_tdata = rdqmux[0][STRM0_WIDTH-1:0];
-end
-
-if(NSTREAMS >= 2) begin: en_strm1
-	if(STRM1_MUX == 1) begin: mux1
-		mux #(STRM1_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM1_BLOCK+STRM1_NBLOCKS)*MEM_WIDTH-1:STRM1_BLOCK*MEM_WIDTH],rdqmux[1],rdblockb[1] - STRM1_BLOCK);
-	end else begin: nomux1
-		assign rdqmux[1] = rdqb[(STRM1_BLOCK+1)*MEM_WIDTH-1:STRM1_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_1_tdata = rdqmux[1][STRM1_WIDTH-1:0];
-end
-
-if(NSTREAMS >= 3) begin: en_strm2
-	if(STRM2_MUX == 1) begin: mux2
-		mux #(STRM2_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM2_BLOCK+STRM2_NBLOCKS)*MEM_WIDTH-1:STRM2_BLOCK*MEM_WIDTH],rdqmux[2],rdblocka[1] - STRM2_BLOCK);
-	end else begin: nomux2
-		assign rdqmux[2] = rdqa[(STRM2_BLOCK+1)*MEM_WIDTH-1:STRM2_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_2_tdata = rdqmux[2][STRM2_WIDTH-1:0];
-end
-
-if(NSTREAMS >= 4) begin: en_strm3
-	if(STRM3_MUX == 1) begin: mux3
-		mux #(STRM3_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM3_BLOCK+STRM3_NBLOCKS)*MEM_WIDTH-1:STRM3_BLOCK*MEM_WIDTH],rdqmux[3],rdblockb[1] - STRM3_BLOCK);
-	end else begin: nomux3
-		assign rdqmux[3] = rdqb[(STRM3_BLOCK+1)*MEM_WIDTH-1:STRM3_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_3_tdata = rdqmux[3][STRM3_WIDTH-1:0];
-end
-
-if(NSTREAMS >= 5) begin: en_strm4
-	if(STRM4_MUX == 1) begin: mux4
-		mux #(STRM4_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM4_BLOCK+STRM4_NBLOCKS)*MEM_WIDTH-1:STRM4_BLOCK*MEM_WIDTH],rdqmux[4],rdblocka[1] - STRM4_BLOCK);
-	end else begin: nomux4
-		assign rdqmux[4] = rdqa[(STRM4_BLOCK+1)*MEM_WIDTH-1:STRM4_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_4_tdata = rdqmux[4][STRM4_WIDTH-1:0];
-end
-
-if(NSTREAMS >= 6) begin: en_strm5
-	if(STRM5_MUX == 1) begin: mux5
-		mux #(STRM5_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM5_BLOCK+STRM5_NBLOCKS)*MEM_WIDTH-1:STRM5_BLOCK*MEM_WIDTH],rdqmux[5],rdblockb[1] - STRM5_BLOCK);
-	end else begin: nomux5
-		assign rdqmux[5] = rdqb[(STRM5_BLOCK+1)*MEM_WIDTH-1:STRM5_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_5_tdata = rdqmux[5][STRM5_WIDTH-1:0];
-end
-
-end else begin: singleblock
-
-if(NSTREAMS >= 1) begin: en_strm0_direct
-    assign m_axis_0_tdata = rdqa[STRM0_WIDTH-1:0];
-end
-if(NSTREAMS >= 2) begin: en_strm1_direct
-	assign m_axis_1_tdata = rdqb[STRM1_WIDTH-1:0];
-end
-if(NSTREAMS >= 3) begin: en_strm2_direct
-	assign m_axis_2_tdata = rdqa[STRM2_WIDTH-1:0];
-end
-if(NSTREAMS >= 4) begin: en_strm3_direct
-	assign m_axis_3_tdata = rdqb[STRM3_WIDTH-1:0];
-end
-if(NSTREAMS >= 5) begin: en_strm4_direct
-	assign m_axis_4_tdata = rdqa[STRM4_WIDTH-1:0];
-end
-if(NSTREAMS >= 6) begin: en_strm5_direct
-	assign m_axis_5_tdata = rdqb[STRM5_WIDTH-1:0];
-end
-
-end
-endgenerate
-
-//output to AXI Streams
-reg tvalid_pipe0[2:0];
-reg tvalid_pipe1[2:0];
-reg tvalid_pipe2[2:0];
-reg tvalid_pipe3[2:0];
-reg tvalid_pipe4[2:0];
-reg tvalid_pipe5[2:0];
-
-assign m_axis_0_tvalid = tvalid_pipe0[2];
-assign m_axis_1_tvalid = tvalid_pipe1[2];
-assign m_axis_2_tvalid = tvalid_pipe2[2];
-assign m_axis_3_tvalid = tvalid_pipe3[2];
-assign m_axis_4_tvalid = tvalid_pipe4[2];
-assign m_axis_5_tvalid = tvalid_pipe5[2];
-
-
-always @(posedge aclk) begin
-    tvalid_pipe0[0] <= strm0_incr_en;
-    tvalid_pipe1[0] <= strm1_incr_en;
-    tvalid_pipe2[0] <= strm2_incr_en;
-    tvalid_pipe3[0] <= strm3_incr_en;
-    tvalid_pipe4[0] <= strm4_incr_en;
-    tvalid_pipe5[0] <= strm5_incr_en;
-    for(i=0; i<2; i=i+1) begin: srl
-        tvalid_pipe0[i+1] <= tvalid_pipe0[i];
-        tvalid_pipe1[i+1] <= tvalid_pipe1[i];
-        tvalid_pipe2[i+1] <= tvalid_pipe2[i];
-        tvalid_pipe3[i+1] <= tvalid_pipe3[i];
-        tvalid_pipe4[i+1] <= tvalid_pipe4[i];
-        tvalid_pipe5[i+1] <= tvalid_pipe5[i];
-    end
-end
-
-assign config_q0 = 0;
-
-endmodule
\ No newline at end of file
+
+//invert reset
+wire rst;
+assign rst = ~aresetn;
+
+//WARNING: pipeline depth is larger than the number of streams per port so we have in-flight writes that may see not-ready when they get executed
+//solution: use prog-full to make sure we have an equal number of free slots in the stream to the read pipeline depth
+
+reg [$clog2(MEM_DEPTH)-1:0] strm0_addr = STRM0_OFFSET;
+reg [$clog2(MEM_DEPTH)-1:0] strm1_addr = STRM1_OFFSET;
+reg [$clog2(MEM_DEPTH)-1:0] strm2_addr = STRM2_OFFSET;
+reg [$clog2(MEM_DEPTH)-1:0] strm3_addr = STRM3_OFFSET;
+reg [$clog2(MEM_DEPTH)-1:0] strm4_addr = STRM4_OFFSET;
+reg [$clog2(MEM_DEPTH)-1:0] strm5_addr = STRM5_OFFSET;
+
+reg strm0_incr_en;
+reg strm1_incr_en;
+reg strm2_incr_en;
+reg strm3_incr_en;
+reg strm4_incr_en;
+reg strm5_incr_en;
+
+wire strm0_rst;
+wire strm1_rst;
+wire strm2_rst;
+wire strm3_rst;
+wire strm4_rst;
+wire strm5_rst;
+
+reg strm0_ready;
+reg strm1_ready;
+reg strm2_ready;
+reg strm3_ready;
+reg strm4_ready;
+reg strm5_ready;
+
+//arbiter: work on one stream at a time
+//multiplex each port between (up to) half of the streams
+reg [1:0] current_stream_porta = 0;
+reg [1:0] current_stream_portb = 0;
+
+always @(posedge aclk) begin
+    if(rst)
+        current_stream_porta <= 0;
+    else case(current_stream_porta)
+        0: current_stream_porta <= strm2_ready ? 1 : strm4_ready ? 2 : 0;
+        1: current_stream_porta <= strm4_ready ? 2 : strm0_ready ? 0 : 1;
+        2: current_stream_porta <= strm0_ready ? 0 : strm2_ready ? 1 : 2;
+    endcase
+    if(rst)
+        current_stream_portb <= 0;
+    else case(current_stream_portb)
+        0: current_stream_portb <= strm3_ready ? 1 : strm5_ready ? 2 : 0;
+        1: current_stream_portb <= strm5_ready ? 2 : strm1_ready ? 0 : 1;
+        2: current_stream_portb <= strm1_ready ? 0 : strm3_ready ? 1 : 2;
+    endcase
+end
+
+always @(posedge aclk) begin
+    if(rst) begin
+        strm0_incr_en <= 0;
+        strm1_incr_en <= 0;
+        strm2_incr_en <= 0;
+        strm3_incr_en <= 0;
+        strm4_incr_en <= 0;
+        strm5_incr_en <= 0;
+    end else begin
+        strm0_incr_en <= (current_stream_porta == 0) & strm0_ready;
+        strm1_incr_en <= (current_stream_portb == 0) & strm1_ready;
+        strm2_incr_en <= (current_stream_porta == 1) & strm2_ready;
+        strm3_incr_en <= (current_stream_portb == 1) & strm3_ready;
+        strm4_incr_en <= (current_stream_porta == 2) & strm4_ready;
+        strm5_incr_en <= (current_stream_portb == 2) & strm5_ready;
+    end
+end
+
+assign strm0_rst = strm0_incr_en & (strm0_addr == (STRM0_OFFSET + STRM0_DEPTH-1));
+assign strm1_rst = strm1_incr_en & (strm1_addr == (STRM1_OFFSET + STRM1_DEPTH-1));
+assign strm2_rst = strm2_incr_en & (strm2_addr == (STRM2_OFFSET + STRM2_DEPTH-1));
+assign strm3_rst = strm3_incr_en & (strm3_addr == (STRM3_OFFSET + STRM3_DEPTH-1));
+assign strm4_rst = strm4_incr_en & (strm4_addr == (STRM4_OFFSET + STRM4_DEPTH-1));
+assign strm5_rst = strm5_incr_en & (strm5_addr == (STRM5_OFFSET + STRM5_DEPTH-1));
+
+always @(posedge aclk) begin
+    strm0_ready <= ~m_axis_0_afull;
+    strm1_ready <= ~m_axis_1_afull & (NSTREAMS >= 2);
+    strm2_ready <= ~m_axis_2_afull & (NSTREAMS >= 3);
+    strm3_ready <= ~m_axis_3_afull & (NSTREAMS >= 4);
+    strm4_ready <= ~m_axis_4_afull & (NSTREAMS >= 5);
+    strm5_ready <= ~m_axis_5_afull & (NSTREAMS >= 6);
+end
+
+//one address counter per stream; more LUTs but keeps routing short and local
+always @(posedge aclk) begin
+    if(strm0_rst | rst)
+        strm0_addr <= STRM0_OFFSET;
+    else if(strm0_incr_en)
+        strm0_addr <= strm0_addr + 1;
+    if(strm1_rst | rst)
+        strm1_addr <= STRM1_OFFSET;
+    else if(strm1_incr_en)
+        strm1_addr <= strm1_addr + 1;
+    if(strm2_rst | rst)
+        strm2_addr <= STRM2_OFFSET;
+    else if(strm2_incr_en)
+        strm2_addr <= strm2_addr + 1;
+    if(strm3_rst | rst)
+        strm3_addr <= STRM3_OFFSET;
+    else if(strm3_incr_en)
+        strm3_addr <= strm3_addr + 1;
+    if(strm4_rst | rst)
+        strm4_addr <= STRM4_OFFSET;
+    else if(strm4_incr_en)
+        strm4_addr <= strm4_addr + 1;
+    if(strm5_rst | rst)
+        strm5_addr <= STRM5_OFFSET;
+    else if(strm5_incr_en)
+        strm5_addr <= strm5_addr + 1;
+end
+
+reg [$clog2(MEM_DEPTH)-1:0] addra;
+wire [MEM_WIDTH*NMEMBLOCKS-1:0] rdqa;
+
+reg [$clog2(MEM_DEPTH)-1:0] addrb;
+wire [MEM_WIDTH*NMEMBLOCKS-1:0] rdqb;
+
+wire [NMEMBLOCKS-1:0] we;
+
+reg [1:0] addr_select_porta;
+reg [1:0] addr_select_portb;
+
+//multiplex addresses of various streams into address ports of memory
+always @(posedge aclk) begin
+    addr_select_porta <= current_stream_porta;
+    case(addr_select_porta)
+        0: addra <= strm0_addr;
+        1: addra <= strm2_addr;
+        2: addra <= strm4_addr;
+    endcase
+    addr_select_portb <= current_stream_portb;
+    case(addr_select_portb)
+        0: addrb <= strm1_addr;
+        1: addrb <= strm3_addr;
+        2: addrb <= strm5_addr;
+    endcase
+end
+
+genvar g;
+generate for(g=0; g<NMEMBLOCKS; g=g+1) begin: blockports
+
+assign we[g] = (CONFIG_EN == 1) & config_ce & config_we & (config_address[31:BLOCKADRWIDTH] == g);
+
+ramb18_wf_dualport
+#(
+    .ID(g),
+	.DWIDTH(MEM_WIDTH),
+	.AWIDTH(BLOCKADRWIDTH),
+	.MEM_INIT(MEM_INIT),
+  .RAM_STYLE(RAM_STYLE)
+)
+ram
+(
+	.clk(aclk),
+
+	.wea(we[g]),
+	.addra(we[g] ? config_address[BLOCKADRWIDTH-1:0] : addra[BLOCKADRWIDTH-1:0]),
+	.wdataa(config_d0),
+	.rdqa(rdqa[(g+1)*MEM_WIDTH-1:g*MEM_WIDTH]),
+
+	.web(1'b0),
+	.addrb(addrb[BLOCKADRWIDTH-1:0]),
+	.wdatab('d0),
+	.rdqb(rdqb[(g+1)*MEM_WIDTH-1:g*MEM_WIDTH])
+);
+
+end
+endgenerate
+
+integer i;
+
+generate if(NMEMBLOCKS > 1) begin: multiblock
+
+wire [MEM_WIDTH-1:0] rdqmux[5:0];
+
+reg [$clog2(MEM_DEPTH)-BLOCKADRWIDTH-1:0] rdblocka[2:0];
+reg [$clog2(MEM_DEPTH)-BLOCKADRWIDTH-1:0] rdblockb[2:0];
+
+always @(posedge aclk) begin
+    rdblocka[0] <= addra[$clog2(MEM_DEPTH)-1:BLOCKADRWIDTH];
+    rdblockb[0] <= addrb[$clog2(MEM_DEPTH)-1:BLOCKADRWIDTH];
+    for(i=0; i<2; i=i+1) begin
+		rdblocka[i+1] <= rdblocka[i];
+		rdblockb[i+1] <= rdblockb[i];
+    end
+end
+
+if(NSTREAMS >= 1) begin: en_strm0
+	if(STRM0_MUX == 1) begin: mux0
+		mux #(STRM0_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM0_BLOCK+STRM0_NBLOCKS)*MEM_WIDTH-1:STRM0_BLOCK*MEM_WIDTH],rdqmux[0],rdblocka[1] - STRM0_BLOCK);
+	end else begin: nomux0
+		assign rdqmux[0] = rdqa[(STRM0_BLOCK+1)*MEM_WIDTH-1:STRM0_BLOCK*MEM_WIDTH];
+	end
+	assign m_axis_0_tdata = rdqmux[0][STRM0_WIDTH-1:0];
+end
+
+if(NSTREAMS >= 2) begin: en_strm1
+	if(STRM1_MUX == 1) begin: mux1
+		mux #(STRM1_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM1_BLOCK+STRM1_NBLOCKS)*MEM_WIDTH-1:STRM1_BLOCK*MEM_WIDTH],rdqmux[1],rdblockb[1] - STRM1_BLOCK);
+	end else begin: nomux1
+		assign rdqmux[1] = rdqb[(STRM1_BLOCK+1)*MEM_WIDTH-1:STRM1_BLOCK*MEM_WIDTH];
+	end
+	assign m_axis_1_tdata = rdqmux[1][STRM1_WIDTH-1:0];
+end
+
+if(NSTREAMS >= 3) begin: en_strm2
+	if(STRM2_MUX == 1) begin: mux2
+		mux #(STRM2_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM2_BLOCK+STRM2_NBLOCKS)*MEM_WIDTH-1:STRM2_BLOCK*MEM_WIDTH],rdqmux[2],rdblocka[1] - STRM2_BLOCK);
+	end else begin: nomux2
+		assign rdqmux[2] = rdqa[(STRM2_BLOCK+1)*MEM_WIDTH-1:STRM2_BLOCK*MEM_WIDTH];
+	end
+	assign m_axis_2_tdata = rdqmux[2][STRM2_WIDTH-1:0];
+end
+
+if(NSTREAMS >= 4) begin: en_strm3
+	if(STRM3_MUX == 1) begin: mux3
+		mux #(STRM3_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM3_BLOCK+STRM3_NBLOCKS)*MEM_WIDTH-1:STRM3_BLOCK*MEM_WIDTH],rdqmux[3],rdblockb[1] - STRM3_BLOCK);
+	end else begin: nomux3
+		assign rdqmux[3] = rdqb[(STRM3_BLOCK+1)*MEM_WIDTH-1:STRM3_BLOCK*MEM_WIDTH];
+	end
+	assign m_axis_3_tdata = rdqmux[3][STRM3_WIDTH-1:0];
+end
+
+if(NSTREAMS >= 5) begin: en_strm4
+	if(STRM4_MUX == 1) begin: mux4
+		mux #(STRM4_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM4_BLOCK+STRM4_NBLOCKS)*MEM_WIDTH-1:STRM4_BLOCK*MEM_WIDTH],rdqmux[4],rdblocka[1] - STRM4_BLOCK);
+	end else begin: nomux4
+		assign rdqmux[4] = rdqa[(STRM4_BLOCK+1)*MEM_WIDTH-1:STRM4_BLOCK*MEM_WIDTH];
+	end
+	assign m_axis_4_tdata = rdqmux[4][STRM4_WIDTH-1:0];
+end
+
+if(NSTREAMS >= 6) begin: en_strm5
+	if(STRM5_MUX == 1) begin: mux5
+		mux #(STRM5_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM5_BLOCK+STRM5_NBLOCKS)*MEM_WIDTH-1:STRM5_BLOCK*MEM_WIDTH],rdqmux[5],rdblockb[1] - STRM5_BLOCK);
+	end else begin: nomux5
+		assign rdqmux[5] = rdqb[(STRM5_BLOCK+1)*MEM_WIDTH-1:STRM5_BLOCK*MEM_WIDTH];
+	end
+	assign m_axis_5_tdata = rdqmux[5][STRM5_WIDTH-1:0];
+end
+
+end else begin: singleblock
+
+if(NSTREAMS >= 1) begin: en_strm0_direct
+    assign m_axis_0_tdata = rdqa[STRM0_WIDTH-1:0];
+end
+if(NSTREAMS >= 2) begin: en_strm1_direct
+	assign m_axis_1_tdata = rdqb[STRM1_WIDTH-1:0];
+end
+if(NSTREAMS >= 3) begin: en_strm2_direct
+	assign m_axis_2_tdata = rdqa[STRM2_WIDTH-1:0];
+end
+if(NSTREAMS >= 4) begin: en_strm3_direct
+	assign m_axis_3_tdata = rdqb[STRM3_WIDTH-1:0];
+end
+if(NSTREAMS >= 5) begin: en_strm4_direct
+	assign m_axis_4_tdata = rdqa[STRM4_WIDTH-1:0];
+end
+if(NSTREAMS >= 6) begin: en_strm5_direct
+	assign m_axis_5_tdata = rdqb[STRM5_WIDTH-1:0];
+end
+
+end
+endgenerate
+
+//output to AXI Streams
+reg tvalid_pipe0[2:0];
+reg tvalid_pipe1[2:0];
+reg tvalid_pipe2[2:0];
+reg tvalid_pipe3[2:0];
+reg tvalid_pipe4[2:0];
+reg tvalid_pipe5[2:0];
+
+assign m_axis_0_tvalid = tvalid_pipe0[2];
+assign m_axis_1_tvalid = tvalid_pipe1[2];
+assign m_axis_2_tvalid = tvalid_pipe2[2];
+assign m_axis_3_tvalid = tvalid_pipe3[2];
+assign m_axis_4_tvalid = tvalid_pipe4[2];
+assign m_axis_5_tvalid = tvalid_pipe5[2];
+
+
+always @(posedge aclk) begin
+    tvalid_pipe0[0] <= strm0_incr_en;
+    tvalid_pipe1[0] <= strm1_incr_en;
+    tvalid_pipe2[0] <= strm2_incr_en;
+    tvalid_pipe3[0] <= strm3_incr_en;
+    tvalid_pipe4[0] <= strm4_incr_en;
+    tvalid_pipe5[0] <= strm5_incr_en;
+    for(i=0; i<2; i=i+1) begin: srl
+        tvalid_pipe0[i+1] <= tvalid_pipe0[i];
+        tvalid_pipe1[i+1] <= tvalid_pipe1[i];
+        tvalid_pipe2[i+1] <= tvalid_pipe2[i];
+        tvalid_pipe3[i+1] <= tvalid_pipe3[i];
+        tvalid_pipe4[i+1] <= tvalid_pipe4[i];
+        tvalid_pipe5[i+1] <= tvalid_pipe5[i];
+    end
+end
+
+assign config_q0 = 0;
+
+endmodule
diff --git a/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v b/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v
index e107c4005..4219d0f1c 100644
--- a/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v
+++ b/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v
@@ -33,7 +33,8 @@ module ramb18_wf_dualport
     parameter ID = 0,
 	parameter DWIDTH = 18,
 	parameter AWIDTH = 10,
-	parameter MEM_INIT = ""
+	parameter MEM_INIT = "",
+  parameter RAM_STYLE = "auto"
 )
 (
 	input clk,
@@ -49,7 +50,7 @@ module ramb18_wf_dualport
 	output reg [DWIDTH-1:0] rdqb
 );
 
-(* ram_style = "block" *) reg [DWIDTH-1:0] mem[0:2**AWIDTH-1];
+(* ram_style = RAM_STYLE *) reg [DWIDTH-1:0] mem[0:2**AWIDTH-1];
 reg [DWIDTH-1:0] rdataa;
 reg [DWIDTH-1:0] rdatab;
 
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index 2fcb03f48..6d63f3b08 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -92,6 +92,12 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             # const -- embedded weights, default, long compile/synth times
             # decoupled -- streaming weights
             "mem_mode": ("s", False, "const"),
+            # FPGA resource type for memories in decoupled mode
+            # auto -- let Vivado decide
+            # block -- use BRAM
+            # distributed -- use LUTRAM
+            # see also https://www.xilinx.com/support/answers/38070.html
+            "decoupled_ram_style": ("s", False, "auto"),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -983,6 +989,9 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             self.code_gen_dict["$MEM_DEPTH$"] = [
                 str(roundup_to_integer_multiple(self.calc_wmem(), 1024))
             ]
+            self.code_gen_dict["$RAM_STYLE$"] = [
+                self.get_nodeattr("decoupled_ram_style")
+            ]
 
             template = self.decoupled_wrapper
 
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 726065cc0..d5d8a61c8 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -188,6 +188,7 @@ memstream
 .MEM_DEPTH($MEM_DEPTH$),
 .MEM_WIDTH($WEIGHT_WIDTH$),
 .MEM_INIT("./"),
+.RAM_STYLE($RAM_STYLE$),
 
 //widths per stream
 .STRM0_WIDTH($WEIGHT_WIDTH$),
-- 
GitLab