diff --git a/finn-rtllib/memstream/component.xml b/finn-rtllib/memstream/component.xml
index 7910a8284dad3674b8665136506a60c498e0547f..3daf36628178766fb440987d2ac2f4c1b37353de 100644
--- a/finn-rtllib/memstream/component.xml
+++ b/finn-rtllib/memstream/component.xml
@@ -260,7 +260,7 @@
         <spirit:parameters>
           <spirit:parameter>
             <spirit:name>viewChecksum</spirit:name>
-            <spirit:value>ba6d3300</spirit:value>
+            <spirit:value>083f6ff3</spirit:value>
           </spirit:parameter>
         </spirit:parameters>
       </spirit:view>
@@ -276,7 +276,7 @@
         <spirit:parameters>
           <spirit:parameter>
             <spirit:name>viewChecksum</spirit:name>
-            <spirit:value>54f61a0e</spirit:value>
+            <spirit:value>7f67dadd</spirit:value>
           </spirit:parameter>
         </spirit:parameters>
       </spirit:view>
@@ -290,7 +290,7 @@
         <spirit:parameters>
           <spirit:parameter>
             <spirit:name>viewChecksum</spirit:name>
-            <spirit:value>92c3ebfc</spirit:value>
+            <spirit:value>d714c73b</spirit:value>
           </spirit:parameter>
         </spirit:parameters>
       </spirit:view>
@@ -780,6 +780,11 @@
         <spirit:displayName>Mem Init</spirit:displayName>
         <spirit:value spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.MEM_INIT">./</spirit:value>
       </spirit:modelParameter>
+      <spirit:modelParameter spirit:dataType="string">
+        <spirit:name>RAM_STYLE</spirit:name>
+        <spirit:displayName>Ram Style</spirit:displayName>
+        <spirit:value spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.RAM_STYLE">auto</spirit:value>
+      </spirit:modelParameter>
       <spirit:modelParameter spirit:dataType="integer">
         <spirit:name>STRM0_WIDTH</spirit:name>
         <spirit:displayName>Strm0 Width</spirit:displayName>
@@ -873,6 +878,12 @@
     </spirit:modelParameters>
   </spirit:model>
   <spirit:choices>
+    <spirit:choice>
+      <spirit:name>choice_list_44c459b8</spirit:name>
+      <spirit:enumeration>auto</spirit:enumeration>
+      <spirit:enumeration>block</spirit:enumeration>
+      <spirit:enumeration>distributed</spirit:enumeration>
+    </spirit:choice>
     <spirit:choice>
       <spirit:name>choice_list_9d8b0d81</spirit:name>
       <spirit:enumeration>ACTIVE_HIGH</spirit:enumeration>
@@ -891,9 +902,26 @@
         <spirit:fileType>verilogSource</spirit:fileType>
       </spirit:file>
       <spirit:file>
-        <spirit:name>hdl/ramb18.v</spirit:name>
+        <spirit:name>hdl/memstream_singleblock.v</spirit:name>
         <spirit:fileType>verilogSource</spirit:fileType>
-        <spirit:userFileType>CHECKSUM_13578c44</spirit:userFileType>
+        <spirit:logicalName>xil_defaultlib</spirit:logicalName>
+      </spirit:file>
+      <spirit:file>
+        <spirit:name>hdl/memstream_multiblock.v</spirit:name>
+        <spirit:fileType>verilogSource</spirit:fileType>
+        <spirit:logicalName>xil_defaultlib</spirit:logicalName>
+      </spirit:file>
+      <spirit:file>
+        <spirit:name>hdl/ramb18_wf_dualport.v</spirit:name>
+        <spirit:fileType>verilogSource</spirit:fileType>
+        <spirit:userFileType>CHECKSUM_9425c051</spirit:userFileType>
+        <spirit:logicalName>xil_defaultlib</spirit:logicalName>
+      </spirit:file>
+      <spirit:file>
+        <spirit:name>hdl/ramb18_sdp.v</spirit:name>
+        <spirit:fileType>verilogSource</spirit:fileType>
+        <spirit:userFileType>CHECKSUM_9e2eda76</spirit:userFileType>
+        <spirit:logicalName>xil_defaultlib</spirit:logicalName>
       </spirit:file>
     </spirit:fileSet>
     <spirit:fileSet>
@@ -908,7 +936,7 @@
       <spirit:file>
         <spirit:name>xgui/memstream_v1_0.tcl</spirit:name>
         <spirit:fileType>tclSource</spirit:fileType>
-        <spirit:userFileType>CHECKSUM_92c3ebfc</spirit:userFileType>
+        <spirit:userFileType>CHECKSUM_d714c73b</spirit:userFileType>
         <spirit:userFileType>XGUI_VERSION_2</spirit:userFileType>
       </spirit:file>
     </spirit:fileSet>
@@ -1034,22 +1062,17 @@
       <spirit:name>Component_Name</spirit:name>
       <spirit:value spirit:resolve="user" spirit:id="PARAM_VALUE.Component_Name" spirit:order="1">memstream_v1_0</spirit:value>
     </spirit:parameter>
+    <spirit:parameter>
+      <spirit:name>RAM_STYLE</spirit:name>
+      <spirit:displayName>Ram Style</spirit:displayName>
+      <spirit:value spirit:resolve="user" spirit:id="PARAM_VALUE.RAM_STYLE" spirit:choiceRef="choice_list_44c459b8">auto</spirit:value>
+    </spirit:parameter>
   </spirit:parameters>
   <spirit:vendorExtensions>
     <xilinx:coreExtensions>
       <xilinx:supportedFamilies>
         <xilinx:family xilinx:lifeCycle="Production">zynq</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Beta">artix7</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Beta">artix7l</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Beta">kintex7</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Beta">kintex7l</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Beta">kintexu</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Beta">kintexuplus</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Beta">spartan7</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Beta">aartix7</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Beta">aspartan7</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Beta">azynq</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Beta">zynquplus</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">zynquplus</xilinx:family>
         <xilinx:family xilinx:lifeCycle="Production">virtexuplus</xilinx:family>
         <xilinx:family xilinx:lifeCycle="Production">virtexuplusHBM</xilinx:family>
       </xilinx:supportedFamilies>
@@ -1057,27 +1080,18 @@
         <xilinx:taxonomy>/UserIP</xilinx:taxonomy>
       </xilinx:taxonomies>
       <xilinx:displayName>memstream_v1_0</xilinx:displayName>
+      <xilinx:autoFamilySupportLevel>level_0</xilinx:autoFamilySupportLevel>
       <xilinx:definitionSource>package_project</xilinx:definitionSource>
-      <xilinx:coreRevision>2</xilinx:coreRevision>
-      <xilinx:coreCreationDateTime>2019-11-04T19:37:20Z</xilinx:coreCreationDateTime>
-      <xilinx:tags>
-        <xilinx:tag xilinx:name="nopcore"/>
-        <xilinx:tag xilinx:name="ui.data.coregen.dd@7a3d79be_ARCHIVE_LOCATION">c:/Users/lucianp/Documents/git/finn-rtllib/memstream</xilinx:tag>
-        <xilinx:tag xilinx:name="ui.data.coregen.dd@6ca546af_ARCHIVE_LOCATION">c:/Users/lucianp/Documents/git/finn-rtllib/memstream</xilinx:tag>
-        <xilinx:tag xilinx:name="ui.data.coregen.dd@2bb0c52f_ARCHIVE_LOCATION">c:/Users/lucianp/Documents/git/finn-rtllib/memstream</xilinx:tag>
-        <xilinx:tag xilinx:name="ui.data.coregen.dd@1f6f8fe4_ARCHIVE_LOCATION">c:/Users/lucianp/Documents/git/finn-rtllib/memstream</xilinx:tag>
-        <xilinx:tag xilinx:name="ui.data.coregen.dd@79ecbc44_ARCHIVE_LOCATION">c:/Users/lucianp/Documents/git/finn-rtllib/memstream</xilinx:tag>
-        <xilinx:tag xilinx:name="ui.data.coregen.dd@22fd683_ARCHIVE_LOCATION">c:/Users/lucianp/Documents/git/finn-rtllib/memstream</xilinx:tag>
-        <xilinx:tag xilinx:name="ui.data.coregen.dd@2c00346d_ARCHIVE_LOCATION">c:/Users/lucianp/Documents/git/finn-rtllib/memstream</xilinx:tag>
-      </xilinx:tags>
+      <xilinx:coreRevision>9</xilinx:coreRevision>
+      <xilinx:coreCreationDateTime>2020-08-21T11:26:48Z</xilinx:coreCreationDateTime>
     </xilinx:coreExtensions>
     <xilinx:packagingInfo>
-      <xilinx:xilinxVersion>2019.1.3</xilinx:xilinxVersion>
+      <xilinx:xilinxVersion>2020.1</xilinx:xilinxVersion>
       <xilinx:checksum xilinx:scope="busInterfaces" xilinx:value="6d8b2551"/>
-      <xilinx:checksum xilinx:scope="fileGroups" xilinx:value="5e0c4694"/>
+      <xilinx:checksum xilinx:scope="fileGroups" xilinx:value="fe9e02ac"/>
       <xilinx:checksum xilinx:scope="ports" xilinx:value="cabd7433"/>
-      <xilinx:checksum xilinx:scope="hdlParameters" xilinx:value="f63127c8"/>
-      <xilinx:checksum xilinx:scope="parameters" xilinx:value="5365a08b"/>
+      <xilinx:checksum xilinx:scope="hdlParameters" xilinx:value="29c70cc4"/>
+      <xilinx:checksum xilinx:scope="parameters" xilinx:value="858b58f8"/>
     </xilinx:packagingInfo>
   </spirit:vendorExtensions>
 </spirit:component>
diff --git a/finn-rtllib/memstream/hdl/memstream.v b/finn-rtllib/memstream/hdl/memstream.v
index 28acb301a583f7437c580744bae7bdc4aef76337..961103e4ca1261ab0109ad9db291a1a66f9c0915 100644
--- a/finn-rtllib/memstream/hdl/memstream.v
+++ b/finn-rtllib/memstream/hdl/memstream.v
@@ -109,359 +109,141 @@ module memstream
 
 );
 
-//calculate number of RAMB18 blocks we need depth-wise
-localparam NMEMBLOCKS = (MEM_DEPTH+1023) / 1024; //ceil(MEM_DEPTH/1024)
-
-//calculate width of address for each block
-localparam BLOCKADRWIDTH = NMEMBLOCKS > 1 ? 10 : $clog2(MEM_DEPTH);
-
-//determine whether a stream needs to multiplex between memory blocks
-localparam STRM0_MUX = ((STRM0_OFFSET/1024) != ((STRM0_OFFSET+STRM0_DEPTH)/1024));
-localparam STRM1_MUX = ((STRM1_OFFSET/1024) != ((STRM1_OFFSET+STRM1_DEPTH)/1024));
-localparam STRM2_MUX = ((STRM2_OFFSET/1024) != ((STRM2_OFFSET+STRM2_DEPTH)/1024));
-localparam STRM3_MUX = ((STRM3_OFFSET/1024) != ((STRM3_OFFSET+STRM3_DEPTH)/1024));
-localparam STRM4_MUX = ((STRM4_OFFSET/1024) != ((STRM4_OFFSET+STRM4_DEPTH)/1024));
-localparam STRM5_MUX = ((STRM5_OFFSET/1024) != ((STRM5_OFFSET+STRM5_DEPTH)/1024));
-
-//determine what the base block of each stream is
-localparam STRM0_BLOCK = (STRM0_OFFSET/1024);
-localparam STRM1_BLOCK = (STRM1_OFFSET/1024);
-localparam STRM2_BLOCK = (STRM2_OFFSET/1024);
-localparam STRM3_BLOCK = (STRM3_OFFSET/1024);
-localparam STRM4_BLOCK = (STRM4_OFFSET/1024);
-localparam STRM5_BLOCK = (STRM5_OFFSET/1024);
-
-//determine what the end block of each stream is
-localparam STRM0_END_BLOCK = ((STRM0_OFFSET+STRM0_DEPTH-1)/1024);
-localparam STRM1_END_BLOCK = ((STRM1_OFFSET+STRM1_DEPTH-1)/1024);
-localparam STRM2_END_BLOCK = ((STRM2_OFFSET+STRM2_DEPTH-1)/1024);
-localparam STRM3_END_BLOCK = ((STRM3_OFFSET+STRM3_DEPTH-1)/1024);
-localparam STRM4_END_BLOCK = ((STRM4_OFFSET+STRM4_DEPTH-1)/1024);
-localparam STRM5_END_BLOCK = ((STRM5_OFFSET+STRM5_DEPTH-1)/1024);
-
-//determine the number of blocks spanned by each stream
-localparam STRM0_NBLOCKS = STRM0_END_BLOCK - STRM0_BLOCK + 1;
-localparam STRM1_NBLOCKS = STRM1_END_BLOCK - STRM1_BLOCK + 1;
-localparam STRM2_NBLOCKS = STRM2_END_BLOCK - STRM2_BLOCK + 1;
-localparam STRM3_NBLOCKS = STRM3_END_BLOCK - STRM3_BLOCK + 1;
-localparam STRM4_NBLOCKS = STRM4_END_BLOCK - STRM4_BLOCK + 1;
-localparam STRM5_NBLOCKS = STRM5_END_BLOCK - STRM5_BLOCK + 1;
-
-//TODO: check that memory width is equal to the widest stream
-//TODO: check that the stream depths and offsets make sense, and that the memory depth is sufficient (or calculate depth here?)
-initial begin
-    if((NSTREAMS < 1) | (NSTREAMS > 6)) begin
-        $display("Invalid setting for NSTREAMS, please set in range [1,6]");
-        $finish();
-    end
-end
-
-//invert reset
-wire rst;
-assign rst = ~aresetn;
-
-//WARNING: pipeline depth is larger than the number of streams per port so we have in-flight writes that may see not-ready when they get executed
-//solution: use prog-full to make sure we have an equal number of free slots in the stream to the read pipeline depth
-
-reg [$clog2(MEM_DEPTH)-1:0] strm0_addr = STRM0_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm1_addr = STRM1_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm2_addr = STRM2_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm3_addr = STRM3_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm4_addr = STRM4_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm5_addr = STRM5_OFFSET;
-
-reg strm0_incr_en;
-reg strm1_incr_en;
-reg strm2_incr_en;
-reg strm3_incr_en;
-reg strm4_incr_en;
-reg strm5_incr_en;
-
-wire strm0_rst;
-wire strm1_rst;
-wire strm2_rst;
-wire strm3_rst;
-wire strm4_rst;
-wire strm5_rst;
-
-reg strm0_ready;
-reg strm1_ready;
-reg strm2_ready;
-reg strm3_ready;
-reg strm4_ready;
-reg strm5_ready;
-
-//arbiter: work on one stream at a time
-//multiplex each port between (up to) half of the streams
-reg [1:0] current_stream_porta = 0;
-reg [1:0] current_stream_portb = 0;
-
-always @(posedge aclk) begin
-    if(rst)
-        current_stream_porta <= 0;
-    else case(current_stream_porta)
-        0: current_stream_porta <= strm2_ready ? 1 : strm4_ready ? 2 : 0;
-        1: current_stream_porta <= strm4_ready ? 2 : strm0_ready ? 0 : 1;
-        2: current_stream_porta <= strm0_ready ? 0 : strm2_ready ? 1 : 2;
-    endcase
-    if(rst)
-        current_stream_portb <= 0;
-    else case(current_stream_portb)
-        0: current_stream_portb <= strm3_ready ? 1 : strm5_ready ? 2 : 0;
-        1: current_stream_portb <= strm5_ready ? 2 : strm1_ready ? 0 : 1;
-        2: current_stream_portb <= strm1_ready ? 0 : strm3_ready ? 1 : 2;
-    endcase
-end
+generate
+if(NSTREAMS <= 2) begin: singleblock
 
-always @(posedge aclk) begin
-    if(rst) begin
-        strm0_incr_en <= 0;
-        strm1_incr_en <= 0;
-        strm2_incr_en <= 0;
-        strm3_incr_en <= 0;
-        strm4_incr_en <= 0;
-        strm5_incr_en <= 0;
-    end else begin
-        strm0_incr_en <= (current_stream_porta == 0) & strm0_ready;
-        strm1_incr_en <= (current_stream_portb == 0) & strm1_ready;
-        strm2_incr_en <= (current_stream_porta == 1) & strm2_ready;
-        strm3_incr_en <= (current_stream_portb == 1) & strm3_ready;
-        strm4_incr_en <= (current_stream_porta == 2) & strm4_ready;
-        strm5_incr_en <= (current_stream_portb == 2) & strm5_ready;
-    end
-end
-
-assign strm0_rst = strm0_incr_en & (strm0_addr == (STRM0_OFFSET + STRM0_DEPTH-1));
-assign strm1_rst = strm1_incr_en & (strm1_addr == (STRM1_OFFSET + STRM1_DEPTH-1));
-assign strm2_rst = strm2_incr_en & (strm2_addr == (STRM2_OFFSET + STRM2_DEPTH-1));
-assign strm3_rst = strm3_incr_en & (strm3_addr == (STRM3_OFFSET + STRM3_DEPTH-1));
-assign strm4_rst = strm4_incr_en & (strm4_addr == (STRM4_OFFSET + STRM4_DEPTH-1));
-assign strm5_rst = strm5_incr_en & (strm5_addr == (STRM5_OFFSET + STRM5_DEPTH-1));
-
-always @(posedge aclk) begin
-    strm0_ready <= ~m_axis_0_afull;
-    strm1_ready <= ~m_axis_1_afull & (NSTREAMS >= 2);
-    strm2_ready <= ~m_axis_2_afull & (NSTREAMS >= 3);
-    strm3_ready <= ~m_axis_3_afull & (NSTREAMS >= 4);
-    strm4_ready <= ~m_axis_4_afull & (NSTREAMS >= 5);
-    strm5_ready <= ~m_axis_5_afull & (NSTREAMS >= 6);
-end
 
-//one address counter per stream; more LUTs but keeps routing short and local
-always @(posedge aclk) begin
-    if(strm0_rst | rst)
-        strm0_addr <= STRM0_OFFSET;
-    else if(strm0_incr_en)
-        strm0_addr <= strm0_addr + 1;
-    if(strm1_rst | rst)
-        strm1_addr <= STRM1_OFFSET;
-    else if(strm1_incr_en)
-        strm1_addr <= strm1_addr + 1;
-    if(strm2_rst | rst)
-        strm2_addr <= STRM2_OFFSET;
-    else if(strm2_incr_en)
-        strm2_addr <= strm2_addr + 1;
-    if(strm3_rst | rst)
-        strm3_addr <= STRM3_OFFSET;
-    else if(strm3_incr_en)
-        strm3_addr <= strm3_addr + 1;
-    if(strm4_rst | rst)
-        strm4_addr <= STRM4_OFFSET;
-    else if(strm4_incr_en)
-        strm4_addr <= strm4_addr + 1;
-    if(strm5_rst | rst)
-        strm5_addr <= STRM5_OFFSET;
-    else if(strm5_incr_en)
-        strm5_addr <= strm5_addr + 1;
-end
-
-reg [$clog2(MEM_DEPTH)-1:0] addra;
-wire [MEM_WIDTH*NMEMBLOCKS-1:0] rdqa;
-
-reg [$clog2(MEM_DEPTH)-1:0] addrb;
-wire [MEM_WIDTH*NMEMBLOCKS-1:0] rdqb;
-
-wire [NMEMBLOCKS-1:0] we;
-
-reg [1:0] addr_select_porta;
-reg [1:0] addr_select_portb;
-
-//multiplex addresses of various streams into address ports of memory
-always @(posedge aclk) begin
-    addr_select_porta <= current_stream_porta;
-    case(addr_select_porta)
-        0: addra <= strm0_addr;
-        1: addra <= strm2_addr;
-        2: addra <= strm4_addr;
-    endcase
-    addr_select_portb <= current_stream_portb;
-    case(addr_select_portb)
-        0: addrb <= strm1_addr;
-        1: addrb <= strm3_addr;
-        2: addrb <= strm5_addr;
-    endcase
-end
+memstream_singleblock
+#(
+    .CONFIG_EN(CONFIG_EN),
+    .NSTREAMS(NSTREAMS),
+    .MEM_DEPTH(MEM_DEPTH),
+    .MEM_WIDTH(MEM_WIDTH),
+    .MEM_INIT(MEM_INIT),
+    .RAM_STYLE(RAM_STYLE),
 
-genvar g;
-generate for(g=0; g<NMEMBLOCKS; g=g+1) begin: blockports
+    //widths per stream
+    .STRM0_WIDTH(STRM0_WIDTH),
+    .STRM1_WIDTH(STRM1_WIDTH),
 
-assign we[g] = (CONFIG_EN == 1) & config_ce & config_we & (config_address[31:BLOCKADRWIDTH] == g);
+    //depths per stream
+    .STRM0_DEPTH(STRM0_DEPTH),
+    .STRM1_DEPTH(STRM1_DEPTH),
 
-ramb18_wf_dualport
-#(
-    .ID(g),
-	.DWIDTH(MEM_WIDTH),
-	.AWIDTH(BLOCKADRWIDTH),
-	.MEM_INIT(MEM_INIT),
-  .RAM_STYLE(RAM_STYLE)
+    //offsets for each stream
+    .STRM0_OFFSET(STRM0_OFFSET),
+    .STRM1_OFFSET(STRM1_OFFSET)
 )
-ram
+mem
 (
-	.clk(aclk),
-
-	.wea(we[g]),
-	.addra(we[g] ? config_address[BLOCKADRWIDTH-1:0] : addra[BLOCKADRWIDTH-1:0]),
-	.wdataa(config_d0),
-	.rdqa(rdqa[(g+1)*MEM_WIDTH-1:g*MEM_WIDTH]),
-
-	.web(1'b0),
-	.addrb(addrb[BLOCKADRWIDTH-1:0]),
-	.wdatab('d0),
-	.rdqb(rdqb[(g+1)*MEM_WIDTH-1:g*MEM_WIDTH])
+    .aclk(aclk),
+    .aresetn(aresetn),
+
+    .config_address(config_address),
+    .config_ce(config_ce),
+    .config_we(config_we),
+    .config_d0(config_d0),
+    .config_q0(config_q0),
+
+    .m_axis_0_tready(m_axis_0_tready),
+    .m_axis_0_tvalid(m_axis_0_tvalid),
+    .m_axis_0_tdata(m_axis_0_tdata),
+
+    .m_axis_1_tready(m_axis_1_tready),
+    .m_axis_1_tvalid(m_axis_1_tvalid),
+    .m_axis_1_tdata(m_axis_1_tdata)
 );
 
-end
-endgenerate
-
-integer i;
-
-generate if(NMEMBLOCKS > 1) begin: multiblock
-
-wire [MEM_WIDTH-1:0] rdqmux[5:0];
-
-reg [$clog2(MEM_DEPTH)-BLOCKADRWIDTH-1:0] rdblocka[2:0];
-reg [$clog2(MEM_DEPTH)-BLOCKADRWIDTH-1:0] rdblockb[2:0];
-
-always @(posedge aclk) begin
-    rdblocka[0] <= addra[$clog2(MEM_DEPTH)-1:BLOCKADRWIDTH];
-    rdblockb[0] <= addrb[$clog2(MEM_DEPTH)-1:BLOCKADRWIDTH];
-    for(i=0; i<2; i=i+1) begin
-		rdblocka[i+1] <= rdblocka[i];
-		rdblockb[i+1] <= rdblockb[i];
-    end
-end
-
-if(NSTREAMS >= 1) begin: en_strm0
-	if(STRM0_MUX == 1) begin: mux0
-		mux #(STRM0_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM0_BLOCK+STRM0_NBLOCKS)*MEM_WIDTH-1:STRM0_BLOCK*MEM_WIDTH],rdqmux[0],rdblocka[1] - STRM0_BLOCK);
-	end else begin: nomux0
-		assign rdqmux[0] = rdqa[(STRM0_BLOCK+1)*MEM_WIDTH-1:STRM0_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_0_tdata = rdqmux[0][STRM0_WIDTH-1:0];
-end
-
-if(NSTREAMS >= 2) begin: en_strm1
-	if(STRM1_MUX == 1) begin: mux1
-		mux #(STRM1_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM1_BLOCK+STRM1_NBLOCKS)*MEM_WIDTH-1:STRM1_BLOCK*MEM_WIDTH],rdqmux[1],rdblockb[1] - STRM1_BLOCK);
-	end else begin: nomux1
-		assign rdqmux[1] = rdqb[(STRM1_BLOCK+1)*MEM_WIDTH-1:STRM1_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_1_tdata = rdqmux[1][STRM1_WIDTH-1:0];
-end
+assign m_axis_2_tvalid = 0;
+assign m_axis_2_tdata = 0;
+assign m_axis_3_tvalid = 0;
+assign m_axis_3_tdata = 0;
+assign m_axis_4_tvalid = 0;
+assign m_axis_4_tdata = 0;
+assign m_axis_5_tvalid = 0;
+assign m_axis_5_tdata = 0;
 
-if(NSTREAMS >= 3) begin: en_strm2
-	if(STRM2_MUX == 1) begin: mux2
-		mux #(STRM2_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM2_BLOCK+STRM2_NBLOCKS)*MEM_WIDTH-1:STRM2_BLOCK*MEM_WIDTH],rdqmux[2],rdblocka[1] - STRM2_BLOCK);
-	end else begin: nomux2
-		assign rdqmux[2] = rdqa[(STRM2_BLOCK+1)*MEM_WIDTH-1:STRM2_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_2_tdata = rdqmux[2][STRM2_WIDTH-1:0];
-end
+end else begin: multiblock
 
-if(NSTREAMS >= 4) begin: en_strm3
-	if(STRM3_MUX == 1) begin: mux3
-		mux #(STRM3_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM3_BLOCK+STRM3_NBLOCKS)*MEM_WIDTH-1:STRM3_BLOCK*MEM_WIDTH],rdqmux[3],rdblockb[1] - STRM3_BLOCK);
-	end else begin: nomux3
-		assign rdqmux[3] = rdqb[(STRM3_BLOCK+1)*MEM_WIDTH-1:STRM3_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_3_tdata = rdqmux[3][STRM3_WIDTH-1:0];
-end
 
-if(NSTREAMS >= 5) begin: en_strm4
-	if(STRM4_MUX == 1) begin: mux4
-		mux #(STRM4_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM4_BLOCK+STRM4_NBLOCKS)*MEM_WIDTH-1:STRM4_BLOCK*MEM_WIDTH],rdqmux[4],rdblocka[1] - STRM4_BLOCK);
-	end else begin: nomux4
-		assign rdqmux[4] = rdqa[(STRM4_BLOCK+1)*MEM_WIDTH-1:STRM4_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_4_tdata = rdqmux[4][STRM4_WIDTH-1:0];
-end
+memstream_multiblock
+#(
+    .CONFIG_EN(CONFIG_EN),
+    .NSTREAMS(NSTREAMS),
+    .MEM_DEPTH(MEM_DEPTH),
+    .MEM_WIDTH(MEM_WIDTH),
+    .MEM_INIT(MEM_INIT),
+    .RAM_STYLE(RAM_STYLE),
 
-if(NSTREAMS >= 6) begin: en_strm5
-	if(STRM5_MUX == 1) begin: mux5
-		mux #(STRM5_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM5_BLOCK+STRM5_NBLOCKS)*MEM_WIDTH-1:STRM5_BLOCK*MEM_WIDTH],rdqmux[5],rdblockb[1] - STRM5_BLOCK);
-	end else begin: nomux5
-		assign rdqmux[5] = rdqb[(STRM5_BLOCK+1)*MEM_WIDTH-1:STRM5_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_5_tdata = rdqmux[5][STRM5_WIDTH-1:0];
-end
+    //widths per stream
+    .STRM0_WIDTH(STRM0_WIDTH),
+    .STRM1_WIDTH(STRM1_WIDTH),
+    .STRM2_WIDTH(STRM2_WIDTH),
+    .STRM3_WIDTH(STRM3_WIDTH),
+    .STRM4_WIDTH(STRM4_WIDTH),
+    .STRM5_WIDTH(STRM5_WIDTH),
+
+    //depths per stream
+    .STRM0_DEPTH(STRM0_DEPTH),
+    .STRM1_DEPTH(STRM1_DEPTH),
+    .STRM2_DEPTH(STRM2_DEPTH),
+    .STRM3_DEPTH(STRM3_DEPTH),
+    .STRM4_DEPTH(STRM4_DEPTH),
+    .STRM5_DEPTH(STRM5_DEPTH),
+
+    //offsets for each stream
+    .STRM0_OFFSET(STRM0_OFFSET),
+    .STRM1_OFFSET(STRM1_OFFSET),
+    .STRM2_OFFSET(STRM2_OFFSET),
+    .STRM3_OFFSET(STRM3_OFFSET),
+    .STRM4_OFFSET(STRM4_OFFSET),
+    .STRM5_OFFSET(STRM5_OFFSET)
+)
+mem
+(
+    .aclk(aclk),
+    .aresetn(aresetn),
+
+    .config_address(config_address),
+    .config_ce(config_ce),
+    .config_we(config_we),
+    .config_d0(config_d0),
+    .config_q0(config_q0),
+
+    .m_axis_0_afull(m_axis_0_afull),
+    .m_axis_0_tready(m_axis_0_tready),
+    .m_axis_0_tvalid(m_axis_0_tvalid),
+    .m_axis_0_tdata(m_axis_0_tdata),
+
+    .m_axis_1_afull(m_axis_1_afull),
+    .m_axis_1_tready(m_axis_1_tready),
+    .m_axis_1_tvalid(m_axis_1_tvalid),
+    .m_axis_1_tdata(m_axis_1_tdata),
+
+    .m_axis_2_afull(m_axis_2_afull),
+    .m_axis_2_tready(m_axis_2_tready),
+    .m_axis_2_tvalid(m_axis_2_tvalid),
+    .m_axis_2_tdata(m_axis_2_tdata),
+
+    .m_axis_3_afull(m_axis_3_afull),
+    .m_axis_3_tready(m_axis_3_tready),
+    .m_axis_3_tvalid(m_axis_3_tvalid),
+    .m_axis_3_tdata(m_axis_3_tdata),
+
+    .m_axis_4_afull(m_axis_4_afull),
+    .m_axis_4_tready(m_axis_4_tready),
+    .m_axis_4_tvalid(m_axis_4_tvalid),
+    .m_axis_4_tdata(m_axis_4_tdata),
+
+    .m_axis_5_afull(m_axis_5_afull),
+    .m_axis_5_tready(m_axis_5_tready),
+    .m_axis_5_tvalid(m_axis_5_tvalid),
+    .m_axis_5_tdata(m_axis_5_tdata)
 
-end else begin: singleblock
+);
 
-if(NSTREAMS >= 1) begin: en_strm0_direct
-    assign m_axis_0_tdata = rdqa[STRM0_WIDTH-1:0];
-end
-if(NSTREAMS >= 2) begin: en_strm1_direct
-	assign m_axis_1_tdata = rdqb[STRM1_WIDTH-1:0];
-end
-if(NSTREAMS >= 3) begin: en_strm2_direct
-	assign m_axis_2_tdata = rdqa[STRM2_WIDTH-1:0];
-end
-if(NSTREAMS >= 4) begin: en_strm3_direct
-	assign m_axis_3_tdata = rdqb[STRM3_WIDTH-1:0];
-end
-if(NSTREAMS >= 5) begin: en_strm4_direct
-	assign m_axis_4_tdata = rdqa[STRM4_WIDTH-1:0];
-end
-if(NSTREAMS >= 6) begin: en_strm5_direct
-	assign m_axis_5_tdata = rdqb[STRM5_WIDTH-1:0];
-end
 
 end
 endgenerate
 
-//output to AXI Streams
-reg tvalid_pipe0[2:0];
-reg tvalid_pipe1[2:0];
-reg tvalid_pipe2[2:0];
-reg tvalid_pipe3[2:0];
-reg tvalid_pipe4[2:0];
-reg tvalid_pipe5[2:0];
-
-assign m_axis_0_tvalid = tvalid_pipe0[2];
-assign m_axis_1_tvalid = tvalid_pipe1[2];
-assign m_axis_2_tvalid = tvalid_pipe2[2];
-assign m_axis_3_tvalid = tvalid_pipe3[2];
-assign m_axis_4_tvalid = tvalid_pipe4[2];
-assign m_axis_5_tvalid = tvalid_pipe5[2];
-
-
-always @(posedge aclk) begin
-    tvalid_pipe0[0] <= strm0_incr_en;
-    tvalid_pipe1[0] <= strm1_incr_en;
-    tvalid_pipe2[0] <= strm2_incr_en;
-    tvalid_pipe3[0] <= strm3_incr_en;
-    tvalid_pipe4[0] <= strm4_incr_en;
-    tvalid_pipe5[0] <= strm5_incr_en;
-    for(i=0; i<2; i=i+1) begin: srl
-        tvalid_pipe0[i+1] <= tvalid_pipe0[i];
-        tvalid_pipe1[i+1] <= tvalid_pipe1[i];
-        tvalid_pipe2[i+1] <= tvalid_pipe2[i];
-        tvalid_pipe3[i+1] <= tvalid_pipe3[i];
-        tvalid_pipe4[i+1] <= tvalid_pipe4[i];
-        tvalid_pipe5[i+1] <= tvalid_pipe5[i];
-    end
-end
-
-assign config_q0 = 0;
-
 endmodule
diff --git a/finn-rtllib/memstream/hdl/memstream_multiblock.v b/finn-rtllib/memstream/hdl/memstream_multiblock.v
new file mode 100644
index 0000000000000000000000000000000000000000..017088b8c1572bb3baa2a5a46336509187a762ab
--- /dev/null
+++ b/finn-rtllib/memstream/hdl/memstream_multiblock.v
@@ -0,0 +1,471 @@
+/*
+ Copyright (c) 2020, Xilinx
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name of FINN nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+module memstream_multiblock
+#(
+//parameters to enable/disable axi-mm, set number of streams, set readmemh for memory, set per-stream offsets in memory, set per-stream widths
+    parameter CONFIG_EN = 1,
+    parameter NSTREAMS = 6,//1 up to 6
+
+    parameter MEM_DEPTH = 13824,
+    parameter MEM_WIDTH = 32,
+    parameter MEM_INIT = "./",
+    parameter RAM_STYLE = "auto",
+
+    //widths per stream
+	parameter STRM0_WIDTH = 32,
+	parameter STRM1_WIDTH = 32,
+	parameter STRM2_WIDTH = 32,
+	parameter STRM3_WIDTH = 32,
+	parameter STRM4_WIDTH = 32,
+	parameter STRM5_WIDTH = 32,
+
+	//depths per stream
+	parameter STRM0_DEPTH = 2304,
+	parameter STRM1_DEPTH = 2304,
+	parameter STRM2_DEPTH = 2304,
+	parameter STRM3_DEPTH = 2304,
+	parameter STRM4_DEPTH = 2304,
+	parameter STRM5_DEPTH = 2304,
+
+	//offsets for each stream
+	parameter STRM0_OFFSET = 0,
+	parameter STRM1_OFFSET = 2304,
+	parameter STRM2_OFFSET = 4608,
+	parameter STRM3_OFFSET = 6912,
+	parameter STRM4_OFFSET = 9216,
+	parameter STRM5_OFFSET = 11520
+)
+
+(
+    input aclk,
+    input aresetn,
+
+    //optional configuration interface compatible with ap_memory
+	input [31:0] config_address,
+	input config_ce,
+	input config_we,
+	input [31:0] config_d0,
+	output [31:0] config_q0,
+
+    //multiple output AXI Streams, TDATA width rounded to multiple of 8 bits
+    input m_axis_0_afull,
+    input m_axis_0_tready,
+    output m_axis_0_tvalid,
+    output [((STRM0_WIDTH+7)/8)*8-1:0] m_axis_0_tdata,
+
+    input m_axis_1_afull,
+    input m_axis_1_tready,
+    output m_axis_1_tvalid,
+    output [((STRM1_WIDTH+7)/8)*8-1:0] m_axis_1_tdata,
+
+    input m_axis_2_afull,
+    input m_axis_2_tready,
+    output m_axis_2_tvalid,
+    output [((STRM2_WIDTH+7)/8)*8-1:0] m_axis_2_tdata,
+
+    input m_axis_3_afull,
+    input m_axis_3_tready,
+    output m_axis_3_tvalid,
+    output [((STRM3_WIDTH+7)/8)*8-1:0] m_axis_3_tdata,
+
+    input m_axis_4_afull,
+    input m_axis_4_tready,
+    output m_axis_4_tvalid,
+    output [((STRM4_WIDTH+7)/8)*8-1:0] m_axis_4_tdata,
+
+    input m_axis_5_afull,
+    input m_axis_5_tready,
+    output m_axis_5_tvalid,
+    output [((STRM5_WIDTH+7)/8)*8-1:0] m_axis_5_tdata
+
+
+);
+
+//calculate number of RAMB18 blocks we need depth-wise
+localparam NMEMBLOCKS = (MEM_DEPTH+1023) / 1024; //ceil(MEM_DEPTH/1024)
+
+//calculate width of address for each block
+localparam BLOCKADRWIDTH = NMEMBLOCKS > 1 ? 10 : $clog2(MEM_DEPTH);
+
+//determine whether a stream needs to multiplex between memory blocks
+localparam STRM0_MUX = ((STRM0_OFFSET/1024) != ((STRM0_OFFSET+STRM0_DEPTH)/1024));
+localparam STRM1_MUX = ((STRM1_OFFSET/1024) != ((STRM1_OFFSET+STRM1_DEPTH)/1024));
+localparam STRM2_MUX = ((STRM2_OFFSET/1024) != ((STRM2_OFFSET+STRM2_DEPTH)/1024));
+localparam STRM3_MUX = ((STRM3_OFFSET/1024) != ((STRM3_OFFSET+STRM3_DEPTH)/1024));
+localparam STRM4_MUX = ((STRM4_OFFSET/1024) != ((STRM4_OFFSET+STRM4_DEPTH)/1024));
+localparam STRM5_MUX = ((STRM5_OFFSET/1024) != ((STRM5_OFFSET+STRM5_DEPTH)/1024));
+
+//determine what the base block of each stream is
+localparam STRM0_BLOCK = (STRM0_OFFSET/1024);
+localparam STRM1_BLOCK = (STRM1_OFFSET/1024);
+localparam STRM2_BLOCK = (STRM2_OFFSET/1024);
+localparam STRM3_BLOCK = (STRM3_OFFSET/1024);
+localparam STRM4_BLOCK = (STRM4_OFFSET/1024);
+localparam STRM5_BLOCK = (STRM5_OFFSET/1024);
+
+//determine what the end block of each stream is
+localparam STRM0_END_BLOCK = ((STRM0_OFFSET+STRM0_DEPTH-1)/1024);
+localparam STRM1_END_BLOCK = ((STRM1_OFFSET+STRM1_DEPTH-1)/1024);
+localparam STRM2_END_BLOCK = ((STRM2_OFFSET+STRM2_DEPTH-1)/1024);
+localparam STRM3_END_BLOCK = ((STRM3_OFFSET+STRM3_DEPTH-1)/1024);
+localparam STRM4_END_BLOCK = ((STRM4_OFFSET+STRM4_DEPTH-1)/1024);
+localparam STRM5_END_BLOCK = ((STRM5_OFFSET+STRM5_DEPTH-1)/1024);
+
+//determine the number of blocks spanned by each stream
+localparam STRM0_NBLOCKS = STRM0_END_BLOCK - STRM0_BLOCK + 1;
+localparam STRM1_NBLOCKS = STRM1_END_BLOCK - STRM1_BLOCK + 1;
+localparam STRM2_NBLOCKS = STRM2_END_BLOCK - STRM2_BLOCK + 1;
+localparam STRM3_NBLOCKS = STRM3_END_BLOCK - STRM3_BLOCK + 1;
+localparam STRM4_NBLOCKS = STRM4_END_BLOCK - STRM4_BLOCK + 1;
+localparam STRM5_NBLOCKS = STRM5_END_BLOCK - STRM5_BLOCK + 1;
+
+//TODO: check that memory width is equal to the widest stream
+//TODO: check that the stream depths and offsets make sense, and that the memory depth is sufficient (or calculate depth here?)
+initial begin
+    if((NSTREAMS < 1) | (NSTREAMS > 6)) begin
+        $display("Invalid setting for NSTREAMS, please set in range [1,6]");
+        $finish();
+    end
+end
+
+//invert reset
+wire rst;
+assign rst = ~aresetn;
+
+//WARNING: pipeline depth is larger than the number of streams per port so we have in-flight writes that may see not-ready when they get executed
+//solution: use prog-full to make sure we have an equal number of free slots in the stream to the read pipeline depth
+
+reg [$clog2(MEM_DEPTH)-1:0] strm0_addr = STRM0_OFFSET;
+reg [$clog2(MEM_DEPTH)-1:0] strm1_addr = STRM1_OFFSET;
+reg [$clog2(MEM_DEPTH)-1:0] strm2_addr = STRM2_OFFSET;
+reg [$clog2(MEM_DEPTH)-1:0] strm3_addr = STRM3_OFFSET;
+reg [$clog2(MEM_DEPTH)-1:0] strm4_addr = STRM4_OFFSET;
+reg [$clog2(MEM_DEPTH)-1:0] strm5_addr = STRM5_OFFSET;
+
+reg strm0_incr_en;
+reg strm1_incr_en;
+reg strm2_incr_en;
+reg strm3_incr_en;
+reg strm4_incr_en;
+reg strm5_incr_en;
+
+wire strm0_rst;
+wire strm1_rst;
+wire strm2_rst;
+wire strm3_rst;
+wire strm4_rst;
+wire strm5_rst;
+
+reg strm0_ready;
+reg strm1_ready;
+reg strm2_ready;
+reg strm3_ready;
+reg strm4_ready;
+reg strm5_ready;
+
+//arbiter: work on one stream at a time
+//multiplex each port between (up to) half of the streams
+reg [1:0] current_stream_porta = 0;
+reg [1:0] current_stream_portb = 0;
+
+always @(posedge aclk) begin
+    if(rst)
+        current_stream_porta <= 0;
+    else case(current_stream_porta)
+        0: current_stream_porta <= strm2_ready ? 1 : strm4_ready ? 2 : 0;
+        1: current_stream_porta <= strm4_ready ? 2 : strm0_ready ? 0 : 1;
+        2: current_stream_porta <= strm0_ready ? 0 : strm2_ready ? 1 : 2;
+    endcase
+    if(rst)
+        current_stream_portb <= 0;
+    else case(current_stream_portb)
+        0: current_stream_portb <= strm3_ready ? 1 : strm5_ready ? 2 : 0;
+        1: current_stream_portb <= strm5_ready ? 2 : strm1_ready ? 0 : 1;
+        2: current_stream_portb <= strm1_ready ? 0 : strm3_ready ? 1 : 2;
+    endcase
+end
+
+always @(posedge aclk) begin
+    if(rst) begin
+        strm0_incr_en <= 0;
+        strm1_incr_en <= 0;
+        strm2_incr_en <= 0;
+        strm3_incr_en <= 0;
+        strm4_incr_en <= 0;
+        strm5_incr_en <= 0;
+    end else begin
+        strm0_incr_en <= (current_stream_porta == 0) & strm0_ready;
+        strm1_incr_en <= (current_stream_portb == 0) & strm1_ready;
+        strm2_incr_en <= (current_stream_porta == 1) & strm2_ready;
+        strm3_incr_en <= (current_stream_portb == 1) & strm3_ready;
+        strm4_incr_en <= (current_stream_porta == 2) & strm4_ready;
+        strm5_incr_en <= (current_stream_portb == 2) & strm5_ready;
+    end
+end
+
+assign strm0_rst = strm0_incr_en & (strm0_addr == (STRM0_OFFSET + STRM0_DEPTH-1));
+assign strm1_rst = strm1_incr_en & (strm1_addr == (STRM1_OFFSET + STRM1_DEPTH-1));
+assign strm2_rst = strm2_incr_en & (strm2_addr == (STRM2_OFFSET + STRM2_DEPTH-1));
+assign strm3_rst = strm3_incr_en & (strm3_addr == (STRM3_OFFSET + STRM3_DEPTH-1));
+assign strm4_rst = strm4_incr_en & (strm4_addr == (STRM4_OFFSET + STRM4_DEPTH-1));
+assign strm5_rst = strm5_incr_en & (strm5_addr == (STRM5_OFFSET + STRM5_DEPTH-1));
+
+always @(posedge aclk) begin
+    strm0_ready <= ~m_axis_0_afull;
+    strm1_ready <= ~m_axis_1_afull & (NSTREAMS >= 2);
+    strm2_ready <= ~m_axis_2_afull & (NSTREAMS >= 3);
+    strm3_ready <= ~m_axis_3_afull & (NSTREAMS >= 4);
+    strm4_ready <= ~m_axis_4_afull & (NSTREAMS >= 5);
+    strm5_ready <= ~m_axis_5_afull & (NSTREAMS >= 6);
+end
+
+//one address counter per stream; more LUTs but keeps routing short and local
+always @(posedge aclk) begin
+    if(strm0_rst | rst)
+        strm0_addr <= STRM0_OFFSET;
+    else if(strm0_incr_en)
+        strm0_addr <= strm0_addr + 1;
+    if(strm1_rst | rst)
+        strm1_addr <= STRM1_OFFSET;
+    else if(strm1_incr_en)
+        strm1_addr <= strm1_addr + 1;
+    if(strm2_rst | rst)
+        strm2_addr <= STRM2_OFFSET;
+    else if(strm2_incr_en)
+        strm2_addr <= strm2_addr + 1;
+    if(strm3_rst | rst)
+        strm3_addr <= STRM3_OFFSET;
+    else if(strm3_incr_en)
+        strm3_addr <= strm3_addr + 1;
+    if(strm4_rst | rst)
+        strm4_addr <= STRM4_OFFSET;
+    else if(strm4_incr_en)
+        strm4_addr <= strm4_addr + 1;
+    if(strm5_rst | rst)
+        strm5_addr <= STRM5_OFFSET;
+    else if(strm5_incr_en)
+        strm5_addr <= strm5_addr + 1;
+end
+
+reg [$clog2(MEM_DEPTH)-1:0] addra;
+wire [MEM_WIDTH*NMEMBLOCKS-1:0] rdqa;
+
+reg [$clog2(MEM_DEPTH)-1:0] addrb;
+wire [MEM_WIDTH*NMEMBLOCKS-1:0] rdqb;
+
+wire [NMEMBLOCKS-1:0] we;
+
+reg [1:0] addr_select_porta;
+reg [1:0] addr_select_portb;
+
+//multiplex addresses of various streams into address ports of memory
+always @(posedge aclk) begin
+    addr_select_porta <= current_stream_porta;
+    case(addr_select_porta)
+        0: addra <= strm0_addr;
+        1: addra <= strm2_addr;
+        2: addra <= strm4_addr;
+    endcase
+    addr_select_portb <= current_stream_portb;
+    case(addr_select_portb)
+        0: addrb <= strm1_addr;
+        1: addrb <= strm3_addr;
+        2: addrb <= strm5_addr;
+    endcase
+end
+
+genvar g;
+generate for(g=0; g<NMEMBLOCKS; g=g+1) begin: blockports
+
+assign we[g] = (CONFIG_EN == 1) & config_ce & config_we & (config_address[31:BLOCKADRWIDTH] == g);
+
+ramb18_wf_dualport
+#(
+    .ID(g),
+	.DWIDTH(MEM_WIDTH),
+	.AWIDTH(BLOCKADRWIDTH),
+	.MEM_INIT(MEM_INIT),
+  .RAM_STYLE(RAM_STYLE)
+)
+ram
+(
+	.clk(aclk),
+
+	.wea(we[g]),
+    .ena(1'b1),
+    .enqa(1'b1),
+	.addra(we[g] ? config_address[BLOCKADRWIDTH-1:0] : addra[BLOCKADRWIDTH-1:0]),
+	.wdataa(config_d0),
+	.rdqa(rdqa[(g+1)*MEM_WIDTH-1:g*MEM_WIDTH]),
+
+	.web(1'b0),
+    .enb(1'b1),
+    .enqb(1'b1),
+	.addrb(addrb[BLOCKADRWIDTH-1:0]),
+	.wdatab('d0),
+	.rdqb(rdqb[(g+1)*MEM_WIDTH-1:g*MEM_WIDTH])
+);
+
+end
+endgenerate
+
+integer i;
+
+generate if(NMEMBLOCKS > 1) begin: multiblock
+
+wire [MEM_WIDTH-1:0] rdqmux[5:0];
+
+reg [$clog2(MEM_DEPTH)-BLOCKADRWIDTH-1:0] rdblocka[2:0];
+reg [$clog2(MEM_DEPTH)-BLOCKADRWIDTH-1:0] rdblockb[2:0];
+
+always @(posedge aclk) begin
+    rdblocka[0] <= addra[$clog2(MEM_DEPTH)-1:BLOCKADRWIDTH];
+    rdblockb[0] <= addrb[$clog2(MEM_DEPTH)-1:BLOCKADRWIDTH];
+    for(i=0; i<2; i=i+1) begin
+		rdblocka[i+1] <= rdblocka[i];
+		rdblockb[i+1] <= rdblockb[i];
+    end
+end
+
+if(NSTREAMS >= 1) begin: en_strm0
+	if(STRM0_MUX == 1) begin: mux0
+		mux #(STRM0_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM0_BLOCK+STRM0_NBLOCKS)*MEM_WIDTH-1:STRM0_BLOCK*MEM_WIDTH],rdqmux[0],rdblocka[1] - STRM0_BLOCK);
+	end else begin: nomux0
+		assign rdqmux[0] = rdqa[(STRM0_BLOCK+1)*MEM_WIDTH-1:STRM0_BLOCK*MEM_WIDTH];
+	end
+	assign m_axis_0_tdata = rdqmux[0][STRM0_WIDTH-1:0];
+end
+
+if(NSTREAMS >= 2) begin: en_strm1
+	if(STRM1_MUX == 1) begin: mux1
+		mux #(STRM1_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM1_BLOCK+STRM1_NBLOCKS)*MEM_WIDTH-1:STRM1_BLOCK*MEM_WIDTH],rdqmux[1],rdblockb[1] - STRM1_BLOCK);
+	end else begin: nomux1
+		assign rdqmux[1] = rdqb[(STRM1_BLOCK+1)*MEM_WIDTH-1:STRM1_BLOCK*MEM_WIDTH];
+	end
+	assign m_axis_1_tdata = rdqmux[1][STRM1_WIDTH-1:0];
+end
+
+if(NSTREAMS >= 3) begin: en_strm2
+	if(STRM2_MUX == 1) begin: mux2
+		mux #(STRM2_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM2_BLOCK+STRM2_NBLOCKS)*MEM_WIDTH-1:STRM2_BLOCK*MEM_WIDTH],rdqmux[2],rdblocka[1] - STRM2_BLOCK);
+	end else begin: nomux2
+		assign rdqmux[2] = rdqa[(STRM2_BLOCK+1)*MEM_WIDTH-1:STRM2_BLOCK*MEM_WIDTH];
+	end
+	assign m_axis_2_tdata = rdqmux[2][STRM2_WIDTH-1:0];
+end
+
+if(NSTREAMS >= 4) begin: en_strm3
+	if(STRM3_MUX == 1) begin: mux3
+		mux #(STRM3_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM3_BLOCK+STRM3_NBLOCKS)*MEM_WIDTH-1:STRM3_BLOCK*MEM_WIDTH],rdqmux[3],rdblockb[1] - STRM3_BLOCK);
+	end else begin: nomux3
+		assign rdqmux[3] = rdqb[(STRM3_BLOCK+1)*MEM_WIDTH-1:STRM3_BLOCK*MEM_WIDTH];
+	end
+	assign m_axis_3_tdata = rdqmux[3][STRM3_WIDTH-1:0];
+end
+
+if(NSTREAMS >= 5) begin: en_strm4
+	if(STRM4_MUX == 1) begin: mux4
+		mux #(STRM4_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM4_BLOCK+STRM4_NBLOCKS)*MEM_WIDTH-1:STRM4_BLOCK*MEM_WIDTH],rdqmux[4],rdblocka[1] - STRM4_BLOCK);
+	end else begin: nomux4
+		assign rdqmux[4] = rdqa[(STRM4_BLOCK+1)*MEM_WIDTH-1:STRM4_BLOCK*MEM_WIDTH];
+	end
+	assign m_axis_4_tdata = rdqmux[4][STRM4_WIDTH-1:0];
+end
+
+if(NSTREAMS >= 6) begin: en_strm5
+	if(STRM5_MUX == 1) begin: mux5
+		mux #(STRM5_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM5_BLOCK+STRM5_NBLOCKS)*MEM_WIDTH-1:STRM5_BLOCK*MEM_WIDTH],rdqmux[5],rdblockb[1] - STRM5_BLOCK);
+	end else begin: nomux5
+		assign rdqmux[5] = rdqb[(STRM5_BLOCK+1)*MEM_WIDTH-1:STRM5_BLOCK*MEM_WIDTH];
+	end
+	assign m_axis_5_tdata = rdqmux[5][STRM5_WIDTH-1:0];
+end
+
+end else begin: singleblock
+
+if(NSTREAMS >= 1) begin: en_strm0_direct
+    assign m_axis_0_tdata = rdqa[STRM0_WIDTH-1:0];
+end
+if(NSTREAMS >= 2) begin: en_strm1_direct
+	assign m_axis_1_tdata = rdqb[STRM1_WIDTH-1:0];
+end
+if(NSTREAMS >= 3) begin: en_strm2_direct
+	assign m_axis_2_tdata = rdqa[STRM2_WIDTH-1:0];
+end
+if(NSTREAMS >= 4) begin: en_strm3_direct
+	assign m_axis_3_tdata = rdqb[STRM3_WIDTH-1:0];
+end
+if(NSTREAMS >= 5) begin: en_strm4_direct
+	assign m_axis_4_tdata = rdqa[STRM4_WIDTH-1:0];
+end
+if(NSTREAMS >= 6) begin: en_strm5_direct
+	assign m_axis_5_tdata = rdqb[STRM5_WIDTH-1:0];
+end
+
+end
+endgenerate
+
+//output to AXI Streams
+reg tvalid_pipe0[2:0];
+reg tvalid_pipe1[2:0];
+reg tvalid_pipe2[2:0];
+reg tvalid_pipe3[2:0];
+reg tvalid_pipe4[2:0];
+reg tvalid_pipe5[2:0];
+
+assign m_axis_0_tvalid = tvalid_pipe0[2];
+assign m_axis_1_tvalid = tvalid_pipe1[2];
+assign m_axis_2_tvalid = tvalid_pipe2[2];
+assign m_axis_3_tvalid = tvalid_pipe3[2];
+assign m_axis_4_tvalid = tvalid_pipe4[2];
+assign m_axis_5_tvalid = tvalid_pipe5[2];
+
+
+always @(posedge aclk) begin
+    tvalid_pipe0[0] <= strm0_incr_en;
+    tvalid_pipe1[0] <= strm1_incr_en;
+    tvalid_pipe2[0] <= strm2_incr_en;
+    tvalid_pipe3[0] <= strm3_incr_en;
+    tvalid_pipe4[0] <= strm4_incr_en;
+    tvalid_pipe5[0] <= strm5_incr_en;
+    for(i=0; i<2; i=i+1) begin: srl
+        tvalid_pipe0[i+1] <= tvalid_pipe0[i];
+        tvalid_pipe1[i+1] <= tvalid_pipe1[i];
+        tvalid_pipe2[i+1] <= tvalid_pipe2[i];
+        tvalid_pipe3[i+1] <= tvalid_pipe3[i];
+        tvalid_pipe4[i+1] <= tvalid_pipe4[i];
+        tvalid_pipe5[i+1] <= tvalid_pipe5[i];
+    end
+end
+
+assign config_q0 = 0;
+
+endmodule
diff --git a/finn-rtllib/memstream/hdl/memstream_singleblock.v b/finn-rtllib/memstream/hdl/memstream_singleblock.v
new file mode 100644
index 0000000000000000000000000000000000000000..53a71a91bc0561e275791ebcf55e2c4653331b1d
--- /dev/null
+++ b/finn-rtllib/memstream/hdl/memstream_singleblock.v
@@ -0,0 +1,229 @@
+/*
+ Copyright (c) 2020, Xilinx
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name of FINN nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+    Implements a lightweight streamer for up to 2 streams in a single block of memory
+*/
+
+module memstream_singleblock
+#(
+    parameter CONFIG_EN = 1,
+    parameter NSTREAMS = 2,//1 up to 2
+
+    parameter MEM_DEPTH = 512,
+    parameter MEM_WIDTH = 32,
+    parameter MEM_INIT = "./",
+    parameter RAM_STYLE = "auto",
+
+    //widths per stream
+	parameter STRM0_WIDTH = 32,
+	parameter STRM1_WIDTH = 32,
+
+	//depths per stream
+	parameter STRM0_DEPTH = 256,
+	parameter STRM1_DEPTH = 256,
+
+	//offsets for each stream
+	parameter STRM0_OFFSET = 0,
+	parameter STRM1_OFFSET = 256
+)
+
+(
+    input aclk,
+    input aresetn,
+
+    //optional configuration interface compatible with ap_memory
+	input [31:0] config_address,
+	input config_ce,
+	input config_we,
+	input [MEM_WIDTH-1:0] config_d0,
+	output [MEM_WIDTH-1:0] config_q0,
+
+    //multiple output AXI Streams, TDATA width rounded to multiple of 8 bits
+    input m_axis_0_tready,
+    output m_axis_0_tvalid,
+    output [((STRM0_WIDTH+7)/8)*8-1:0] m_axis_0_tdata,
+
+    input m_axis_1_tready,
+    output m_axis_1_tvalid,
+    output [((STRM1_WIDTH+7)/8)*8-1:0] m_axis_1_tdata
+
+);
+
+
+//TODO: check that memory width is equal to the widest stream
+//TODO: check that the stream depths and offsets make sense, and that the memory depth is sufficient (or calculate depth here?)
+initial begin
+    if((NSTREAMS < 1) | (NSTREAMS > 2)) begin
+        $display("Invalid setting for NSTREAMS, please set in range [1,2]");
+        $finish();
+    end
+end
+
+//invert reset
+wire rst;
+assign rst = ~aresetn;
+
+wire strm0_incr_en;
+wire strm1_incr_en;
+
+assign strm0_incr_en = m_axis_0_tready | ~m_axis_0_tvalid;
+assign strm1_incr_en = m_axis_1_tready | ~m_axis_1_tvalid;
+
+generate
+if(MEM_DEPTH > 1) begin: use_ram
+
+//calculate width of memory address, with a minimum of 1 bit
+localparam BLOCKADRWIDTH = $clog2(MEM_DEPTH);
+
+reg [BLOCKADRWIDTH-1:0] strm0_addr = STRM0_OFFSET;
+wire strm0_rst;
+assign strm0_rst = strm0_incr_en & (strm0_addr == (STRM0_OFFSET + STRM0_DEPTH-1));
+
+//one address counter per stream; more LUTs but keeps routing short and local
+always @(posedge aclk) begin
+    if(strm0_rst | rst)
+        strm0_addr <= STRM0_OFFSET;
+    else if(strm0_incr_en)
+        strm0_addr <= strm0_addr + 1;
+end
+
+if(NSTREAMS == 1) begin: sdp
+
+ramb18_sdp
+#(
+    .ID(0),
+	.DWIDTH(MEM_WIDTH),
+	.AWIDTH(BLOCKADRWIDTH),
+    .DEPTH(MEM_DEPTH),
+	.MEM_INIT(MEM_INIT),
+    .RAM_STYLE(RAM_STYLE)
+)
+ram
+(
+	.clk(aclk),
+
+    .ena(config_ce),
+	.wea(config_we),
+	.addra(config_address[BLOCKADRWIDTH-1:0]),
+    .wdataa(config_d0),
+
+    .enb(strm0_incr_en),
+    .enqb(strm0_incr_en),
+	.addrb(strm0_addr),
+	.rdqb(m_axis_0_tdata)
+);
+
+
+end else begin: tdp
+
+reg [BLOCKADRWIDTH-1:0] strm1_addr = STRM1_OFFSET;
+wire strm1_rst;
+assign strm1_rst = strm1_incr_en & (strm1_addr == (STRM1_OFFSET + STRM1_DEPTH-1));
+
+always @(posedge aclk) begin
+    if(strm1_rst | rst)
+        strm1_addr <= STRM1_OFFSET;
+    else if(strm1_incr_en)
+        strm1_addr <= strm1_addr + 1;
+end
+
+ramb18_wf_dualport
+#(
+    .ID(0),
+	.DWIDTH(MEM_WIDTH),
+	.AWIDTH(BLOCKADRWIDTH),
+    .DEPTH(MEM_DEPTH),
+	.MEM_INIT(MEM_INIT),
+    .RAM_STYLE(RAM_STYLE)
+)
+ram
+(
+	.clk(aclk),
+
+	.wea(config_we),
+    .ena(strm0_incr_en | config_ce),
+    .enqa(strm0_incr_en | config_ce),
+	.addra(config_we ? config_address[BLOCKADRWIDTH-1:0] : strm0_addr),
+	.wdataa(config_d0),
+	.rdqa(m_axis_0_tdata),
+
+	.web(1'b0),
+    .enb(strm1_incr_en),
+    .enqb(strm1_incr_en),
+	.addrb(strm1_addr),
+	.wdatab('d0),
+	.rdqb(m_axis_1_tdata)
+);
+
+end
+
+end else begin: bypass
+
+reg [MEM_WIDTH-1:0] singleval[0:0];
+initial begin
+    $readmemh({MEM_INIT,"memblock_0.dat"}, singleval, 0, 0);
+end
+
+assign m_axis_0_tdata = singleval[0];
+assign m_axis_1_tdata = singleval[0];
+
+end
+endgenerate
+
+//signal valid after 2 tready cycles after initialization
+//then stay valid
+reg [1:0] tvalid_pipe0 = 2'd0;
+reg [1:0] tvalid_pipe1 = 2'd0;
+
+assign m_axis_0_tvalid = tvalid_pipe0[1];
+assign m_axis_1_tvalid = tvalid_pipe1[1];
+
+always @(posedge aclk) begin
+    if(rst) begin
+        tvalid_pipe0 <= 0;
+    end else if(strm0_incr_en) begin
+        tvalid_pipe0[0] <= 1;
+        tvalid_pipe0[1] <= tvalid_pipe0[0];
+    end
+end
+
+always @(posedge aclk) begin
+    if(rst) begin
+        tvalid_pipe1 <= 0;
+    end else if(strm1_incr_en) begin
+        tvalid_pipe1[0] <= 1;
+        tvalid_pipe1[1] <= tvalid_pipe1[0];
+    end
+end
+
+assign config_q0 = m_axis_0_tdata;
+
+endmodule
diff --git a/finn-rtllib/memstream/hdl/ramb18_sdp.v b/finn-rtllib/memstream/hdl/ramb18_sdp.v
new file mode 100644
index 0000000000000000000000000000000000000000..63a349f7d56197a9b5a66c837a2f003a6e8475e6
--- /dev/null
+++ b/finn-rtllib/memstream/hdl/ramb18_sdp.v
@@ -0,0 +1,96 @@
+/*
+ Copyright (c) 2020, Xilinx
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name of FINN nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+module ramb18_sdp
+#(
+    parameter ID = 0,
+    parameter DWIDTH = 18,
+    parameter AWIDTH = 10,
+    parameter DEPTH = 2**AWIDTH,
+    parameter MEM_INIT = "",
+    parameter RAM_STYLE = "auto"
+)
+(
+	input clk,
+
+	input ena,
+	input wea,
+	input [AWIDTH-1:0] addra,
+	input [DWIDTH-1:0] wdataa,
+
+    input enb,
+    input enqb,
+	input [AWIDTH-1:0] addrb,
+	output reg [DWIDTH-1:0] rdqb
+);
+
+(* ram_style = RAM_STYLE *) reg [DWIDTH-1:0] mem[0:DEPTH-1];
+reg [DWIDTH-1:0] rdatab;
+
+`ifdef SYNTHESIS
+reg [7:0] idx = ID;
+`else
+reg [15:0] idx;
+`endif
+
+//initialize memory
+initial begin
+  //note the hacky way of adding a filename memblock_ID.dat to the path provided in MEM_INIT
+  //ID can go up to 99
+  if (ID < 0 && ID > 99) begin
+    $display("ID out of range [0-99]");
+    $finish();
+  end
+	//MEM_INIT path must be terminated by /
+  `ifdef SYNTHESIS
+  if (ID < 10)
+    $readmemh({MEM_INIT,"memblock_",idx+8'd48,".dat"}, mem, 0, DEPTH-1);
+  else
+    $readmemh({MEM_INIT,"memblock_",(idx/10)+8'd48,(idx%10)+8'd48,".dat"}, mem, 0, DEPTH-1);
+  `else
+  $sformat(idx,"%0d",ID);
+  if (ID < 10)
+    $readmemh({MEM_INIT,"memblock_",idx[7:0],".dat"}, mem, 0, DEPTH-1);
+  else
+    $readmemh({MEM_INIT,"memblock_",idx,".dat"}, mem, 0, DEPTH-1);
+  `endif
+end
+
+//memory ports, with output pipeline register
+always @(posedge clk) begin
+    if(wea)
+        mem[addra] <= wdataa;
+    if(enb)
+        rdatab <= mem[addrb];
+    if(enqb)
+        rdqb <= rdatab;
+end
+
+endmodule
diff --git a/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v b/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v
index 4219d0f1c74bddff690b0d0cb21ce6a448c01c97..c7850106ae4cad21f1230477ee86062411e531c8 100644
--- a/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v
+++ b/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v
@@ -31,26 +31,31 @@
 module ramb18_wf_dualport
 #(
     parameter ID = 0,
-	parameter DWIDTH = 18,
-	parameter AWIDTH = 10,
-	parameter MEM_INIT = "",
-  parameter RAM_STYLE = "auto"
+    parameter DWIDTH = 18,
+    parameter AWIDTH = 10,
+    parameter DEPTH = 2**AWIDTH,
+    parameter MEM_INIT = "",
+    parameter RAM_STYLE = "auto"
 )
 (
 	input clk,
 
 	input wea,
+    input ena,
+    input enqa,
 	input [AWIDTH-1:0] addra,
 	input [DWIDTH-1:0] wdataa,
 	output reg [DWIDTH-1:0] rdqa,
 
 	input web,
+    input enb,
+    input enqb,
 	input [AWIDTH-1:0] addrb,
 	input [DWIDTH-1:0] wdatab,
 	output reg [DWIDTH-1:0] rdqb
 );
 
-(* ram_style = RAM_STYLE *) reg [DWIDTH-1:0] mem[0:2**AWIDTH-1];
+(* ram_style = RAM_STYLE *) reg [DWIDTH-1:0] mem[0:DEPTH-1];
 reg [DWIDTH-1:0] rdataa;
 reg [DWIDTH-1:0] rdatab;
 
@@ -71,30 +76,36 @@ initial begin
 	//MEM_INIT path must be terminated by /
   `ifdef SYNTHESIS
   if (ID < 10)
-    $readmemh({MEM_INIT,"memblock_",idx+8'd48,".dat"}, mem, 0, 1023);
+    $readmemh({MEM_INIT,"memblock_",idx+8'd48,".dat"}, mem, 0, DEPTH-1);
   else
-    $readmemh({MEM_INIT,"memblock_",(idx/10)+8'd48,(idx%10)+8'd48,".dat"}, mem, 0, 1023);
+    $readmemh({MEM_INIT,"memblock_",(idx/10)+8'd48,(idx%10)+8'd48,".dat"}, mem, 0, DEPTH-1);
   `else
   $sformat(idx,"%0d",ID);
   if (ID < 10)
-    $readmemh({MEM_INIT,"memblock_",idx[7:0],".dat"}, mem, 0, 1023);
+    $readmemh({MEM_INIT,"memblock_",idx[7:0],".dat"}, mem, 0, DEPTH-1);
   else
-    $readmemh({MEM_INIT,"memblock_",idx,".dat"}, mem, 0, 1023);
+    $readmemh({MEM_INIT,"memblock_",idx,".dat"}, mem, 0, DEPTH-1);
   `endif
 end
 
 //memory ports, with output pipeline register
 always @(posedge clk) begin
-    if(wea)
-        mem[addra] <= wdataa;
-    rdataa <= mem[addra];
-    rdqa <= rdataa;
+    if(ena) begin
+        if(wea)
+            mem[addra] <= wdataa;
+        rdataa <= mem[addra];
+    end
+    if(enqa)
+        rdqa <= rdataa;
 end
 always @(posedge clk) begin
-    if(web)
-        mem[addrb] <= wdatab;
-    rdatab <= mem[addrb];
-    rdqb <= rdatab;
+    if(enb) begin
+        if(web)
+            mem[addrb] <= wdatab;
+        rdatab <= mem[addrb];
+    end
+    if(enqb)
+        rdqb <= rdatab;
 end
 
 endmodule
diff --git a/finn-rtllib/memstream/xgui/memstream_v1_0.tcl b/finn-rtllib/memstream/xgui/memstream_v1_0.tcl
index e5cbb670da94612e8de73f48cfa4562f89e124d1..7ce84b44a7cd6e20b59fd1b21a467d137ff0288f 100644
--- a/finn-rtllib/memstream/xgui/memstream_v1_0.tcl
+++ b/finn-rtllib/memstream/xgui/memstream_v1_0.tcl
@@ -1,383 +1,370 @@
-# Copyright (c) 2020, Xilinx
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-# Definitional proc to organize widgets for parameters.
-proc init_gui { IPINST } {
-  ipgui::add_param $IPINST -name "Component_Name"
-  #Adding Page
-  set Page_0 [ipgui::add_page $IPINST -name "Page 0"]
-  ipgui::add_param $IPINST -name "CONFIG_EN" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "MEM_DEPTH" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "MEM_INIT" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "MEM_WIDTH" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "NSTREAMS" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM0_DEPTH" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM0_OFFSET" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM0_WIDTH" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM1_DEPTH" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM1_OFFSET" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM1_WIDTH" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM2_DEPTH" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM2_OFFSET" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM2_WIDTH" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM3_DEPTH" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM3_OFFSET" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM3_WIDTH" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM4_DEPTH" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM4_OFFSET" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM4_WIDTH" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM5_DEPTH" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM5_OFFSET" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM5_WIDTH" -parent ${Page_0}
-
-
-}
-
-proc update_PARAM_VALUE.CONFIG_EN { PARAM_VALUE.CONFIG_EN } {
-	# Procedure called to update CONFIG_EN when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.CONFIG_EN { PARAM_VALUE.CONFIG_EN } {
-	# Procedure called to validate CONFIG_EN
-	return true
-}
-
-proc update_PARAM_VALUE.MEM_DEPTH { PARAM_VALUE.MEM_DEPTH } {
-	# Procedure called to update MEM_DEPTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.MEM_DEPTH { PARAM_VALUE.MEM_DEPTH } {
-	# Procedure called to validate MEM_DEPTH
-	return true
-}
-
-proc update_PARAM_VALUE.MEM_INIT { PARAM_VALUE.MEM_INIT } {
-	# Procedure called to update MEM_INIT when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.MEM_INIT { PARAM_VALUE.MEM_INIT } {
-	# Procedure called to validate MEM_INIT
-	return true
-}
-
-proc update_PARAM_VALUE.MEM_WIDTH { PARAM_VALUE.MEM_WIDTH } {
-	# Procedure called to update MEM_WIDTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.MEM_WIDTH { PARAM_VALUE.MEM_WIDTH } {
-	# Procedure called to validate MEM_WIDTH
-	return true
-}
-
-proc update_PARAM_VALUE.NSTREAMS { PARAM_VALUE.NSTREAMS } {
-	# Procedure called to update NSTREAMS when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.NSTREAMS { PARAM_VALUE.NSTREAMS } {
-	# Procedure called to validate NSTREAMS
-	return true
-}
-
-proc update_PARAM_VALUE.STRM0_DEPTH { PARAM_VALUE.STRM0_DEPTH } {
-	# Procedure called to update STRM0_DEPTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM0_DEPTH { PARAM_VALUE.STRM0_DEPTH } {
-	# Procedure called to validate STRM0_DEPTH
-	return true
-}
-
-proc update_PARAM_VALUE.STRM0_OFFSET { PARAM_VALUE.STRM0_OFFSET } {
-	# Procedure called to update STRM0_OFFSET when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM0_OFFSET { PARAM_VALUE.STRM0_OFFSET } {
-	# Procedure called to validate STRM0_OFFSET
-	return true
-}
-
-proc update_PARAM_VALUE.STRM0_WIDTH { PARAM_VALUE.STRM0_WIDTH } {
-	# Procedure called to update STRM0_WIDTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM0_WIDTH { PARAM_VALUE.STRM0_WIDTH } {
-	# Procedure called to validate STRM0_WIDTH
-	return true
-}
-
-proc update_PARAM_VALUE.STRM1_DEPTH { PARAM_VALUE.STRM1_DEPTH } {
-	# Procedure called to update STRM1_DEPTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM1_DEPTH { PARAM_VALUE.STRM1_DEPTH } {
-	# Procedure called to validate STRM1_DEPTH
-	return true
-}
-
-proc update_PARAM_VALUE.STRM1_OFFSET { PARAM_VALUE.STRM1_OFFSET } {
-	# Procedure called to update STRM1_OFFSET when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM1_OFFSET { PARAM_VALUE.STRM1_OFFSET } {
-	# Procedure called to validate STRM1_OFFSET
-	return true
-}
-
-proc update_PARAM_VALUE.STRM1_WIDTH { PARAM_VALUE.STRM1_WIDTH } {
-	# Procedure called to update STRM1_WIDTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM1_WIDTH { PARAM_VALUE.STRM1_WIDTH } {
-	# Procedure called to validate STRM1_WIDTH
-	return true
-}
-
-proc update_PARAM_VALUE.STRM2_DEPTH { PARAM_VALUE.STRM2_DEPTH } {
-	# Procedure called to update STRM2_DEPTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM2_DEPTH { PARAM_VALUE.STRM2_DEPTH } {
-	# Procedure called to validate STRM2_DEPTH
-	return true
-}
-
-proc update_PARAM_VALUE.STRM2_OFFSET { PARAM_VALUE.STRM2_OFFSET } {
-	# Procedure called to update STRM2_OFFSET when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM2_OFFSET { PARAM_VALUE.STRM2_OFFSET } {
-	# Procedure called to validate STRM2_OFFSET
-	return true
-}
-
-proc update_PARAM_VALUE.STRM2_WIDTH { PARAM_VALUE.STRM2_WIDTH } {
-	# Procedure called to update STRM2_WIDTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM2_WIDTH { PARAM_VALUE.STRM2_WIDTH } {
-	# Procedure called to validate STRM2_WIDTH
-	return true
-}
-
-proc update_PARAM_VALUE.STRM3_DEPTH { PARAM_VALUE.STRM3_DEPTH } {
-	# Procedure called to update STRM3_DEPTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM3_DEPTH { PARAM_VALUE.STRM3_DEPTH } {
-	# Procedure called to validate STRM3_DEPTH
-	return true
-}
-
-proc update_PARAM_VALUE.STRM3_OFFSET { PARAM_VALUE.STRM3_OFFSET } {
-	# Procedure called to update STRM3_OFFSET when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM3_OFFSET { PARAM_VALUE.STRM3_OFFSET } {
-	# Procedure called to validate STRM3_OFFSET
-	return true
-}
-
-proc update_PARAM_VALUE.STRM3_WIDTH { PARAM_VALUE.STRM3_WIDTH } {
-	# Procedure called to update STRM3_WIDTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM3_WIDTH { PARAM_VALUE.STRM3_WIDTH } {
-	# Procedure called to validate STRM3_WIDTH
-	return true
-}
-
-proc update_PARAM_VALUE.STRM4_DEPTH { PARAM_VALUE.STRM4_DEPTH } {
-	# Procedure called to update STRM4_DEPTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM4_DEPTH { PARAM_VALUE.STRM4_DEPTH } {
-	# Procedure called to validate STRM4_DEPTH
-	return true
-}
-
-proc update_PARAM_VALUE.STRM4_OFFSET { PARAM_VALUE.STRM4_OFFSET } {
-	# Procedure called to update STRM4_OFFSET when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM4_OFFSET { PARAM_VALUE.STRM4_OFFSET } {
-	# Procedure called to validate STRM4_OFFSET
-	return true
-}
-
-proc update_PARAM_VALUE.STRM4_WIDTH { PARAM_VALUE.STRM4_WIDTH } {
-	# Procedure called to update STRM4_WIDTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM4_WIDTH { PARAM_VALUE.STRM4_WIDTH } {
-	# Procedure called to validate STRM4_WIDTH
-	return true
-}
-
-proc update_PARAM_VALUE.STRM5_DEPTH { PARAM_VALUE.STRM5_DEPTH } {
-	# Procedure called to update STRM5_DEPTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM5_DEPTH { PARAM_VALUE.STRM5_DEPTH } {
-	# Procedure called to validate STRM5_DEPTH
-	return true
-}
-
-proc update_PARAM_VALUE.STRM5_OFFSET { PARAM_VALUE.STRM5_OFFSET } {
-	# Procedure called to update STRM5_OFFSET when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM5_OFFSET { PARAM_VALUE.STRM5_OFFSET } {
-	# Procedure called to validate STRM5_OFFSET
-	return true
-}
-
-proc update_PARAM_VALUE.STRM5_WIDTH { PARAM_VALUE.STRM5_WIDTH } {
-	# Procedure called to update STRM5_WIDTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM5_WIDTH { PARAM_VALUE.STRM5_WIDTH } {
-	# Procedure called to validate STRM5_WIDTH
-	return true
-}
-
-
-proc update_MODELPARAM_VALUE.CONFIG_EN { MODELPARAM_VALUE.CONFIG_EN PARAM_VALUE.CONFIG_EN } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.CONFIG_EN}] ${MODELPARAM_VALUE.CONFIG_EN}
-}
-
-proc update_MODELPARAM_VALUE.NSTREAMS { MODELPARAM_VALUE.NSTREAMS PARAM_VALUE.NSTREAMS } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.NSTREAMS}] ${MODELPARAM_VALUE.NSTREAMS}
-}
-
-proc update_MODELPARAM_VALUE.MEM_DEPTH { MODELPARAM_VALUE.MEM_DEPTH PARAM_VALUE.MEM_DEPTH } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.MEM_DEPTH}] ${MODELPARAM_VALUE.MEM_DEPTH}
-}
-
-proc update_MODELPARAM_VALUE.MEM_WIDTH { MODELPARAM_VALUE.MEM_WIDTH PARAM_VALUE.MEM_WIDTH } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.MEM_WIDTH}] ${MODELPARAM_VALUE.MEM_WIDTH}
-}
-
-proc update_MODELPARAM_VALUE.MEM_INIT { MODELPARAM_VALUE.MEM_INIT PARAM_VALUE.MEM_INIT } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.MEM_INIT}] ${MODELPARAM_VALUE.MEM_INIT}
-}
-
-proc update_MODELPARAM_VALUE.STRM0_WIDTH { MODELPARAM_VALUE.STRM0_WIDTH PARAM_VALUE.STRM0_WIDTH } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM0_WIDTH}] ${MODELPARAM_VALUE.STRM0_WIDTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM1_WIDTH { MODELPARAM_VALUE.STRM1_WIDTH PARAM_VALUE.STRM1_WIDTH } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM1_WIDTH}] ${MODELPARAM_VALUE.STRM1_WIDTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM2_WIDTH { MODELPARAM_VALUE.STRM2_WIDTH PARAM_VALUE.STRM2_WIDTH } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM2_WIDTH}] ${MODELPARAM_VALUE.STRM2_WIDTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM3_WIDTH { MODELPARAM_VALUE.STRM3_WIDTH PARAM_VALUE.STRM3_WIDTH } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM3_WIDTH}] ${MODELPARAM_VALUE.STRM3_WIDTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM4_WIDTH { MODELPARAM_VALUE.STRM4_WIDTH PARAM_VALUE.STRM4_WIDTH } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM4_WIDTH}] ${MODELPARAM_VALUE.STRM4_WIDTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM5_WIDTH { MODELPARAM_VALUE.STRM5_WIDTH PARAM_VALUE.STRM5_WIDTH } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM5_WIDTH}] ${MODELPARAM_VALUE.STRM5_WIDTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM0_DEPTH { MODELPARAM_VALUE.STRM0_DEPTH PARAM_VALUE.STRM0_DEPTH } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM0_DEPTH}] ${MODELPARAM_VALUE.STRM0_DEPTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM1_DEPTH { MODELPARAM_VALUE.STRM1_DEPTH PARAM_VALUE.STRM1_DEPTH } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM1_DEPTH}] ${MODELPARAM_VALUE.STRM1_DEPTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM2_DEPTH { MODELPARAM_VALUE.STRM2_DEPTH PARAM_VALUE.STRM2_DEPTH } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM2_DEPTH}] ${MODELPARAM_VALUE.STRM2_DEPTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM3_DEPTH { MODELPARAM_VALUE.STRM3_DEPTH PARAM_VALUE.STRM3_DEPTH } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM3_DEPTH}] ${MODELPARAM_VALUE.STRM3_DEPTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM4_DEPTH { MODELPARAM_VALUE.STRM4_DEPTH PARAM_VALUE.STRM4_DEPTH } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM4_DEPTH}] ${MODELPARAM_VALUE.STRM4_DEPTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM5_DEPTH { MODELPARAM_VALUE.STRM5_DEPTH PARAM_VALUE.STRM5_DEPTH } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM5_DEPTH}] ${MODELPARAM_VALUE.STRM5_DEPTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM0_OFFSET { MODELPARAM_VALUE.STRM0_OFFSET PARAM_VALUE.STRM0_OFFSET } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM0_OFFSET}] ${MODELPARAM_VALUE.STRM0_OFFSET}
-}
-
-proc update_MODELPARAM_VALUE.STRM1_OFFSET { MODELPARAM_VALUE.STRM1_OFFSET PARAM_VALUE.STRM1_OFFSET } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM1_OFFSET}] ${MODELPARAM_VALUE.STRM1_OFFSET}
-}
-
-proc update_MODELPARAM_VALUE.STRM2_OFFSET { MODELPARAM_VALUE.STRM2_OFFSET PARAM_VALUE.STRM2_OFFSET } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM2_OFFSET}] ${MODELPARAM_VALUE.STRM2_OFFSET}
-}
-
-proc update_MODELPARAM_VALUE.STRM3_OFFSET { MODELPARAM_VALUE.STRM3_OFFSET PARAM_VALUE.STRM3_OFFSET } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM3_OFFSET}] ${MODELPARAM_VALUE.STRM3_OFFSET}
-}
-
-proc update_MODELPARAM_VALUE.STRM4_OFFSET { MODELPARAM_VALUE.STRM4_OFFSET PARAM_VALUE.STRM4_OFFSET } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM4_OFFSET}] ${MODELPARAM_VALUE.STRM4_OFFSET}
-}
-
-proc update_MODELPARAM_VALUE.STRM5_OFFSET { MODELPARAM_VALUE.STRM5_OFFSET PARAM_VALUE.STRM5_OFFSET } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM5_OFFSET}] ${MODELPARAM_VALUE.STRM5_OFFSET}
-}
-
+# Definitional proc to organize widgets for parameters.
+proc init_gui { IPINST } {
+  ipgui::add_param $IPINST -name "Component_Name"
+  #Adding Page
+  set Page_0 [ipgui::add_page $IPINST -name "Page 0"]
+  ipgui::add_param $IPINST -name "CONFIG_EN" -parent ${Page_0}
+  ipgui::add_param $IPINST -name "MEM_DEPTH" -parent ${Page_0}
+  ipgui::add_param $IPINST -name "MEM_INIT" -parent ${Page_0}
+  ipgui::add_param $IPINST -name "MEM_WIDTH" -parent ${Page_0}
+  ipgui::add_param $IPINST -name "RAM_STYLE" -parent ${Page_0} -widget comboBox
+  ipgui::add_param $IPINST -name "NSTREAMS" -parent ${Page_0}
+  ipgui::add_param $IPINST -name "STRM0_DEPTH" -parent ${Page_0}
+  ipgui::add_param $IPINST -name "STRM0_OFFSET" -parent ${Page_0}
+  ipgui::add_param $IPINST -name "STRM0_WIDTH" -parent ${Page_0}
+  ipgui::add_param $IPINST -name "STRM1_DEPTH" -parent ${Page_0}
+  ipgui::add_param $IPINST -name "STRM1_OFFSET" -parent ${Page_0}
+  ipgui::add_param $IPINST -name "STRM1_WIDTH" -parent ${Page_0}
+  ipgui::add_param $IPINST -name "STRM2_DEPTH" -parent ${Page_0}
+  ipgui::add_param $IPINST -name "STRM2_OFFSET" -parent ${Page_0}
+  ipgui::add_param $IPINST -name "STRM2_WIDTH" -parent ${Page_0}
+  ipgui::add_param $IPINST -name "STRM3_DEPTH" -parent ${Page_0}
+  ipgui::add_param $IPINST -name "STRM3_OFFSET" -parent ${Page_0}
+  ipgui::add_param $IPINST -name "STRM3_WIDTH" -parent ${Page_0}
+  ipgui::add_param $IPINST -name "STRM4_DEPTH" -parent ${Page_0}
+  ipgui::add_param $IPINST -name "STRM4_OFFSET" -parent ${Page_0}
+  ipgui::add_param $IPINST -name "STRM4_WIDTH" -parent ${Page_0}
+  ipgui::add_param $IPINST -name "STRM5_DEPTH" -parent ${Page_0}
+  ipgui::add_param $IPINST -name "STRM5_OFFSET" -parent ${Page_0}
+  ipgui::add_param $IPINST -name "STRM5_WIDTH" -parent ${Page_0}
+
+
+}
+
+proc update_PARAM_VALUE.CONFIG_EN { PARAM_VALUE.CONFIG_EN } {
+	# Procedure called to update CONFIG_EN when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.CONFIG_EN { PARAM_VALUE.CONFIG_EN } {
+	# Procedure called to validate CONFIG_EN
+	return true
+}
+
+proc update_PARAM_VALUE.MEM_DEPTH { PARAM_VALUE.MEM_DEPTH } {
+	# Procedure called to update MEM_DEPTH when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.MEM_DEPTH { PARAM_VALUE.MEM_DEPTH } {
+	# Procedure called to validate MEM_DEPTH
+	return true
+}
+
+proc update_PARAM_VALUE.MEM_INIT { PARAM_VALUE.MEM_INIT } {
+	# Procedure called to update MEM_INIT when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.MEM_INIT { PARAM_VALUE.MEM_INIT } {
+	# Procedure called to validate MEM_INIT
+	return true
+}
+
+proc update_PARAM_VALUE.MEM_WIDTH { PARAM_VALUE.MEM_WIDTH } {
+	# Procedure called to update MEM_WIDTH when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.MEM_WIDTH { PARAM_VALUE.MEM_WIDTH } {
+	# Procedure called to validate MEM_WIDTH
+	return true
+}
+
+proc update_PARAM_VALUE.NSTREAMS { PARAM_VALUE.NSTREAMS } {
+	# Procedure called to update NSTREAMS when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.NSTREAMS { PARAM_VALUE.NSTREAMS } {
+	# Procedure called to validate NSTREAMS
+	return true
+}
+
+proc update_PARAM_VALUE.RAM_STYLE { PARAM_VALUE.RAM_STYLE } {
+	# Procedure called to update RAM_STYLE when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.RAM_STYLE { PARAM_VALUE.RAM_STYLE } {
+	# Procedure called to validate RAM_STYLE
+	return true
+}
+
+proc update_PARAM_VALUE.STRM0_DEPTH { PARAM_VALUE.STRM0_DEPTH } {
+	# Procedure called to update STRM0_DEPTH when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.STRM0_DEPTH { PARAM_VALUE.STRM0_DEPTH } {
+	# Procedure called to validate STRM0_DEPTH
+	return true
+}
+
+proc update_PARAM_VALUE.STRM0_OFFSET { PARAM_VALUE.STRM0_OFFSET } {
+	# Procedure called to update STRM0_OFFSET when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.STRM0_OFFSET { PARAM_VALUE.STRM0_OFFSET } {
+	# Procedure called to validate STRM0_OFFSET
+	return true
+}
+
+proc update_PARAM_VALUE.STRM0_WIDTH { PARAM_VALUE.STRM0_WIDTH } {
+	# Procedure called to update STRM0_WIDTH when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.STRM0_WIDTH { PARAM_VALUE.STRM0_WIDTH } {
+	# Procedure called to validate STRM0_WIDTH
+	return true
+}
+
+proc update_PARAM_VALUE.STRM1_DEPTH { PARAM_VALUE.STRM1_DEPTH } {
+	# Procedure called to update STRM1_DEPTH when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.STRM1_DEPTH { PARAM_VALUE.STRM1_DEPTH } {
+	# Procedure called to validate STRM1_DEPTH
+	return true
+}
+
+proc update_PARAM_VALUE.STRM1_OFFSET { PARAM_VALUE.STRM1_OFFSET } {
+	# Procedure called to update STRM1_OFFSET when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.STRM1_OFFSET { PARAM_VALUE.STRM1_OFFSET } {
+	# Procedure called to validate STRM1_OFFSET
+	return true
+}
+
+proc update_PARAM_VALUE.STRM1_WIDTH { PARAM_VALUE.STRM1_WIDTH } {
+	# Procedure called to update STRM1_WIDTH when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.STRM1_WIDTH { PARAM_VALUE.STRM1_WIDTH } {
+	# Procedure called to validate STRM1_WIDTH
+	return true
+}
+
+proc update_PARAM_VALUE.STRM2_DEPTH { PARAM_VALUE.STRM2_DEPTH } {
+	# Procedure called to update STRM2_DEPTH when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.STRM2_DEPTH { PARAM_VALUE.STRM2_DEPTH } {
+	# Procedure called to validate STRM2_DEPTH
+	return true
+}
+
+proc update_PARAM_VALUE.STRM2_OFFSET { PARAM_VALUE.STRM2_OFFSET } {
+	# Procedure called to update STRM2_OFFSET when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.STRM2_OFFSET { PARAM_VALUE.STRM2_OFFSET } {
+	# Procedure called to validate STRM2_OFFSET
+	return true
+}
+
+proc update_PARAM_VALUE.STRM2_WIDTH { PARAM_VALUE.STRM2_WIDTH } {
+	# Procedure called to update STRM2_WIDTH when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.STRM2_WIDTH { PARAM_VALUE.STRM2_WIDTH } {
+	# Procedure called to validate STRM2_WIDTH
+	return true
+}
+
+proc update_PARAM_VALUE.STRM3_DEPTH { PARAM_VALUE.STRM3_DEPTH } {
+	# Procedure called to update STRM3_DEPTH when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.STRM3_DEPTH { PARAM_VALUE.STRM3_DEPTH } {
+	# Procedure called to validate STRM3_DEPTH
+	return true
+}
+
+proc update_PARAM_VALUE.STRM3_OFFSET { PARAM_VALUE.STRM3_OFFSET } {
+	# Procedure called to update STRM3_OFFSET when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.STRM3_OFFSET { PARAM_VALUE.STRM3_OFFSET } {
+	# Procedure called to validate STRM3_OFFSET
+	return true
+}
+
+proc update_PARAM_VALUE.STRM3_WIDTH { PARAM_VALUE.STRM3_WIDTH } {
+	# Procedure called to update STRM3_WIDTH when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.STRM3_WIDTH { PARAM_VALUE.STRM3_WIDTH } {
+	# Procedure called to validate STRM3_WIDTH
+	return true
+}
+
+proc update_PARAM_VALUE.STRM4_DEPTH { PARAM_VALUE.STRM4_DEPTH } {
+	# Procedure called to update STRM4_DEPTH when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.STRM4_DEPTH { PARAM_VALUE.STRM4_DEPTH } {
+	# Procedure called to validate STRM4_DEPTH
+	return true
+}
+
+proc update_PARAM_VALUE.STRM4_OFFSET { PARAM_VALUE.STRM4_OFFSET } {
+	# Procedure called to update STRM4_OFFSET when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.STRM4_OFFSET { PARAM_VALUE.STRM4_OFFSET } {
+	# Procedure called to validate STRM4_OFFSET
+	return true
+}
+
+proc update_PARAM_VALUE.STRM4_WIDTH { PARAM_VALUE.STRM4_WIDTH } {
+	# Procedure called to update STRM4_WIDTH when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.STRM4_WIDTH { PARAM_VALUE.STRM4_WIDTH } {
+	# Procedure called to validate STRM4_WIDTH
+	return true
+}
+
+proc update_PARAM_VALUE.STRM5_DEPTH { PARAM_VALUE.STRM5_DEPTH } {
+	# Procedure called to update STRM5_DEPTH when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.STRM5_DEPTH { PARAM_VALUE.STRM5_DEPTH } {
+	# Procedure called to validate STRM5_DEPTH
+	return true
+}
+
+proc update_PARAM_VALUE.STRM5_OFFSET { PARAM_VALUE.STRM5_OFFSET } {
+	# Procedure called to update STRM5_OFFSET when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.STRM5_OFFSET { PARAM_VALUE.STRM5_OFFSET } {
+	# Procedure called to validate STRM5_OFFSET
+	return true
+}
+
+proc update_PARAM_VALUE.STRM5_WIDTH { PARAM_VALUE.STRM5_WIDTH } {
+	# Procedure called to update STRM5_WIDTH when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.STRM5_WIDTH { PARAM_VALUE.STRM5_WIDTH } {
+	# Procedure called to validate STRM5_WIDTH
+	return true
+}
+
+
+proc update_MODELPARAM_VALUE.CONFIG_EN { MODELPARAM_VALUE.CONFIG_EN PARAM_VALUE.CONFIG_EN } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.CONFIG_EN}] ${MODELPARAM_VALUE.CONFIG_EN}
+}
+
+proc update_MODELPARAM_VALUE.NSTREAMS { MODELPARAM_VALUE.NSTREAMS PARAM_VALUE.NSTREAMS } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.NSTREAMS}] ${MODELPARAM_VALUE.NSTREAMS}
+}
+
+proc update_MODELPARAM_VALUE.MEM_DEPTH { MODELPARAM_VALUE.MEM_DEPTH PARAM_VALUE.MEM_DEPTH } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.MEM_DEPTH}] ${MODELPARAM_VALUE.MEM_DEPTH}
+}
+
+proc update_MODELPARAM_VALUE.MEM_WIDTH { MODELPARAM_VALUE.MEM_WIDTH PARAM_VALUE.MEM_WIDTH } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.MEM_WIDTH}] ${MODELPARAM_VALUE.MEM_WIDTH}
+}
+
+proc update_MODELPARAM_VALUE.MEM_INIT { MODELPARAM_VALUE.MEM_INIT PARAM_VALUE.MEM_INIT } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.MEM_INIT}] ${MODELPARAM_VALUE.MEM_INIT}
+}
+
+proc update_MODELPARAM_VALUE.RAM_STYLE { MODELPARAM_VALUE.RAM_STYLE PARAM_VALUE.RAM_STYLE } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.RAM_STYLE}] ${MODELPARAM_VALUE.RAM_STYLE}
+}
+
+proc update_MODELPARAM_VALUE.STRM0_WIDTH { MODELPARAM_VALUE.STRM0_WIDTH PARAM_VALUE.STRM0_WIDTH } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.STRM0_WIDTH}] ${MODELPARAM_VALUE.STRM0_WIDTH}
+}
+
+proc update_MODELPARAM_VALUE.STRM1_WIDTH { MODELPARAM_VALUE.STRM1_WIDTH PARAM_VALUE.STRM1_WIDTH } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.STRM1_WIDTH}] ${MODELPARAM_VALUE.STRM1_WIDTH}
+}
+
+proc update_MODELPARAM_VALUE.STRM2_WIDTH { MODELPARAM_VALUE.STRM2_WIDTH PARAM_VALUE.STRM2_WIDTH } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.STRM2_WIDTH}] ${MODELPARAM_VALUE.STRM2_WIDTH}
+}
+
+proc update_MODELPARAM_VALUE.STRM3_WIDTH { MODELPARAM_VALUE.STRM3_WIDTH PARAM_VALUE.STRM3_WIDTH } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.STRM3_WIDTH}] ${MODELPARAM_VALUE.STRM3_WIDTH}
+}
+
+proc update_MODELPARAM_VALUE.STRM4_WIDTH { MODELPARAM_VALUE.STRM4_WIDTH PARAM_VALUE.STRM4_WIDTH } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.STRM4_WIDTH}] ${MODELPARAM_VALUE.STRM4_WIDTH}
+}
+
+proc update_MODELPARAM_VALUE.STRM5_WIDTH { MODELPARAM_VALUE.STRM5_WIDTH PARAM_VALUE.STRM5_WIDTH } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.STRM5_WIDTH}] ${MODELPARAM_VALUE.STRM5_WIDTH}
+}
+
+proc update_MODELPARAM_VALUE.STRM0_DEPTH { MODELPARAM_VALUE.STRM0_DEPTH PARAM_VALUE.STRM0_DEPTH } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.STRM0_DEPTH}] ${MODELPARAM_VALUE.STRM0_DEPTH}
+}
+
+proc update_MODELPARAM_VALUE.STRM1_DEPTH { MODELPARAM_VALUE.STRM1_DEPTH PARAM_VALUE.STRM1_DEPTH } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.STRM1_DEPTH}] ${MODELPARAM_VALUE.STRM1_DEPTH}
+}
+
+proc update_MODELPARAM_VALUE.STRM2_DEPTH { MODELPARAM_VALUE.STRM2_DEPTH PARAM_VALUE.STRM2_DEPTH } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.STRM2_DEPTH}] ${MODELPARAM_VALUE.STRM2_DEPTH}
+}
+
+proc update_MODELPARAM_VALUE.STRM3_DEPTH { MODELPARAM_VALUE.STRM3_DEPTH PARAM_VALUE.STRM3_DEPTH } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.STRM3_DEPTH}] ${MODELPARAM_VALUE.STRM3_DEPTH}
+}
+
+proc update_MODELPARAM_VALUE.STRM4_DEPTH { MODELPARAM_VALUE.STRM4_DEPTH PARAM_VALUE.STRM4_DEPTH } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.STRM4_DEPTH}] ${MODELPARAM_VALUE.STRM4_DEPTH}
+}
+
+proc update_MODELPARAM_VALUE.STRM5_DEPTH { MODELPARAM_VALUE.STRM5_DEPTH PARAM_VALUE.STRM5_DEPTH } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.STRM5_DEPTH}] ${MODELPARAM_VALUE.STRM5_DEPTH}
+}
+
+proc update_MODELPARAM_VALUE.STRM0_OFFSET { MODELPARAM_VALUE.STRM0_OFFSET PARAM_VALUE.STRM0_OFFSET } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.STRM0_OFFSET}] ${MODELPARAM_VALUE.STRM0_OFFSET}
+}
+
+proc update_MODELPARAM_VALUE.STRM1_OFFSET { MODELPARAM_VALUE.STRM1_OFFSET PARAM_VALUE.STRM1_OFFSET } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.STRM1_OFFSET}] ${MODELPARAM_VALUE.STRM1_OFFSET}
+}
+
+proc update_MODELPARAM_VALUE.STRM2_OFFSET { MODELPARAM_VALUE.STRM2_OFFSET PARAM_VALUE.STRM2_OFFSET } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.STRM2_OFFSET}] ${MODELPARAM_VALUE.STRM2_OFFSET}
+}
+
+proc update_MODELPARAM_VALUE.STRM3_OFFSET { MODELPARAM_VALUE.STRM3_OFFSET PARAM_VALUE.STRM3_OFFSET } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.STRM3_OFFSET}] ${MODELPARAM_VALUE.STRM3_OFFSET}
+}
+
+proc update_MODELPARAM_VALUE.STRM4_OFFSET { MODELPARAM_VALUE.STRM4_OFFSET PARAM_VALUE.STRM4_OFFSET } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.STRM4_OFFSET}] ${MODELPARAM_VALUE.STRM4_OFFSET}
+}
+
+proc update_MODELPARAM_VALUE.STRM5_OFFSET { MODELPARAM_VALUE.STRM5_OFFSET PARAM_VALUE.STRM5_OFFSET } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.STRM5_OFFSET}] ${MODELPARAM_VALUE.STRM5_OFFSET}
+}
+
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 65c898a8c453420ed96ca22715ef2595c5840288..7de6cce936ee54d58d9a526e926ff79dcd35b90d 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -308,6 +308,12 @@ class HLSCustomOp(CustomOp):
         f.close()
         self.code_gen_dict.clear()
 
+    def code_generation_ipi(self):
+        """Constructs and returns the TCL for node instantiation in Vivado IPI."""
+        vlnv = self.get_nodeattr("ip_vlnv")
+        cmd = ["create_bd_cell -type ip -vlnv %s %s" % (vlnv, self.onnx_node.name)]
+        return cmd
+
     def compile_singlenode_code(self):
         """Builds the bash script for compilation using the CppBuilder from
         finn.util.basic and executes the script to produce the executable."""
diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
index f666becdbcceca6ca202907610595f8c0069c5a0..748880400d53f9bb6e90585234e2cfc21a366ba6 100644
--- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
@@ -28,6 +28,7 @@
 
 import os
 import numpy as np
+import math
 
 from finn.custom_op.fpgadataflow import HLSCustomOp
 from finn.core.datatype import DataType
@@ -51,6 +52,10 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
             "outWidth": ("i", True, 0),
             # FINN DataTypes for inputs/outputs
             "dataType": ("s", True, ""),
+            # Toggle between hls or IPI implementation
+            # hls - use the hls generated IP during stitching
+            # vivado - use the AXI Infrastructure DWC
+            "impl_style": ("s", False, "hls"),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -381,3 +386,96 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
             exp_shape
         ), """Output
         shape doesn't match expected shape, should be same as input shape"""
+
+    def code_generation_ipi(self):
+        impl_style = self.get_nodeattr("impl_style")
+        if impl_style == "hls":
+            return super().code_generation_ipi()
+        elif impl_style == "vivado":
+            cmd = []
+            node_name = self.onnx_node.name
+            # create a hierarchy for this layer, with the same port names
+            clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
+            rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
+            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0]
+            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0]
+            cmd.append("create_bd_cell -type hier %s" % node_name)
+            cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
+            cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
+            cmd.append(
+                "create_bd_intf_pin -mode Master "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s"
+                % (node_name, dout_name)
+            )
+            cmd.append(
+                "create_bd_intf_pin -mode Slave "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name)
+            )
+            # instantiate and configure DWC
+            cmd.append(
+                "create_bd_cell -type ip "
+                "-vlnv xilinx.com:ip:axis_dwidth_converter:1.1 /%s/dwc" % node_name
+            )
+            cmd.append(
+                "set_property -dict "
+                "[list CONFIG.S_TDATA_NUM_BYTES.VALUE_SRC PROPAGATED] "
+                "[get_bd_cells /%s/dwc]" % node_name
+            )
+            cmd.append(
+                "set_property -dict "
+                "[list CONFIG.M_TDATA_NUM_BYTES {%d}] [get_bd_cells /%s/dwc]"
+                % (np.ceil(self.get_outstream_width() / 8), node_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/dwc/M_AXIS] "
+                "[get_bd_intf_pins %s/%s]" % (node_name, node_name, dout_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/dwc/S_AXIS] "
+                "[get_bd_intf_pins %s/%s]" % (node_name, node_name, din_name)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/dwc/aresetn]"
+                % (node_name, rst_name, node_name)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/dwc/aclk]"
+                % (node_name, clk_name, node_name)
+            )
+            return cmd
+        else:
+            raise Exception(
+                "DWC implementation style %s not supported, please use hls or vivado"
+                % impl_style
+            )
+
+    def lut_estimation(self):
+        """Calculates resource estimations for LUTs"""
+        impl = self.get_nodeattr("impl_style")
+        inw = self.get_instream_width()
+        outw = self.get_outstream_width()
+
+        minw = min(inw, outw)
+        maxw = max(inw, outw)
+
+        # sometimes withs aren't directly divisible
+        # this requires going up from input width to least common multiple
+        # then down to output width
+        intw = abs(maxw*minw) // math.gcd(maxw, minw)
+
+        # we assume a shift-based implementation
+        # even if we don't use LUTs explicitly, we make some unavailable
+        # to other logic because they're tied into the DWC control sets
+
+        cnt_luts = 0
+        cset_luts = 0
+
+        if inw != intw:
+            cnt_luts += abs(math.ceil(math.log(inw/intw, 2)))
+            cset_luts += intw
+        if intw != outw:
+            cnt_luts += abs(math.ceil(math.log(intw / outw, 2)))
+            cset_luts += outw
+
+        return int(cnt_luts+cset_luts)
+
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index 181e04f7142053708cc5b2338a8078f6c9fc8303..75be2bec869959cdd8fc227677f80a5e40c2a56e 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -28,9 +28,6 @@
 
 import math
 import os
-import subprocess
-from shutil import copy
-
 import numpy as np
 
 from onnx import TensorProto, helper
@@ -103,23 +100,6 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
-    def get_verilog_top_module_name(self):
-        "Return the Verilog top module name for this node."
-
-        node = self.onnx_node
-        # set top name depending on mem_mode
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode == "const" or mem_mode == "external":
-            prefixed_top_name = "%s_%s" % (node.name, node.name)
-        elif mem_mode == "decoupled":
-            prefixed_top_name = "%s_memstream" % (node.name)
-        else:
-            raise Exception(
-                """Please set mem_mode to "const", "decoupled", or "external",
-                currently no other parameter value is supported!"""
-            )
-        return prefixed_top_name
-
     def calc_wmem(self):
         """Calculates and returns WMEM."""
         mw = self.get_nodeattr("MW")
@@ -624,24 +604,12 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
                     weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
                 )
-                weight_stream_len = np.prod(weight_tensor_pe_flipped.shape)
-                factor = math.ceil(weight_stream_len / 1024)
                 # add zeroes to pad out file to 1024 entries
                 weight_stream = weight_tensor_pe_flipped.flatten()
-                pad_amt = (factor * 1024) - weight_stream_len
-                weight_stream = np.pad(
-                    weight_stream, (0, pad_amt), mode="constant", constant_values="0"
-                )
                 weight_stream = weight_stream.copy()
-                i = 0
-                j = 0
-                for val in weight_stream:
-                    if i == 1024:
-                        i = 0
-                        j += 1
-                    with open("{}/memblock_{}.dat".format(code_gen_dir, j), "a+") as f:
+                with open("{}/memblock_0.dat".format(code_gen_dir), "a+") as f:
+                    for val in weight_stream:
                         f.write(val + "\n")
-                    i += 1
         else:
             raise Exception(
                 """Please set mem_mode to "const", "decoupled", or "external",
@@ -765,7 +733,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             )
             super().reset_rtlsim(sim)
             super().toggle_clk(sim)
-            if mem_mode == "external":
+            if mem_mode == "external" or mem_mode == "decoupled":
                 wnbits = self.get_weightstream_width()
                 export_wdt = self.get_weight_datatype()
                 # we have converted bipolar weights to binary for export,
@@ -775,8 +743,9 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 wei = npy_to_rtlsim_input(
                     "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits
                 )
+                num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
                 io_dict = {
-                    "inputs": {"in0": inp, "weights": wei},
+                    "inputs": {"in0": inp, "weights": wei*num_w_reps},
                     "outputs": {"out": []},
                 }
                 self.rtlsim_multi_io(sim, io_dict)
@@ -1068,114 +1037,96 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 )
             )
 
-    def code_generation_ipgen(self, model, fpgapart, clk):
-        # generate code for all mem_mode of MVAU/FCLayer unit
-        super().code_generation_ipgen(model, fpgapart, clk)
-
-        # if mem_mode = "decoupled" generate code for verilog wrapper
+    def code_generation_ipi(self):
+        cmd = []
+        # add streamer if needed
         mem_mode = self.get_nodeattr("mem_mode")
         if mem_mode == "decoupled":
-            # empty code gen dictionary for new entries
-            self.code_gen_dict.clear()
-            self.code_gen_dict["$TOPNAME$"] = [
-                "{}_memstream".format(self.onnx_node.name)
-            ]
-            self.code_gen_dict["$LAYER_NAME$"] = [
-                "{}_{}".format(self.onnx_node.name, self.onnx_node.name)
-            ]
-            # make instream width a multiple of 8 for AXI stream interface
-            in_width = self.get_instream_width_padded()
-            self.code_gen_dict["$IN_RANGE$"] = ["[{}:0]".format(in_width - 1)]
-            self.code_gen_dict["$OUT_RANGE$"] = [
-                "[{}:0]".format(self.get_outstream_width_padded() - 1)
-            ]
-            # make weight stream width a multiple of 8 for AXI stream interface
-            weight_width = self.get_weightstream_width_padded()
-            self.code_gen_dict["$WEIGHT_RANGE$"] = ["[{}:0]".format(weight_width - 1)]
-            self.code_gen_dict["$WEIGHT_WIDTH$"] = [str(weight_width)]
-            self.code_gen_dict["$WSTREAM_DEPTH$"] = [str(self.calc_wmem())]
-            self.code_gen_dict["$MEM_DEPTH$"] = [
-                str(roundup_to_integer_multiple(self.calc_wmem(), 1024))
-            ]
-            self.code_gen_dict["$RAM_STYLE$"] = [self.get_nodeattr("ram_style")]
-
-            template = self.decoupled_wrapper
-
-            for key in self.code_gen_dict:
-                # transform list into long string separated by '\n'
-                code_gen_line = "\n".join(self.code_gen_dict[key])
-                template = template.replace(key, code_gen_line)
-            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-            f = open(
-                os.path.join(
-                    code_gen_dir, "{}_memstream.v".format(self.onnx_node.name)
-                ),
-                "w",
+            node_name = self.onnx_node.name
+            # create a hierarchy for this layer, with the same port names
+            clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
+            rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
+            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0]
+            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0]
+            cmd.append("create_bd_cell -type hier %s" % node_name)
+            cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
+            cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
+            cmd.append(
+                "create_bd_intf_pin -mode Master "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s"
+                % (node_name, dout_name)
             )
-            f.write(template)
-            f.close()
-            self.code_gen_dict.clear()
-
-    def ipgen_singlenode_code(self):
-        # generate ip block of MVAU/FCLayer unit for all mem modes
-        super().ipgen_singlenode_code()
-
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode == "decoupled":
-            # copy necessary verilog and .dat files
-            # into verilog folder in code generation folder
-            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-            verilog_folder = "{}/project_{}/sol1/impl/verilog/".format(
-                code_gen_dir, self.onnx_node.name
+            cmd.append(
+                "create_bd_intf_pin -mode Slave "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name)
             )
-            # copy memstream components from finn-rtllib
-            memstream_dir = "/workspace/finn/finn-rtllib/memstream/hdl/"
-            for file in os.listdir(memstream_dir):
-                if file.endswith(".v"):
-                    verilog_file = os.path.join(memstream_dir, file)
-                    copy(verilog_file, verilog_folder)
-            # copy .dat files of weights
-            for file in os.listdir(code_gen_dir):
-                if file.endswith(".dat"):
-                    dat_file = os.path.join(code_gen_dir, file)
-                    copy(dat_file, verilog_folder)
-            # copy verilog wrapper
-            verilog_wrapper = "{}/{}_memstream.v".format(
-                code_gen_dir, self.onnx_node.name
+            # instantiate the hls ip
+            cmd.append(
+                "create_bd_cell -type ip -vlnv %s /%s/%s"
+                % (self.get_nodeattr("ip_vlnv"), node_name, node_name)
             )
-            copy(verilog_wrapper, verilog_folder)
-            # prepare the IP packaging tcl template
-            template = templates.ip_package_tcl
-            self.code_gen_dict["$TOPNAME$"] = [
-                "{}_memstream".format(self.onnx_node.name)
-            ]
-            self.code_gen_dict["$VERILOG_DIR$"] = [verilog_folder]
-            for key in self.code_gen_dict:
-                # transform list into long string separated by '\n'
-                code_gen_line = "\n".join(self.code_gen_dict[key])
-                template = template.replace(key, code_gen_line)
-            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-            f = open(os.path.join(verilog_folder, "package_ip.tcl"), "w")
-            f.write(template)
-            f.close()
-            # create a shell script and call Vivado to invoke the IP pkg script
-            make_project_sh = verilog_folder + "/make_ip.sh"
-            working_dir = os.environ["PWD"]
-            with open(make_project_sh, "w") as f:
-                f.write("#!/bin/bash \n")
-                f.write("cd {}\n".format(verilog_folder))
-                f.write("vivado -mode batch -source package_ip.tcl\n")
-                f.write("cd {}\n".format(working_dir))
-            bash_command = ["bash", make_project_sh]
-            process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
-            process_compile.communicate()
-            # re-set ip_path to point to the new packaged IP
-            self.set_nodeattr("ip_path", verilog_folder)
-            vlnv = "xilinx.com:hls:%s:1.0" % (
-                "{}_memstream".format(self.onnx_node.name)
+            # instantiate a streamer and connect it to the HLS IP
+            strm_vlnv = "xilinx.com:user:memstream:1.0"
+            strm_inst = node_name + "_wstrm"
+            cmd.append(
+                "create_bd_cell -type ip -vlnv %s /%s/%s"
+                % (strm_vlnv, node_name, strm_inst)
+            )
+            cmd.append(
+                "set_property -dict [list "
+                "CONFIG.NSTREAMS {1} "
+                "CONFIG.MEM_DEPTH {%d} "
+                "CONFIG.MEM_WIDTH {%d} "
+                "CONFIG.MEM_INIT {%s} "
+                "CONFIG.RAM_STYLE {%s} "
+                "CONFIG.STRM0_DEPTH {%d} "
+                "CONFIG.STRM0_WIDTH {%d} "
+                "CONFIG.STRM0_OFFSET {0} "
+                "] [get_bd_cells /%s/%s]"
+                % (
+                    self.calc_wmem(),
+                    self.get_weightstream_width_padded(),
+                    self.get_nodeattr("code_gen_dir_ipgen") + "/",
+                    self.get_nodeattr("ram_style"),
+                    self.calc_wmem(),
+                    self.get_weightstream_width_padded(),
+                    node_name,
+                    strm_inst,
+                )
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s/m_axis_0] "
+                "[get_bd_intf_pins %s/%s/weights_V_V]"
+                % (node_name, strm_inst, node_name, node_name)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]"
+                % (node_name, rst_name, node_name, strm_inst)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aclk]"
+                % (node_name, clk_name, node_name, strm_inst)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
+                % (node_name, rst_name, node_name, node_name, rst_name)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
+                % (node_name, clk_name, node_name, node_name, clk_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                "[get_bd_intf_pins %s/%s/%s]"
+                % (node_name, din_name, node_name, node_name, din_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                "[get_bd_intf_pins %s/%s/%s]"
+                % (node_name, dout_name, node_name, node_name, dout_name)
             )
-            self.set_nodeattr("ip_vlnv", vlnv)
-            self.code_gen_dict.clear()
+            cmd.append("save_bd_design")
+        return cmd
 
     def get_verilog_top_module_intf_names(self):
         intf_names = super().get_verilog_top_module_intf_names()
diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py
index 1f734b548f923341687843c538d1887fcc069bee..e2f96395ad74255ad67549255608cd52737e97d9 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfifo.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py
@@ -29,6 +29,7 @@ import os
 import numpy as np
 from shutil import copy
 import subprocess
+import math
 
 from finn.custom_op.fpgadataflow import HLSCustomOp
 from finn.core.datatype import DataType
@@ -51,6 +52,16 @@ class StreamingFIFO(HLSCustomOp):
             "folded_shape": ("ints", True, []),
             # FINN DataTypes for inputs/outputs
             "dataType": ("s", True, ""),
+            # Toggle between hls or IPI implementation
+            # rtl - use the hls generated IP during stitching
+            # vivado - use the AXI Infrastructure FIFO
+            "impl_style": ("s", False, "rtl"),
+            # FPGA resource type for FIFOs when impl_style is vivado
+            # auto -- let Vivado decide
+            # block -- use BRAM
+            # distributed -- use LUTRAM
+            # ultra -- use URAM (on UltraScale+)
+            "ram_style": ("s", False, "auto"),
         }
         my_attrs.update(super().get_nodeattr_types())
 
@@ -306,3 +317,137 @@ class StreamingFIFO(HLSCustomOp):
 
     def pragmas(self):
         pass
+
+    def code_generation_ipi(self):
+        impl_style = self.get_nodeattr("impl_style")
+        if impl_style == "rtl":
+            return super().code_generation_ipi()
+        elif impl_style == "vivado":
+            cmd = []
+            node_name = self.onnx_node.name
+            depth = self.get_nodeattr("depth")
+            ram_style = self.get_nodeattr("ram_style")
+            # create a hierarchy for this layer, with the same port names
+            clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
+            rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
+            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0]
+            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0]
+            cmd.append("create_bd_cell -type hier %s" % node_name)
+            cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
+            cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
+            cmd.append(
+                "create_bd_intf_pin -mode Master "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s"
+                % (node_name, dout_name)
+            )
+            cmd.append(
+                "create_bd_intf_pin -mode Slave "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name)
+            )
+            # instantiate and configure DWC
+            cmd.append(
+                "create_bd_cell -type ip "
+                "-vlnv xilinx.com:ip:axis_data_fifo:2.0 /%s/fifo" % node_name
+            )
+            cmd.append(
+                "set_property -dict [list CONFIG.FIFO_DEPTH {%d}] "
+                "[get_bd_cells /%s/fifo]" % (depth, node_name)
+            )
+            cmd.append(
+                "set_property -dict [list CONFIG.FIFO_MEMORY_TYPE {%s}] "
+                "[get_bd_cells /%s/fifo]" % (ram_style, node_name)
+            )
+            cmd.append(
+                "set_property -dict [list CONFIG.TDATA_NUM_BYTES {%d}] "
+                "[get_bd_cells /%s/fifo]"
+                % (np.ceil(self.get_outstream_width() / 8), node_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/fifo/M_AXIS] "
+                "[get_bd_intf_pins %s/%s]" % (node_name, node_name, dout_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/fifo/S_AXIS] "
+                "[get_bd_intf_pins %s/%s]" % (node_name, node_name, din_name)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] "
+                "[get_bd_pins %s/fifo/s_axis_aresetn]"
+                % (node_name, rst_name, node_name)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] "
+                "[get_bd_pins %s/fifo/s_axis_aclk]" % (node_name, clk_name, node_name)
+            )
+            return cmd
+        else:
+            raise Exception(
+                "FIFO implementation style %s not supported, please use rtl or vivado"
+                % impl_style
+            )
+
+    def bram_estimation(self):
+        """Calculates resource estimation for BRAM"""
+        impl = self.get_nodeattr("impl_style")
+        ram_type = self.get_nodeattr("ram_style")
+        depth = self.get_nodeattr("depth")
+        W = self.get_instream_width()
+
+        if impl == "rtl" or (impl == "vivado" and ram_type != "block"):
+            # Non-BRAM based implementation
+            return 0
+
+        if W == 1:
+            return math.ceil(depth / 16384)
+        elif W == 2:
+            return math.ceil(depth / 8192)
+        elif W <= 4:
+            return (math.ceil(depth / 4096)) * (math.ceil(W / 4))
+        elif W <= 9:
+            return (math.ceil(depth / 2048)) * (math.ceil(W / 9))
+        elif W <= 18 or depth > 512:
+            return (math.ceil(depth / 1024)) * (math.ceil(W / 18))
+        else:
+            return (math.ceil(depth / 512)) * (math.ceil(W / 36))
+
+    def uram_estimation(self):
+        """Calculates resource estimation for URAM"""
+
+        impl = self.get_nodeattr("impl_style")
+        ram_type = self.get_nodeattr("ram_style")
+        depth = self.get_nodeattr("depth")
+        W = self.get_instream_width()
+
+        if impl == "rtl" or (impl == "vivado" and ram_type != "ultra"):
+            # Non-BRAM based implementation
+            return 0
+        else:
+            return (math.ceil(depth / 4096)) * (math.ceil(W / 72))
+
+
+    def bram_efficiency_estimation(self):
+        depth = self.get_nodeattr("depth")
+        W = self.get_instream_width()
+        bram16_est = self.bram_estimation()
+        if bram16_est == 0:
+            return 1
+        wbits = W * depth
+        bram16_est_capacity = bram16_est * 36 * 512
+        return wbits / bram16_est_capacity
+
+    def lut_estimation(self):
+        """Calculates resource estimations for LUTs"""
+        impl = self.get_nodeattr("impl_style")
+        ram_type = self.get_nodeattr("ram_style")
+        depth = self.get_nodeattr("depth")
+        W = self.get_instream_width()
+
+        address_luts = 2 * math.ceil(math.log(depth, 2))
+
+        if impl == "rtl" or (impl == "vivado" and ram_type == "distributed"):
+            ram_luts = (math.ceil(depth / 32)) * (math.ceil(W / 2))
+        else:
+            ram_luts = 0
+
+        return int(address_luts + ram_luts)
+
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 319731df70d5bd1cb80d42932f08acdcec80c074..67cce8675681be47036ffaf3a3428b8c43284215 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -146,39 +146,6 @@ wire m_axis_0_tready;
 wire m_axis_0_tvalid;
 wire $WEIGHT_RANGE$ m_axis_0_tdata;
 
-wire m_axis_0_tready_q;
-wire m_axis_0_tvalid_q;
-wire $WEIGHT_RANGE$ m_axis_0_tdata_q;
-
-wire m_axis_0_tready_q2;
-wire m_axis_0_tvalid_q2;
-wire $WEIGHT_RANGE$ m_axis_0_tdata_q2;
-
-reg m_axis_1_afull = 0;
-reg m_axis_1_tready = 1;
-wire m_axis_1_tvalid;
-wire $WEIGHT_RANGE$ m_axis_1_tdata;
-
-reg m_axis_2_afull = 0;
-reg m_axis_2_tready = 1;
-wire m_axis_2_tvalid;
-wire $WEIGHT_RANGE$ m_axis_2_tdata;
-
-reg m_axis_3_afull = 0;
-reg m_axis_3_tready = 1;
-wire m_axis_3_tvalid;
-wire $WEIGHT_RANGE$ m_axis_3_tdata;
-
-reg m_axis_4_afull = 0;
-reg m_axis_4_tready = 1;
-wire m_axis_4_tvalid;
-wire $WEIGHT_RANGE$ m_axis_4_tdata;
-
-reg m_axis_5_afull = 0;
-reg m_axis_5_tready = 1;
-wire m_axis_5_tvalid;
-wire $WEIGHT_RANGE$ m_axis_5_tdata;
-
 //memstream component
 
 memstream
@@ -194,27 +161,12 @@ memstream
 
 //widths per stream
 .STRM0_WIDTH($WEIGHT_WIDTH$),
-.STRM1_WIDTH($WEIGHT_WIDTH$),
-.STRM2_WIDTH($WEIGHT_WIDTH$),
-.STRM3_WIDTH($WEIGHT_WIDTH$),
-.STRM4_WIDTH($WEIGHT_WIDTH$),
-.STRM5_WIDTH($WEIGHT_WIDTH$),
 
 //depths per stream
 .STRM0_DEPTH($WSTREAM_DEPTH$),
-.STRM1_DEPTH(1),
-.STRM2_DEPTH(1),
-.STRM3_DEPTH(1),
-.STRM4_DEPTH(1),
-.STRM5_DEPTH(1),
 
 //offsets for each stream
-.STRM0_OFFSET(0),
-.STRM1_OFFSET(0),
-.STRM2_OFFSET(0),
-.STRM3_OFFSET(0),
-.STRM4_OFFSET(0),
-.STRM5_OFFSET(0)
+.STRM0_OFFSET(0)
 )
 mem
 (
@@ -232,55 +184,12 @@ mem
 .m_axis_0_afull(m_axis_0_afull),
 .m_axis_0_tready(m_axis_0_tready),
 .m_axis_0_tvalid(m_axis_0_tvalid),
-.m_axis_0_tdata(m_axis_0_tdata),
-
-.m_axis_1_afull(m_axis_1_afull),
-.m_axis_1_tready(m_axis_1_tready),
-.m_axis_1_tvalid(m_axis_1_tvalid),
-.m_axis_1_tdata(m_axis_1_tdata),
-
-.m_axis_2_afull(m_axis_2_afull),
-.m_axis_2_tready(m_axis_2_tready),
-.m_axis_2_tvalid(m_axis_2_tvalid),
-.m_axis_2_tdata(m_axis_2_tdata),
-
-.m_axis_3_afull(m_axis_3_afull),
-.m_axis_3_tready(m_axis_3_tready),
-.m_axis_3_tvalid(m_axis_3_tvalid),
-.m_axis_3_tdata(m_axis_3_tdata),
-
-.m_axis_4_afull(m_axis_4_afull),
-.m_axis_4_tready(m_axis_4_tready),
-.m_axis_4_tvalid(m_axis_4_tvalid),
-.m_axis_4_tdata(m_axis_4_tdata),
-
-.m_axis_5_afull(m_axis_5_afull),
-.m_axis_5_tready(m_axis_5_tready),
-.m_axis_5_tvalid(m_axis_5_tvalid),
-.m_axis_5_tdata(m_axis_5_tdata)
+.m_axis_0_tdata(m_axis_0_tdata)
 
 
 );
 
 
-Q_srl #(
-.depth(32),
-.width($WEIGHT_WIDTH$)
-)
-$LAYER_NAME$_w_fifo_1
-(
- .clock(ap_clk),
- .reset(!ap_rst_n),
- .i_d(m_axis_0_tdata),
- .i_v(m_axis_0_tvalid),
- .i_r(m_axis_0_tready),
- .o_d(m_axis_0_tdata_q),
- .o_v(m_axis_0_tvalid_q),
- .o_r(m_axis_0_tready_q),
- .count(fifo_0_count)
-);
-
-
 //MVA_Stream_Unit
 
 $LAYER_NAME$
@@ -291,17 +200,14 @@ MVA_Stream_U
 .in0_V_V_TDATA(in0_V_V_TDATA),		//$IN_RANGE$ input
 .in0_V_V_TVALID(in0_V_V_TVALID),  	//input
 .in0_V_V_TREADY(in0_V_V_TREADY),	//output
-.weights_V_V_TDATA(m_axis_0_tdata_q),	//$WEIGHT_RANGE$ input
-.weights_V_V_TVALID(m_axis_0_tvalid_q),	//input
-.weights_V_V_TREADY(m_axis_0_tready_q),	//output
+.weights_V_V_TDATA(m_axis_0_tdata),	//$WEIGHT_RANGE$ input
+.weights_V_V_TVALID(m_axis_0_tvalid),	//input
+.weights_V_V_TREADY(m_axis_0_tready),	//output
 .out_V_V_TDATA(out_V_V_TDATA),		//$OUT_RANGE$ output
 .out_V_V_TVALID(out_V_V_TVALID),	//output
 .out_V_V_TREADY(out_V_V_TREADY)		//input
 );
 
-// programmable full threshold at 16 elements
-assign m_axis_0_afull = (fifo_0_count > 16);
-
 endmodule
 """
 
diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index 90b4b6c47e6e353c1b606d6918eb271e9c0619c5..aa5de589a75d81ddfa7924a123630d721368fec0 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -182,6 +182,8 @@ class CreateStitchedIP(Transformation):
 
     def apply(self, model):
         ip_dirs = ["list"]
+        # add RTL streamer IP
+        ip_dirs.append("/workspace/finn/finn-rtllib/memstream")
         # ensure that all nodes are fpgadataflow, and that IPs are generated
         for node in model.graph.node:
             assert node.domain == "finn", 'Node domain is not set to "finn"'
@@ -196,10 +198,7 @@ class CreateStitchedIP(Transformation):
             ip_dir_value = node_inst.get_nodeattr("ip_path")
             assert os.path.isdir(ip_dir_value), "IP generation directory doesn't exist."
             ip_dirs += [ip_dir_value]
-            vlnv = node_inst.get_nodeattr("ip_vlnv")
-            inst_name = node.name
-            create_cmd = "create_bd_cell -type ip -vlnv %s %s" % (vlnv, inst_name)
-            self.create_cmds += [create_cmd]
+            self.create_cmds += node_inst.code_generation_ipi()
             my_producer = model.find_producer(node.input[0])
             self.connect_clk_rst(node)
             self.connect_axi(node)
@@ -223,6 +222,7 @@ class CreateStitchedIP(Transformation):
                 #     find index of producer output connected to our target input
                 #     get names of hdl interfaces for input and producer output
                 #     issue a TCL directive to connect input to output
+                #     if FC layer with mode "decoupled", add a streamer on input 1
                 for i in range(len(node.input)):
                     producer = model.find_producer(node.input[i])
                     if producer is None:
@@ -408,7 +408,7 @@ class CreateStitchedIP(Transformation):
         tcl.append("ipx::update_checksums [ipx::find_open_core %s]" % block_vlnv)
         tcl.append("ipx::save_core [ipx::find_open_core %s]" % block_vlnv)
         # export list of used Verilog files (for rtlsim later on)
-        tcl.append("set all_v_files [get_files -filter {FILE_TYPE == Verilog}]")
+        tcl.append("set all_v_files [get_files -filter {FILE_TYPE == Verilog && USED_IN_SYNTHESIS == 1} ]")
         v_file_list = "%s/all_verilog_srcs.txt" % vivado_stitch_proj_dir
         tcl.append("set fp [open %s w]" % v_file_list)
         # write each verilog filename to all_verilog_srcs.txt