diff --git a/docker/Dockerfile.finn_ci b/docker/Dockerfile.finn_ci
index b9c71df58b1869cec7a49d2e5f4dd19d09949097..3145d8776025f71aa32da71eb5d31ef472b885a7 100644
--- a/docker/Dockerfile.finn_ci
+++ b/docker/Dockerfile.finn_ci
@@ -48,7 +48,7 @@ RUN rm xrtdeps.sh
 
 # cloning dependency repos
 # finn-base
-RUN git clone https://github.com/maltanar/finn-base.git /workspace/finn-base
+RUN git clone https://github.com/Xilinx/finn-base.git /workspace/finn-base
 # Brevitas
 RUN git clone https://github.com/Xilinx/brevitas.git /workspace/brevitas
 # CNPY
diff --git a/docker/Dockerfile.finn_dev b/docker/Dockerfile.finn_dev
index fe7f45b461f517ae736b11ab7871b56ec8f9061a..c3017f172c6f357a4b59a3c4129c21b6a801fd49 100644
--- a/docker/Dockerfile.finn_dev
+++ b/docker/Dockerfile.finn_dev
@@ -72,7 +72,7 @@ USER $UNAME
 
 # cloning dependency repos (as user)
 # finn-base
-RUN git clone https://github.com/maltanar/finn-base.git /workspace/finn-base
+RUN git clone https://github.com/Xilinx/finn-base.git /workspace/finn-base
 # Brevitas
 RUN git clone https://github.com/Xilinx/brevitas.git /workspace/brevitas
 # CNPY
diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index e13f0d0211ce4c140c8ccba1a4d4832cf1fc2a17..0e8988d5392810b08ed647fc0466699425430e12 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -12,7 +12,7 @@ gecho () {
 
 # checkout the correct dependency repo commits
 # the repos themselves are cloned in the Dockerfile
-FINN_BASE_COMMIT=c4d8885e38a55f9bb7424bde76d35a3e000c5a7e
+FINN_BASE_COMMIT=951d5e9dd25b7f38731fa539959667a86e7091b2
 BREVITAS_COMMIT=6ffefa8dbf37fdb0f44c994f34604c29fadb16b0
 CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4
 HLSLIB_COMMIT=cfafe11a93b79ab1af7529d68f08886913a6466e
diff --git a/docs/finn/getting_started.rst b/docs/finn/getting_started.rst
index 84b8ed23b1f232267c1bc02291aa990ccc56917d..418b665e1317c23c22daf2e3fd0ebcd40a1e2151 100644
--- a/docs/finn/getting_started.rst
+++ b/docs/finn/getting_started.rst
@@ -105,6 +105,7 @@ These are summarized below:
 * ``PYNQ_IP`` and ``PYNQ_PORT`` (or ``ALVEO_IP`` and ``ALVEO_PORT``) specify ip address and port number to access the PYNQ board / Alveo target
 * ``PYNQ_USERNAME`` and ``PYNQ_PASSWORD`` (or ``ALVEO_USERNAME`` and ``ALVEO_PASSWORD``) specify the PYNQ board / Alveo host access credentials for the test suite. For PYNQ, password is always needed to run as sudo. For Alveo, you can leave the password empty and place your ssh private key in the ``finn/ssh_keys`` folder to use keypair authentication.
 * ``PYNQ_TARGET_DIR`` (or ``ALVEO_TARGET_DIR``) specifies the target dir on the PYNQ board / Alveo host for the test suite
+* (optional) ``FINN_HOST_BUILD_DIR`` specifies which directory on the host will be used as the build directory. Defaults to ``/tmp/finn_dev_<username>``
 
 Supported Hardware
 ===================
diff --git a/finn-rtllib/memstream/component.xml b/finn-rtllib/memstream/component.xml
index 14af1610385a735ad987fd1055ff9f90f76e4a23..3d6767abfc11eb114ddb084f1f7275f7a93d0607 100644
--- a/finn-rtllib/memstream/component.xml
+++ b/finn-rtllib/memstream/component.xml
@@ -68,6 +68,13 @@
           </spirit:physicalPort>
         </spirit:portMap>
       </spirit:portMaps>
+      <spirit:vendorExtensions>
+        <xilinx:busInterfaceInfo>
+          <xilinx:enablement>
+            <xilinx:isEnabled xilinx:resolve="dependent" xilinx:id="BUSIF_ENABLEMENT.m_axis_1" xilinx:dependency="$NSTREAMS>=2">true</xilinx:isEnabled>
+          </xilinx:enablement>
+        </xilinx:busInterfaceInfo>
+      </spirit:vendorExtensions>
     </spirit:busInterface>
     <spirit:busInterface>
       <spirit:name>m_axis_2</spirit:name>
@@ -100,6 +107,13 @@
           </spirit:physicalPort>
         </spirit:portMap>
       </spirit:portMaps>
+      <spirit:vendorExtensions>
+        <xilinx:busInterfaceInfo>
+          <xilinx:enablement>
+            <xilinx:isEnabled xilinx:resolve="dependent" xilinx:id="BUSIF_ENABLEMENT.m_axis_2" xilinx:dependency="$NSTREAMS>=3">true</xilinx:isEnabled>
+          </xilinx:enablement>
+        </xilinx:busInterfaceInfo>
+      </spirit:vendorExtensions>
     </spirit:busInterface>
     <spirit:busInterface>
       <spirit:name>m_axis_3</spirit:name>
@@ -132,6 +146,13 @@
           </spirit:physicalPort>
         </spirit:portMap>
       </spirit:portMaps>
+      <spirit:vendorExtensions>
+        <xilinx:busInterfaceInfo>
+          <xilinx:enablement>
+            <xilinx:isEnabled xilinx:resolve="dependent" xilinx:id="BUSIF_ENABLEMENT.m_axis_3" xilinx:dependency="$NSTREAMS>=4">true</xilinx:isEnabled>
+          </xilinx:enablement>
+        </xilinx:busInterfaceInfo>
+      </spirit:vendorExtensions>
     </spirit:busInterface>
     <spirit:busInterface>
       <spirit:name>m_axis_4</spirit:name>
@@ -164,6 +185,13 @@
           </spirit:physicalPort>
         </spirit:portMap>
       </spirit:portMaps>
+      <spirit:vendorExtensions>
+        <xilinx:busInterfaceInfo>
+          <xilinx:enablement>
+            <xilinx:isEnabled xilinx:resolve="dependent" xilinx:id="BUSIF_ENABLEMENT.m_axis_4" xilinx:dependency="$NSTREAMS>=5">true</xilinx:isEnabled>
+          </xilinx:enablement>
+        </xilinx:busInterfaceInfo>
+      </spirit:vendorExtensions>
     </spirit:busInterface>
     <spirit:busInterface>
       <spirit:name>m_axis_5</spirit:name>
@@ -196,6 +224,182 @@
           </spirit:physicalPort>
         </spirit:portMap>
       </spirit:portMaps>
+      <spirit:vendorExtensions>
+        <xilinx:busInterfaceInfo>
+          <xilinx:enablement>
+            <xilinx:isEnabled xilinx:resolve="dependent" xilinx:id="BUSIF_ENABLEMENT.m_axis_5" xilinx:dependency="$NSTREAMS = 6">true</xilinx:isEnabled>
+          </xilinx:enablement>
+        </xilinx:busInterfaceInfo>
+      </spirit:vendorExtensions>
+    </spirit:busInterface>
+    <spirit:busInterface>
+      <spirit:name>s_axilite</spirit:name>
+      <spirit:busType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="aximm" spirit:version="1.0"/>
+      <spirit:abstractionType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="aximm_rtl" spirit:version="1.0"/>
+      <spirit:slave>
+        <spirit:memoryMapRef spirit:memoryMapRef="interface_aximm"/>
+      </spirit:slave>
+      <spirit:portMaps>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>AWADDR</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>awaddr</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>AWPROT</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>awprot</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>AWVALID</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>awvalid</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>AWREADY</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>awready</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>WDATA</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>wdata</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>WSTRB</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>wstrb</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>WVALID</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>wvalid</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>WREADY</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>wready</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>BRESP</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>bresp</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>BVALID</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>bvalid</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>BREADY</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>bready</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>ARADDR</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>araddr</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>ARPROT</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>arprot</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>ARVALID</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>arvalid</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>ARREADY</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>arready</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>RDATA</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>rdata</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>RRESP</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>rresp</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>RVALID</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>rvalid</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>RREADY</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>rready</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+      </spirit:portMaps>
+      <spirit:vendorExtensions>
+        <xilinx:busInterfaceInfo>
+          <xilinx:enablement>
+            <xilinx:isEnabled xilinx:resolve="dependent" xilinx:id="BUSIF_ENABLEMENT.s_axilite" xilinx:dependency="$CONFIG_EN = 1">true</xilinx:isEnabled>
+          </xilinx:enablement>
+        </xilinx:busInterfaceInfo>
+      </spirit:vendorExtensions>
     </spirit:busInterface>
     <spirit:busInterface>
       <spirit:name>aresetn</spirit:name>
@@ -237,7 +441,7 @@
       <spirit:parameters>
         <spirit:parameter>
           <spirit:name>ASSOCIATED_BUSIF</spirit:name>
-          <spirit:value spirit:id="BUSIFPARAM_VALUE.ACLK.ASSOCIATED_BUSIF">m_axis_0:m_axis_1:m_axis_2:m_axis_3:m_axis_4:m_axis_5</spirit:value>
+          <spirit:value spirit:id="BUSIFPARAM_VALUE.ACLK.ASSOCIATED_BUSIF">m_axis_0:m_axis_1:m_axis_2:m_axis_3:m_axis_4:m_axis_5:s_axilite</spirit:value>
         </spirit:parameter>
         <spirit:parameter>
           <spirit:name>ASSOCIATED_RESET</spirit:name>
@@ -246,6 +450,18 @@
       </spirit:parameters>
     </spirit:busInterface>
   </spirit:busInterfaces>
+  <spirit:memoryMaps>
+    <spirit:memoryMap>
+      <spirit:name>interface_aximm</spirit:name>
+      <spirit:addressBlock>
+        <spirit:name>reg0</spirit:name>
+        <spirit:baseAddress spirit:format="bitString" spirit:resolve="user" spirit:bitStringLength="32">0</spirit:baseAddress>
+        <spirit:range spirit:format="long" spirit:resolve="dependent" spirit:dependency="pow(2,(spirit:decode(id(&apos;MODELPARAM_VALUE.AXILITE_ADDR_WIDTH&apos;)) - 1) + 1)" spirit:minimum="4096" spirit:rangeType="long">65536</spirit:range>
+        <spirit:width spirit:format="long" spirit:resolve="user">32</spirit:width>
+        <spirit:usage>register</spirit:usage>
+      </spirit:addressBlock>
+    </spirit:memoryMap>
+  </spirit:memoryMaps>
   <spirit:model>
     <spirit:views>
       <spirit:view>
@@ -260,7 +476,7 @@
         <spirit:parameters>
           <spirit:parameter>
             <spirit:name>viewChecksum</spirit:name>
-            <spirit:value>3c30c4ac</spirit:value>
+            <spirit:value>1fc5a310</spirit:value>
           </spirit:parameter>
         </spirit:parameters>
       </spirit:view>
@@ -276,7 +492,7 @@
         <spirit:parameters>
           <spirit:parameter>
             <spirit:name>viewChecksum</spirit:name>
-            <spirit:value>00b5320e</spirit:value>
+            <spirit:value>d02d9990</spirit:value>
           </spirit:parameter>
         </spirit:parameters>
       </spirit:view>
@@ -290,7 +506,21 @@
         <spirit:parameters>
           <spirit:parameter>
             <spirit:name>viewChecksum</spirit:name>
-            <spirit:value>d714c73b</spirit:value>
+            <spirit:value>f960907f</spirit:value>
+          </spirit:parameter>
+        </spirit:parameters>
+      </spirit:view>
+      <spirit:view>
+        <spirit:name>xilinx_utilityxitfiles</spirit:name>
+        <spirit:displayName>Utility XIT/TTCL</spirit:displayName>
+        <spirit:envIdentifier>:vivado.xilinx.com:xit.util</spirit:envIdentifier>
+        <spirit:fileSetRef>
+          <spirit:localName>xilinx_utilityxitfiles_view_fileset</spirit:localName>
+        </spirit:fileSetRef>
+        <spirit:parameters>
+          <spirit:parameter>
+            <spirit:name>viewChecksum</spirit:name>
+            <spirit:value>d2aad2c5</spirit:value>
           </spirit:parameter>
         </spirit:parameters>
       </spirit:view>
@@ -323,11 +553,40 @@
         </spirit:wire>
       </spirit:port>
       <spirit:port>
-        <spirit:name>config_address</spirit:name>
+        <spirit:name>awready</spirit:name>
+        <spirit:wire>
+          <spirit:direction>out</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>awvalid</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>awaddr</spirit:name>
         <spirit:wire>
           <spirit:direction>in</spirit:direction>
           <spirit:vector>
-            <spirit:left spirit:format="long">31</spirit:left>
+            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="(spirit:decode(id(&apos;MODELPARAM_VALUE.AXILITE_ADDR_WIDTH&apos;)) - 1)">15</spirit:left>
             <spirit:right spirit:format="long">0</spirit:right>
           </spirit:vector>
           <spirit:wireTypeDefs>
@@ -337,12 +596,35 @@
               <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
             </spirit:wireTypeDef>
           </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
         </spirit:wire>
       </spirit:port>
       <spirit:port>
-        <spirit:name>config_ce</spirit:name>
+        <spirit:name>awprot</spirit:name>
         <spirit:wire>
           <spirit:direction>in</spirit:direction>
+          <spirit:vector>
+            <spirit:left spirit:format="long">2</spirit:left>
+            <spirit:right spirit:format="long">0</spirit:right>
+          </spirit:vector>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic_vector</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>wready</spirit:name>
+        <spirit:wire>
+          <spirit:direction>out</spirit:direction>
           <spirit:wireTypeDefs>
             <spirit:wireTypeDef>
               <spirit:typeName>std_logic</spirit:typeName>
@@ -353,7 +635,7 @@
         </spirit:wire>
       </spirit:port>
       <spirit:port>
-        <spirit:name>config_we</spirit:name>
+        <spirit:name>wvalid</spirit:name>
         <spirit:wire>
           <spirit:direction>in</spirit:direction>
           <spirit:wireTypeDefs>
@@ -363,10 +645,13 @@
               <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
             </spirit:wireTypeDef>
           </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
         </spirit:wire>
       </spirit:port>
       <spirit:port>
-        <spirit:name>config_d0</spirit:name>
+        <spirit:name>wdata</spirit:name>
         <spirit:wire>
           <spirit:direction>in</spirit:direction>
           <spirit:vector>
@@ -380,10 +665,194 @@
               <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
             </spirit:wireTypeDef>
           </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>wstrb</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:vector>
+            <spirit:left spirit:format="long">3</spirit:left>
+            <spirit:right spirit:format="long">0</spirit:right>
+          </spirit:vector>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic_vector</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">1</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>bready</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>bvalid</spirit:name>
+        <spirit:wire>
+          <spirit:direction>out</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>bresp</spirit:name>
+        <spirit:wire>
+          <spirit:direction>out</spirit:direction>
+          <spirit:vector>
+            <spirit:left spirit:format="long">1</spirit:left>
+            <spirit:right spirit:format="long">0</spirit:right>
+          </spirit:vector>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic_vector</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>arready</spirit:name>
+        <spirit:wire>
+          <spirit:direction>out</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>arvalid</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>araddr</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:vector>
+            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="(spirit:decode(id(&apos;MODELPARAM_VALUE.AXILITE_ADDR_WIDTH&apos;)) - 1)">15</spirit:left>
+            <spirit:right spirit:format="long">0</spirit:right>
+          </spirit:vector>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic_vector</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>arprot</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:vector>
+            <spirit:left spirit:format="long">2</spirit:left>
+            <spirit:right spirit:format="long">0</spirit:right>
+          </spirit:vector>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic_vector</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>rready</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>rvalid</spirit:name>
+        <spirit:wire>
+          <spirit:direction>out</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>rresp</spirit:name>
+        <spirit:wire>
+          <spirit:direction>out</spirit:direction>
+          <spirit:vector>
+            <spirit:left spirit:format="long">1</spirit:left>
+            <spirit:right spirit:format="long">0</spirit:right>
+          </spirit:vector>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic_vector</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
         </spirit:wire>
       </spirit:port>
       <spirit:port>
-        <spirit:name>config_q0</spirit:name>
+        <spirit:name>rdata</spirit:name>
         <spirit:wire>
           <spirit:direction>out</spirit:direction>
           <spirit:vector>
@@ -410,7 +879,17 @@
               <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
             </spirit:wireTypeDef>
           </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
         </spirit:wire>
+        <spirit:vendorExtensions>
+          <xilinx:portInfo>
+            <xilinx:enablement>
+              <xilinx:isEnabled xilinx:resolve="dependent" xilinx:id="PORT_ENABLEMENT.m_axis_0_afull" xilinx:dependency="$NSTREAMS>2">true</xilinx:isEnabled>
+            </xilinx:enablement>
+          </xilinx:portInfo>
+        </spirit:vendorExtensions>
       </spirit:port>
       <spirit:port>
         <spirit:name>m_axis_0_tready</spirit:name>
@@ -446,7 +925,7 @@
         <spirit:wire>
           <spirit:direction>out</spirit:direction>
           <spirit:vector>
-            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="(spirit:decode(id(&apos;MODELPARAM_VALUE.STRM0_WIDTH&apos;)) - 1)">31</spirit:left>
+            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="((((spirit:decode(id(&apos;MODELPARAM_VALUE.STRM0_WIDTH&apos;)) + 7) / 8) * 8) - 1)">31</spirit:left>
             <spirit:right spirit:format="long">0</spirit:right>
           </spirit:vector>
           <spirit:wireTypeDefs>
@@ -469,7 +948,17 @@
               <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
             </spirit:wireTypeDef>
           </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
         </spirit:wire>
+        <spirit:vendorExtensions>
+          <xilinx:portInfo>
+            <xilinx:enablement>
+              <xilinx:isEnabled xilinx:resolve="dependent" xilinx:id="PORT_ENABLEMENT.m_axis_1_afull" xilinx:dependency="$NSTREAMS>2">true</xilinx:isEnabled>
+            </xilinx:enablement>
+          </xilinx:portInfo>
+        </spirit:vendorExtensions>
       </spirit:port>
       <spirit:port>
         <spirit:name>m_axis_1_tready</spirit:name>
@@ -505,7 +994,7 @@
         <spirit:wire>
           <spirit:direction>out</spirit:direction>
           <spirit:vector>
-            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="(spirit:decode(id(&apos;MODELPARAM_VALUE.STRM1_WIDTH&apos;)) - 1)">31</spirit:left>
+            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="((((spirit:decode(id(&apos;MODELPARAM_VALUE.STRM1_WIDTH&apos;)) + 7) / 8) * 8) - 1)">31</spirit:left>
             <spirit:right spirit:format="long">0</spirit:right>
           </spirit:vector>
           <spirit:wireTypeDefs>
@@ -528,7 +1017,17 @@
               <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
             </spirit:wireTypeDef>
           </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
         </spirit:wire>
+        <spirit:vendorExtensions>
+          <xilinx:portInfo>
+            <xilinx:enablement>
+              <xilinx:isEnabled xilinx:resolve="dependent" xilinx:id="PORT_ENABLEMENT.m_axis_2_afull" xilinx:dependency="$NSTREAMS>=3">true</xilinx:isEnabled>
+            </xilinx:enablement>
+          </xilinx:portInfo>
+        </spirit:vendorExtensions>
       </spirit:port>
       <spirit:port>
         <spirit:name>m_axis_2_tready</spirit:name>
@@ -564,7 +1063,7 @@
         <spirit:wire>
           <spirit:direction>out</spirit:direction>
           <spirit:vector>
-            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="(spirit:decode(id(&apos;MODELPARAM_VALUE.STRM2_WIDTH&apos;)) - 1)">31</spirit:left>
+            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="((((spirit:decode(id(&apos;MODELPARAM_VALUE.STRM2_WIDTH&apos;)) + 7) / 8) * 8) - 1)">31</spirit:left>
             <spirit:right spirit:format="long">0</spirit:right>
           </spirit:vector>
           <spirit:wireTypeDefs>
@@ -587,7 +1086,17 @@
               <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
             </spirit:wireTypeDef>
           </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
         </spirit:wire>
+        <spirit:vendorExtensions>
+          <xilinx:portInfo>
+            <xilinx:enablement>
+              <xilinx:isEnabled xilinx:resolve="dependent" xilinx:id="PORT_ENABLEMENT.m_axis_3_afull" xilinx:dependency="$NSTREAMS>=4">true</xilinx:isEnabled>
+            </xilinx:enablement>
+          </xilinx:portInfo>
+        </spirit:vendorExtensions>
       </spirit:port>
       <spirit:port>
         <spirit:name>m_axis_3_tready</spirit:name>
@@ -623,7 +1132,7 @@
         <spirit:wire>
           <spirit:direction>out</spirit:direction>
           <spirit:vector>
-            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="(spirit:decode(id(&apos;MODELPARAM_VALUE.STRM3_WIDTH&apos;)) - 1)">31</spirit:left>
+            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="((((spirit:decode(id(&apos;MODELPARAM_VALUE.STRM3_WIDTH&apos;)) + 7) / 8) * 8) - 1)">31</spirit:left>
             <spirit:right spirit:format="long">0</spirit:right>
           </spirit:vector>
           <spirit:wireTypeDefs>
@@ -646,7 +1155,17 @@
               <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
             </spirit:wireTypeDef>
           </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
         </spirit:wire>
+        <spirit:vendorExtensions>
+          <xilinx:portInfo>
+            <xilinx:enablement>
+              <xilinx:isEnabled xilinx:resolve="dependent" xilinx:id="PORT_ENABLEMENT.m_axis_4_afull" xilinx:dependency="$NSTREAMS>=5">true</xilinx:isEnabled>
+            </xilinx:enablement>
+          </xilinx:portInfo>
+        </spirit:vendorExtensions>
       </spirit:port>
       <spirit:port>
         <spirit:name>m_axis_4_tready</spirit:name>
@@ -682,7 +1201,7 @@
         <spirit:wire>
           <spirit:direction>out</spirit:direction>
           <spirit:vector>
-            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="(spirit:decode(id(&apos;MODELPARAM_VALUE.STRM4_WIDTH&apos;)) - 1)">31</spirit:left>
+            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="((((spirit:decode(id(&apos;MODELPARAM_VALUE.STRM4_WIDTH&apos;)) + 7) / 8) * 8) - 1)">31</spirit:left>
             <spirit:right spirit:format="long">0</spirit:right>
           </spirit:vector>
           <spirit:wireTypeDefs>
@@ -705,7 +1224,17 @@
               <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
             </spirit:wireTypeDef>
           </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
         </spirit:wire>
+        <spirit:vendorExtensions>
+          <xilinx:portInfo>
+            <xilinx:enablement>
+              <xilinx:isEnabled xilinx:resolve="dependent" xilinx:id="PORT_ENABLEMENT.m_axis_5_afull" xilinx:dependency="$NSTREAMS = 6">true</xilinx:isEnabled>
+            </xilinx:enablement>
+          </xilinx:portInfo>
+        </spirit:vendorExtensions>
       </spirit:port>
       <spirit:port>
         <spirit:name>m_axis_5_tready</spirit:name>
@@ -741,7 +1270,7 @@
         <spirit:wire>
           <spirit:direction>out</spirit:direction>
           <spirit:vector>
-            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="(spirit:decode(id(&apos;MODELPARAM_VALUE.STRM5_WIDTH&apos;)) - 1)">31</spirit:left>
+            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="((((spirit:decode(id(&apos;MODELPARAM_VALUE.STRM5_WIDTH&apos;)) + 7) / 8) * 8) - 1)">31</spirit:left>
             <spirit:right spirit:format="long">0</spirit:right>
           </spirit:vector>
           <spirit:wireTypeDefs>
@@ -758,7 +1287,7 @@
       <spirit:modelParameter xsi:type="spirit:nameValueTypeType" spirit:dataType="integer">
         <spirit:name>CONFIG_EN</spirit:name>
         <spirit:displayName>Config En</spirit:displayName>
-        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.CONFIG_EN">1</spirit:value>
+        <spirit:value spirit:format="bool" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.CONFIG_EN">true</spirit:value>
       </spirit:modelParameter>
       <spirit:modelParameter spirit:dataType="integer">
         <spirit:name>NSTREAMS</spirit:name>
@@ -875,90 +1404,103 @@
         <spirit:displayName>Strm5 Offset</spirit:displayName>
         <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.STRM5_OFFSET">11520</spirit:value>
       </spirit:modelParameter>
+      <spirit:modelParameter spirit:dataType="integer">
+        <spirit:name>AXILITE_ADDR_WIDTH</spirit:name>
+        <spirit:displayName>Axilite Addr Width</spirit:displayName>
+        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.AXILITE_ADDR_WIDTH" spirit:dependency="(2 + spirit:ceil(spirit:log(2,(spirit:decode(id(&apos;MODELPARAM_VALUE.MEM_DEPTH&apos;)) * (1 &lt;&lt; spirit:ceil(spirit:log(2,((spirit:decode(id(&apos;MODELPARAM_VALUE.MEM_WIDTH&apos;)) + 31) / 32))))))))">16</spirit:value>
+      </spirit:modelParameter>
     </spirit:modelParameters>
   </spirit:model>
   <spirit:choices>
-    <spirit:choice>
-      <spirit:name>choice_list_44c459b8</spirit:name>
-      <spirit:enumeration>auto</spirit:enumeration>
-      <spirit:enumeration>block</spirit:enumeration>
-      <spirit:enumeration>distributed</spirit:enumeration>
-    </spirit:choice>
     <spirit:choice>
       <spirit:name>choice_list_9d8b0d81</spirit:name>
       <spirit:enumeration>ACTIVE_HIGH</spirit:enumeration>
       <spirit:enumeration>ACTIVE_LOW</spirit:enumeration>
     </spirit:choice>
+    <spirit:choice>
+      <spirit:name>choice_list_e2bd1cd0</spirit:name>
+      <spirit:enumeration>auto</spirit:enumeration>
+      <spirit:enumeration>distributed</spirit:enumeration>
+      <spirit:enumeration>block</spirit:enumeration>
+      <spirit:enumeration>ultra</spirit:enumeration>
+    </spirit:choice>
   </spirit:choices>
   <spirit:fileSets>
     <spirit:fileSet>
       <spirit:name>xilinx_anylanguagesynthesis_view_fileset</spirit:name>
+      <spirit:file>
+        <spirit:name>hdl/axilite_if.v</spirit:name>
+        <spirit:fileType>verilogSource</spirit:fileType>
+      </spirit:file>
       <spirit:file>
         <spirit:name>hdl/memstream.v</spirit:name>
         <spirit:fileType>verilogSource</spirit:fileType>
       </spirit:file>
       <spirit:file>
-        <spirit:name>hdl/mux.v</spirit:name>
+        <spirit:name>hdl/memstream_multiblock.v</spirit:name>
         <spirit:fileType>verilogSource</spirit:fileType>
       </spirit:file>
       <spirit:file>
         <spirit:name>hdl/memstream_singleblock.v</spirit:name>
         <spirit:fileType>verilogSource</spirit:fileType>
-        <spirit:logicalName>xil_defaultlib</spirit:logicalName>
       </spirit:file>
       <spirit:file>
-        <spirit:name>hdl/memstream_multiblock.v</spirit:name>
+        <spirit:name>hdl/mux.v</spirit:name>
         <spirit:fileType>verilogSource</spirit:fileType>
-        <spirit:logicalName>xil_defaultlib</spirit:logicalName>
       </spirit:file>
       <spirit:file>
-        <spirit:name>hdl/ramb18_wf_dualport.v</spirit:name>
+        <spirit:name>hdl/ramb18_sdp.v</spirit:name>
         <spirit:fileType>verilogSource</spirit:fileType>
-        <spirit:userFileType>CHECKSUM_9425c051</spirit:userFileType>
-        <spirit:logicalName>xil_defaultlib</spirit:logicalName>
       </spirit:file>
       <spirit:file>
-        <spirit:name>hdl/ramb18_sdp.v</spirit:name>
+        <spirit:name>hdl/ramb18_wf_dualport.v</spirit:name>
         <spirit:fileType>verilogSource</spirit:fileType>
-        <spirit:userFileType>CHECKSUM_9f7c64bf</spirit:userFileType>
-        <spirit:logicalName>xil_defaultlib</spirit:logicalName>
+        <spirit:userFileType>CHECKSUM_9425c051</spirit:userFileType>
       </spirit:file>
     </spirit:fileSet>
     <spirit:fileSet>
       <spirit:name>xilinx_anylanguagebehavioralsimulation_view_fileset</spirit:name>
-      <spirit:file>
-        <spirit:name>sim/tb_memstream.v</spirit:name>
-        <spirit:fileType>verilogSource</spirit:fileType>
-      </spirit:file>
       <spirit:file>
         <spirit:name>hdl/memstream.v</spirit:name>
         <spirit:fileType>verilogSource</spirit:fileType>
         <spirit:userFileType>USED_IN_ipstatic</spirit:userFileType>
+        <spirit:logicalName>xil_defaultlib</spirit:logicalName>
       </spirit:file>
       <spirit:file>
-        <spirit:name>hdl/mux.v</spirit:name>
+        <spirit:name>hdl/axilite_if.v</spirit:name>
         <spirit:fileType>verilogSource</spirit:fileType>
         <spirit:userFileType>USED_IN_ipstatic</spirit:userFileType>
+        <spirit:logicalName>xil_defaultlib</spirit:logicalName>
       </spirit:file>
       <spirit:file>
         <spirit:name>hdl/memstream_singleblock.v</spirit:name>
         <spirit:fileType>verilogSource</spirit:fileType>
         <spirit:userFileType>USED_IN_ipstatic</spirit:userFileType>
+        <spirit:logicalName>xil_defaultlib</spirit:logicalName>
       </spirit:file>
       <spirit:file>
-        <spirit:name>hdl/memstream_multiblock.v</spirit:name>
+        <spirit:name>hdl/mux.v</spirit:name>
         <spirit:fileType>verilogSource</spirit:fileType>
         <spirit:userFileType>USED_IN_ipstatic</spirit:userFileType>
+        <spirit:logicalName>xil_defaultlib</spirit:logicalName>
       </spirit:file>
       <spirit:file>
         <spirit:name>hdl/ramb18_wf_dualport.v</spirit:name>
         <spirit:fileType>verilogSource</spirit:fileType>
         <spirit:userFileType>USED_IN_ipstatic</spirit:userFileType>
+        <spirit:logicalName>xil_defaultlib</spirit:logicalName>
+      </spirit:file>
+      <spirit:file>
+        <spirit:name>hdl/memstream_multiblock.v</spirit:name>
+        <spirit:fileType>verilogSource</spirit:fileType>
+        <spirit:userFileType>USED_IN_ipstatic</spirit:userFileType>
+        <spirit:logicalName>xil_defaultlib</spirit:logicalName>
       </spirit:file>
       <spirit:file>
         <spirit:name>hdl/ramb18_sdp.v</spirit:name>
         <spirit:fileType>verilogSource</spirit:fileType>
         <spirit:userFileType>USED_IN_ipstatic</spirit:userFileType>
+        <spirit:logicalName>xil_defaultlib</spirit:logicalName>
       </spirit:file>
     </spirit:fileSet>
     <spirit:fileSet>
@@ -966,22 +1508,29 @@
       <spirit:file>
         <spirit:name>xgui/memstream_v1_0.tcl</spirit:name>
         <spirit:fileType>tclSource</spirit:fileType>
-        <spirit:userFileType>CHECKSUM_d714c73b</spirit:userFileType>
+        <spirit:userFileType>CHECKSUM_f960907f</spirit:userFileType>
         <spirit:userFileType>XGUI_VERSION_2</spirit:userFileType>
       </spirit:file>
     </spirit:fileSet>
+    <spirit:fileSet>
+      <spirit:name>xilinx_utilityxitfiles_view_fileset</spirit:name>
+      <spirit:file>
+        <spirit:name>gui/memstream_v1_0.gtcl</spirit:name>
+        <spirit:userFileType>GTCL</spirit:userFileType>
+      </spirit:file>
+    </spirit:fileSet>
   </spirit:fileSets>
   <spirit:description>memstream_v1_0</spirit:description>
   <spirit:parameters>
     <spirit:parameter>
       <spirit:name>CONFIG_EN</spirit:name>
       <spirit:displayName>Config En</spirit:displayName>
-      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.CONFIG_EN">1</spirit:value>
+      <spirit:value spirit:format="bool" spirit:resolve="user" spirit:id="PARAM_VALUE.CONFIG_EN">true</spirit:value>
     </spirit:parameter>
     <spirit:parameter>
       <spirit:name>NSTREAMS</spirit:name>
       <spirit:displayName>Nstreams</spirit:displayName>
-      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.NSTREAMS">6</spirit:value>
+      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.NSTREAMS" spirit:minimum="1" spirit:maximum="6" spirit:rangeType="long">6</spirit:value>
     </spirit:parameter>
     <spirit:parameter>
       <spirit:name>MEM_DEPTH</spirit:name>
@@ -998,6 +1547,11 @@
       <spirit:displayName>Mem Init</spirit:displayName>
       <spirit:value spirit:resolve="user" spirit:id="PARAM_VALUE.MEM_INIT">./</spirit:value>
     </spirit:parameter>
+    <spirit:parameter>
+      <spirit:name>RAM_STYLE</spirit:name>
+      <spirit:displayName>Ram Style</spirit:displayName>
+      <spirit:value spirit:resolve="user" spirit:id="PARAM_VALUE.RAM_STYLE" spirit:choiceRef="choice_list_e2bd1cd0">auto</spirit:value>
+    </spirit:parameter>
     <spirit:parameter>
       <spirit:name>STRM0_WIDTH</spirit:name>
       <spirit:displayName>Strm0 Width</spirit:displayName>
@@ -1089,39 +1643,46 @@
       <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.STRM5_OFFSET">11520</spirit:value>
     </spirit:parameter>
     <spirit:parameter>
-      <spirit:name>Component_Name</spirit:name>
-      <spirit:value spirit:resolve="user" spirit:id="PARAM_VALUE.Component_Name" spirit:order="1">memstream_v1_0</spirit:value>
+      <spirit:name>AXILITE_ADDR_WIDTH</spirit:name>
+      <spirit:displayName>Axilite Addr Width</spirit:displayName>
+      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.AXILITE_ADDR_WIDTH">16</spirit:value>
+      <spirit:vendorExtensions>
+        <xilinx:parameterInfo>
+          <xilinx:enablement>
+            <xilinx:isEnabled xilinx:id="PARAM_ENABLEMENT.AXILITE_ADDR_WIDTH">false</xilinx:isEnabled>
+          </xilinx:enablement>
+        </xilinx:parameterInfo>
+      </spirit:vendorExtensions>
     </spirit:parameter>
     <spirit:parameter>
-      <spirit:name>RAM_STYLE</spirit:name>
-      <spirit:displayName>Ram Style</spirit:displayName>
-      <spirit:value spirit:resolve="user" spirit:id="PARAM_VALUE.RAM_STYLE" spirit:choiceRef="choice_list_44c459b8">auto</spirit:value>
+      <spirit:name>Component_Name</spirit:name>
+      <spirit:value spirit:resolve="user" spirit:id="PARAM_VALUE.Component_Name" spirit:order="1">memstream_v1_0</spirit:value>
     </spirit:parameter>
   </spirit:parameters>
   <spirit:vendorExtensions>
     <xilinx:coreExtensions>
       <xilinx:supportedFamilies>
         <xilinx:family xilinx:lifeCycle="Production">zynq</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">zynquplus</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">virtexuplusHBM</xilinx:family>
         <xilinx:family xilinx:lifeCycle="Production">virtexuplus</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Beta">virtexuplusHBM</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">zynquplus</xilinx:family>
       </xilinx:supportedFamilies>
       <xilinx:taxonomies>
         <xilinx:taxonomy>/UserIP</xilinx:taxonomy>
       </xilinx:taxonomies>
       <xilinx:displayName>memstream_v1_0</xilinx:displayName>
-      <xilinx:autoFamilySupportLevel>level_0</xilinx:autoFamilySupportLevel>
       <xilinx:definitionSource>package_project</xilinx:definitionSource>
-      <xilinx:coreRevision>10</xilinx:coreRevision>
-      <xilinx:coreCreationDateTime>2020-09-17T18:04:10Z</xilinx:coreCreationDateTime>
+      <xilinx:coreRevision>5</xilinx:coreRevision>
+      <xilinx:coreCreationDateTime>2020-10-09T15:31:57Z</xilinx:coreCreationDateTime>
     </xilinx:coreExtensions>
     <xilinx:packagingInfo>
       <xilinx:xilinxVersion>2020.1</xilinx:xilinxVersion>
-      <xilinx:checksum xilinx:scope="busInterfaces" xilinx:value="6d8b2551"/>
-      <xilinx:checksum xilinx:scope="fileGroups" xilinx:value="629ffc9d"/>
-      <xilinx:checksum xilinx:scope="ports" xilinx:value="cabd7433"/>
-      <xilinx:checksum xilinx:scope="hdlParameters" xilinx:value="29c70cc4"/>
-      <xilinx:checksum xilinx:scope="parameters" xilinx:value="858b58f8"/>
+      <xilinx:checksum xilinx:scope="busInterfaces" xilinx:value="8f86a494"/>
+      <xilinx:checksum xilinx:scope="memoryMaps" xilinx:value="5a080bee"/>
+      <xilinx:checksum xilinx:scope="fileGroups" xilinx:value="d633e93f"/>
+      <xilinx:checksum xilinx:scope="ports" xilinx:value="2e562330"/>
+      <xilinx:checksum xilinx:scope="hdlParameters" xilinx:value="134f154d"/>
+      <xilinx:checksum xilinx:scope="parameters" xilinx:value="83e5a517"/>
     </xilinx:packagingInfo>
   </spirit:vendorExtensions>
 </spirit:component>
diff --git a/finn-rtllib/memstream/doc/README b/finn-rtllib/memstream/doc/README
new file mode 100644
index 0000000000000000000000000000000000000000..f4f99542cf6daed25736cd22e9d4a2e2d1a2607c
--- /dev/null
+++ b/finn-rtllib/memstream/doc/README
@@ -0,0 +1,45 @@
+IMPORTANT: After using AXI lite to either read or write the weights,
+always "flush" the accelerator by first passing a dummy input
+vector through the accelerator. This will get rid of any old
+weight data from the weight FIFOs.
+
+Memory Streamer Address Map
+
+The memory streamer implements an internal storage array of parameters:
+Memory Depth: D
+Memory Width: W
+
+When W is greater than 32, the bits of each word are assigned to N
+32-bit words on the AXI interface, where:
+N = pow(2,ceil(log2(W/32)))
+
+To the AXI master, this memory appears to have the following parameters:
+Apparent Memory Depth: D*N
+Apparent Memory Width: 32b
+
+To perform a write, the AXI master must write to the N 32-bit segments
+corresponding to a streamer memory word. The writes are committed to
+the internal memory when the last segment of a word is written.
+The order of writes to the other segments does not matter.
+To perform a read, the AXI master simply indicates on 32-bit segment to read.
+
+Example: D=2, W=70
+
+Here N=2 so we allocate two 32-bit words in the global memory map for each
+physical memory word:
+
+AXI Addr | Internal position of data
+------------------------------------
+0        | mem[0][31: 0]
+4        | mem[0][63:32]
+8        | mem[0][69:64]
+C        | N/A
+10       | mem[1][31: 0]
+14       | mem[1][63:32]
+18       | mem[1][69:64]
+1C       | N/A
+
+To perform a write to mem[0], the AXI master writes to AXI Addr 0,4,8 in any
+order, then writes to AXI Addr 0xC to commit the write. To read mem[1][63:32],
+the AXI master reads from AXI Addr 0x14. The words of mem can be written to or
+read from in any order.
diff --git a/finn-rtllib/memstream/gui/memstream_v1_0.gtcl b/finn-rtllib/memstream/gui/memstream_v1_0.gtcl
new file mode 100644
index 0000000000000000000000000000000000000000..a68b85e1f5bdb3dc102f14eb4e3b8d82a86fa556
--- /dev/null
+++ b/finn-rtllib/memstream/gui/memstream_v1_0.gtcl
@@ -0,0 +1,2 @@
+# This file is automatically written.  Do not modify.
+proc gen_USERPARAMETER_AXILITE_ADDR_WIDTH_VALUE {MEM_DEPTH MEM_WIDTH } {expr 2+ceil(log($MEM_DEPTH*pow(2,ceil(log(($MEM_WIDTH+31)/32)/log(2))))/log(2))}
diff --git a/finn-rtllib/memstream/hdl/axilite_if.v b/finn-rtllib/memstream/hdl/axilite_if.v
new file mode 100644
index 0000000000000000000000000000000000000000..93b2227de1b51d4fca145e8b61e6ed6dc2ed3121
--- /dev/null
+++ b/finn-rtllib/memstream/hdl/axilite_if.v
@@ -0,0 +1,211 @@
+/*
+ Copyright (c) 2020, Xilinx
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name of FINN nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+module axi4lite_if
+#(
+    parameter ADDR_WIDTH = 32,
+    parameter DATA_WIDTH = 32,//AXI4 spec requires this to be strictly 32 or 64
+    parameter IP_DATA_WIDTH = 64//can be any power-of-2 multiple of DATA_WIDTH
+)
+(
+//system signals
+input aclk,
+input aresetn,//active low, asynchronous assertion and synchronous deassertion
+
+//Write channels
+//write address
+output reg                  awready,
+input                       awvalid,
+input [ADDR_WIDTH-1:0]      awaddr,
+input [2:0]                 awprot,
+//write data
+output reg                  wready,
+input                       wvalid,
+input [DATA_WIDTH-1:0]      wdata,
+input [(DATA_WIDTH/8)-1:0]  wstrb,
+//burst response
+input                       bready,
+output reg                  bvalid,
+output reg [1:0]            bresp,//NOTE: 00 = OKAY, 10 = SLVERR (write error)
+
+//Read channels
+//read address
+output reg                  arready,
+input                       arvalid,
+input [ADDR_WIDTH-1:0]      araddr,
+input [2:0]                 arprot,
+//read data
+input                       rready,
+output reg                  rvalid,
+output reg [1:0]            rresp,//NOTE: 00 = OKAY, 10 = SLVERR (read error)
+output reg [DATA_WIDTH-1:0] rdata,
+
+//IP-side interface
+output reg                  ip_en,
+output reg                  ip_wen,
+output reg [ADDR_WIDTH-1:0] ip_addr,
+output [IP_DATA_WIDTH-1:0]  ip_wdata,
+input                       ip_rack,
+input [IP_DATA_WIDTH-1:0]      ip_rdata
+);
+
+localparam RESP_OKAY = 2'b00;
+localparam RESP_SLVERR = 2'b10;
+//get ceil(log2(ceil(IP_DATA_WIDTH/DATA_WIDTH)))
+localparam NFOLDS_LOG = $clog2((IP_DATA_WIDTH + DATA_WIDTH - 1) / DATA_WIDTH);
+
+reg                      internal_ren;
+reg                      internal_wen;
+reg                      internal_wack;
+reg [ADDR_WIDTH-1:0]     internal_raddr;
+reg [ADDR_WIDTH-1:0]     internal_waddr;
+reg [DATA_WIDTH-1:0]     internal_wdata;
+wire [DATA_WIDTH-1:0]    internal_rdata;
+reg                      internal_error = 0;
+
+//check DATA_WIDTH
+initial begin
+    if(DATA_WIDTH != 32 & DATA_WIDTH != 64) begin
+        $display("AXI4Lite DATA_WIDTH must be 32 or 64");
+        $finish;
+    end
+end
+
+//transaction state machine
+localparam  STATE_IDLE  = 0,
+            STATE_READ  = 1,
+            STATE_WRITE = 2;
+
+reg [1:0] state;
+
+always @(posedge aclk or negedge aresetn)
+    if(~aresetn)
+        state <= STATE_IDLE;
+    else case(state)
+        STATE_IDLE:
+            if(awvalid & wvalid)
+                state <= STATE_WRITE;
+            else if(arvalid)
+                state <= STATE_READ;
+        STATE_READ:
+            if(rvalid & rready)
+                state <= STATE_IDLE;
+        STATE_WRITE:
+            if(bvalid & bready)
+                state <= STATE_IDLE;
+        default: state <= STATE_IDLE;
+    endcase
+
+//write-related internal signals
+always @(*) begin
+    internal_waddr = awaddr >> $clog2(DATA_WIDTH/8);
+    internal_wdata = wdata;
+    internal_wen = (state == STATE_IDLE) & awvalid & wvalid; 
+end
+
+always @(posedge aclk) begin
+    awready <= internal_wen;
+    wready <= internal_wen;
+end
+
+//read-related internal signals
+always @(*) begin
+    internal_raddr = araddr >> $clog2(DATA_WIDTH/8);
+    internal_ren = (state == STATE_IDLE) & ~internal_wen & arvalid;
+end
+
+always @(posedge aclk)
+    arready <= internal_ren;
+
+wire write_to_last_fold;
+
+always @(posedge aclk) begin
+    ip_wen <= write_to_last_fold;
+    ip_en <= internal_ren | write_to_last_fold;
+    if(internal_ren | write_to_last_fold)
+        ip_addr <= internal_ren ? (internal_raddr >> NFOLDS_LOG) : (internal_waddr >> NFOLDS_LOG);
+    internal_wack <= internal_wen;
+end
+
+genvar i;
+reg [(1<<NFOLDS_LOG)*DATA_WIDTH-1:0] ip_wdata_wide;
+generate
+if(NFOLDS_LOG == 0) begin: no_fold
+    assign write_to_last_fold = internal_wen;
+    assign internal_rdata = ip_rdata;
+    always @(posedge aclk)
+        ip_wdata_wide <= internal_wdata;
+end else begin: fold
+    reg [NFOLDS_LOG-1:0] internal_rfold;
+    assign write_to_last_fold = internal_wen & (internal_waddr[NFOLDS_LOG-1:0] == {(NFOLDS_LOG){1'b1}});
+    assign internal_rdata = ip_rdata >> (internal_rfold*DATA_WIDTH);
+    always @(posedge aclk)
+        if(internal_ren)
+            internal_rfold <= internal_raddr[NFOLDS_LOG-1:0];
+    for(i=0; i<(1<<NFOLDS_LOG); i = i+1) begin: gen_wdata
+        always @(posedge aclk)
+            if(internal_waddr[NFOLDS_LOG-1:0] == i)
+                ip_wdata_wide[(i+1)*DATA_WIDTH-1:i*DATA_WIDTH] <= internal_wdata;
+    end
+end
+endgenerate
+assign ip_wdata = ip_wdata_wide[IP_DATA_WIDTH-1:0];
+
+//write response on AXI4L bus
+always @(posedge aclk or negedge aresetn)
+    if(~aresetn) begin
+        bvalid <= 0;//AXI4 spec requires BVALID pulled LOW during reset
+        bresp <= RESP_OKAY;
+    end else if(internal_wack) begin
+        bvalid <= 1;
+        bresp <= internal_error ? RESP_SLVERR : RESP_OKAY;
+    end else if(bready) begin
+        bvalid <= 0;
+        bresp <= RESP_OKAY;
+    end
+
+//read response on AXI4L bus
+always @(posedge aclk or negedge aresetn)
+    if(~aresetn) begin
+        rvalid <= 0;//AXI4 spec requires RVALID pulled LOW during reset
+        rdata <= 0;
+        rresp <= RESP_OKAY;
+    end else if(ip_rack) begin
+        rvalid <= 1;
+        rdata <= internal_rdata;
+        rresp <= internal_error ? RESP_SLVERR : RESP_OKAY;
+    end else if(rready) begin
+        rvalid <= 0;
+        rdata <= 0;
+        rresp <= RESP_OKAY;
+    end
+
+endmodule
+
diff --git a/finn-rtllib/memstream/hdl/memstream.v b/finn-rtllib/memstream/hdl/memstream.v
index 961103e4ca1261ab0109ad9db291a1a66f9c0915..2cd955f8d1ff0dd5dfed8b004df7053573a43488 100644
--- a/finn-rtllib/memstream/hdl/memstream.v
+++ b/finn-rtllib/memstream/hdl/memstream.v
@@ -61,19 +61,40 @@ module memstream
 	parameter STRM2_OFFSET = 4608,
 	parameter STRM3_OFFSET = 6912,
 	parameter STRM4_OFFSET = 9216,
-	parameter STRM5_OFFSET = 11520
+	parameter STRM5_OFFSET = 11520,
+
+    parameter AXILITE_ADDR_WIDTH = 2+$clog2(MEM_DEPTH*(1<<$clog2((MEM_WIDTH+31)/32)))
 )
 
 (
     input aclk,
     input aresetn,
 
-    //optional configuration interface compatible with ap_memory
-	input [31:0] config_address,
-	input config_ce,
-	input config_we,
-	input [31:0] config_d0,
-	output [31:0] config_q0,
+    output awready,
+    input                       awvalid,
+    input [AXILITE_ADDR_WIDTH-1:0]      awaddr,
+    input [2:0]                 awprot,
+    //write data
+    output                  wready,
+    input                       wvalid,
+    input [31:0]      wdata,
+    input [3:0]  wstrb,
+    //burst response
+    input                       bready,
+    output                  bvalid,
+    output [1:0]            bresp,
+
+    //Read channels
+    //read address
+    output                  arready,
+    input                       arvalid,
+    input [AXILITE_ADDR_WIDTH-1:0]      araddr,
+    input [2:0]                 arprot,
+    //read data
+    input                       rready,
+    output                  rvalid,
+    output [1:0]            rresp,
+    output [31:0] rdata,
 
     //multiple output AXI Streams, TDATA width rounded to multiple of 8 bits
     input m_axis_0_afull,
@@ -109,6 +130,13 @@ module memstream
 
 );
 
+wire [31:0] config_address;
+wire config_ce;
+wire config_we;
+wire config_rack;
+wire [MEM_WIDTH-1:0] config_d0;
+wire [MEM_WIDTH-1:0] config_q0;
+
 generate
 if(NSTREAMS <= 2) begin: singleblock
 
@@ -144,6 +172,7 @@ mem
     .config_we(config_we),
     .config_d0(config_d0),
     .config_q0(config_q0),
+    .config_rack(config_rack),
 
     .m_axis_0_tready(m_axis_0_tready),
     .m_axis_0_tvalid(m_axis_0_tvalid),
@@ -246,4 +275,53 @@ mem
 end
 endgenerate
 
+axi4lite_if
+#(
+    .ADDR_WIDTH(AXILITE_ADDR_WIDTH),
+    .DATA_WIDTH(32),
+    .IP_DATA_WIDTH(MEM_WIDTH)
+)
+config_if
+(
+    //system signals
+    .aclk(aclk),
+    .aresetn(aresetn),
+
+    //Write channels
+    //write address
+    .awready(awready),
+    .awvalid(awvalid),
+    .awaddr(awaddr),
+    .awprot(awprot),
+    //write data
+    .wready(wready),
+    .wvalid(wvalid),
+    .wdata(wdata),
+    .wstrb(wstrb),
+    //burst response
+    .bready(bready),
+    .bvalid(bvalid),
+    .bresp(bresp),
+
+    //Read channels
+    //read address
+    .arready(arready),
+    .arvalid(arvalid),
+    .araddr(araddr),
+    .arprot(arprot),
+    //read data
+    .rready(rready),
+    .rvalid(rvalid),
+    .rresp(rresp),
+    .rdata(rdata),
+
+    //IP-side interface
+    .ip_en(config_ce),
+    .ip_wen(config_we),
+    .ip_addr(config_address),
+    .ip_wdata(config_d0),
+    .ip_rack(config_rack),
+    .ip_rdata(config_q0)
+);
+
 endmodule
diff --git a/finn-rtllib/memstream/hdl/memstream_multiblock.v b/finn-rtllib/memstream/hdl/memstream_multiblock.v
index 017088b8c1572bb3baa2a5a46336509187a762ab..4e6167132da119d9e19a5f4b6a378311e74311c3 100644
--- a/finn-rtllib/memstream/hdl/memstream_multiblock.v
+++ b/finn-rtllib/memstream/hdl/memstream_multiblock.v
@@ -74,6 +74,7 @@ module memstream_multiblock
 	input config_we,
 	input [31:0] config_d0,
 	output [31:0] config_q0,
+    output config_rack,
 
     //multiple output AXI Streams, TDATA width rounded to multiple of 8 bits
     input m_axis_0_afull,
@@ -466,6 +467,8 @@ always @(posedge aclk) begin
     end
 end
 
+//dummy read, for now
 assign config_q0 = 0;
+assign config_rack = config_ce & ~config_we;
 
 endmodule
diff --git a/finn-rtllib/memstream/hdl/memstream_singleblock.v b/finn-rtllib/memstream/hdl/memstream_singleblock.v
index 53a71a91bc0561e275791ebcf55e2c4653331b1d..54ee56764e187520997e03bdcb291b4183e6ecf0 100644
--- a/finn-rtllib/memstream/hdl/memstream_singleblock.v
+++ b/finn-rtllib/memstream/hdl/memstream_singleblock.v
@@ -65,6 +65,7 @@ module memstream_singleblock
 	input config_we,
 	input [MEM_WIDTH-1:0] config_d0,
 	output [MEM_WIDTH-1:0] config_q0,
+    output config_rack,
 
     //multiple output AXI Streams, TDATA width rounded to multiple of 8 bits
     input m_axis_0_tready,
@@ -97,6 +98,8 @@ wire strm1_incr_en;
 assign strm0_incr_en = m_axis_0_tready | ~m_axis_0_tvalid;
 assign strm1_incr_en = m_axis_1_tready | ~m_axis_1_tvalid;
 
+reg rack_shift[1:0]; 
+
 generate
 if(MEM_DEPTH > 1) begin: use_ram
 
@@ -135,9 +138,9 @@ ram
 	.addra(config_address[BLOCKADRWIDTH-1:0]),
     .wdataa(config_d0),
 
-    .enb(strm0_incr_en),
-    .enqb(strm0_incr_en),
-	.addrb(strm0_addr),
+    .enb(strm0_incr_en | config_ce),
+    .enqb(strm0_incr_en | rack_shift[0]),
+	.addrb(config_ce ? config_address[BLOCKADRWIDTH-1:0] : strm0_addr),
 	.rdqb(m_axis_0_tdata)
 );
 
@@ -170,7 +173,7 @@ ram
 
 	.wea(config_we),
     .ena(strm0_incr_en | config_ce),
-    .enqa(strm0_incr_en | config_ce),
+    .enqa(strm0_incr_en | config_ce_r),
 	.addra(config_we ? config_address[BLOCKADRWIDTH-1:0] : strm0_addr),
 	.wdataa(config_d0),
 	.rdqa(m_axis_0_tdata),
@@ -192,6 +195,10 @@ initial begin
     $readmemh({MEM_INIT,"memblock_0.dat"}, singleval, 0, 0);
 end
 
+always @(posedge aclk)
+    if(config_ce & config_we)
+        singleval[0] <= config_d0;
+
 assign m_axis_0_tdata = singleval[0];
 assign m_axis_1_tdata = singleval[0];
 
@@ -224,6 +231,12 @@ always @(posedge aclk) begin
     end
 end
 
+always @(posedge aclk) begin
+    rack_shift[0] <= config_ce & ~config_we;
+    rack_shift[1] <= rack_shift[0];
+end
+
+assign config_rack = rack_shift[1];
 assign config_q0 = m_axis_0_tdata;
 
 endmodule
diff --git a/finn-rtllib/memstream/sim/tb_memstream_writes.v b/finn-rtllib/memstream/sim/tb_memstream_writes.v
new file mode 100644
index 0000000000000000000000000000000000000000..867acfe813280cc3c9a473fb2a7e6bc9d7c05b23
--- /dev/null
+++ b/finn-rtllib/memstream/sim/tb_memstream_writes.v
@@ -0,0 +1,497 @@
+/*
+ Copyright (c) 2020, Xilinx
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name of FINN nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+`timescale 1ns/10ps
+
+module tb_memstream_writes;
+
+//parameters to enable/disable axi-mm, set number of streams, set readmemh for memory, set per-stream offsets in memory, set per-stream widths
+parameter CONFIG_EN = 1;
+parameter NSTREAMS = 2;//1 up to 6
+
+parameter MEM_DEPTH = 40;
+parameter MEM_WIDTH = 70;
+
+//widths per stream
+parameter STRM0_WIDTH = 70;
+parameter STRM1_WIDTH = 32;
+parameter STRM2_WIDTH = 32;
+parameter STRM3_WIDTH = 32;
+parameter STRM4_WIDTH = 1;
+parameter STRM5_WIDTH = 1;
+
+//depths per stream
+parameter STRM0_DEPTH = 20;
+parameter STRM1_DEPTH = 20;
+parameter STRM2_DEPTH = 2304;
+parameter STRM3_DEPTH = 2304;
+parameter STRM4_DEPTH = 1;
+parameter STRM5_DEPTH = 1;
+
+//offsets for each stream
+parameter STRM0_OFFSET = 0;
+parameter STRM1_OFFSET = 20;
+parameter STRM2_OFFSET = 4608;
+parameter STRM3_OFFSET = 6912;
+parameter STRM4_OFFSET = 0;
+parameter STRM5_OFFSET = 0;
+
+
+reg clk;
+reg rst;
+
+wire        awready;
+reg         awvalid;
+reg [31:0]  awaddr;
+reg [2:0]   awprot;
+//write data
+wire        wready;
+reg         wvalid;
+reg [31:0]  wdata;
+reg [3:0]   wstrb;
+//burst response
+reg         bready;
+wire        bvalid;
+wire [1:0]  bresp;
+
+//Read channels
+//read address
+wire        arready;
+reg         arvalid;
+reg [31:0]  araddr;
+reg [2:0]   arprot;
+//read data
+reg         rready;
+wire        rvalid;
+wire [1:0]  rresp;
+wire [31:0] rdata;
+
+//multiple wire AXI Streams
+reg m_axis_0_afull;
+reg m_axis_0_tready;
+wire m_axis_0_tvalid;
+wire [STRM0_WIDTH-1:0] m_axis_0_tdata;
+
+reg m_axis_1_afull;
+reg m_axis_1_tready;
+wire m_axis_1_tvalid;
+wire [STRM1_WIDTH-1:0] m_axis_1_tdata;
+
+reg m_axis_2_afull;
+reg m_axis_2_tready;
+wire m_axis_2_tvalid;
+wire [STRM2_WIDTH-1:0] m_axis_2_tdata;
+
+reg m_axis_3_afull;
+reg m_axis_3_tready;
+wire m_axis_3_tvalid;
+wire [STRM3_WIDTH-1:0] m_axis_3_tdata;
+
+reg m_axis_4_afull;
+reg m_axis_4_tready;
+wire m_axis_4_tvalid;
+wire [STRM4_WIDTH-1:0] m_axis_4_tdata;
+
+reg m_axis_5_afull;
+reg m_axis_5_tready;
+wire m_axis_5_tvalid;
+wire [STRM5_WIDTH-1:0] m_axis_5_tdata;
+
+reg [MEM_WIDTH-1:0] golden[MEM_DEPTH-1:0];
+reg [MEM_WIDTH-1:0] gword;
+integer ptr0, ptr1, ptr2, ptr3, ptr4, ptr5;
+integer done = 0;
+integer i, j;
+reg [5:0] rng;
+
+parameter NFOLDS_PER_WORD = (MEM_WIDTH+31)/32;
+
+task axi_write;
+    input [MEM_WIDTH-1:0] data;
+    input [31:0] adr;
+    begin
+        for(j=0; j<(1<<$clog2(NFOLDS_PER_WORD)); j=j+1) begin
+            @(negedge clk);
+            awvalid = 1;
+            wvalid = 1;
+            wdata = data>>(j*32);
+            awaddr = (adr*(1<<$clog2(NFOLDS_PER_WORD))+j)*4;
+            fork
+                begin
+                    @(posedge awready);
+                    @(posedge clk) awvalid = 0;
+                end
+                begin
+                    @(posedge wready);
+                    @(posedge clk) wvalid = 0;
+                end
+            join
+            @(posedge clk);
+        end
+    end
+endtask
+
+task axi_read;
+    input [31:0] adr;
+    output [MEM_WIDTH-1:0] data;
+    begin
+        data = 0;
+        for(j=0; j<NFOLDS_PER_WORD; j=j+1) begin
+            @(negedge clk);
+            arvalid = 1;
+            araddr = (adr*(1<<$clog2(NFOLDS_PER_WORD))+j)*4;
+            rready = 1;
+            fork
+                begin
+                    @(posedge arready);
+                    @(posedge clk) arvalid = 0;
+                end
+                begin
+                    @(posedge rvalid);
+                    @(posedge clk) rready = 0;
+                    data = data | (rdata<<(32*j));
+                end
+            join
+            
+            @(posedge clk);
+        end
+    end
+endtask
+
+//clock
+initial begin
+    clk = 0;
+    forever #5 clk = ~clk;
+end
+
+initial begin
+    rst = 1;
+    awvalid = 0;
+    arvalid = 0;
+    wvalid = 0;
+    rready = 1;
+    bready = 1;
+    m_axis_0_afull = 1;
+    m_axis_1_afull = 1;
+    m_axis_2_afull = 1;
+    m_axis_3_afull = 1;
+    m_axis_4_afull = 1;
+    m_axis_5_afull = 1;
+    m_axis_0_tready = 0;
+    m_axis_1_tready = 0;
+    m_axis_2_tready = 0;
+    m_axis_3_tready = 0;
+    m_axis_4_tready = 0;
+    m_axis_5_tready = 0;
+    repeat(100) @(negedge clk);
+    rst = 0;
+    #100
+    //random initialization of golden data
+    for(i=0; i<MEM_DEPTH; i=i+1) begin
+        gword = 0;
+        repeat(NFOLDS_PER_WORD)
+            gword = (gword << 32) | $random;
+        golden[i] = gword;
+        axi_write(golden[i],i);
+        axi_read(i,gword);
+    end
+    //re-reset
+    repeat(100) @(negedge clk);
+    rst = 1;
+    #100
+    repeat(100) @(negedge clk);
+    rst = 0;
+    #100
+    @(negedge clk);
+    //start reads
+    m_axis_0_afull = 0;
+    m_axis_1_afull = 0;
+    m_axis_2_afull = 0;
+    m_axis_3_afull = 0;
+    m_axis_4_afull = 0;
+    m_axis_5_afull = 0;
+    m_axis_0_tready = 1;
+    m_axis_1_tready = 1;
+    m_axis_2_tready = 1;
+    m_axis_3_tready = 1;
+    m_axis_4_tready = 1;
+    m_axis_5_tready = 1;
+    fork
+	    begin
+		    $display("Starting to generate random AFULL");
+			while(~done) begin
+			    rng = $random;
+				m_axis_0_afull = rng[0];
+				m_axis_1_afull = rng[1];
+				m_axis_2_afull = rng[2];
+				m_axis_3_afull = rng[3];
+				m_axis_4_afull = rng[4];
+				m_axis_5_afull = rng[5];
+				@(negedge clk);
+			end
+		end
+	join
+end
+
+
+//DUT
+memstream
+#(
+    CONFIG_EN,
+    NSTREAMS,
+    MEM_DEPTH,
+    MEM_WIDTH,
+    ".",
+    "auto",
+    
+    //widths per stream
+    STRM0_WIDTH,
+    STRM1_WIDTH,
+    STRM2_WIDTH,
+    STRM3_WIDTH,
+    STRM4_WIDTH,
+    STRM5_WIDTH,
+    
+    //depths per stream
+    STRM0_DEPTH,
+    STRM1_DEPTH,
+    STRM2_DEPTH,
+    STRM3_DEPTH,
+    STRM4_DEPTH,
+    STRM5_DEPTH,
+    
+    //offsets for each stream
+    STRM0_OFFSET,
+    STRM1_OFFSET,
+    STRM2_OFFSET,
+    STRM3_OFFSET,
+    STRM4_OFFSET,
+    STRM5_OFFSET
+)
+dut
+(
+    clk,
+    ~rst,
+
+    //optional AXI-Lite interface
+    awready,
+    awvalid,
+    awaddr,
+    awprot,
+    //write data
+    wready,
+    wvalid,
+    wdata,
+    wstrb,
+    //burst response
+    bready,
+    bvalid,
+    bresp,
+
+    //Read channels
+    //read address
+    arready,
+    arvalid,
+    araddr,
+    arprot,
+    //read data
+    rready,
+    rvalid,
+    rresp,
+    rdata,
+
+    //multiple output AXI Streams
+    m_axis_0_afull,
+    m_axis_0_tready,
+    m_axis_0_tvalid,
+    m_axis_0_tdata,
+    
+    m_axis_1_afull,
+    m_axis_1_tready,
+    m_axis_1_tvalid,
+    m_axis_1_tdata,
+    
+    m_axis_2_afull,
+    m_axis_2_tready,
+    m_axis_2_tvalid,
+    m_axis_2_tdata,
+    
+    m_axis_3_afull,
+    m_axis_3_tready,
+    m_axis_3_tvalid,
+    m_axis_3_tdata,
+    
+    m_axis_4_afull,
+    m_axis_4_tready,
+    m_axis_4_tvalid,
+    m_axis_4_tdata,
+    
+    m_axis_5_afull,
+    m_axis_5_tready,
+    m_axis_5_tvalid,
+    m_axis_5_tdata
+    
+
+);
+
+//stream checkers
+initial begin
+    ptr0 = STRM0_OFFSET;
+	ptr1 = STRM1_OFFSET;
+	ptr2 = STRM2_OFFSET;
+	ptr3 = STRM3_OFFSET;
+	ptr4 = STRM4_OFFSET;
+	ptr5 = STRM5_OFFSET;
+    fork
+		//check stream 0
+	    begin
+		    $display("Starting stream 0 checker");
+		    while(~done & (NSTREAMS > 0)) begin
+				@(negedge clk);
+				if(m_axis_0_tvalid & m_axis_0_tready) begin
+					if(m_axis_0_tdata != golden[ptr0]) begin
+						$display("Mismatch on stream 0");
+						$stop();
+					end
+					//increment pointer
+					ptr0 = ptr0 + 1;
+					//rewind pointer if it's reached end
+					if(ptr0 == (STRM0_OFFSET + STRM0_DEPTH))
+				        ptr0 = STRM0_OFFSET;
+				end
+			end
+		end
+		//check stream 1
+	    begin
+		    $display("Starting stream 1 checker");
+		    while(~done & (NSTREAMS > 1)) begin
+				@(negedge clk);
+				if(m_axis_1_tvalid & m_axis_1_tready) begin
+					if(m_axis_1_tdata != golden[ptr1]) begin
+						$display("Mismatch on stream 1");
+						$stop();
+					end
+					//increment pointer
+					ptr1 = ptr1 + 1;
+					//rewind pointer if it's reached end
+					if(ptr1 == (STRM1_OFFSET + STRM1_DEPTH))
+						ptr1 = STRM1_OFFSET;
+				end
+			end
+		end
+		
+		//check stream 2
+	    begin
+		    $display("Starting stream 2 checker");
+		    while(~done & (NSTREAMS > 2)) begin
+				@(negedge clk);
+				if(m_axis_2_tvalid & m_axis_2_tready) begin
+					if(m_axis_2_tdata != golden[ptr2]) begin
+						$display("Mismatch on stream 2");
+						$stop();
+					end
+					//increment pointer
+					ptr2 = ptr2 + 1;
+					//rewind pointer if it's reached end
+					if(ptr2 == (STRM2_OFFSET + STRM2_DEPTH))
+						ptr2 = STRM2_OFFSET;
+				end
+			end
+		end
+		//check stream 3
+	    begin
+		    $display("Starting stream 3 checker");
+		    while(~done & (NSTREAMS > 3)) begin
+				@(negedge clk);
+				if(m_axis_3_tvalid & m_axis_3_tready) begin
+					if(m_axis_3_tdata != golden[ptr3]) begin
+						$display("Mismatch on stream 3");
+						$stop();
+					end
+					//increment pointer
+					ptr3 = ptr3 + 1;
+					//rewind pointer if it's reached end
+					if(ptr3 == (STRM3_OFFSET + STRM3_DEPTH))
+						ptr3 = STRM3_OFFSET;
+				end
+			end
+		end
+		//check stream 4
+	    begin
+		    $display("Starting stream 4 checker");
+		    while(~done & (NSTREAMS > 4)) begin
+				@(negedge clk);
+				if(m_axis_4_tvalid & m_axis_4_tready) begin
+					if(m_axis_4_tdata != golden[ptr4]) begin
+						$display("Mismatch on stream 4");
+						$stop();
+					end
+					//increment pointer
+					ptr4 = ptr4 + 1;
+					//rewind pointer if it's reached end
+					if(ptr4 == (STRM4_OFFSET + STRM4_DEPTH))
+						ptr4 = STRM4_OFFSET;
+				end
+			end
+		end
+		//check stream 5
+	    begin
+		    $display("Starting stream 5 checker");
+		    while(~done & (NSTREAMS > 5)) begin
+				@(negedge clk);
+				if(m_axis_5_tvalid & m_axis_5_tready) begin
+					if(m_axis_5_tdata != golden[ptr5]) begin
+						$display("Mismatch on stream 5");
+						$stop();
+					end
+					//increment pointer
+					ptr5 = ptr5 + 1;
+					//rewind pointer if it's reached end
+					if(ptr5 == (STRM5_OFFSET + STRM5_DEPTH))
+						ptr5 = STRM5_OFFSET;
+				end
+			end
+		end
+	join
+end
+
+initial begin
+    done = 0;
+    @(negedge rst);
+    $dumpfile("wave.vcd");
+    $dumpvars(0,tb_memstream_writes);
+    #50000
+	$display("Test done!");
+	done = 1;
+	#1000
+    $finish();
+end
+
+endmodule
diff --git a/finn-rtllib/memstream/sim/test.sh b/finn-rtllib/memstream/sim/test.sh
old mode 100644
new mode 100755
index 24767edf4ade1c6867e0c7d906e7a45bbcb987a2..3348e64b715ccbba17a38ac3bdf2c2c4173c3956
--- a/finn-rtllib/memstream/sim/test.sh
+++ b/finn-rtllib/memstream/sim/test.sh
@@ -28,7 +28,6 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-./gen_memblocks.sh golden.dat
-iverilog ../hdl/*.v *v -o sim
+iverilog ../hdl/*.v tb_memstream_writes.v -o sim
 ./sim
 
diff --git a/finn-rtllib/memstream/xgui/memstream_v1_0.tcl b/finn-rtllib/memstream/xgui/memstream_v1_0.tcl
index 7ce84b44a7cd6e20b59fd1b21a467d137ff0288f..b8be5e0a2f5c960cc5cb47ff9b348efffad98762 100644
--- a/finn-rtllib/memstream/xgui/memstream_v1_0.tcl
+++ b/finn-rtllib/memstream/xgui/memstream_v1_0.tcl
@@ -1,14 +1,19 @@
+
+# Loading additional proc with user specified bodies to compute parameter values.
+source [file join [file dirname [file dirname [info script]]] gui/memstream_v1_0.gtcl]
+
 # Definitional proc to organize widgets for parameters.
 proc init_gui { IPINST } {
   ipgui::add_param $IPINST -name "Component_Name"
   #Adding Page
   set Page_0 [ipgui::add_page $IPINST -name "Page 0"]
+  ipgui::add_param $IPINST -name "AXILITE_ADDR_WIDTH" -parent ${Page_0}
   ipgui::add_param $IPINST -name "CONFIG_EN" -parent ${Page_0}
   ipgui::add_param $IPINST -name "MEM_DEPTH" -parent ${Page_0}
   ipgui::add_param $IPINST -name "MEM_INIT" -parent ${Page_0}
   ipgui::add_param $IPINST -name "MEM_WIDTH" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "RAM_STYLE" -parent ${Page_0} -widget comboBox
   ipgui::add_param $IPINST -name "NSTREAMS" -parent ${Page_0}
+  ipgui::add_param $IPINST -name "RAM_STYLE" -parent ${Page_0} -widget comboBox
   ipgui::add_param $IPINST -name "STRM0_DEPTH" -parent ${Page_0}
   ipgui::add_param $IPINST -name "STRM0_OFFSET" -parent ${Page_0}
   ipgui::add_param $IPINST -name "STRM0_WIDTH" -parent ${Page_0}
@@ -31,6 +36,22 @@ proc init_gui { IPINST } {
 
 }
 
+proc update_PARAM_VALUE.AXILITE_ADDR_WIDTH { PARAM_VALUE.AXILITE_ADDR_WIDTH PARAM_VALUE.MEM_DEPTH PARAM_VALUE.MEM_WIDTH } {
+	# Procedure called to update AXILITE_ADDR_WIDTH when any of the dependent parameters in the arguments change
+	
+	set AXILITE_ADDR_WIDTH ${PARAM_VALUE.AXILITE_ADDR_WIDTH}
+	set MEM_DEPTH ${PARAM_VALUE.MEM_DEPTH}
+	set MEM_WIDTH ${PARAM_VALUE.MEM_WIDTH}
+	set values(MEM_DEPTH) [get_property value $MEM_DEPTH]
+	set values(MEM_WIDTH) [get_property value $MEM_WIDTH]
+	set_property value [gen_USERPARAMETER_AXILITE_ADDR_WIDTH_VALUE $values(MEM_DEPTH) $values(MEM_WIDTH)] $AXILITE_ADDR_WIDTH
+}
+
+proc validate_PARAM_VALUE.AXILITE_ADDR_WIDTH { PARAM_VALUE.AXILITE_ADDR_WIDTH } {
+	# Procedure called to validate AXILITE_ADDR_WIDTH
+	return true
+}
+
 proc update_PARAM_VALUE.CONFIG_EN { PARAM_VALUE.CONFIG_EN } {
 	# Procedure called to update CONFIG_EN when any of the dependent parameters in the arguments change
 }
@@ -368,3 +389,8 @@ proc update_MODELPARAM_VALUE.STRM5_OFFSET { MODELPARAM_VALUE.STRM5_OFFSET PARAM_
 	set_property value [get_property value ${PARAM_VALUE.STRM5_OFFSET}] ${MODELPARAM_VALUE.STRM5_OFFSET}
 }
 
+proc update_MODELPARAM_VALUE.AXILITE_ADDR_WIDTH { MODELPARAM_VALUE.AXILITE_ADDR_WIDTH PARAM_VALUE.AXILITE_ADDR_WIDTH } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.AXILITE_ADDR_WIDTH}] ${MODELPARAM_VALUE.AXILITE_ADDR_WIDTH}
+}
+
diff --git a/run-docker.sh b/run-docker.sh
index 219e5c258f2e4d8b4c95d1c0a84cd1a636510e24..8eab28508359bc512357cc2bf2654167c04ef370 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -95,18 +95,18 @@ SCRIPTPATH=$(dirname "$SCRIPT")
 : ${ALVEO_TARGET_DIR="/tmp"}
 : ${XILINX_XRT="/opt/xilinx/xrt"}
 : ${PLATFORM_REPO_PATHS="/opt/xilinx/platforms"}
+: ${FINN_HOST_BUILD_DIR="/tmp/$DOCKER_INST_NAME"}
 
-BUILD_LOCAL=/tmp/$DOCKER_INST_NAME
+FINN_CONTAINER_BUILD_DIR=/tmp/$DOCKER_INST_NAME
 VIVADO_HLS_LOCAL=$VIVADO_PATH
-VIVADO_IP_CACHE=$BUILD_LOCAL/vivado_ip_cache
+VIVADO_IP_CACHE=$FINN_CONTAINER_BUILD_DIR/vivado_ip_cache
 
 # ensure build dir exists locally
-mkdir -p $BUILD_LOCAL
-mkdir -p $VIVADO_IP_CACHE
+mkdir -p $FINN_HOST_BUILD_DIR
 mkdir -p $FINN_SSH_KEY_DIR
 
 gecho "Instance is named as $DOCKER_INST_NAME"
-gecho "Mounting $BUILD_LOCAL into $BUILD_LOCAL"
+gecho "Mounting $FINN_HOST_BUILD_DIR into $FINN_CONTAINER_BUILD_DIR"
 gecho "Mounting $VIVADO_PATH into $VIVADO_PATH"
 gecho "Mounting $VITIS_PATH into $VITIS_PATH"
 gecho "Port-forwarding for Jupyter $JUPYTER_PORT:$JUPYTER_PORT"
@@ -148,7 +148,7 @@ DOCKER_EXEC="docker run -t --rm --name $DOCKER_INST_NAME $DOCKER_INTERACTIVE --i
 DOCKER_EXEC+="--hostname $DOCKER_INST_NAME "
 DOCKER_EXEC+="-e SHELL=/bin/bash "
 DOCKER_EXEC+="-v $SCRIPTPATH:/workspace/finn "
-DOCKER_EXEC+="-v $BUILD_LOCAL:$BUILD_LOCAL "
+DOCKER_EXEC+="-v $FINN_HOST_BUILD_DIR:$FINN_CONTAINER_BUILD_DIR "
 DOCKER_EXEC+="-v $FINN_SSH_KEY_DIR:/home/$DOCKER_UNAME/.ssh "
 DOCKER_EXEC+="-e FINN_INST_NAME=$DOCKER_INST_NAME "
 DOCKER_EXEC+="-e FINN_ROOT="/workspace/finn" "
diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
index 8d7dfcf63ff063c21aab9476bab8795694abea63..e80920551120e0e74aae217d9fe4e287e6cabd3d 100644
--- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
@@ -214,11 +214,9 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
 
     def defines(self, var):
         numReps = 1
-        numInWords = 1
+        numInWords = int(np.prod(self.get_folded_input_shape()[:-1]))
         inWidth = self.get_nodeattr("inWidth")
         outWidth = self.get_nodeattr("outWidth")
-        if outWidth > inWidth:
-            numInWords = int(outWidth // inWidth)
         self.code_gen_dict["$DEFINES$"] = [
             "#define InWidth %d " % inWidth,
             "#define OutWidth %d " % outWidth,
@@ -451,7 +449,6 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
 
     def lut_estimation(self):
         """Calculates resource estimations for LUTs"""
-        impl = self.get_nodeattr("impl_style")
         inw = self.get_instream_width()
         outw = self.get_outstream_width()
 
@@ -461,7 +458,7 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
         # sometimes withs aren't directly divisible
         # this requires going up from input width to least common multiple
         # then down to output width
-        intw = abs(maxw*minw) // math.gcd(maxw, minw)
+        intw = abs(maxw * minw) // math.gcd(maxw, minw)
 
         # we assume a shift-based implementation
         # even if we don't use LUTs explicitly, we make some unavailable
@@ -471,11 +468,10 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
         cset_luts = 0
 
         if inw != intw:
-            cnt_luts += abs(math.ceil(math.log(inw/intw, 2)))
+            cnt_luts += abs(math.ceil(math.log(inw / intw, 2)))
             cset_luts += intw
         if intw != outw:
             cnt_luts += abs(math.ceil(math.log(intw / outw, 2)))
             cset_luts += outw
 
-        return int(cnt_luts+cset_luts)
-
+        return int(cnt_luts + cset_luts)
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index 3b557d084797432e7551a1e6c83d5f772bf7ccd0..9d63a6866269ddf6c5c7cf54de00b6dfd11505e6 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -45,6 +45,7 @@ from finn.util.data_packing import (
     pack_innermost_dim_as_hex_string,
 )
 from . import templates
+import textwrap
 
 # ONNX i/o tensor shape assumptions for StreamingFCLayer:
 # input 0 is the input tensor, shape (.., i_size) = (..., MW)
@@ -96,6 +97,16 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             # distributed -- use LUTRAM
             # see also https://www.xilinx.com/support/answers/38070.html
             "ram_style": ("s", False, "auto"),
+            # (mem_mode = decoupled only) whether weights will be writable through
+            # an AXI-lite interface during runtime
+            # 1 for enabled, 0 for disabled.
+            # see finn-rtllib/memstream/doc/README for more about the memory
+            # address map used for writable weights
+            # IMPORTANT: After using AXI lite to either read or write the weights,
+            # always "flush" the accelerator by first passing a dummy input
+            # vector through the accelerator. This will get rid of any old
+            # weight data from the weight FIFOs.
+            "runtime_writeable_weights": ("i", False, 0),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -537,11 +548,17 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         rows between PEs is not as expected (n_thres_steps)"""
         return ret.reshape(1, pe, tmem, n_thres_steps)
 
-    def generate_params(self, model, path):
-        mem_mode = self.get_nodeattr("mem_mode")
-        code_gen_dir = path
-        # weights, if not external
-        weights = model.get_initializer(self.onnx_node.input[1])
+    def make_weight_file(self, weights, weight_file_mode, weight_file_name):
+        """Produce a file containing given weights in appropriate format for this
+        layer. This file can be used for either synthesis or run-time reconfig
+        of weights.
+
+        Arguments:
+        * weights : numpy array with weights to be put into the file
+        * weight_file_mode : one of {hls_header, decoupled_verilog_dat,
+          decoupled_runtime}
+        * weight_file_name : filename for the weight file to be generated
+        """
         # convert weights into hlslib-compatible format
         weight_tensor = self.get_hls_compatible_weight_tensor(weights)
         export_wdt = self.get_weight_datatype()
@@ -549,15 +566,12 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         # so use it as such for weight generation
         if self.get_weight_datatype() == DataType.BIPOLAR:
             export_wdt = DataType.BINARY
-
-        if mem_mode == "const":
-            """Saves weights into params.h"""
+        if weight_file_mode == "hls_header":
             weight_hls_code = numpy_to_hls_code(
                 weight_tensor, export_wdt, "weights", True, True
             )
-            # write weights into params.h
-            f_weights = open("{}/params.h".format(code_gen_dir), "w")
-
+            # write weights into C++ header file as dictated by finn-hlslib
+            f_weights = open(weight_file_name, "w")
             if export_wdt.bitwidth() != 1:
                 f_weights.write(
                     "const FixedPointWeights<{},{},{},{}> weights = ".format(
@@ -577,17 +591,14 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 )
             f_weights.write(weight_hls_code)
             f_weights.close()
-
-        elif mem_mode == "decoupled" or mem_mode == "external":
-            """Saves weights in corresponding file format for cppsim or rtlsim"""
+        elif "decoupled" in weight_file_mode:
+            # create a weight stream for various flavors of decoupled mode:
             # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD)
             weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3))
-
             # reverse SIMD flip for saving weights in .npy
             weight_tensor_simd_flipped = np.flip(weight_tensor_unflipped, axis=-1)
             # PE flip for saving weights in .dat
             weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2)
-
             # reshape weight tensor (simd_flipped and pe_flipped) to desired shape
             pe = self.get_nodeattr("PE")
             simd = self.get_nodeattr("SIMD")
@@ -601,14 +612,10 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 1, -1, pe * simd
             )
             weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy()
-
-            """Saves weights into .npy file"""
-            np.save(
-                os.path.join(code_gen_dir, "weights.npy"), weight_tensor_simd_flipped
-            )
-
-            if mem_mode == "decoupled":
-                """Saves weights into .dat file"""
+            if weight_file_mode == "decoupled_npy":
+                # save weight stream into npy for cppsim
+                np.save(weight_file_name, weight_tensor_simd_flipped)
+            elif weight_file_mode == "decoupled_verilog_dat":
                 # convert weight values into hexstring
                 weight_width = self.get_weightstream_width()
                 # pad to nearest 4 bits to get hex strings
@@ -619,9 +626,55 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 # add zeroes to pad out file to 1024 entries
                 weight_stream = weight_tensor_pe_flipped.flatten()
                 weight_stream = weight_stream.copy()
-                with open("{}/memblock_0.dat".format(code_gen_dir), "a+") as f:
+                with open(weight_file_name, "w") as f:
                     for val in weight_stream:
                         f.write(val + "\n")
+            elif weight_file_mode == "decoupled_runtime":
+                # memstream axi-lite interface will map each mem line to
+                # one or multiple 32-bit words
+                weight_width = self.get_weightstream_width()
+                words_per_memwidth = 2 ** math.ceil(math.log2(weight_width / 32))
+                if words_per_memwidth < 1:
+                    words_per_memwidth = 1
+                weight_width_padded = words_per_memwidth * 32
+                # first, pack and ensure padding to 32 bits
+                weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
+                    weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
+                )
+                weight_stream = weight_tensor_pe_flipped.flatten()
+                weight_stream = weight_stream.copy()
+                with open(weight_file_name, "w") as f:
+                    for val in weight_stream:
+                        # split into groups of 8 hex digits (= 32 bits)
+                        words_32b = textwrap.wrap(val, 8)
+                        words_32b.reverse()
+                        for word_32b in words_32b:
+                            f.write(word_32b + "\n")
+            else:
+                raise Exception("Unknown weight_file_mode")
+
+        else:
+            raise Exception("Unknown weight_file_mode")
+
+    def generate_params(self, model, path):
+        mem_mode = self.get_nodeattr("mem_mode")
+        code_gen_dir = path
+        # weights, if not external
+        weights = model.get_initializer(self.onnx_node.input[1])
+        if mem_mode == "const":
+            # save hlslib-compatible weights in params.h
+            weight_filename = "{}/params.h".format(code_gen_dir)
+            self.make_weight_file(weights, "hls_header", weight_filename)
+        elif mem_mode == "decoupled" or mem_mode == "external":
+            weight_filename_sim = "{}/weights.npy".format(code_gen_dir)
+            # save decoupled weights for cppsim
+            self.make_weight_file(weights, "decoupled_npy", weight_filename_sim)
+            if mem_mode == "decoupled":
+                # also save weights as Verilog .dat file
+                weight_filename_rtl = "{}/memblock_0.dat".format(code_gen_dir)
+                self.make_weight_file(
+                    weights, "decoupled_verilog_dat", weight_filename_rtl
+                )
         else:
             raise Exception(
                 """Please set mem_mode to "const", "decoupled", or "external",
@@ -1055,6 +1108,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         mem_mode = self.get_nodeattr("mem_mode")
         if mem_mode == "decoupled":
             node_name = self.onnx_node.name
+            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
             # create a hierarchy for this layer, with the same port names
             clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
             rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
@@ -1137,6 +1191,21 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 "[get_bd_intf_pins %s/%s/%s]"
                 % (node_name, dout_name, node_name, node_name, dout_name)
             )
+            if runtime_writable:
+                # expose axi lite interface for writeable weights
+                axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0]
+                cmd.append(
+                    "create_bd_intf_pin -mode Slave "
+                    "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s"
+                    % (node_name, axilite_name)
+                )
+                cmd.append(
+                    "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                    "[get_bd_intf_pins %s/%s/%s]"
+                    % (node_name, axilite_name, node_name, strm_inst, axilite_name)
+                )
+                # TODO calculate and pass in segment size here
+                cmd.append("assign_bd_address")
             cmd.append("save_bd_design")
         elif mem_mode == "const" or mem_mode == "external":
             # base class impl sufficient for const/external modes
@@ -1152,4 +1221,9 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             intf_names["s_axis"].append(
                 ("weights_V_V", self.get_weightstream_width_padded())
             )
+        if mem_mode == "decoupled":
+            # only expose axilite interface if attribute is set
+            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
+            if runtime_writable:
+                intf_names["axilite"] = ["s_axilite"]
         return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 67cce8675681be47036ffaf3a3428b8c43284215..a0ca34ed0a6838dfe9c680cc6c16961ac7f897ed 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -354,3 +354,56 @@ $LAYER_NAME$
 
 endmodule
 """
+
+decoupled_thresholding_template = """
+template <
+    unsigned ImgDim, unsigned NumChannels, unsigned PE,
+    typename TSrcI = Identity, typename TDstI = Identity,
+    int ActVal=0, typename TT, unsigned int NumSteps,
+    typename TI, typename TO>
+void Thresholding_Stream_Batch(hls::stream<TI> &in,
+                        hls::stream<TO> &out,
+                        hls::stream<ap_uint<PE*NumSteps*TT::width>> &weight,
+                        int const reps)
+{
+
+  // how many different rows each neuron will compute
+  // alternatively: number of vertical matrix chunks
+  unsigned const NF = NumChannels / PE;
+
+  ThresholdsActivation<1, PE, NumSteps, TT, TO, ActVal, std::less_equal<TT>> internal_thr;
+  #pragma HLS ARRAY_PARTITION variable=internal_thr.m_thresholds complete dim=0
+
+  // everything merged into a common iteration space (one "big" loop instead
+  // of smaller nested loops) to get the pipelinening the way we want
+  for (unsigned i = 0; i < reps * ImgDim * ImgDim * NF; i++)
+  {
+    #pragma HLS PIPELINE II=1
+
+    ap_uint<PE*NumSteps*TT::width> packed_thr;
+    packed_thr = weight.read();
+    // slicer to get 1 PE's worth of thresholds
+    auto const pe_slicer = Slice<ap_uint<NumSteps*TT::width>>()(packed_thr);
+
+    TI inElem;
+    inElem = in.read();
+    auto outElem = TDstI().template operator()<TO>();
+
+    for (unsigned pe = 0; pe < PE; pe++)
+    {
+#pragma HLS UNROLL
+      // slicer to get individual thresholds
+      auto const thr_slicer = Slice<TT>()(pe_slicer(pe, 0));
+      for (unsigned nt = 0; nt < NumSteps; nt++)
+      {
+      #pragma HLS UNROLL
+        internal_thr.m_thresholds[pe][0][nt] = thr_slicer(nt, 0);
+      }
+
+      auto const act = TSrcI()(inElem);
+      outElem(pe,0,1) = internal_thr.activate(0, pe, act(pe,0));
+    }
+    out.write(outElem);
+  }
+}
+"""
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
index 2429bf6190f822fb4a6c988fcbb34152d5a338e0..ccb065f62a8340b916bfa5f6cf96c23c65d19d12 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
@@ -26,7 +26,8 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from math import ceil
+from math import ceil, log2
+import textwrap
 import os
 
 import numpy as np
@@ -34,11 +35,15 @@ import numpy as np
 from onnx import TensorProto, helper
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow import HLSCustomOp
-from finn.util.basic import interleave_matrix_outer_dim_from_partitions
+from finn.util.basic import (
+    interleave_matrix_outer_dim_from_partitions,
+    roundup_to_integer_multiple,
+)
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
     numpy_to_hls_code,
     rtlsim_output_to_npy,
+    pack_innermost_dim_as_hex_string,
 )
 from . import templates
 
@@ -58,12 +63,17 @@ class Thresholding_Batch(HLSCustomOp):
 
     def get_nodeattr_types(self):
         my_attrs = {
+            # parallelization; channels thresholded per cycle
             "PE": ("i", True, 0),
+            # number of channels (each may have different thresholds)
             "NumChannels": ("i", True, 0),
+            # number of steps in thresholding function
+            "numSteps": ("i", True, 1),
             # string defining memory type
             "ram_style": ("s", False, "distributed"),
-            # FINN DataTypes for inputs, weights, outputs
+            # FINN DataTypes for inputs, outputs
             "inputDataType": ("s", True, ""),
+            "weightDataType": ("s", True, ""),
             "outputDataType": ("s", True, ""),
             # input and output FIFO depths
             "inFIFODepth": ("i", False, 0),
@@ -75,6 +85,20 @@ class Thresholding_Batch(HLSCustomOp):
             "numInputVectors": ("ints", False, [1]),
             # initialization value for the thresholding accumulator
             "ActVal": ("i", False, 0),
+            # memory mode for the thresholds
+            # const -- embedded thresholds, default
+            # decoupled -- streaming thresholds with  streamer packaged inside IP
+            "mem_mode": ("s", False, "const"),
+            # (mem_mode = decoupled only) whether weights (thresholds) will be
+            # writable through an AXI-lite interface during runtime
+            # 1 for enabled, 0 for disabled.
+            # see finn-rtllib/memstream/doc/README for more about the memory
+            # address map used for writable weights
+            # IMPORTANT: After using AXI lite to either read or write the weights,
+            # always "flush" the accelerator by first passing a dummy input
+            # vector through the accelerator. This will get rid of any old
+            # weight data from the weight FIFOs.
+            "runtime_writeable_weights": ("i", False, 0),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -183,6 +207,34 @@ class Thresholding_Batch(HLSCustomOp):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("outputDataType")]
 
+    def get_weight_datatype(self):
+        """Returns FINN DataType of thresholds, here called weights."""
+        return DataType[self.get_nodeattr("weightDataType")]
+
+    def minimize_accumulator_width(self, model):
+        "Minimize threshold width ('accumulator width' here due to convention)"
+        thresholds = model.get_initializer(self.onnx_node.input[1])
+        threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
+        min_threshold = thresholds.min()
+        max_threshold = thresholds.max()
+        min_input = self.get_input_datatype().min()
+        max_input = self.get_input_datatype().max()
+        # get range required by threshold values
+        tdt_min = min(min_input, min_threshold)
+        tdt_max = max(max_input, max_threshold)
+        if tdt_min < 0:
+            if abs(tdt_min) > tdt_max:
+                tdt = DataType.get_smallest_possible(tdt_min)
+            else:
+                tdt = DataType.get_smallest_possible(0 - tdt_max - 1)
+        else:
+            tdt = DataType.get_smallest_possible(tdt_max)
+        assert np.vectorize(tdt.allowed)(
+            threshold_tensor
+        ).all(), "Thresholds can't be expressed with type %s" % str(tdt)
+        self.set_nodeattr("weightDataType", tdt.name)
+        return DataType[self.get_nodeattr("weightDataType")]
+
     def get_instream_width(self):
         i_bits = self.get_input_datatype().bitwidth()
         return i_bits * self.get_nodeattr("PE")
@@ -191,6 +243,28 @@ class Thresholding_Batch(HLSCustomOp):
         o_bits = self.get_output_datatype().bitwidth()
         return o_bits * self.get_nodeattr("PE")
 
+    def get_weightstream_width(self):
+        """Returns weight stream width. Used only in decoupled mode."""
+        if self.get_nodeattr("mem_mode") == "decoupled":
+            pe = self.get_nodeattr("PE")
+            wp = self.get_weight_datatype().bitwidth()
+            n_thres_steps = self.get_nodeattr("numSteps")
+            w_width = pe * wp * n_thres_steps
+            return w_width
+        else:
+            return 0
+
+    def get_weightstream_width_padded(self):
+        """Returns weight stream width padded to a multiple of 8. This is required
+        by the AXI Stream spec. Used in decoupled mode."""
+        weight_width = self.get_weightstream_width()
+        return roundup_to_integer_multiple(weight_width, 8)
+
+    def get_ap_int_max_w(self):
+        temp_value = super().get_ap_int_max_w()
+        weightstream = self.get_weightstream_width()
+        return max([weightstream, temp_value])
+
     def get_folded_input_shape(self):
         ich = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
@@ -251,6 +325,9 @@ class Thresholding_Batch(HLSCustomOp):
         ), """Threshold matrix dimension is
         not as expected (2)."""
         n_thres_steps = orig_thres_matrix.shape[1]
+        assert n_thres_steps == self.get_nodeattr(
+            "numSteps"
+        ), "Mismatch in threshold steps"
         if not self.get_input_datatype().signed():
             # ensure all thresholds are nonnegative
             assert (orig_thres_matrix >= 0).all()
@@ -279,56 +356,126 @@ class Thresholding_Batch(HLSCustomOp):
         rows between PEs is not as expected (n_thres_steps)"""
         return ret.reshape(1, pe, tmem, n_thres_steps)
 
-    def generate_params(self, model, path):
-        code_gen_dir = path
-        # save thresholds in thresh.h
-        thresholds = model.get_initializer(self.onnx_node.input[1])
-
-        threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
+    def make_weight_file(self, weights, weight_file_mode, weight_file_name):
+        """Produce a file containing given weights (thresholds) in appropriate
+        format for this layer. This file can be used for either synthesis or
+        run-time reconfig of weights.
 
-        min_threshold = thresholds.min()
-        max_threshold = thresholds.max()
-        min_input = self.get_input_datatype().min()
-        max_input = self.get_input_datatype().max()
-        # get range required by threshold values
-        tdt_min = min(min_input, min_threshold)
-        tdt_max = max(max_input, max_threshold)
-        if tdt_min < 0:
-            if abs(tdt_min) > tdt_max:
-                tdt = DataType.get_smallest_possible(tdt_min)
-            else:
-                tdt = DataType.get_smallest_possible(0 - tdt_max - 1)
-        else:
-            tdt = DataType.get_smallest_possible(tdt_max)
+        Arguments:
+        * weights : numpy array with weights to be put into the file
+        * weight_file_mode : one of {hls_header, decoupled_verilog_dat,
+          decoupled_runtime}
+        * weight_file_name : filename for the weight file to be generated
+        """
+        threshold_tensor = self.get_hls_compatible_threshold_tensor(weights)
+        tdt = self.get_weight_datatype()
         assert np.vectorize(tdt.allowed)(
             threshold_tensor
         ).all(), "Thresholds can't be expressed with type %s" % str(tdt)
+        if weight_file_mode == "hls_header":
+            # save thresholds in thresh.h
+            thresholds_hls_code = numpy_to_hls_code(
+                threshold_tensor, tdt, "thresholds", False, True
+            )
+            # write thresholds into thresh.h
+            f_thresh = open(weight_file_name, "w")
+            tdt_hls = tdt.get_hls_datatype_str()
+            # use binary to export bipolar activations
+            export_odt = self.get_output_datatype()
+            if self.get_output_datatype() == DataType.BIPOLAR:
+                export_odt = DataType.BINARY
+            odt_hls = export_odt.get_hls_datatype_str()
+            f_thresh.write(
+                "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \
+                = ".format(
+                    self.calc_tmem(),
+                    self.get_nodeattr("PE"),
+                    threshold_tensor.shape[-1],
+                    tdt_hls,
+                    odt_hls,
+                    self.get_nodeattr("ActVal"),
+                    "std::less_equal<%s>" % tdt_hls,
+                )
+            )
+            f_thresh.write(thresholds_hls_code)
+            f_thresh.close()
+        elif "decoupled" in weight_file_mode:
+            # streaming thresholds need to be organized differently
+            # (1, pe, tmem, n_thres_steps) -> (1, tmem, pe, n_thres_steps)
+            decoupled_thres = np.transpose(threshold_tensor, (0, 2, 1, 3))
+            # TODO add flips/reversals as needed here
+            # (1, tmem, pe, n_thres_steps) -(1, tmem, pe * n_thres_steps)
+            pe = self.get_nodeattr("PE")
+            n_thres_steps = self.get_nodeattr("numSteps")
+            decoupled_thres_pe_flipped = np.flip(decoupled_thres, axis=-2)
+            decoupled_thres = decoupled_thres.reshape(1, -1, pe * n_thres_steps)
+            decoupled_thres = decoupled_thres.copy()
+            decoupled_thres_pe_flipped = decoupled_thres_pe_flipped.reshape(
+                1, -1, pe * n_thres_steps
+            )
+            decoupled_thres_pe_flipped = decoupled_thres_pe_flipped.copy()
+
+            if weight_file_mode == "decoupled_npy":
+                # save weight stream into npy for cppsim
+                np.save(weight_file_name, decoupled_thres)
+            elif weight_file_mode == "decoupled_verilog_dat":
+                # convert weight values into hexstring
+                weight_width = self.get_weightstream_width()
+                # pad to nearest 4 bits to get hex strings
+                weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
+                weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
+                    decoupled_thres_pe_flipped, tdt, weight_width_padded, prefix=""
+                )
+                weight_stream = weight_tensor_pe_flipped.flatten()
+                weight_stream = weight_stream.copy()
+                with open(weight_file_name, "w") as f:
+                    for val in weight_stream:
+                        f.write(val + "\n")
+            elif weight_file_mode == "decoupled_runtime":
+                # memstream axi-lite interface will map each mem line to
+                # one or multiple 32-bit words
+                weight_width = self.get_weightstream_width()
+                words_per_memwidth = 2 ** ceil(log2(weight_width / 32))
+                if words_per_memwidth < 1:
+                    words_per_memwidth = 1
+                weight_width_padded = words_per_memwidth * 32
+                # first, pack and ensure padding to 32 bits
+                weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
+                    decoupled_thres_pe_flipped, tdt, weight_width_padded, prefix=""
+                )
+                weight_stream = weight_tensor_pe_flipped.flatten()
+                weight_stream = weight_stream.copy()
+                with open(weight_file_name, "w") as f:
+                    for val in weight_stream:
+                        # split into groups of 8 hex digits (= 32 bits)
+                        words_32b = textwrap.wrap(val, 8)
+                        words_32b.reverse()
+                        for word_32b in words_32b:
+                            f.write(word_32b + "\n")
+            else:
+                raise Exception("Decoupled weight export not yet implemented")
+        else:
+            raise Exception("Unknown weight_file_mode")
 
-        thresholds_hls_code = numpy_to_hls_code(
-            threshold_tensor, tdt, "thresholds", False, True
-        )
-        # write thresholds into thresh.h
-        f_thresh = open("{}/thresh.h".format(code_gen_dir), "w")
-        tdt_hls = tdt.get_hls_datatype_str()
-        # use binary to export bipolar activations
-        export_odt = self.get_output_datatype()
-        if self.get_output_datatype() == DataType.BIPOLAR:
-            export_odt = DataType.BINARY
-        odt_hls = export_odt.get_hls_datatype_str()
-        f_thresh.write(
-            "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \
-            = ".format(
-                self.calc_tmem(),
-                self.get_nodeattr("PE"),
-                threshold_tensor.shape[-1],
-                tdt_hls,
-                odt_hls,
-                self.get_nodeattr("ActVal"),
-                "std::less_equal<%s>" % tdt_hls,
+    def generate_params(self, model, path):
+        code_gen_dir = path
+        thresholds = model.get_initializer(self.onnx_node.input[1])
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "const":
+            # save thresholds in thresh.h
+            weight_filename = "{}/thresh.h".format(code_gen_dir)
+            self.make_weight_file(thresholds, "hls_header", weight_filename)
+        elif mem_mode == "decoupled":
+            # save decoupled weights for cppsim
+            weight_filename_sim = "{}/thresholds.npy".format(code_gen_dir)
+            self.make_weight_file(thresholds, "decoupled_npy", weight_filename_sim)
+            # also save weights as Verilog .dat file
+            weight_filename_rtl = "{}/memblock_0.dat".format(code_gen_dir)
+            self.make_weight_file(
+                thresholds, "decoupled_verilog_dat", weight_filename_rtl
             )
-        )
-        f_thresh.write(thresholds_hls_code)
-        f_thresh.close()
+        else:
+            raise Exception("Unrecognized mem_mode")
 
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
@@ -373,7 +520,7 @@ class Thresholding_Batch(HLSCustomOp):
                     reshaped_input,
                 )
             elif in_ind > 2:
-                raise Exception("Unexpected input found for StreamingFCLayer")
+                raise Exception("Unexpected input found for Thresholding_Batch")
             in_ind += 1
 
         if mode == "cppsim":
@@ -400,7 +547,23 @@ class Thresholding_Batch(HLSCustomOp):
             )
             super().reset_rtlsim(sim)
             super().toggle_clk(sim)
-            output = self.rtlsim(sim, inp)
+            if self.get_nodeattr("mem_mode") == "decoupled":
+                wnbits = self.get_weightstream_width()
+                export_wdt = self.get_weight_datatype()
+                wei = npy_to_rtlsim_input(
+                    "{}/thresholds.npy".format(code_gen_dir), export_wdt, wnbits
+                )
+                num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+                io_dict = {
+                    "inputs": {"in0": inp, "weights": wei * num_w_reps},
+                    "outputs": {"out": []},
+                }
+                self.rtlsim_multi_io(sim, io_dict)
+                output = io_dict["outputs"]["out"]
+            elif self.get_nodeattr("mem_mode") == "const":
+                output = self.rtlsim(sim, inp)
+            else:
+                raise Exception("Unrecognized mem_mode")
             odt = self.get_output_datatype()
             target_bits = odt.bitwidth()
             packed_bits = self.get_outstream_width()
@@ -425,7 +588,8 @@ class Thresholding_Batch(HLSCustomOp):
 
     def global_includes(self):
         self.code_gen_dict["$GLOBALS$"] = ['#include "activations.hpp"']
-        self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"']
+        if self.get_nodeattr("mem_mode") == "const":
+            self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"']
 
     # TODO check and add whatever missing
     def defines(self, var):
@@ -436,6 +600,21 @@ class Thresholding_Batch(HLSCustomOp):
                 self.get_nodeattr("NumChannels"), self.get_nodeattr("PE"), numReps,
             )
         ]
+        if self.get_nodeattr("mem_mode") == "decoupled":
+            self.code_gen_dict["$DEFINES$"].append(
+                "#define ActVal1 %d" % self.get_nodeattr("ActVal")
+            )
+            self.code_gen_dict["$DEFINES$"].append(
+                "#define ThresType1 %s"
+                % self.get_weight_datatype().get_hls_datatype_str()
+            )
+            self.code_gen_dict["$DEFINES$"].append(
+                "#define NumSteps1 %d" % self.get_nodeattr("numSteps")
+            )
+            # TODO remove once Thresholding_Stream_Batch is in hlslib:
+            self.code_gen_dict["$DEFINES$"].append(
+                templates.decoupled_thresholding_template
+            )
 
     def read_npy_data(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -452,6 +631,20 @@ class Thresholding_Batch(HLSCustomOp):
             'npy2apintstream<%s, %s, %d, %s>("%s", in0, false);'
             % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
         )
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "decoupled":
+            tdt = self.get_weight_datatype()
+            elem_bits = tdt.bitwidth()
+            packed_bits = self.get_weightstream_width()
+            packed_hls_type = "ap_uint<%d>" % packed_bits
+            elem_hls_type = tdt.get_hls_datatype_str()
+            npy_type = "float"
+            npy_in = "%s/thresholds.npy" % code_gen_dir
+
+            self.code_gen_dict["$READNPYDATA$"].append(
+                'npy2apintstream<%s, %s, %d, %s>("%s", weights, false, numReps);'
+                % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+            )
 
     def strm_decl(self):
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
@@ -461,6 +654,13 @@ class Thresholding_Batch(HLSCustomOp):
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
             'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
         )
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "decoupled":
+            self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+                'hls::stream<ap_uint<{}>> weights ("weights");'.format(
+                    self.get_weightstream_width()
+                )
+            )
 
     def docompute(self):
         tmpl_args = self.get_template_param_values()
@@ -474,12 +674,26 @@ class Thresholding_Batch(HLSCustomOp):
             imgdim = ishape[1]
         else:
             raise Exception("""Unexpeted input shape""")
-        self.code_gen_dict["$DOCOMPUTE$"] = [
-            """{}<{}, NumChannels1, PE1, {}, {}>
-            (in0, out, threshs, numReps);""".format(
-                node.op_type, imgdim, tmpl_args["TSrcI"], tmpl_args["TDstI"],
-            )
-        ]
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "const":
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}<{}, NumChannels1, PE1, {}, {}>
+                (in0, out, threshs, numReps);""".format(
+                    node.op_type, imgdim, tmpl_args["TSrcI"], tmpl_args["TDstI"],
+                )
+            ]
+        elif mem_mode == "decoupled":
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}<{}, NumChannels1, PE1, {}, {}, ActVal1, ThresType1, NumSteps1>
+                (in0, out, weights, numReps);""".format(
+                    "Thresholding_Stream_Batch",
+                    imgdim,
+                    tmpl_args["TSrcI"],
+                    tmpl_args["TDstI"],
+                )
+            ]
+        else:
+            raise Exception("Unrecognized mem_mode")
 
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -513,15 +727,30 @@ class Thresholding_Batch(HLSCustomOp):
         self.code_gen_dict["$SAVEASCNPY$"] = []
 
     def blackboxfunction(self):
-        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            """void {}(hls::stream<ap_uint<{}>> &in0,
-                hls::stream<ap_uint<{}>> &out
-                )""".format(
-                self.onnx_node.name,
-                self.get_instream_width(),
-                self.get_outstream_width(),
-            )
-        ]
+        if self.get_nodeattr("mem_mode") == "const":
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void {}(hls::stream<ap_uint<{}>> &in0,
+                    hls::stream<ap_uint<{}>> &out
+                    )""".format(
+                    self.onnx_node.name,
+                    self.get_instream_width(),
+                    self.get_outstream_width(),
+                )
+            ]
+        elif self.get_nodeattr("mem_mode") == "decoupled":
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void {}(hls::stream<ap_uint<{}>> &in0,
+                    hls::stream<ap_uint<{}>> &weights,
+                    hls::stream<ap_uint<{}>> &out
+                    )""".format(
+                    self.onnx_node.name,
+                    self.get_instream_width(),
+                    self.get_weightstream_width(),
+                    self.get_outstream_width(),
+                )
+            ]
+        else:
+            raise Exception("Unrecognized mem_mode")
 
     def pragmas(self):
         self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
@@ -530,46 +759,173 @@ class Thresholding_Batch(HLSCustomOp):
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
 
-        # the threshold tensor is acc_type [PE][TMEM][N_THRES]
-        # partition for parallel access along PE and N_THRES
-        # dimensions (dims 1 and 3)
-        self.code_gen_dict["$PRAGMAS$"].append(
-            (
-                "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds "
-                "complete dim=1"
+        if self.get_nodeattr("mem_mode") == "const":
+            # the threshold tensor is acc_type [PE][TMEM][N_THRES]
+            # partition for parallel access along PE and N_THRES
+            # dimensions (dims 1 and 3)
+            self.code_gen_dict["$PRAGMAS$"].append(
+                (
+                    "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds "
+                    "complete dim=1"
+                )
             )
-        )
-        self.code_gen_dict["$PRAGMAS$"].append(
-            (
-                "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds "
-                "complete dim=3"
+            self.code_gen_dict["$PRAGMAS$"].append(
+                (
+                    "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds "
+                    "complete dim=3"
+                )
             )
-        )
-        # set resource type
-        ram_style = self.get_nodeattr("ram_style")
-        pe = self.get_nodeattr("PE")
-        ich = self.get_nodeattr("NumChannels")
-        # if PE less than NumChannels, assign cores according to ram_style;
-        # otherwise if PE == NumChannels, Vivado HLS will unroll to FFs
-        if pe < ich:
-            if ram_style == "distributed":
-                self.code_gen_dict["$PRAGMAS$"].append(
-                    (
-                        "#pragma HLS RESOURCE variable=threshs.m_thresholds "
-                        "core=ROM_2P_LUTRAM"
+            # set resource type
+            ram_style = self.get_nodeattr("ram_style")
+            pe = self.get_nodeattr("PE")
+            ich = self.get_nodeattr("NumChannels")
+            # if PE less than NumChannels, assign cores according to ram_style;
+            # otherwise if PE == NumChannels, Vivado HLS will unroll to FFs
+            if pe < ich:
+                if ram_style == "distributed":
+                    self.code_gen_dict["$PRAGMAS$"].append(
+                        (
+                            "#pragma HLS RESOURCE variable=threshs.m_thresholds "
+                            "core=ROM_2P_LUTRAM"
+                        )
                     )
-                )
-            elif ram_style == "block":
-                self.code_gen_dict["$PRAGMAS$"].append(
-                    (
-                        "#pragma HLS RESOURCE variable=threshs.m_thresholds "
-                        "core=ROM_2P_BRAM"
+                elif ram_style == "block":
+                    self.code_gen_dict["$PRAGMAS$"].append(
+                        (
+                            "#pragma HLS RESOURCE variable=threshs.m_thresholds "
+                            "core=ROM_2P_BRAM"
+                        )
                     )
-                )
-            else:
-                raise Exception(
-                    """Invalid value for attribute ram_style! Is currently set to: {}
-                has to be set to one of ("block", "distributed")""".format(
-                        ram_style
+                else:
+                    raise Exception(
+                        """Invalid value for attribute ram_style! Is currently set to: {}
+                    has to be set to one of ("block", "distributed")""".format(
+                            ram_style
+                        )
                     )
+        elif self.get_nodeattr("mem_mode") == "decoupled":
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS INTERFACE axis port=weights"
+            )
+
+    def code_generation_ipi(self):
+        cmd = []
+        # add streamer if needed
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "decoupled":
+            node_name = self.onnx_node.name
+            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
+            # create a hierarchy for this layer, with the same port names
+            clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
+            rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
+            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0]
+            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0]
+            cmd.append("create_bd_cell -type hier %s" % node_name)
+            cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
+            cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
+            cmd.append(
+                "create_bd_intf_pin -mode Master "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s"
+                % (node_name, dout_name)
+            )
+            cmd.append(
+                "create_bd_intf_pin -mode Slave "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name)
+            )
+            # instantiate the hls ip
+            cmd.append(
+                "create_bd_cell -type ip -vlnv %s /%s/%s"
+                % (self.get_nodeattr("ip_vlnv"), node_name, node_name)
+            )
+            # instantiate a streamer and connect it to the HLS IP
+            strm_vlnv = "xilinx.com:user:memstream:1.0"
+            strm_inst = node_name + "_wstrm"
+            cmd.append(
+                "create_bd_cell -type ip -vlnv %s /%s/%s"
+                % (strm_vlnv, node_name, strm_inst)
+            )
+            cmd.append(
+                "set_property -dict [list "
+                "CONFIG.NSTREAMS {1} "
+                "CONFIG.MEM_DEPTH {%d} "
+                "CONFIG.MEM_WIDTH {%d} "
+                "CONFIG.MEM_INIT {%s} "
+                "CONFIG.RAM_STYLE {%s} "
+                "CONFIG.STRM0_DEPTH {%d} "
+                "CONFIG.STRM0_WIDTH {%d} "
+                "CONFIG.STRM0_OFFSET {0} "
+                "] [get_bd_cells /%s/%s]"
+                % (
+                    self.calc_tmem(),
+                    self.get_weightstream_width_padded(),
+                    self.get_nodeattr("code_gen_dir_ipgen") + "/",
+                    self.get_nodeattr("ram_style"),
+                    self.calc_tmem(),
+                    self.get_weightstream_width_padded(),
+                    node_name,
+                    strm_inst,
                 )
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s/m_axis_0] "
+                "[get_bd_intf_pins %s/%s/weights_V_V]"
+                % (node_name, strm_inst, node_name, node_name)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]"
+                % (node_name, rst_name, node_name, strm_inst)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aclk]"
+                % (node_name, clk_name, node_name, strm_inst)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
+                % (node_name, rst_name, node_name, node_name, rst_name)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
+                % (node_name, clk_name, node_name, node_name, clk_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                "[get_bd_intf_pins %s/%s/%s]"
+                % (node_name, din_name, node_name, node_name, din_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                "[get_bd_intf_pins %s/%s/%s]"
+                % (node_name, dout_name, node_name, node_name, dout_name)
+            )
+            if runtime_writable:
+                # expose axi lite interface for writeable weights
+                axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0]
+                cmd.append(
+                    "create_bd_intf_pin -mode Slave "
+                    "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s"
+                    % (node_name, axilite_name)
+                )
+                cmd.append(
+                    "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                    "[get_bd_intf_pins %s/%s/%s]"
+                    % (node_name, axilite_name, node_name, strm_inst, axilite_name)
+                )
+                # TODO calculate and pass in segment size here
+                cmd.append("assign_bd_address")
+            cmd.append("save_bd_design")
+        elif mem_mode == "const":
+            # base class impl sufficient for const mode
+            return super().code_generation_ipi()
+        else:
+            raise Exception("Unrecognized mem_mode for Thresholding_Batch")
+        return cmd
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "decoupled":
+            # only expose axilite interface if attribute is set
+            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
+            if runtime_writable:
+                intf_names["axilite"] = ["s_axilite"]
+        return intf_names
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index d6f2e04f762a6b67ace989fa802829fd3e5a6fb5..f27ebc645dbee20ff97b64aa942e375250f60cbd 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -780,6 +780,10 @@ class InferVVAU(Transformation):
 class InferThresholdingLayer(Transformation):
     """Convert any MultiThreshold into a standalone thresholding HLS layer."""
 
+    def __init__(self, mem_mode="const"):
+        super().__init__()
+        self.mem_mode = mem_mode
+
     def apply(self, model):
         graph = model.graph
         node_ind = 0
@@ -791,6 +795,7 @@ class InferThresholdingLayer(Transformation):
                 thl_threshold = node.input[1]
                 thl_output = node.output[0]
                 thl_in_shape = model.get_tensor_shape(thl_input)
+                thl_thres_shape = model.get_tensor_shape(thl_threshold)
                 idt = model.get_tensor_datatype(thl_input)
 
                 # skip conversion for layers with float input
@@ -841,10 +846,13 @@ class InferThresholdingLayer(Transformation):
                     backend="fpgadataflow",
                     NumChannels=ifc,
                     PE=pe,
+                    numSteps=thl_thres_shape[1],
                     inputDataType=idt.name,
+                    weightDataType=idt.name,  # will be set by MinimizeAccumulatorWidth
                     outputDataType=odt.name,
                     numInputVectors=list(thl_in_shape[:-1]),
                     ActVal=actval,
+                    mem_mode=self.mem_mode,
                 )
                 graph.node.insert(insert_point, new_node)
                 # remove old node
@@ -852,6 +860,7 @@ class InferThresholdingLayer(Transformation):
                 graph_modified = True
 
         if graph_modified:
+            model = model.transform(MinimizeAccumulatorWidth())
             model = model.transform(InferShapes())
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index 2bcb4a89a4610c64c53947fdb7e8093a2d050821..f7643673a0ba326ab77e4379d524fc831fbbc9ca 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -91,7 +91,6 @@ class CreateStitchedIP(Transformation):
                 """The chosen frequency may lead to failure due to clock divider
                 constraints."""
             )
-        self.has_axilite = False
         self.has_aximm = False
         self.has_m_axis = False
         self.m_axis_idx = 0
@@ -153,14 +152,11 @@ class CreateStitchedIP(Transformation):
                 "make_bd_intf_pins_external "
                 "[get_bd_intf_pins %s/%s]" % (inst_name, axilite_intf_name[0])
             )
-            self.connect_cmds.append(
-                "set_property name s_axi_control " "[get_bd_intf_ports s_axi_control_0]"
+            ext_if_name = "%s_%d" % (
+                axilite_intf_name[0],
+                len(self.intf_names["axilite"]),
             )
-            assert (
-                self.has_axilite is False
-            ), "Currently limited to one slave AXI-Stream"
-            self.intf_names["axilite"] = ["s_axi_control"]
-            self.has_axilite = True
+            self.intf_names["axilite"].append(ext_if_name)
         if len(aximm_intf_name) != 0:
             self.connect_cmds.append(
                 "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]"
diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
index 1a4b67d1e7adb86b5a7515eb0ff14b780eea0585..7baa27757abd91c2602f15f739555014b24f559d 100644
--- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -136,6 +136,15 @@ class MakeZYNQProject(Transformation):
             if clk_ns > global_clk_ns:
                 global_clk_ns = clk_ns
 
+            ifnames = eval(kernel_model.get_metadata_prop("vivado_stitch_ifnames"))
+            assert (
+                len(ifnames["axilite"]) <= 1
+            ), "MakeZYNQProject supports max 1 AXI lite interface"
+            if len(ifnames["axilite"]) == 1:
+                axilite_intf_name = ifnames["axilite"][0]
+            else:
+                axilite_intf_name = None
+
             # gather info on connectivity
             # assume each node connected to outputs/inputs is DMA:
             # has axis, aximm and axilite
@@ -162,10 +171,11 @@ class MakeZYNQProject(Transformation):
                     "[get_bd_intf_pins smartconnect_0/S%02d_AXI]"
                     % (instance_names[node.name], aximm_idx)
                 )
+                assert axilite_intf_name is not None
                 config.append(
-                    "connect_bd_intf_net [get_bd_intf_pins %s/s_axi_control] "
+                    "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
                     "[get_bd_intf_pins axi_interconnect_0/M%02d_AXI]"
-                    % (instance_names[node.name], axilite_idx)
+                    % (instance_names[node.name], axilite_intf_name, axilite_idx)
                 )
                 idma_idx += 1
                 aximm_idx += 1
diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py
index e4da0d631b8f8bb1cc21799bba00c454eba528ae..5afb48637b8ad3e7198cdf5e7ac2f30afd3866b4 100644
--- a/src/finn/transformation/fpgadataflow/vitis_build.py
+++ b/src/finn/transformation/fpgadataflow/vitis_build.py
@@ -99,7 +99,12 @@ class CreateVitisXO(Transformation):
         # NOTE: this assumes the graph is Vitis-compatible: max one axi lite interface
         # developed from instructions in UG1393 (v2019.2) and package_xo documentation
         # package_xo is responsible for generating the kernel xml
-        if len(interfaces["axilite"]) > 0:
+        assert (
+            len(interfaces["axilite"]) <= 1
+        ), "CreateVitisXO supports max 1 AXI lite interface"
+        axilite_intf_name = None
+        if len(interfaces["axilite"]) == 1:
+            axilite_intf_name = interfaces["axilite"][0]
             if len(interfaces["aximm"]) > 0:
                 args_string.append(
                     "{addr:1:%s:%s:0x8:0x10:ap_uint&lt;%s>*:0}"
@@ -111,12 +116,14 @@ class CreateVitisXO(Transformation):
                 )
                 arg_id += 1
                 args_string.append(
-                    "{numReps:0:%s:s_axi_control:0x4:0x1C:uint:0}" % str(arg_id)
+                    "{numReps:0:%s:%s:0x4:0x1C:uint:0}" 
+                    % (str(arg_id), axilite_intf_name)
                 )
                 arg_id += 1
             else:
                 args_string.append(
-                    "{numReps:0:%s:s_axi_control:0x4:0x10:uint:0}" % str(arg_id)
+                    "{numReps:0:%s:%s:0x4:0x10:uint:0}"
+                    % (str(arg_id), axilite_intf_name)
                 )
                 arg_id += 1
         for intf in interfaces["s_axis"] + interfaces["m_axis"]:
diff --git a/src/finn/util/pyverilator.py b/src/finn/util/pyverilator.py
deleted file mode 100644
index 41b72db9504ebeb8d10c5d838977da93b27f94ed..0000000000000000000000000000000000000000
--- a/src/finn/util/pyverilator.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright (c) 2020, Xilinx
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-def reset_rtlsim(sim, rst_name="ap_rst_n", active_low=True):
-    """Sets reset input in pyverilator to zero, toggles the clock and set it
-    back to one"""
-    sim.io[rst_name] = 0 if active_low else 1
-    toggle_clk(sim)
-    toggle_clk(sim)
-    sim.io[rst_name] = 1 if active_low else 0
-    toggle_clk(sim)
-    toggle_clk(sim)
-
-
-def toggle_clk(sim, clk_name="ap_clk"):
-    """Toggles the clock input in pyverilator once."""
-    sim.io[clk_name] = 0
-    sim.eval()
-    sim.io[clk_name] = 1
-    sim.eval()
-
-
-def wait_for_handshake(sim, ifname, basename="s_axi_control_", dataname="DATA"):
-    """Wait for handshake (READY and VALID high at the same time) on given
-    interface on PyVerilator sim object.
-
-    Arguments:
-    - sim : PyVerilator sim object
-    - ifname : name for decoupled interface to wait for handshake on
-    - basename : prefix for decoupled interface name
-    - dataname : interface data sig name, will be return value if it exists
-
-    Returns: value of interface data signal during handshake (if given by dataname),
-    None otherwise (e.g. if there is no data signal associated with interface)
-    """
-    ret = None
-    while 1:
-        hs = (
-            sim.io[basename + ifname + "READY"] == 1
-            and sim.io[basename + ifname + "VALID"] == 1
-        )
-        if basename + ifname + dataname in sim.io:
-            ret = sim.io[basename + ifname + dataname]
-        toggle_clk(sim)
-        if hs:
-            break
-    return ret
-
-
-def axilite_write(sim, addr, val, basename="s_axi_control_", wstrb=0xF):
-    """Write val to addr on AXI lite interface given by basename.
-
-    Arguments:
-    - sim : PyVerilator sim object
-    - addr : address for write
-    - val : value to be written at addr
-    - basename : prefix for AXI lite interface name
-    - wstrb : write strobe value to do partial writes, see AXI protocol reference
-    """
-    sim.io[basename + "WSTRB"] = wstrb
-    sim.io[basename + "AWADDR"] = addr
-    sim.io[basename + "AWVALID"] = 1
-    wait_for_handshake(sim, "AW", basename=basename)
-    # write request done
-    sim.io[basename + "AWVALID"] = 0
-    # write data
-    sim.io[basename + "WDATA"] = val
-    sim.io[basename + "WVALID"] = 1
-    wait_for_handshake(sim, "W", basename=basename)
-    # write data OK
-    sim.io[basename + "WVALID"] = 0
-    # wait for write response
-    sim.io[basename + "BREADY"] = 1
-    wait_for_handshake(sim, "B", basename=basename)
-    # write response OK
-    sim.io[basename + "BREADY"] = 0
-
-
-def axilite_read(sim, addr, basename="s_axi_control_"):
-    """Read val from addr on AXI lite interface given by basename.
-
-    Arguments:
-    - sim : PyVerilator sim object
-    - addr : address for read
-    - basename : prefix for AXI lite interface name
-
-    Returns: read value from AXI lite interface at given addr
-    """
-    sim.io[basename + "ARADDR"] = addr
-    sim.io[basename + "ARVALID"] = 1
-    wait_for_handshake(sim, "AR", basename=basename)
-    # read request OK
-    sim.io[basename + "ARVALID"] = 0
-    # wait for read response
-    sim.io[basename + "RREADY"] = 1
-    ret_data = wait_for_handshake(sim, "R", basename=basename)
-    sim.io[basename + "RREADY"] = 0
-    return ret_data
diff --git a/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py b/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py
index 9ff2bedb96613b94310fc743d39fc2f4d9618677..40f0a620c6cd5db873a731c038a737b35c1cce9d 100644
--- a/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py
@@ -1,3 +1,31 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 import pytest
 
 from onnx import TensorProto, helper
diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
index f4817d70ce3080738e7b7321bfc686b73ad55fe1..aaffa3f7ed28116a9c1de9dd3b9dacba19954ee1 100644
--- a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
@@ -1,3 +1,31 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 from onnx import TensorProto, helper
 import numpy as np
 import pytest
diff --git a/tests/fpgadataflow/test_fpgadataflow_dwc.py b/tests/fpgadataflow/test_fpgadataflow_dwc.py
index 1d83f7a23cd3bad757e772055d242799cf22b0da..90b3145805f0c1ba59c7225b121b14b124ffe878 100644
--- a/tests/fpgadataflow/test_fpgadataflow_dwc.py
+++ b/tests/fpgadataflow/test_fpgadataflow_dwc.py
@@ -1,3 +1,31 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 import pytest
 
 from onnx import TensorProto, helper
diff --git a/tests/fpgadataflow/test_fpgadataflow_fifo.py b/tests/fpgadataflow/test_fpgadataflow_fifo.py
index 53de417eac175d8b700e84aecb304895a5942c16..77c518966c15ae002b6e88c244c1ee9e853c29aa 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fifo.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fifo.py
@@ -1,3 +1,31 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 import pytest
 import os
 
diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
index ef4f17998dbb09d31cdc9b3c89afafd10653fd28..8b38b2520c2239be822093da70fb29f6545c0b43 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
@@ -1,3 +1,31 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 import pytest
 import os
 import numpy as np
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
index 75fa625ff00ad6d367e2d6c94d98705f391fb9be..8461efd15576fc04906b7f48b2629ad83835de38 100644
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
@@ -46,9 +46,16 @@ from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.util.basic import gen_finn_dt_tensor
 from finn.custom_op.registry import getCustomOp
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+import os
+from finn.util.pyverilator import axilite_read, axilite_write
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.core.rtlsim_exec import rtlsim_exec
 
+test_fpga_part = "xc7z020clg400-1"
+target_clk_ns = 5
 
-def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval):
+
+def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode):
     NumChannels = T.shape[0]
 
     inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, NumChannels])
@@ -64,9 +71,12 @@ def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval):
         backend="fpgadataflow",
         NumChannels=NumChannels,
         PE=pe,
+        numSteps=T.shape[1],
         inputDataType=idt.name,
+        weightDataType=idt.name,  # will be set by MinimizeAccumulatorWidth
         outputDataType=odt.name,
         ActVal=actval,
+        mem_mode=mem_mode,
     )
     graph = helper.make_graph(
         nodes=[Thresholding_node],
@@ -96,9 +106,11 @@ def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval):
 @pytest.mark.parametrize("ich", [16])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+# memory mode
+@pytest.mark.parametrize("mem_mode", ["const", "decoupled"])
 @pytest.mark.vivado
 @pytest.mark.slow
-def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode):
+def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode):
     if nf == -1:
         nf = ich
     pe = ich // nf
@@ -118,7 +130,7 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode):
     else:
         actval = odt.min()
 
-    model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval)
+    model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode)
 
     if exec_mode == "cppsim":
         model = model.transform(PrepareCppSim())
@@ -127,7 +139,7 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode):
     elif exec_mode == "rtlsim":
         model = model.transform(SetExecMode("rtlsim"))
         model = model.transform(GiveUniqueNodeNames())
-        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+        model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
         model = model.transform(HLSSynthIP())
         model = model.transform(PrepareRTLSim())
     else:
@@ -164,3 +176,102 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode):
         exp_cycles = exp_cycles_dict[node.name]
         assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
         assert exp_cycles != 0
+
+
+@pytest.mark.vivado
+def test_runtime_thresholds_single_layer():
+    mem_mode = "decoupled"
+    act = DataType.INT4
+    idt = DataType.INT16
+    nf = 8
+    ich = 16
+    pe = ich // nf
+    assert ich % pe == 0
+
+    # generate input data
+    in_tensor = gen_finn_dt_tensor(idt, (1, ich))
+
+    odt = act
+    n_steps = act.get_num_possible_values() - 1
+    T = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(np.float32)
+    # provide non-decreasing thresholds
+    T = np.sort(T, axis=1)
+
+    if odt == DataType.BIPOLAR:
+        actval = 0
+    else:
+        actval = odt.min()
+
+    model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode)
+    op_inst = getCustomOp(model.graph.node[0])
+    op_inst.set_nodeattr("runtime_writeable_weights", 1)
+    op_inst.make_weight_file(T, "decoupled_runtime", "old_weights.dat")
+    with open("old_weights.dat", "r") as f:
+        old_weight_stream = f.read().strip()
+    os.remove("old_weights.dat")
+    old_weight_stream = map(lambda x: int(x, 16), old_weight_stream.split("\n"))
+    old_weight_stream = list(old_weight_stream)
+    # need to create stitched IP for runtime weight testing
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
+    model = model.transform(PrepareRTLSim())
+    model.set_metadata_prop("exec_mode", "rtlsim")
+    # add two copies of the input tensor as the first one is just used to
+    # "flush out" the pipeline (as mvau already starts receiving old weights while
+    # we read/write new ones and reads seem to cause a disturbance too)
+    in_tensor = np.tile(in_tensor, (2, 1))
+    exec_ctx = {"inp": in_tensor}
+    extracted_weight_stream = []
+
+    def read_weights(sim):
+        addr = 0
+        for i in range(len(old_weight_stream)):
+            extracted_weight_stream.append(
+                axilite_read(sim, addr, basename="s_axilite_0_")
+            )
+            addr += 4
+
+    rtlsim_exec(model, exec_ctx, pre_hook=read_weights)
+    assert extracted_weight_stream == old_weight_stream
+    # only use second batch element in output; first will be invalid due to
+    # old weights (see above)
+    y = exec_ctx["outp"][1]
+    expected = multithreshold(in_tensor, T)[1]
+    if act == DataType.BIPOLAR:
+        # binary to bipolar
+        expected = 2 * expected - 1
+    else:
+        # signed offset
+        expected += act.min()
+    assert (y == expected).all()
+
+    new_weights = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(
+        np.float32
+    )
+    # provide non-decreasing thresholds
+    new_weights = np.sort(T, axis=1)
+    op_inst.make_weight_file(new_weights, "decoupled_runtime", "new_weights.dat")
+    with open("new_weights.dat", "r") as f:
+        new_weight_stream = f.read().strip()
+    os.remove("new_weights.dat")
+    new_weight_stream = map(lambda x: int(x, 16), new_weight_stream.split("\n"))
+    new_weight_stream = list(new_weight_stream)
+
+    def write_weights(sim):
+        addr = 0
+        for nw in new_weight_stream:
+            axilite_write(sim, addr, nw, basename="s_axilite_0_")
+            addr += 4
+
+    rtlsim_exec(model, exec_ctx, pre_hook=write_weights)
+    y = exec_ctx["outp"][1]
+    expected = multithreshold(in_tensor, new_weights)[1]
+    if act == DataType.BIPOLAR:
+        # binary to bipolar
+        expected = 2 * expected - 1
+    else:
+        # signed offset
+        expected += act.min()
+    assert (y == expected).all()
diff --git a/tests/fpgadataflow/test_runtime_weights.py b/tests/fpgadataflow/test_runtime_weights.py
new file mode 100644
index 0000000000000000000000000000000000000000..c487824964400cacbde575da2c10757985ad6e32
--- /dev/null
+++ b/tests/fpgadataflow/test_runtime_weights.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from finn.util.create import hls_random_mlp_maker
+from finn.core.datatype import DataType
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.custom_op.registry import getCustomOp
+from finn.core.rtlsim_exec import rtlsim_exec
+from finn.util.basic import gen_finn_dt_tensor
+from finn.util.pyverilator import axilite_write, axilite_read
+import numpy as np
+import pytest
+import os
+
+test_fpga_part = "xc7z020clg400-1"
+target_clk_ns = 5
+
+
+@pytest.mark.vivado
+def test_runtime_weights_single_layer():
+    idt = DataType.UINT32
+    wdt = DataType.UINT4
+    act = None
+    mw = 64
+    mh = 32
+    pe = 4
+    simd = 16
+    layer_spec = {
+        "idt": idt,
+        "wdt": wdt,
+        "mw": mw,
+        "mh": mh,
+        "act": act,
+        "pe": pe,
+        "simd": simd,
+    }
+    layer_spec_list = [layer_spec]
+    model = hls_random_mlp_maker(layer_spec_list)
+    fcl = model.get_nodes_by_op_type("StreamingFCLayer_Batch")[0]
+    op_inst = getCustomOp(fcl)
+    op_inst.set_nodeattr("mem_mode", "decoupled")
+    op_inst.set_nodeattr("runtime_writeable_weights", 1)
+    old_weights = model.get_initializer(fcl.input[1])
+    op_inst.make_weight_file(old_weights, "decoupled_runtime", "old_weights.dat")
+    with open("old_weights.dat", "r") as f:
+        old_weight_stream = f.read().strip()
+    os.remove("old_weights.dat")
+    old_weight_stream = map(lambda x: int(x, 16), old_weight_stream.split("\n"))
+    old_weight_stream = list(old_weight_stream)
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
+    model = model.transform(PrepareRTLSim())
+    model.set_metadata_prop("exec_mode", "rtlsim")
+    in_tensor = np.asarray(range(mw), dtype=np.float32)
+    # add two copies of the input tensor as the first one is just used to
+    # "flush out" the pipeline (as mvau already starts receiving old weights while
+    # we read/write new ones and reads seem to cause a disturbance too)
+    in_tensor = np.tile(in_tensor, (2, 1))
+    exec_ctx = {"act_0": in_tensor}
+    extracted_weight_stream = []
+
+    def read_weights(sim):
+        addr = 0
+        for i in range(len(old_weight_stream)):
+            extracted_weight_stream.append(
+                axilite_read(sim, addr, basename="s_axilite_0_")
+            )
+            addr += 4
+
+    rtlsim_exec(model, exec_ctx, pre_hook=read_weights)
+    assert extracted_weight_stream == old_weight_stream
+    y = exec_ctx["act_1"]
+    # only use second batch element in output; first will be invalid due to
+    # old weights (see above)
+    assert (y[1] == np.dot(in_tensor[1], old_weights)).all()
+
+    new_weights = gen_finn_dt_tensor(wdt, (mw, mh))
+    op_inst.make_weight_file(new_weights, "decoupled_runtime", "new_weights.dat")
+    with open("new_weights.dat", "r") as f:
+        new_weight_stream = f.read().strip()
+    os.remove("new_weights.dat")
+    new_weight_stream = map(lambda x: int(x, 16), new_weight_stream.split("\n"))
+    new_weight_stream = list(new_weight_stream)
+
+    def write_weights(sim):
+        addr = 0
+        for nw in new_weight_stream:
+            axilite_write(sim, addr, nw, basename="s_axilite_0_")
+            addr += 4
+
+    rtlsim_exec(model, exec_ctx, pre_hook=write_weights)
+    y = exec_ctx["act_1"]
+    # only use second batch element in output; first will be invalid due to
+    # old weights (see above)
+    assert (y[1] == np.dot(in_tensor[1], new_weights)).all()
diff --git a/tests/util/test_pyverilator.py b/tests/util/test_pyverilator.py
deleted file mode 100644
index 5d837a924d92ca9a834557c21756d5f490146908..0000000000000000000000000000000000000000
--- a/tests/util/test_pyverilator.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright (c) 2020, Xilinx
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import pkg_resources as pk
-from pyverilator import PyVerilator
-from finn.util.pyverilator import axilite_read, axilite_write, reset_rtlsim
-
-
-def test_pyverilator_axilite():
-    example_root = pk.resource_filename("finn.qnn-data", "verilog/myadd")
-    # load example verilog: takes two 32-bit integers as AXI lite mem mapped
-    # registers, adds them together and return result
-    sim = PyVerilator.build(
-        "myadd_myadd.v", verilog_path=[example_root], top_module_name="myadd_myadd",
-    )
-    ifname = "s_axi_control_"
-    expected_signals = [
-        "AWVALID",
-        "AWREADY",
-        "AWADDR",
-        "WVALID",
-        "WREADY",
-        "WDATA",
-        "WSTRB",
-        "ARVALID",
-        "ARREADY",
-        "ARADDR",
-        "RVALID",
-        "RREADY",
-        "RDATA",
-        "RRESP",
-        "BVALID",
-        "BREADY",
-        "BRESP",
-    ]
-    for signal_name in expected_signals:
-        assert ifname + signal_name in sim.io
-    reset_rtlsim(sim)
-    # initial values
-    sim.io[ifname + "WVALID"] = 0
-    sim.io[ifname + "AWVALID"] = 0
-    sim.io[ifname + "ARVALID"] = 0
-    sim.io[ifname + "BREADY"] = 0
-    sim.io[ifname + "RREADY"] = 0
-    # write + verify first parameter in AXI lite memory mapped regs
-    val_a = 3
-    addr_a = 0x18
-    axilite_write(sim, addr_a, val_a)
-    ret_data = axilite_read(sim, addr_a)
-    assert ret_data == val_a
-    # write + verify second parameter in AXI lite memory mapped regs
-    val_b = 5
-    addr_b = 0x20
-    axilite_write(sim, addr_b, val_b)
-    ret_data = axilite_read(sim, addr_b)
-    assert ret_data == val_b
-    # launch accelerator and wait for completion
-    addr_ctrl_status = 0x00
-    # check for ap_idle
-    assert axilite_read(sim, addr_ctrl_status) and (1 << 2) != 0
-    # set ap_start
-    axilite_write(sim, addr_ctrl_status, 1)
-    # wait until ap_done
-    while 1:
-        ap_done = axilite_read(sim, addr_ctrl_status) and (1 << 1)
-        if ap_done != 0:
-            break
-    # read out and verify result
-    addr_return = 0x10
-    val_ret = axilite_read(sim, addr_return)
-    assert val_ret == val_a + val_b