diff --git a/.gitignore b/.gitignore
index 91879240b36709b5c827ec951366cc55ad515cce..0411de3941d790fd1668fe2328b248cd3c09be08 100644
--- a/.gitignore
+++ b/.gitignore
@@ -83,3 +83,6 @@ MANIFEST
 /finn-hlslib/
 /pyverilator/
 /PYNQ-HelloWorld/
+
+# Jenkins cfg dir
+/docker/jenkins_home
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8e4e82db3a046e454373c2f0b58d55865cda9c5b..c513c5493d674b067b82fdae9e675d7f9b6eb024 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -27,7 +27,8 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 exclude: '^docs/conf.py'
-
+default_language_version:
+    python: python3
 repos:
 # black
 - repo: https://github.com/ambv/black
diff --git a/AUTHORS.rst b/AUTHORS.rst
index a87cf170b065879f0c5e01e4726bc57608e5c4f2..e231e61d38991e11e2e43a7c9a3a78c50c878244 100644
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
@@ -2,6 +2,7 @@
 Contributors
 ============
 
-* Yaman Umuroglu
-* Jakoba Petri-Koenig
-* Andrea Rigoni
+* Yaman Umuroglu (@maltanar)
+* Jakoba Petri-Koenig (@auphelia)
+* Andrea Rigoni (@AndreaRigoni)
+* Hendrik Borras (@HenniOVP)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..f12dafa857b8a99493d7266ad029bec3f725d9ec
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,34 @@
+We welcome contributions to FINN.
+
+Please follow the steps below and be sure that your contribution complies with our guidelines.
+
+1. Share your proposal via <a href="https://github.com/Xilinx/finn/issues" target="_blank">Github issues</a>. If you are looking for some issues to get started with, we have a list of <a href="https://github.com/Xilinx/finn/labels/good%20first%20issue">good first issues</a> in the issue tracker. Feel free to ask questions on the <a href="https://gitter.im/xilinx-finn/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge">FINN gitter channel as well</a>.
+
+	We welcome submissions to:
+
+	1. The FINN flow like additional custom ONNX nodes, transformation and analysis passes.
+	2. Contributions to the documentation and Jupyter notebooks
+
+	To ensure clean separation of toolflow and examples, we do not keep example networks in this repo. If you want to add example networks, we ask you to make them into a separate repo and use FINN as a dependency -- we'll be happy to add it to the list of <a href="https://xilinx.github.io/finn/community">FINN community projects</a>.
+
+2. Submitting your pull request:
+
+	1. Fork this repository to your own GitHub account using the *fork* button above.
+
+	2. Clone the fork to your local computer using *git clone*. Checkout the branch you want to work on.
+
+	3. Please install <a href="https://pre-commit.com/" target="_blank">pre-commit</a> to ensure your code is formatted to our style guidelines. The hooks we use for pre-commit can be found in <a href="https://github.com/Xilinx/finn/blob/master/.pre-commit-config.yaml" target="_blank">this file</a>
+
+	4. Modify the Python source code, Jupyter notebooks and Sphinx documentation etc. as needed.
+
+	5. Use *git add*, *git commit*, *git push* to add changes to your fork.
+
+	6. If you are introducing new functionality, add at least one unit test under the `test/` folder and make sure it passes before you submit the pull request.
+
+	7. Submit a pull request by clicking the *pull request* button on your GitHub repo:
+		1. The <a href="https://github.com/Xilinx/finn" target="_blank">master branch</a> should always be treated as stable and clean. Only hot fixes are allowed to be pull-requested. The hot fix is supposed to be very important such that without this fix, a lot of things will break.
+        2. For new features, smaller bug fixes, doc updates, and many other fixes, users should pull request against the <a href="https://github.com/Xilinx/finn/tree/dev" target="_blank">development branch</a>.
+
+3. We will review your contribution and, if any additional fixes or modifications are
+necessary, may provide feedback to guide you. When accepted, your pull request will
+be merged to the repository. If you have more questions please contact us via the <a href="https://gitter.im/xilinx-finn/community" target="_blank">FINN gitter channel</a>.
diff --git a/README.md b/README.md
index 0a70f27b675c105d76259edcacb78251419a5205..b408b1a69d6833382763795f35002e2b3322f09d 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@ Please see the [Getting Started](https://finn.readthedocs.io/en/latest/getting_s
 
 ## What's New in FINN?
 
-* **2020-02-27:** FINN v0.2b (beta) is released, which is a clean-slate reimplementation of the framework. Currently only fully-connected networks are supported for the end-to-end flow. Please see the release blog post for a summary of the key features.
+* **2020-02-28:** FINN v0.2b (beta) is released, which is a clean-slate reimplementation of the framework. Currently only fully-connected networks are supported for the end-to-end flow. Please see the release blog post for a summary of the key features.
 
 ## Documentation
 
diff --git a/docker/Dockerfile.finn_ci b/docker/Dockerfile.finn_ci
new file mode 100644
index 0000000000000000000000000000000000000000..dd0c28da759d31544a68f2a969783174c628c28b
--- /dev/null
+++ b/docker/Dockerfile.finn_ci
@@ -0,0 +1,93 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+FROM pytorch/pytorch:1.1.0-cuda10.0-cudnn7.5-devel
+MAINTAINER Yaman Umuroglu <yamanu@xilinx.com>
+ARG PYTHON_VERSION=3.6
+ARG BUILD_PATH
+ARG FINN_CI_BRANCH
+
+WORKDIR /workspace
+
+RUN apt-get update
+RUN apt-get -y upgrade
+RUN apt-get install -y build-essential libglib2.0-0 libsm6 libxext6 libxrender-dev
+RUN apt install verilator
+RUN apt-get -y install sshpass
+RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config
+
+# cloning dependency repos
+# Brevitas
+RUN git clone --branch feature/finn_onnx_export https://github.com/Xilinx/brevitas.git /workspace/brevitas
+RUN git -C /workspace/brevitas checkout 215cf44c76d562339fca368c8c3afee3110033e8
+
+# Brevitas examples
+RUN git clone --branch feature/rework_scaling_clipping https://github.com/maltanar/brevitas_cnv_lfc.git /workspace/brevitas_cnv_lfc
+RUN git -C /workspace/brevitas_cnv_lfc checkout 2059f96bd576bf71f32c757e7f92617a70190c90
+
+# CNPY
+RUN git clone https://github.com/rogersce/cnpy.git /workspace/cnpy
+RUN git -C /workspace/cnpy checkout 4e8810b1a8637695171ed346ce68f6984e585ef4
+
+# FINN hlslib
+RUN git clone https://github.com/maltanar/finn-hlslib.git /workspace/finn-hlslib
+RUN git -C /workspace/finn-hlslib checkout b139bf051ac8f8e0a3625509247f714127cf3317
+
+# PyVerilator
+RUN git clone https://github.com/maltanar/pyverilator /workspace/pyverilator
+RUN git -C /workspace/pyverilator checkout 307fc5c82db748620836307a2002fdc9fe170226
+
+# PYNQ-HelloWorld
+RUN git clone --branch feature/synth_rpt https://github.com/maltanar/PYNQ-HelloWorld.git /workspace/PYNQ-HelloWorld
+RUN git -C /workspace/PYNQ-HelloWorld checkout db7e418767ce2a8e08fe732ddb3aa56ee79b7560
+
+# FINN
+RUN git clone --branch $FINN_CI_BRANCH https://github.com/Xilinx/finn /workspace/finn
+
+RUN pip install -r /workspace/finn/requirements.txt
+RUN apt update; apt install nano
+RUN pip install pytest-dependency
+
+ENV PYTHONPATH "${PYTHONPATH}:/workspace/finn/src"
+ENV PYTHONPATH "${PYTHONPATH}:/workspace/brevitas_cnv_lfc/training_scripts"
+ENV PYTHONPATH "${PYTHONPATH}:/workspace/brevitas"
+ENV PYTHONPATH "${PYTHONPATH}:/workspace/pyverilator"
+ENV PYNQSHELL_PATH "/workspace/PYNQ-HelloWorld/boards"
+ENV VIVADO_IP_CACHE "$BUILD_PATH/vivado_ip_cache"
+
+# colorful terminal output
+RUN echo "PS1='\[\033[1;36m\]\u\[\033[1;31m\]@\[\033[1;32m\]\h:\[\033[1;35m\]\w\[\033[1;31m\]\$\[\033[0m\] '" >>  /root/.bashrc
+RUN mkdir -p $BUILD_PATH
+RUN mkdir -p $VIVADO_IP_CACHE
+
+WORKDIR /workspace/finn
+
+COPY finn_entrypoint.sh /usr/local/bin/
+RUN chmod 755 /usr/local/bin/finn_entrypoint.sh
+ENTRYPOINT ["finn_entrypoint.sh"]
+CMD ["bash"]
diff --git a/Dockerfile b/docker/Dockerfile.finn_dev
similarity index 68%
rename from Dockerfile
rename to docker/Dockerfile.finn_dev
index c220e6ac6f4f4b24f2a10af778a0740137ee949f..e28492bd31f3a2115ac566ed06a0125d348208f4 100644
--- a/Dockerfile
+++ b/docker/Dockerfile.finn_dev
@@ -29,46 +29,42 @@
 FROM pytorch/pytorch:1.1.0-cuda10.0-cudnn7.5-devel
 MAINTAINER Yaman Umuroglu <yamanu@xilinx.com>
 ARG PYTHON_VERSION=3.6
+ARG GID
+ARG GNAME
+ARG UNAME
+ARG UID
+ARG PASSWD
+ARG JUPYTER_PORT
+ARG NETRON_PORT
+
+EXPOSE $JUPYTER_PORT
+EXPOSE $NETRON_PORT
 
 WORKDIR /workspace
 
+RUN apt-get update
+RUN apt-get -y upgrade
+RUN apt-get install -y build-essential libglib2.0-0 libsm6 libxext6 libxrender-dev
+RUN apt-get install verilator
+RUN apt-get install nano
+RUN apt-get -y install sshpass
+RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config
+
 COPY requirements.txt .
 RUN pip install -r requirements.txt
 RUN rm requirements.txt
-RUN apt update; apt install nano
 RUN pip install jupyter
 RUN pip install netron
 RUN pip install matplotlib
 RUN pip install pytest-dependency
-RUN apt-get update
-RUN apt-get -y upgrade
-RUN apt-get install -y build-essential libglib2.0-0 libsm6 libxext6 libxrender-dev
-RUN apt install verilator
-RUN apt-get -y install sshpass
-RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config
 RUN pip install sphinx
 RUN pip install sphinx_rtd_theme
 
+# copy entrypoint script
+COPY docker/finn_entrypoint.sh /usr/local/bin/
+RUN chmod 755 /usr/local/bin/finn_entrypoint.sh
 
-# Note that we expect the cloned finn directory on the host to be
-# mounted on /workspace/finn -- see run-docker.sh for an example
-# of how to do this.
-# This branch assumes the same for brevitas and brevitas_cnv_lfc for easier
-# co-development.
-ENV PYTHONPATH "${PYTHONPATH}:/workspace/finn/src"
-ENV PYTHONPATH "${PYTHONPATH}:/workspace/brevitas_cnv_lfc/training_scripts"
-ENV PYTHONPATH "${PYTHONPATH}:/workspace/brevitas"
-ENV PYTHONPATH "${PYTHONPATH}:/workspace/pyverilator"
-ENV PYNQSHELL_PATH "/workspace/PYNQ-HelloWorld/boards"
-
-ARG GID
-ARG GNAME
-ARG UNAME
-ARG UID
-ARG PASSWD
-ARG JUPYTER_PORT
-ARG NETRON_PORT
-
+# switch user
 RUN groupadd -g $GID $GNAME
 RUN useradd -M -u $UID $UNAME -g $GNAME
 RUN usermod -aG sudo $UNAME
@@ -78,8 +74,42 @@ RUN ln -s /workspace /home/$UNAME
 RUN chown -R $UNAME:$GNAME /home/$UNAME
 USER $UNAME
 
-RUN echo "source \$VIVADO_PATH/settings64.sh" >> /home/$UNAME/.bashrc
-RUN echo "PS1='\[\033[1;36m\]\u\[\033[1;31m\]@\[\033[1;32m\]\h:\[\033[1;35m\]\w\[\033[1;31m\]\$\[\033[0m\] '" >>  /home/$UNAME/.bashrc
-EXPOSE $JUPYTER_PORT
-EXPOSE $NETRON_PORT
+# cloning dependency repos (as user)
+# Brevitas
+RUN git clone --branch feature/finn_onnx_export https://github.com/Xilinx/brevitas.git /workspace/brevitas
+RUN git -C /workspace/brevitas checkout 215cf44c76d562339fca368c8c3afee3110033e8
+
+# Brevitas examples
+RUN git clone --branch feature/rework_scaling_clipping https://github.com/maltanar/brevitas_cnv_lfc.git /workspace/brevitas_cnv_lfc
+RUN git -C /workspace/brevitas_cnv_lfc checkout 2059f96bd576bf71f32c757e7f92617a70190c90
+
+# CNPY
+RUN git clone https://github.com/rogersce/cnpy.git /workspace/cnpy
+RUN git -C /workspace/cnpy checkout 4e8810b1a8637695171ed346ce68f6984e585ef4
+
+# FINN hlslib
+RUN git clone https://github.com/maltanar/finn-hlslib.git /workspace/finn-hlslib
+RUN git -C /workspace/finn-hlslib checkout b139bf051ac8f8e0a3625509247f714127cf3317
+
+# PyVerilator
+RUN git clone https://github.com/maltanar/pyverilator /workspace/pyverilator
+RUN git -C /workspace/pyverilator checkout 307fc5c82db748620836307a2002fdc9fe170226
+
+# PYNQ-HelloWorld
+RUN git clone --branch feature/synth_rpt https://github.com/maltanar/PYNQ-HelloWorld.git /workspace/PYNQ-HelloWorld
+RUN git -C /workspace/PYNQ-HelloWorld checkout db7e418767ce2a8e08fe732ddb3aa56ee79b7560
+
+# Note that we expect the cloned finn directory on the host to be
+# mounted on /workspace/finn -- see run-docker.sh for an example
+# of how to do this.
+ENV PYTHONPATH "${PYTHONPATH}:/workspace/finn/src"
+ENV PYTHONPATH "${PYTHONPATH}:/workspace/brevitas_cnv_lfc/training_scripts"
+ENV PYTHONPATH "${PYTHONPATH}:/workspace/brevitas"
+ENV PYTHONPATH "${PYTHONPATH}:/workspace/pyverilator"
+ENV PYNQSHELL_PATH "/workspace/PYNQ-HelloWorld/boards"
+
 WORKDIR /home/$UNAME/finn
+RUN echo "PS1='\[\033[1;36m\]\u\[\033[1;31m\]@\[\033[1;32m\]\h:\[\033[1;35m\]\w\[\033[1;31m\]\$\[\033[0m\] '" >>  /home/$UNAME/.bashrc
+
+ENTRYPOINT ["finn_entrypoint.sh"]
+CMD ["bash"]
diff --git a/docker/Dockerfile.jenkins b/docker/Dockerfile.jenkins
new file mode 100644
index 0000000000000000000000000000000000000000..e1939b642e1493ee97daf6472009649d3634632f
--- /dev/null
+++ b/docker/Dockerfile.jenkins
@@ -0,0 +1,11 @@
+FROM jenkins/jenkins:lts
+# if we want to install via apt
+USER root
+RUN apt-get update
+RUN apt-get install -y gnupg-agent curl ca-certificates apt-transport-https software-properties-common
+RUN curl -fsSL https://download.docker.com/linux/debian/gpg | apt-key add -
+RUN add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/debian $(lsb_release -cs) stable"
+RUN apt-get update
+RUN apt-get install -y docker-ce-cli
+# drop back to the regular jenkins user - good practice
+USER jenkins
diff --git a/docker/Jenkinsfile b/docker/Jenkinsfile
new file mode 100644
index 0000000000000000000000000000000000000000..e64280222a6d2e558f00d20a25a4a79d55526a97
--- /dev/null
+++ b/docker/Jenkinsfile
@@ -0,0 +1,54 @@
+pipeline {
+    agent any
+    parameters {
+        string(name: 'FINN_CI_BRANCH', defaultValue: '', description: 'FINN branch to build')
+        string(name: 'VIVADO_PATH', defaultValue: '', description: 'Path to Vivado installation')
+        string(name: 'PYNQ_BOARD', defaultValue: 'Pynq-Z1', description: 'PYNQ board type')
+        string(name: 'PYNQ_IP', defaultValue: '', description: 'PYNQ board IP address')
+        string(name: 'PYNQ_USERNAME', defaultValue: 'xilinx', description: 'PYNQ board username')
+        string(name: 'PYNQ_PASSWORD', defaultValue: 'xilinx', description: 'PYNQ board password')
+        string(name: 'PYNQ_TARGET_DIR', defaultValue: '/home/xilinx/finn', description: 'PYNQ board target deployment directory')
+        string(name: 'NUM_DEFAULT_WORKERS', defaultValue: '1', description: 'Number of cores for parallel transformations')
+        string(name: 'DOCKER_CMD', defaultValue: """python setup.py test""", description: 'Command to run')
+    }
+    environment {
+        DOCKER_TAG='finn_ci:$BUILD_ID'
+        DOCKER_INST_NAME='finn_ci_$BUILD_ID'
+        BUILD_PATH='/tmp/finn_ci_$BUILD_ID'
+    }
+    stages {
+        stage("Clone") {
+            steps {
+                git branch: "${params.FINN_CI_BRANCH}", url: 'https://github.com/Xilinx/finn.git'
+            }
+        }
+      stage('Build') {
+            steps {
+                sh """
+                docker build -t $DOCKER_TAG -f docker/Dockerfile.finn_ci \
+                --build-arg BUILD_PATH=$BUILD_PATH \
+                --build-arg FINN_CI_BRANCH=${params.FINN_CI_BRANCH} \
+                docker/
+                """
+            }
+        }
+        stage('Test') {
+            steps {
+                sh """
+                docker run --name $DOCKER_INST_NAME \
+                --hostname $DOCKER_INST_NAME \
+                -v ${params.VIVADO_PATH}:${params.VIVADO_PATH}:ro \
+                -e NUM_DEFAULT_WORKERS=${params.NUM_DEFAULT_WORKERS} \
+                -e FINN_INST_NAME=$DOCKER_INST_NAME \
+                -e VIVADO_PATH=${params.VIVADO_PATH} \
+                -e PYNQ_BOARD=${params.PYNQ_BOARD} \
+                -e PYNQ_IP=${params.PYNQ_IP} \
+                -e PYNQ_USERNAME=${params.PYNQ_USERNAME} \
+                -e PYNQ_PASSWORD=${params.PYNQ_PASSWORD} \
+                -e PYNQ_TARGET_DIR=${params.PYNQ_TARGET_DIR} \
+                $DOCKER_TAG ${params.DOCKER_CMD}
+                """
+            }
+        }
+    }
+}
diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
new file mode 100644
index 0000000000000000000000000000000000000000..930218e26eff0b7be541529f452efc2a038160c5
--- /dev/null
+++ b/docker/finn_entrypoint.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+export XILINX_VIVADO=$VIVADO_PATH
+export SHELL=/bin/bash
+export FINN_ROOT=/workspace/finn
+
+# source Vivado env.vars
+source $VIVADO_PATH/settings64.sh
+
+exec "$@"
diff --git a/docker/launch-jenkins.sh b/docker/launch-jenkins.sh
new file mode 100755
index 0000000000000000000000000000000000000000..64dc1ec73f68e621cdd737595983b6b9a217f6fe
--- /dev/null
+++ b/docker/launch-jenkins.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+# defaults, can be overriden by environment variables
+# user to run Jenkins as -- see NOTE below regarding Docker access permissions
+: ${JENKINS_USER=jenkins}
+# port for Jenkins on host machine
+: ${JENKINS_PORT=8080}
+# make Jenkins config persistent by mounting into this folder
+: ${JENKINS_HOME=$(pwd)/jenkins_home}
+
+mkdir -p $JENKINS_HOME
+
+# build a Jenkins Docker image that also has the Docker CLI installed
+docker build -t finn_jenkins -f Dockerfile.jenkins .
+
+# launch Docker container mounted to local Docker socket
+# NOTE: we allow customizing the user (e.g. as root) to work around permission
+# issues, may not al
+docker run -u $JENKINS_USER -p $JENKINS_PORT:8080 -v /var/run/docker.sock:/var/run/docker.sock -v $JENKINS_HOME:/var/jenkins_home finn_jenkins
diff --git a/docs/_posts/2020-02-28-finn-v02b-beta-is-released.md b/docs/_posts/2020-02-28-finn-v02b-beta-is-released.md
new file mode 100644
index 0000000000000000000000000000000000000000..319c03e14229f4866279cb09a4b70419ce2fcdc7
--- /dev/null
+++ b/docs/_posts/2020-02-28-finn-v02b-beta-is-released.md
@@ -0,0 +1,33 @@
+---
+layout: post
+title:  "FINN v0.2b (beta) is released"
+author: "Yaman Umuroglu"
+---
+
+We've been working on the new version of the FINN compiler for a while, and today we are excited to announce our first beta release to 
+give you a taste of how things are shaping up! 
+
+Here's a quick overview of the key features:
+
+* <b>Train and export highly-quantized networks in PyTorch using Brevitas.</b> You can use <a href="https://github.com/Xilinx/brevitas">Brevitas</a>,
+  our PyTorch library for quantization-aware training to train networks with few-bit weights and activations, then export them into 
+  FINN-ONNX to be used by the FINN compiler.
+
+* <b>Fully transparent end-to-end flow.</b> We support taking quantized networks (with limitations, see bottom of post) all the way down to a 
+  customized FPGA bitstream. This happens across many steps ranging from streamlining to Vivado IPI stitching, and each step is fully 
+  visible to the user. So if you are happy with just the threshold-activation (streamlined) QNN in ONNX, or if you want to take the 
+  generated Vivado IP block and integrate it into your own IPI design, it's easy to break out of the flow at any step. 
+  We also provide a variety of mechanisms to verify the design at different steps.
+
+* <b>ONNX-based intermediate representation.</b> We use ONNX with some custom nodes and annotations as our intermediate representation. As the 
+  FINN compiler transforms the network across many steps to produce an FPGA bitstream, you can view and explore the transformed network 
+  using the excellent <a href="https://www.lutzroeder.com/ai/netron">Netron</a> viewer from the comfort of your web browser.
+
+* Tutorials and documentation. We have prepared a set of <a href="https://github.com/Xilinx/finn/tree/master/notebooks">Jupyter notebooks</a> 
+  to let you experiment with some of the things FINN can do, covering the basics, demonstrating the end-to-end flow on an example network, 
+  and discussing some of the internals for more advanced users and developers. We also have Sphinx-generated documentation on 
+  <a href="http://finn.readthedocs.io/">readthedocs</a> for more information on the FINN compiler and its API.
+
+The release (tagged 0.2b) is now available on GitHub. Currently it's a beta release and only supports fully-connected layers in linear 
+(non-branching) topologies, but we're actively working on the end-to-end convolution support for the next release. Further down the 
+road, we hope to support more advanced topologies and provide end-to-end examples for MobileNet and ResNet-50.
diff --git a/docs/_posts/2020-03-11-rn50-released.md b/docs/_posts/2020-03-11-rn50-released.md
new file mode 100644
index 0000000000000000000000000000000000000000..baa924410cf56a07e22a6c85450205d18a4d45bb
--- /dev/null
+++ b/docs/_posts/2020-03-11-rn50-released.md
@@ -0,0 +1,75 @@
+---
+layout: post
+title:  "ResNet50 for Alveo released"
+author: "Lucian Petrica"
+---
+
+We're pleased to announce as part of the FINN project our release of the first fully quantized, all-dataflow ResNet50 inference accelerator for Xilinx Alveo boards. The source code is available on [GitHub](https://github.com/Xilinx/ResNet50-PYNQ) and we provide a Python [package](https://pypi.org/project/resnet50-pynq/) and Jupyter Notebook to get you started and show how the accelerator is controlled using [PYNQ](http://www.pynq.io/) for Alveo.
+Built using a custom [FINN](https://xilinx.github.io/finn/about.html) streamlining flow, which is not yet public, 
+this accelerator showcases the advantage of deep quantization for FPGA acceleration of DNN workloads in the datacenter. 
+The key performance metrics are:
+
+FPGA Device | ImageNet Accuracy     | Max FPS    | Min Latency | Power @ Max FPS | Power @ Min Latency
+----------  |----------             |----------  |----------   |----------       |----------
+Alveo U250  | 65% Top-1 / 85% Top-5 | 2000       | 2 ms      | 70 W            | 40 W
+
+In addition to demonstrating the achievable performance of low-precision dataflow acceleration on Alveo, the ResNet50 design
+serves as proof of concept for two key features of future FINN releases: 
+modular build flows based on Vivado IP Integrator, and pure Python interface to the accelerator. 
+
+## Modular build flow
+
+FINN accelerators targetting embedded parts, such as the [BNN-PYNQ](https://github.com/Xilinx/BNN-PYNQ) accelerators, have in the past implemented the
+entire acceleration functionality in a singe monolithic HLS C++ description.
+For large datacenter-class designs this approach is not feasible, as the HLS simulation and synthesis times become very large.
+
+Instead, here we identify the key computational pattern, the residual block, which we implement as a HLS C++ IP block by assembling multiple Matrix-Vector-Activation Units from the [FINN HLS Library](https://github.com/Xilinx/finn-hlslib). 
+We then construct the accelerator by instantiating and connecting multiple residual blocks together in a Vivado IPI block design, which are then synthesized in parallel and exported as a netlist IP.
+
+<img align="left" src="https://xilinx.github.io/finn/img/rn50-ipi.png" alt="drawing" style="margin-right: 20px" width="300"/>
+
+
+In our flow, this IP is linked by Vitis into an Alveo platform, but users are free to integrate the ResNet50 IP in their own Vivado-based flows and augment it with other HLS or RTL IP. See our build scripts and documentation for more information.
+
+## Pure Python host interface
+
+Using PYNQ for Alveo, users can interface directly with the ResNet50 accelerator in Python.
+To program the accelerator, an Overlay object is created from an XCLBin file produced by Vitis.
+
+```Python
+import pynq
+
+ol=pynq.Overlay("resnet50.xclbin")
+accelerator=ol.resnet50_1
+```
+
+Before using the accelerator, we must configure the weights of the fully-connected layer in DDR Bank 0.
+Assuming the weights are already loaded in the NumPy array `fcweights`, we allocate a buffer 
+of appropriate size, copy the weights into it, and flush it to the Alveo DDR Bank 0.
+
+```Python
+fcbuf = pynq.allocate((1000,2048), dtype=np.int8, target=ol.bank0)
+fcbuf[:] = fcweights
+fcbuf.sync_to_device()
+```
+
+To perform inference we first allocate input and output buffers for one image, and copy the contents of the NumPy array `img` into the input buffer.
+We then flush the input data to the Alveo DDR Bank 0, and call the accelerator providing as arguments
+the input and output buffers, the FC layer weights buffer, and the number of images to process, in this case just one.
+After the call finishes, we pull the output buffer data from the accelerator DDR to host memory and copy its 
+contents to user memory in a NumPy array.
+
+```Python
+inbuf = pynq.allocate((224,224,3), dtype=np.int8, target=ol.bank0)
+outbuf = pynq.allocate((5,), dtype=np.uint32, target=ol.bank0)
+
+inbuf[:] = img
+inbuf.sync_to_device()
+
+accelerator.call(inbuf, outbuf, fcbuf, 1)
+
+outbuf.sync_from_device()
+results = np.copy(outbuf)
+```
+
+It's that easy! See our Jupyter Notebook demo and application examples for more details.
diff --git a/docs/_posts/2020-03-27-brevitas-quartznet-release.md b/docs/_posts/2020-03-27-brevitas-quartznet-release.md
new file mode 100644
index 0000000000000000000000000000000000000000..0940f754815c834662919404860b8a7b00d08e64
--- /dev/null
+++ b/docs/_posts/2020-03-27-brevitas-quartznet-release.md
@@ -0,0 +1,92 @@
+---
+layout: post
+title:  "Quantized QuartzNet with Brevitas for efficient speech recognition"
+author: "Giuseppe Franco"
+---
+
+*Although not yet supported in FINN, we are excited to show you how Brevitas and quantized neural network training techniques can be applied to models beyond image classification.*
+
+We are pleased to announce the release of quantized pre-trained models of [QuartzNet](https://arxiv.org/abs/1904.03288) for efficient speech recognition.
+They can be found at the [following link](https://github.com/Xilinx/brevitas/tree/master/examples/speech_to_text), with a brief
+explanation on how to test them.
+The quantized version of QuartzNet has been trained using [Brevitas](https://github.com/Xilinx/brevitas), an experimental library for quantization-aware training.
+
+QuartzNet, whose structure can be seen in Fig. 1, is a convolution-based speech-to-text network, based on a similar structure as [Jasper](https://arxiv.org/abs/1904.03288).
+
+| <img src="https://xilinx.github.io/finn/img/QuartzNet.jpg" alt="QuartzNet Structure" title="QuartzNet Structure" width="450" height="500" align="center"/>|
+| :---:|
+| *Fig. 1 QuartzNet Model, [source](https://arxiv.org/abs/1910.10261)* |
+
+The starting point is the mel-spectrogram representation of the input audio file.
+Through repeated base building blocks of 1D Convolutions (1D-Conv), Batch-Normalizations (BN), and ReLU with residual connections,
+QuartzNet is able to reconstruct the underlying text.
+The main difference with respect to Jasper is the use of Depthwise and Pointwise 1D-Conv (Fig. 2a), instead of 'simple' 1D-Conv (Fig. 2b).
+Thanks to this structure, QuartzNet is able to achieve better performance in terms of Word Error Rate (WER) compared to Jasper,
+using *only* 19.9 M parameters, compared to 333M parameters of Jasper.
+
+Moreover, the authors proposed a grouped-pointwise convolution strategy that allows to greatly reduce the numbers of parameters,
+down to 8.7M, with a little degradation in accuracy.
+
+| <img src="https://xilinx.github.io/finn/img/quartzPic1.jpg" alt="QuartzNet block" title="QuartzNet block" width="130" height="220" align="center"/> | <img src="https://xilinx.github.io/finn/img/JasperVertical4.jpg" alt="Jasper block" title="Jasper block" width="130" height="220" align="center"/>|
+| :---:|:---:|
+| *Fig. 2a QuartzNet Block, [source](https://arxiv.org/abs/1910.10261)* | *Fig. 2b Jasper Block [source](https://arxiv.org/abs/1904.03288)*  |
+
+
+The authors of QuartzNet proposes different BxR configurations. Each B<sub>i</sub> block consist of the same base building block described above,
+repeated R times.
+Different BxR configurations have been trained on several different datasets (Wall Street Journal,
+LibriSpeech + Mozilla Common Voice, LibriSpeech only).
+
+For our quantization experiments, we focus on the 15x5 variant trained on LibriSpeech with spec-augmentation without grouped convolutions.
+More detail about this configuration can be found in the paper and on a [related discussion with the authors](https://github.com/NVIDIA/NeMo/issues/230).
+
+Started from the [official implementation](https://github.com/NVIDIA/NeMo/blob/master/examples/asr/quartznet.py),
+the first step was to implement a quantized version of the topology in Brevitas, using quantized convolutions and activations.
+
+After implementing the quantized version, the second step was to re-train the model, starting
+from the [pre-trained models](https://ngc.nvidia.com/catalog/models/nvidia:quartznet_15x5_ls_sp)
+kindly released by the authors.
+
+We focused on three main quantization configurations. Two configurations at 8 bit, with per-tensor and per-channel scaling,
+and one configuration at 4 bit, with per-channel scaling.
+
+We compare our results with the one achieved by the authors, not only in terms of pure WER, but also the parameter's memory footprint,
+and the number of operations performed. Note that the WER is always based on greedy decoding. The results can be seen in Fig. 3 and Fig. 4,
+and are summed up in Table 1.
+
+| Configuration | Word Error Rate (WER) | Memory Footprint (MegaByte) | Mega MACs |
+| :-----------: | :-------------------: | :-------------------------: | :-------: |
+| FP 300E, 1G   | 11.58%                | 37.69                       | 1658.54   |
+| FP 400E, 1G   | 11.08%                | 37.69                       | 1658.54   |
+| FP 1500E, 1G  | 10.78%                | 37.69                       | 1658.54   |
+| FP 300E, 2G   | 12.52%                | 24.06                       | 1058.75   |
+| FP 300E, 4G   | 13.48%                | 17.25                       |  758.86   |
+| 8 bit, 1G Per-Channel scaling| 10.98% | 18.58                       |  414.63   |
+| 8 bit, 1G Per-Tensor scaling | 11.03% | 18.58                       |  414.63   |
+| 4 bit, 1G Per-Channel scaling| 12.00% |  9.44                       |  104.18   |
+
+| <img src="https://xilinx.github.io/finn/img/WERMB.jpg" alt="WERvsMB" title="WERvsMB" width="500" height="300" align="center"/> |
+| :---:|
+| *Fig. 3 Memory footprint over WER on LibriSpeech dev-other* |
+
+| <img src="https://xilinx.github.io/finn/img/WERNops.jpg" alt="WERvsMACs" title="WERvsMACs" width="500" height="300" align="center"/> |
+| :---: |
+| *Fig. 4 Number of MACs Operations over WER on LibriSpeech dev-other*  |
+
+In evaluating the memory footprint, we consider half-precision (16 bit) Floating Point (FP) numbers for the original QuartzNet.
+As we can see on Fig. 3, the quantized implementations are able to achieve comparable accuracy compared to the corresponding floating-point verion,
+while greatly reducing the memory occupation. In the graph, the terms <em>E</em> stands for Epochs, while <em>G</em> for Groups, referring
+to the numbers of groups used for the grouped convolutions.
+In case of our 4 bit implementation, the first and last layer are left at 8 bit, but this is taken in account both in the computation
+of the memory occupation and of the number of operations.
+Notice how the 4 bit version is able to greatly reduce the memory footprint of the network compared to the grouped convolution variants, while still granting better accuracy.
+
+
+For comparing accuracy against the number of multiply-accumulate (MAC), we consider 16 bit floating-point multiplications as 16 bit integer multiplications.
+This means that we are greatly underestimating the complexity of operations performed in the original floating-point QuartzNet model.
+Assuming a n^2 growth in the cost of integer multiplication, we consider a 4 bit MAC 16x less expensive than a 16 bit one.
+The number of MACs in the Fig. 2b is normalized with respect to 16 bit.
+Also in this case, it is clear to see that the quantized versions are able to greatly reduce the amount of operations required,
+with little-to-none degradation in accuracy. In particular, the 8 bit versions are already able to have a better WER and lower amount
+of MACs compared to the grouped convolutions, and this is confirmed also by the 4 bit version, with a little degradation in terms of
+WER.
diff --git a/docs/img/JasperVertical4.jpg b/docs/img/JasperVertical4.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d7364ec8a99f51e77b421c85a8da4eebe2883751
Binary files /dev/null and b/docs/img/JasperVertical4.jpg differ
diff --git a/docs/img/QuartzNet.jpg b/docs/img/QuartzNet.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..ce258fcd5f458caae606af0973c2eb14aea0af27
Binary files /dev/null and b/docs/img/QuartzNet.jpg differ
diff --git a/docs/img/WERMB.jpg b/docs/img/WERMB.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3c1ce7d6bc3e378f6e75c204a01538f02a9cb007
Binary files /dev/null and b/docs/img/WERMB.jpg differ
diff --git a/docs/img/WERNops.jpg b/docs/img/WERNops.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e539bb26077fb98f9a0f7b554ed63a18d57207a1
Binary files /dev/null and b/docs/img/WERNops.jpg differ
diff --git a/docs/img/quartzPic1.jpg b/docs/img/quartzPic1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..cec4829f2187d720be8589d075c83443eaaef69c
Binary files /dev/null and b/docs/img/quartzPic1.jpg differ
diff --git a/docs/img/rn50-ipi.png b/docs/img/rn50-ipi.png
new file mode 100644
index 0000000000000000000000000000000000000000..504b011c9660b446ae39d407a8ce3d824bd2cd6a
Binary files /dev/null and b/docs/img/rn50-ipi.png differ
diff --git a/finn-rtllib/memstream/hdl/memstream.v b/finn-rtllib/memstream/hdl/memstream.v
index a1c8e066d7290699575c94b0ac939d5a3fb27b19..28acb301a583f7437c580744bae7bdc4aef76337 100644
--- a/finn-rtllib/memstream/hdl/memstream.v
+++ b/finn-rtllib/memstream/hdl/memstream.v
@@ -1,465 +1,467 @@
-/*
- Copyright (c) 2020, Xilinx
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
- * Neither the name of FINN nor the names of its
-   contributors may be used to endorse or promote products derived from
-   this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-module memstream
-#(
-//parameters to enable/disable axi-mm, set number of streams, set readmemh for memory, set per-stream offsets in memory, set per-stream widths
-    parameter CONFIG_EN = 1,
-    parameter NSTREAMS = 6,//1 up to 6
-
-    parameter MEM_DEPTH = 13824,
-    parameter MEM_WIDTH = 32,
-    parameter MEM_INIT = "./",
-    
-    //widths per stream
-	parameter STRM0_WIDTH = 32,
-	parameter STRM1_WIDTH = 32,
-	parameter STRM2_WIDTH = 32,
-	parameter STRM3_WIDTH = 32,
-	parameter STRM4_WIDTH = 32,
-	parameter STRM5_WIDTH = 32,
-
-	//depths per stream
-	parameter STRM0_DEPTH = 2304,
-	parameter STRM1_DEPTH = 2304,
-	parameter STRM2_DEPTH = 2304,
-	parameter STRM3_DEPTH = 2304,
-	parameter STRM4_DEPTH = 2304,
-	parameter STRM5_DEPTH = 2304,
-
-	//offsets for each stream
-	parameter STRM0_OFFSET = 0,
-	parameter STRM1_OFFSET = 2304,
-	parameter STRM2_OFFSET = 4608,
-	parameter STRM3_OFFSET = 6912,
-	parameter STRM4_OFFSET = 9216,
-	parameter STRM5_OFFSET = 11520
-)
-
-(
-    input aclk,
-    input aresetn,
-
-    //optional configuration interface compatible with ap_memory
-	input [31:0] config_address,
-	input config_ce,
-	input config_we,
-	input [31:0] config_d0,
-	output [31:0] config_q0,
-       
-    //multiple output AXI Streams, TDATA width rounded to multiple of 8 bits
-    input m_axis_0_afull,
-    input m_axis_0_tready,
-    output m_axis_0_tvalid,
-    output [((STRM0_WIDTH+7)/8)*8-1:0] m_axis_0_tdata,
-    
-    input m_axis_1_afull,
-    input m_axis_1_tready,
-    output m_axis_1_tvalid,
-    output [((STRM1_WIDTH+7)/8)*8-1:0] m_axis_1_tdata,
-    
-    input m_axis_2_afull,
-    input m_axis_2_tready,
-    output m_axis_2_tvalid,
-    output [((STRM2_WIDTH+7)/8)*8-1:0] m_axis_2_tdata,
-    
-    input m_axis_3_afull,
-    input m_axis_3_tready,
-    output m_axis_3_tvalid,
-    output [((STRM3_WIDTH+7)/8)*8-1:0] m_axis_3_tdata,
-    
-    input m_axis_4_afull,
-    input m_axis_4_tready,
-    output m_axis_4_tvalid,
-    output [((STRM4_WIDTH+7)/8)*8-1:0] m_axis_4_tdata,
-    
-    input m_axis_5_afull,
-    input m_axis_5_tready,
-    output m_axis_5_tvalid,
-    output [((STRM5_WIDTH+7)/8)*8-1:0] m_axis_5_tdata
-    
-
-);
-
-//calculate number of RAMB18 blocks we need depth-wise
-localparam NMEMBLOCKS = (MEM_DEPTH+1023) / 1024; //ceil(MEM_DEPTH/1024)
-
-//calculate width of address for each block
-localparam BLOCKADRWIDTH = NMEMBLOCKS > 1 ? 10 : $clog2(MEM_DEPTH);
-
-//determine whether a stream needs to multiplex between memory blocks
-localparam STRM0_MUX = ((STRM0_OFFSET/1024) != ((STRM0_OFFSET+STRM0_DEPTH)/1024));
-localparam STRM1_MUX = ((STRM1_OFFSET/1024) != ((STRM1_OFFSET+STRM1_DEPTH)/1024));
-localparam STRM2_MUX = ((STRM2_OFFSET/1024) != ((STRM2_OFFSET+STRM2_DEPTH)/1024));
-localparam STRM3_MUX = ((STRM3_OFFSET/1024) != ((STRM3_OFFSET+STRM3_DEPTH)/1024));
-localparam STRM4_MUX = ((STRM4_OFFSET/1024) != ((STRM4_OFFSET+STRM4_DEPTH)/1024));
-localparam STRM5_MUX = ((STRM5_OFFSET/1024) != ((STRM5_OFFSET+STRM5_DEPTH)/1024));
-
-//determine what the base block of each stream is
-localparam STRM0_BLOCK = (STRM0_OFFSET/1024);
-localparam STRM1_BLOCK = (STRM1_OFFSET/1024);
-localparam STRM2_BLOCK = (STRM2_OFFSET/1024);
-localparam STRM3_BLOCK = (STRM3_OFFSET/1024);
-localparam STRM4_BLOCK = (STRM4_OFFSET/1024);
-localparam STRM5_BLOCK = (STRM5_OFFSET/1024);
-
-//determine what the end block of each stream is
-localparam STRM0_END_BLOCK = ((STRM0_OFFSET+STRM0_DEPTH-1)/1024);
-localparam STRM1_END_BLOCK = ((STRM1_OFFSET+STRM1_DEPTH-1)/1024);
-localparam STRM2_END_BLOCK = ((STRM2_OFFSET+STRM2_DEPTH-1)/1024);
-localparam STRM3_END_BLOCK = ((STRM3_OFFSET+STRM3_DEPTH-1)/1024);
-localparam STRM4_END_BLOCK = ((STRM4_OFFSET+STRM4_DEPTH-1)/1024);
-localparam STRM5_END_BLOCK = ((STRM5_OFFSET+STRM5_DEPTH-1)/1024);
-
-//determine the number of blocks spanned by each stream
-localparam STRM0_NBLOCKS = STRM0_END_BLOCK - STRM0_BLOCK + 1;
-localparam STRM1_NBLOCKS = STRM1_END_BLOCK - STRM1_BLOCK + 1;
-localparam STRM2_NBLOCKS = STRM2_END_BLOCK - STRM2_BLOCK + 1;
-localparam STRM3_NBLOCKS = STRM3_END_BLOCK - STRM3_BLOCK + 1;
-localparam STRM4_NBLOCKS = STRM4_END_BLOCK - STRM4_BLOCK + 1;
-localparam STRM5_NBLOCKS = STRM5_END_BLOCK - STRM5_BLOCK + 1;
-
-//TODO: check that memory width is equal to the widest stream
-//TODO: check that the stream depths and offsets make sense, and that the memory depth is sufficient (or calculate depth here?)
+/*
+ Copyright (c) 2020, Xilinx
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name of FINN nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+module memstream
+#(
+//parameters to enable/disable axi-mm, set number of streams, set readmemh for memory, set per-stream offsets in memory, set per-stream widths
+    parameter CONFIG_EN = 1,
+    parameter NSTREAMS = 6,//1 up to 6
+
+    parameter MEM_DEPTH = 13824,
+    parameter MEM_WIDTH = 32,
+    parameter MEM_INIT = "./",
+    parameter RAM_STYLE = "auto",
+
+    //widths per stream
+	parameter STRM0_WIDTH = 32,
+	parameter STRM1_WIDTH = 32,
+	parameter STRM2_WIDTH = 32,
+	parameter STRM3_WIDTH = 32,
+	parameter STRM4_WIDTH = 32,
+	parameter STRM5_WIDTH = 32,
+
+	//depths per stream
+	parameter STRM0_DEPTH = 2304,
+	parameter STRM1_DEPTH = 2304,
+	parameter STRM2_DEPTH = 2304,
+	parameter STRM3_DEPTH = 2304,
+	parameter STRM4_DEPTH = 2304,
+	parameter STRM5_DEPTH = 2304,
+
+	//offsets for each stream
+	parameter STRM0_OFFSET = 0,
+	parameter STRM1_OFFSET = 2304,
+	parameter STRM2_OFFSET = 4608,
+	parameter STRM3_OFFSET = 6912,
+	parameter STRM4_OFFSET = 9216,
+	parameter STRM5_OFFSET = 11520
+)
+
+(
+    input aclk,
+    input aresetn,
+
+    //optional configuration interface compatible with ap_memory
+	input [31:0] config_address,
+	input config_ce,
+	input config_we,
+	input [31:0] config_d0,
+	output [31:0] config_q0,
+
+    //multiple output AXI Streams, TDATA width rounded to multiple of 8 bits
+    input m_axis_0_afull,
+    input m_axis_0_tready,
+    output m_axis_0_tvalid,
+    output [((STRM0_WIDTH+7)/8)*8-1:0] m_axis_0_tdata,
+
+    input m_axis_1_afull,
+    input m_axis_1_tready,
+    output m_axis_1_tvalid,
+    output [((STRM1_WIDTH+7)/8)*8-1:0] m_axis_1_tdata,
+
+    input m_axis_2_afull,
+    input m_axis_2_tready,
+    output m_axis_2_tvalid,
+    output [((STRM2_WIDTH+7)/8)*8-1:0] m_axis_2_tdata,
+
+    input m_axis_3_afull,
+    input m_axis_3_tready,
+    output m_axis_3_tvalid,
+    output [((STRM3_WIDTH+7)/8)*8-1:0] m_axis_3_tdata,
+
+    input m_axis_4_afull,
+    input m_axis_4_tready,
+    output m_axis_4_tvalid,
+    output [((STRM4_WIDTH+7)/8)*8-1:0] m_axis_4_tdata,
+
+    input m_axis_5_afull,
+    input m_axis_5_tready,
+    output m_axis_5_tvalid,
+    output [((STRM5_WIDTH+7)/8)*8-1:0] m_axis_5_tdata
+
+
+);
+
+//calculate number of RAMB18 blocks we need depth-wise
+localparam NMEMBLOCKS = (MEM_DEPTH+1023) / 1024; //ceil(MEM_DEPTH/1024)
+
+//calculate width of address for each block
+localparam BLOCKADRWIDTH = NMEMBLOCKS > 1 ? 10 : $clog2(MEM_DEPTH);
+
+//determine whether a stream needs to multiplex between memory blocks
+localparam STRM0_MUX = ((STRM0_OFFSET/1024) != ((STRM0_OFFSET+STRM0_DEPTH)/1024));
+localparam STRM1_MUX = ((STRM1_OFFSET/1024) != ((STRM1_OFFSET+STRM1_DEPTH)/1024));
+localparam STRM2_MUX = ((STRM2_OFFSET/1024) != ((STRM2_OFFSET+STRM2_DEPTH)/1024));
+localparam STRM3_MUX = ((STRM3_OFFSET/1024) != ((STRM3_OFFSET+STRM3_DEPTH)/1024));
+localparam STRM4_MUX = ((STRM4_OFFSET/1024) != ((STRM4_OFFSET+STRM4_DEPTH)/1024));
+localparam STRM5_MUX = ((STRM5_OFFSET/1024) != ((STRM5_OFFSET+STRM5_DEPTH)/1024));
+
+//determine what the base block of each stream is
+localparam STRM0_BLOCK = (STRM0_OFFSET/1024);
+localparam STRM1_BLOCK = (STRM1_OFFSET/1024);
+localparam STRM2_BLOCK = (STRM2_OFFSET/1024);
+localparam STRM3_BLOCK = (STRM3_OFFSET/1024);
+localparam STRM4_BLOCK = (STRM4_OFFSET/1024);
+localparam STRM5_BLOCK = (STRM5_OFFSET/1024);
+
+//determine what the end block of each stream is
+localparam STRM0_END_BLOCK = ((STRM0_OFFSET+STRM0_DEPTH-1)/1024);
+localparam STRM1_END_BLOCK = ((STRM1_OFFSET+STRM1_DEPTH-1)/1024);
+localparam STRM2_END_BLOCK = ((STRM2_OFFSET+STRM2_DEPTH-1)/1024);
+localparam STRM3_END_BLOCK = ((STRM3_OFFSET+STRM3_DEPTH-1)/1024);
+localparam STRM4_END_BLOCK = ((STRM4_OFFSET+STRM4_DEPTH-1)/1024);
+localparam STRM5_END_BLOCK = ((STRM5_OFFSET+STRM5_DEPTH-1)/1024);
+
+//determine the number of blocks spanned by each stream
+localparam STRM0_NBLOCKS = STRM0_END_BLOCK - STRM0_BLOCK + 1;
+localparam STRM1_NBLOCKS = STRM1_END_BLOCK - STRM1_BLOCK + 1;
+localparam STRM2_NBLOCKS = STRM2_END_BLOCK - STRM2_BLOCK + 1;
+localparam STRM3_NBLOCKS = STRM3_END_BLOCK - STRM3_BLOCK + 1;
+localparam STRM4_NBLOCKS = STRM4_END_BLOCK - STRM4_BLOCK + 1;
+localparam STRM5_NBLOCKS = STRM5_END_BLOCK - STRM5_BLOCK + 1;
+
+//TODO: check that memory width is equal to the widest stream
+//TODO: check that the stream depths and offsets make sense, and that the memory depth is sufficient (or calculate depth here?)
 initial begin
     if((NSTREAMS < 1) | (NSTREAMS > 6)) begin
         $display("Invalid setting for NSTREAMS, please set in range [1,6]");
         $finish();
     end
 end
-
-//invert reset
-wire rst;
-assign rst = ~aresetn;
-
-//WARNING: pipeline depth is larger than the number of streams per port so we have in-flight writes that may see not-ready when they get executed
-//solution: use prog-full to make sure we have an equal number of free slots in the stream to the read pipeline depth
-
-reg [$clog2(MEM_DEPTH)-1:0] strm0_addr = STRM0_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm1_addr = STRM1_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm2_addr = STRM2_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm3_addr = STRM3_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm4_addr = STRM4_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm5_addr = STRM5_OFFSET;
-
-reg strm0_incr_en;
-reg strm1_incr_en;
-reg strm2_incr_en;
-reg strm3_incr_en;
-reg strm4_incr_en;
-reg strm5_incr_en;
-
-wire strm0_rst;
-wire strm1_rst;
-wire strm2_rst;
-wire strm3_rst;
-wire strm4_rst;
-wire strm5_rst;
-
-reg strm0_ready;
-reg strm1_ready;
-reg strm2_ready;
-reg strm3_ready;
-reg strm4_ready;
-reg strm5_ready;
-
-//arbiter: work on one stream at a time
-//multiplex each port between (up to) half of the streams 
-reg [1:0] current_stream_porta = 0;
-reg [1:0] current_stream_portb = 0;
-
-always @(posedge aclk) begin
-    if(rst)
-        current_stream_porta <= 0;
-    else case(current_stream_porta)
-        0: current_stream_porta <= strm2_ready ? 1 : strm4_ready ? 2 : 0;
-        1: current_stream_porta <= strm4_ready ? 2 : strm0_ready ? 0 : 1;
-        2: current_stream_porta <= strm0_ready ? 0 : strm2_ready ? 1 : 2;
-    endcase
-    if(rst)
-        current_stream_portb <= 0;
-    else case(current_stream_portb)
-        0: current_stream_portb <= strm3_ready ? 1 : strm5_ready ? 2 : 0;
-        1: current_stream_portb <= strm5_ready ? 2 : strm1_ready ? 0 : 1;
-        2: current_stream_portb <= strm1_ready ? 0 : strm3_ready ? 1 : 2;
-    endcase
-end
-
-always @(posedge aclk) begin
-    if(rst) begin
-        strm0_incr_en <= 0;
-        strm1_incr_en <= 0;
-        strm2_incr_en <= 0;
-        strm3_incr_en <= 0;
-        strm4_incr_en <= 0;
-        strm5_incr_en <= 0;
-    end else begin
-        strm0_incr_en <= (current_stream_porta == 0) & strm0_ready;
-        strm1_incr_en <= (current_stream_portb == 0) & strm1_ready;
-        strm2_incr_en <= (current_stream_porta == 1) & strm2_ready;
-        strm3_incr_en <= (current_stream_portb == 1) & strm3_ready;
-        strm4_incr_en <= (current_stream_porta == 2) & strm4_ready;
-        strm5_incr_en <= (current_stream_portb == 2) & strm5_ready;
-    end
-end
-
-assign strm0_rst = strm0_incr_en & (strm0_addr == (STRM0_OFFSET + STRM0_DEPTH-1));
-assign strm1_rst = strm1_incr_en & (strm1_addr == (STRM1_OFFSET + STRM1_DEPTH-1));
-assign strm2_rst = strm2_incr_en & (strm2_addr == (STRM2_OFFSET + STRM2_DEPTH-1));
-assign strm3_rst = strm3_incr_en & (strm3_addr == (STRM3_OFFSET + STRM3_DEPTH-1));
-assign strm4_rst = strm4_incr_en & (strm4_addr == (STRM4_OFFSET + STRM4_DEPTH-1));
-assign strm5_rst = strm5_incr_en & (strm5_addr == (STRM5_OFFSET + STRM5_DEPTH-1));
-
-always @(posedge aclk) begin
-    strm0_ready <= ~m_axis_0_afull;
-    strm1_ready <= ~m_axis_1_afull & (NSTREAMS >= 2);
-    strm2_ready <= ~m_axis_2_afull & (NSTREAMS >= 3);
-    strm3_ready <= ~m_axis_3_afull & (NSTREAMS >= 4);
-    strm4_ready <= ~m_axis_4_afull & (NSTREAMS >= 5);
-    strm5_ready <= ~m_axis_5_afull & (NSTREAMS >= 6);
-end
-
-//one address counter per stream; more LUTs but keeps routing short and local
-always @(posedge aclk) begin
-    if(strm0_rst | rst)
-        strm0_addr <= STRM0_OFFSET;
-    else if(strm0_incr_en)
-        strm0_addr <= strm0_addr + 1;
-    if(strm1_rst | rst)
-        strm1_addr <= STRM1_OFFSET;
-    else if(strm1_incr_en)
-        strm1_addr <= strm1_addr + 1;
-    if(strm2_rst | rst)
-        strm2_addr <= STRM2_OFFSET;
-    else if(strm2_incr_en)
-        strm2_addr <= strm2_addr + 1;
-    if(strm3_rst | rst)
-        strm3_addr <= STRM3_OFFSET;
-    else if(strm3_incr_en)
-        strm3_addr <= strm3_addr + 1;
-    if(strm4_rst | rst)
-        strm4_addr <= STRM4_OFFSET;
-    else if(strm4_incr_en)
-        strm4_addr <= strm4_addr + 1;
-    if(strm5_rst | rst)
-        strm5_addr <= STRM5_OFFSET;
-    else if(strm5_incr_en)
-        strm5_addr <= strm5_addr + 1;
-end
-
-reg [$clog2(MEM_DEPTH)-1:0] addra;
-wire [MEM_WIDTH*NMEMBLOCKS-1:0] rdqa;
-
-reg [$clog2(MEM_DEPTH)-1:0] addrb;
-wire [MEM_WIDTH*NMEMBLOCKS-1:0] rdqb;
-
-wire [NMEMBLOCKS-1:0] we;
-
-reg [1:0] addr_select_porta;
-reg [1:0] addr_select_portb;
-
-//multiplex addresses of various streams into address ports of memory
-always @(posedge aclk) begin
-    addr_select_porta <= current_stream_porta;
-    case(addr_select_porta)
-        0: addra <= strm0_addr;
-        1: addra <= strm2_addr;
-        2: addra <= strm4_addr;
-    endcase
-    addr_select_portb <= current_stream_portb;
-    case(addr_select_portb)
-        0: addrb <= strm1_addr;
-        1: addrb <= strm3_addr;
-        2: addrb <= strm5_addr;
-    endcase
-end
-
-genvar g;
-generate for(g=0; g<NMEMBLOCKS; g=g+1) begin: blockports
-
-assign we[g] = (CONFIG_EN == 1) & config_ce & config_we & (config_address[31:BLOCKADRWIDTH] == g);
-
-ramb18_wf_dualport
-#(
-    .ID(g),
-	.DWIDTH(MEM_WIDTH),
-	.AWIDTH(BLOCKADRWIDTH),
-	.MEM_INIT(MEM_INIT)
-)
-ram
-(
-	.clk(aclk),
-	
-	.wea(we[g]),
-	.addra(we[g] ? config_address[BLOCKADRWIDTH-1:0] : addra[BLOCKADRWIDTH-1:0]),
-	.wdataa(config_d0),
-	.rdqa(rdqa[(g+1)*MEM_WIDTH-1:g*MEM_WIDTH]),
-
-	.web(1'b0),
-	.addrb(addrb[BLOCKADRWIDTH-1:0]),
-	.wdatab('d0),
-	.rdqb(rdqb[(g+1)*MEM_WIDTH-1:g*MEM_WIDTH])
-);
-
-end
-endgenerate
-
-integer i;
-
-generate if(NMEMBLOCKS > 1) begin: multiblock
-
-wire [MEM_WIDTH-1:0] rdqmux[5:0];
-
-reg [$clog2(MEM_DEPTH)-BLOCKADRWIDTH-1:0] rdblocka[2:0];
-reg [$clog2(MEM_DEPTH)-BLOCKADRWIDTH-1:0] rdblockb[2:0];
-
-always @(posedge aclk) begin
-    rdblocka[0] <= addra[$clog2(MEM_DEPTH)-1:BLOCKADRWIDTH];
-    rdblockb[0] <= addrb[$clog2(MEM_DEPTH)-1:BLOCKADRWIDTH];
-    for(i=0; i<2; i=i+1) begin
-		rdblocka[i+1] <= rdblocka[i];
-		rdblockb[i+1] <= rdblockb[i];
-    end
-end
-
-if(NSTREAMS >= 1) begin: en_strm0
-	if(STRM0_MUX == 1) begin: mux0
-		mux #(STRM0_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM0_BLOCK+STRM0_NBLOCKS)*MEM_WIDTH-1:STRM0_BLOCK*MEM_WIDTH],rdqmux[0],rdblocka[1] - STRM0_BLOCK);
-	end else begin: nomux0
-		assign rdqmux[0] = rdqa[(STRM0_BLOCK+1)*MEM_WIDTH-1:STRM0_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_0_tdata = rdqmux[0][STRM0_WIDTH-1:0];
-end
-
-if(NSTREAMS >= 2) begin: en_strm1
-	if(STRM1_MUX == 1) begin: mux1
-		mux #(STRM1_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM1_BLOCK+STRM1_NBLOCKS)*MEM_WIDTH-1:STRM1_BLOCK*MEM_WIDTH],rdqmux[1],rdblockb[1] - STRM1_BLOCK);
-	end else begin: nomux1
-		assign rdqmux[1] = rdqb[(STRM1_BLOCK+1)*MEM_WIDTH-1:STRM1_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_1_tdata = rdqmux[1][STRM1_WIDTH-1:0];
-end
-
-if(NSTREAMS >= 3) begin: en_strm2
-	if(STRM2_MUX == 1) begin: mux2
-		mux #(STRM2_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM2_BLOCK+STRM2_NBLOCKS)*MEM_WIDTH-1:STRM2_BLOCK*MEM_WIDTH],rdqmux[2],rdblocka[1] - STRM2_BLOCK);
-	end else begin: nomux2
-		assign rdqmux[2] = rdqa[(STRM2_BLOCK+1)*MEM_WIDTH-1:STRM2_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_2_tdata = rdqmux[2][STRM2_WIDTH-1:0];
-end
-
-if(NSTREAMS >= 4) begin: en_strm3
-	if(STRM3_MUX == 1) begin: mux3
-		mux #(STRM3_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM3_BLOCK+STRM3_NBLOCKS)*MEM_WIDTH-1:STRM3_BLOCK*MEM_WIDTH],rdqmux[3],rdblockb[1] - STRM3_BLOCK);
-	end else begin: nomux3
-		assign rdqmux[3] = rdqb[(STRM3_BLOCK+1)*MEM_WIDTH-1:STRM3_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_3_tdata = rdqmux[3][STRM3_WIDTH-1:0];
-end
-
-if(NSTREAMS >= 5) begin: en_strm4
-	if(STRM4_MUX == 1) begin: mux4
-		mux #(STRM4_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM4_BLOCK+STRM4_NBLOCKS)*MEM_WIDTH-1:STRM4_BLOCK*MEM_WIDTH],rdqmux[4],rdblocka[1] - STRM4_BLOCK);
-	end else begin: nomux4
-		assign rdqmux[4] = rdqa[(STRM4_BLOCK+1)*MEM_WIDTH-1:STRM4_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_4_tdata = rdqmux[4][STRM4_WIDTH-1:0];
-end
-
-if(NSTREAMS >= 6) begin: en_strm5
-	if(STRM5_MUX == 1) begin: mux5
-		mux #(STRM5_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM5_BLOCK+STRM5_NBLOCKS)*MEM_WIDTH-1:STRM5_BLOCK*MEM_WIDTH],rdqmux[5],rdblockb[1] - STRM5_BLOCK);
-	end else begin: nomux5
-		assign rdqmux[5] = rdqb[(STRM5_BLOCK+1)*MEM_WIDTH-1:STRM5_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_5_tdata = rdqmux[5][STRM5_WIDTH-1:0];
-end
-
-end else begin: singleblock
-
-if(NSTREAMS >= 1) begin: en_strm0_direct
-    assign m_axis_0_tdata = rdqa[STRM0_WIDTH-1:0];
-end
-if(NSTREAMS >= 2) begin: en_strm1_direct
-	assign m_axis_1_tdata = rdqb[STRM1_WIDTH-1:0];
-end
-if(NSTREAMS >= 3) begin: en_strm2_direct
-	assign m_axis_2_tdata = rdqa[STRM2_WIDTH-1:0];
-end
-if(NSTREAMS >= 4) begin: en_strm3_direct
-	assign m_axis_3_tdata = rdqb[STRM3_WIDTH-1:0];
-end
-if(NSTREAMS >= 5) begin: en_strm4_direct
-	assign m_axis_4_tdata = rdqa[STRM4_WIDTH-1:0];
-end
-if(NSTREAMS >= 6) begin: en_strm5_direct
-	assign m_axis_5_tdata = rdqb[STRM5_WIDTH-1:0];
-end
-
-end
-endgenerate
-
-//output to AXI Streams
-reg tvalid_pipe0[2:0];
-reg tvalid_pipe1[2:0];
-reg tvalid_pipe2[2:0];
-reg tvalid_pipe3[2:0];
-reg tvalid_pipe4[2:0];
-reg tvalid_pipe5[2:0];
-
-assign m_axis_0_tvalid = tvalid_pipe0[2];
-assign m_axis_1_tvalid = tvalid_pipe1[2];
-assign m_axis_2_tvalid = tvalid_pipe2[2];
-assign m_axis_3_tvalid = tvalid_pipe3[2];
-assign m_axis_4_tvalid = tvalid_pipe4[2];
-assign m_axis_5_tvalid = tvalid_pipe5[2];
-
-
-always @(posedge aclk) begin
-    tvalid_pipe0[0] <= strm0_incr_en;
-    tvalid_pipe1[0] <= strm1_incr_en;
-    tvalid_pipe2[0] <= strm2_incr_en;
-    tvalid_pipe3[0] <= strm3_incr_en;
-    tvalid_pipe4[0] <= strm4_incr_en;
-    tvalid_pipe5[0] <= strm5_incr_en;
-    for(i=0; i<2; i=i+1) begin: srl
-        tvalid_pipe0[i+1] <= tvalid_pipe0[i];
-        tvalid_pipe1[i+1] <= tvalid_pipe1[i];
-        tvalid_pipe2[i+1] <= tvalid_pipe2[i];
-        tvalid_pipe3[i+1] <= tvalid_pipe3[i];
-        tvalid_pipe4[i+1] <= tvalid_pipe4[i];
-        tvalid_pipe5[i+1] <= tvalid_pipe5[i];
-    end
-end
-
-assign config_q0 = 0;
-
-endmodule
\ No newline at end of file
+
+//invert reset
+wire rst;
+assign rst = ~aresetn;
+
+//WARNING: pipeline depth is larger than the number of streams per port so we have in-flight writes that may see not-ready when they get executed
+//solution: use prog-full to make sure we have an equal number of free slots in the stream to the read pipeline depth
+
+reg [$clog2(MEM_DEPTH)-1:0] strm0_addr = STRM0_OFFSET;
+reg [$clog2(MEM_DEPTH)-1:0] strm1_addr = STRM1_OFFSET;
+reg [$clog2(MEM_DEPTH)-1:0] strm2_addr = STRM2_OFFSET;
+reg [$clog2(MEM_DEPTH)-1:0] strm3_addr = STRM3_OFFSET;
+reg [$clog2(MEM_DEPTH)-1:0] strm4_addr = STRM4_OFFSET;
+reg [$clog2(MEM_DEPTH)-1:0] strm5_addr = STRM5_OFFSET;
+
+reg strm0_incr_en;
+reg strm1_incr_en;
+reg strm2_incr_en;
+reg strm3_incr_en;
+reg strm4_incr_en;
+reg strm5_incr_en;
+
+wire strm0_rst;
+wire strm1_rst;
+wire strm2_rst;
+wire strm3_rst;
+wire strm4_rst;
+wire strm5_rst;
+
+reg strm0_ready;
+reg strm1_ready;
+reg strm2_ready;
+reg strm3_ready;
+reg strm4_ready;
+reg strm5_ready;
+
+//arbiter: work on one stream at a time
+//multiplex each port between (up to) half of the streams
+reg [1:0] current_stream_porta = 0;
+reg [1:0] current_stream_portb = 0;
+
+always @(posedge aclk) begin
+    if(rst)
+        current_stream_porta <= 0;
+    else case(current_stream_porta)
+        0: current_stream_porta <= strm2_ready ? 1 : strm4_ready ? 2 : 0;
+        1: current_stream_porta <= strm4_ready ? 2 : strm0_ready ? 0 : 1;
+        2: current_stream_porta <= strm0_ready ? 0 : strm2_ready ? 1 : 2;
+    endcase
+    if(rst)
+        current_stream_portb <= 0;
+    else case(current_stream_portb)
+        0: current_stream_portb <= strm3_ready ? 1 : strm5_ready ? 2 : 0;
+        1: current_stream_portb <= strm5_ready ? 2 : strm1_ready ? 0 : 1;
+        2: current_stream_portb <= strm1_ready ? 0 : strm3_ready ? 1 : 2;
+    endcase
+end
+
+always @(posedge aclk) begin
+    if(rst) begin
+        strm0_incr_en <= 0;
+        strm1_incr_en <= 0;
+        strm2_incr_en <= 0;
+        strm3_incr_en <= 0;
+        strm4_incr_en <= 0;
+        strm5_incr_en <= 0;
+    end else begin
+        strm0_incr_en <= (current_stream_porta == 0) & strm0_ready;
+        strm1_incr_en <= (current_stream_portb == 0) & strm1_ready;
+        strm2_incr_en <= (current_stream_porta == 1) & strm2_ready;
+        strm3_incr_en <= (current_stream_portb == 1) & strm3_ready;
+        strm4_incr_en <= (current_stream_porta == 2) & strm4_ready;
+        strm5_incr_en <= (current_stream_portb == 2) & strm5_ready;
+    end
+end
+
+assign strm0_rst = strm0_incr_en & (strm0_addr == (STRM0_OFFSET + STRM0_DEPTH-1));
+assign strm1_rst = strm1_incr_en & (strm1_addr == (STRM1_OFFSET + STRM1_DEPTH-1));
+assign strm2_rst = strm2_incr_en & (strm2_addr == (STRM2_OFFSET + STRM2_DEPTH-1));
+assign strm3_rst = strm3_incr_en & (strm3_addr == (STRM3_OFFSET + STRM3_DEPTH-1));
+assign strm4_rst = strm4_incr_en & (strm4_addr == (STRM4_OFFSET + STRM4_DEPTH-1));
+assign strm5_rst = strm5_incr_en & (strm5_addr == (STRM5_OFFSET + STRM5_DEPTH-1));
+
+always @(posedge aclk) begin
+    strm0_ready <= ~m_axis_0_afull;
+    strm1_ready <= ~m_axis_1_afull & (NSTREAMS >= 2);
+    strm2_ready <= ~m_axis_2_afull & (NSTREAMS >= 3);
+    strm3_ready <= ~m_axis_3_afull & (NSTREAMS >= 4);
+    strm4_ready <= ~m_axis_4_afull & (NSTREAMS >= 5);
+    strm5_ready <= ~m_axis_5_afull & (NSTREAMS >= 6);
+end
+
+//one address counter per stream; more LUTs but keeps routing short and local
+always @(posedge aclk) begin
+    if(strm0_rst | rst)
+        strm0_addr <= STRM0_OFFSET;
+    else if(strm0_incr_en)
+        strm0_addr <= strm0_addr + 1;
+    if(strm1_rst | rst)
+        strm1_addr <= STRM1_OFFSET;
+    else if(strm1_incr_en)
+        strm1_addr <= strm1_addr + 1;
+    if(strm2_rst | rst)
+        strm2_addr <= STRM2_OFFSET;
+    else if(strm2_incr_en)
+        strm2_addr <= strm2_addr + 1;
+    if(strm3_rst | rst)
+        strm3_addr <= STRM3_OFFSET;
+    else if(strm3_incr_en)
+        strm3_addr <= strm3_addr + 1;
+    if(strm4_rst | rst)
+        strm4_addr <= STRM4_OFFSET;
+    else if(strm4_incr_en)
+        strm4_addr <= strm4_addr + 1;
+    if(strm5_rst | rst)
+        strm5_addr <= STRM5_OFFSET;
+    else if(strm5_incr_en)
+        strm5_addr <= strm5_addr + 1;
+end
+
+reg [$clog2(MEM_DEPTH)-1:0] addra;
+wire [MEM_WIDTH*NMEMBLOCKS-1:0] rdqa;
+
+reg [$clog2(MEM_DEPTH)-1:0] addrb;
+wire [MEM_WIDTH*NMEMBLOCKS-1:0] rdqb;
+
+wire [NMEMBLOCKS-1:0] we;
+
+reg [1:0] addr_select_porta;
+reg [1:0] addr_select_portb;
+
+//multiplex addresses of various streams into address ports of memory
+always @(posedge aclk) begin
+    addr_select_porta <= current_stream_porta;
+    case(addr_select_porta)
+        0: addra <= strm0_addr;
+        1: addra <= strm2_addr;
+        2: addra <= strm4_addr;
+    endcase
+    addr_select_portb <= current_stream_portb;
+    case(addr_select_portb)
+        0: addrb <= strm1_addr;
+        1: addrb <= strm3_addr;
+        2: addrb <= strm5_addr;
+    endcase
+end
+
+genvar g;
+generate for(g=0; g<NMEMBLOCKS; g=g+1) begin: blockports
+
+assign we[g] = (CONFIG_EN == 1) & config_ce & config_we & (config_address[31:BLOCKADRWIDTH] == g);
+
+ramb18_wf_dualport
+#(
+    .ID(g),
+	.DWIDTH(MEM_WIDTH),
+	.AWIDTH(BLOCKADRWIDTH),
+	.MEM_INIT(MEM_INIT),
+  .RAM_STYLE(RAM_STYLE)
+)
+ram
+(
+	.clk(aclk),
+
+	.wea(we[g]),
+	.addra(we[g] ? config_address[BLOCKADRWIDTH-1:0] : addra[BLOCKADRWIDTH-1:0]),
+	.wdataa(config_d0),
+	.rdqa(rdqa[(g+1)*MEM_WIDTH-1:g*MEM_WIDTH]),
+
+	.web(1'b0),
+	.addrb(addrb[BLOCKADRWIDTH-1:0]),
+	.wdatab('d0),
+	.rdqb(rdqb[(g+1)*MEM_WIDTH-1:g*MEM_WIDTH])
+);
+
+end
+endgenerate
+
+integer i;
+
+generate if(NMEMBLOCKS > 1) begin: multiblock
+
+wire [MEM_WIDTH-1:0] rdqmux[5:0];
+
+reg [$clog2(MEM_DEPTH)-BLOCKADRWIDTH-1:0] rdblocka[2:0];
+reg [$clog2(MEM_DEPTH)-BLOCKADRWIDTH-1:0] rdblockb[2:0];
+
+always @(posedge aclk) begin
+    rdblocka[0] <= addra[$clog2(MEM_DEPTH)-1:BLOCKADRWIDTH];
+    rdblockb[0] <= addrb[$clog2(MEM_DEPTH)-1:BLOCKADRWIDTH];
+    for(i=0; i<2; i=i+1) begin
+		rdblocka[i+1] <= rdblocka[i];
+		rdblockb[i+1] <= rdblockb[i];
+    end
+end
+
+if(NSTREAMS >= 1) begin: en_strm0
+	if(STRM0_MUX == 1) begin: mux0
+		mux #(STRM0_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM0_BLOCK+STRM0_NBLOCKS)*MEM_WIDTH-1:STRM0_BLOCK*MEM_WIDTH],rdqmux[0],rdblocka[1] - STRM0_BLOCK);
+	end else begin: nomux0
+		assign rdqmux[0] = rdqa[(STRM0_BLOCK+1)*MEM_WIDTH-1:STRM0_BLOCK*MEM_WIDTH];
+	end
+	assign m_axis_0_tdata = rdqmux[0][STRM0_WIDTH-1:0];
+end
+
+if(NSTREAMS >= 2) begin: en_strm1
+	if(STRM1_MUX == 1) begin: mux1
+		mux #(STRM1_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM1_BLOCK+STRM1_NBLOCKS)*MEM_WIDTH-1:STRM1_BLOCK*MEM_WIDTH],rdqmux[1],rdblockb[1] - STRM1_BLOCK);
+	end else begin: nomux1
+		assign rdqmux[1] = rdqb[(STRM1_BLOCK+1)*MEM_WIDTH-1:STRM1_BLOCK*MEM_WIDTH];
+	end
+	assign m_axis_1_tdata = rdqmux[1][STRM1_WIDTH-1:0];
+end
+
+if(NSTREAMS >= 3) begin: en_strm2
+	if(STRM2_MUX == 1) begin: mux2
+		mux #(STRM2_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM2_BLOCK+STRM2_NBLOCKS)*MEM_WIDTH-1:STRM2_BLOCK*MEM_WIDTH],rdqmux[2],rdblocka[1] - STRM2_BLOCK);
+	end else begin: nomux2
+		assign rdqmux[2] = rdqa[(STRM2_BLOCK+1)*MEM_WIDTH-1:STRM2_BLOCK*MEM_WIDTH];
+	end
+	assign m_axis_2_tdata = rdqmux[2][STRM2_WIDTH-1:0];
+end
+
+if(NSTREAMS >= 4) begin: en_strm3
+	if(STRM3_MUX == 1) begin: mux3
+		mux #(STRM3_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM3_BLOCK+STRM3_NBLOCKS)*MEM_WIDTH-1:STRM3_BLOCK*MEM_WIDTH],rdqmux[3],rdblockb[1] - STRM3_BLOCK);
+	end else begin: nomux3
+		assign rdqmux[3] = rdqb[(STRM3_BLOCK+1)*MEM_WIDTH-1:STRM3_BLOCK*MEM_WIDTH];
+	end
+	assign m_axis_3_tdata = rdqmux[3][STRM3_WIDTH-1:0];
+end
+
+if(NSTREAMS >= 5) begin: en_strm4
+	if(STRM4_MUX == 1) begin: mux4
+		mux #(STRM4_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM4_BLOCK+STRM4_NBLOCKS)*MEM_WIDTH-1:STRM4_BLOCK*MEM_WIDTH],rdqmux[4],rdblocka[1] - STRM4_BLOCK);
+	end else begin: nomux4
+		assign rdqmux[4] = rdqa[(STRM4_BLOCK+1)*MEM_WIDTH-1:STRM4_BLOCK*MEM_WIDTH];
+	end
+	assign m_axis_4_tdata = rdqmux[4][STRM4_WIDTH-1:0];
+end
+
+if(NSTREAMS >= 6) begin: en_strm5
+	if(STRM5_MUX == 1) begin: mux5
+		mux #(STRM5_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM5_BLOCK+STRM5_NBLOCKS)*MEM_WIDTH-1:STRM5_BLOCK*MEM_WIDTH],rdqmux[5],rdblockb[1] - STRM5_BLOCK);
+	end else begin: nomux5
+		assign rdqmux[5] = rdqb[(STRM5_BLOCK+1)*MEM_WIDTH-1:STRM5_BLOCK*MEM_WIDTH];
+	end
+	assign m_axis_5_tdata = rdqmux[5][STRM5_WIDTH-1:0];
+end
+
+end else begin: singleblock
+
+if(NSTREAMS >= 1) begin: en_strm0_direct
+    assign m_axis_0_tdata = rdqa[STRM0_WIDTH-1:0];
+end
+if(NSTREAMS >= 2) begin: en_strm1_direct
+	assign m_axis_1_tdata = rdqb[STRM1_WIDTH-1:0];
+end
+if(NSTREAMS >= 3) begin: en_strm2_direct
+	assign m_axis_2_tdata = rdqa[STRM2_WIDTH-1:0];
+end
+if(NSTREAMS >= 4) begin: en_strm3_direct
+	assign m_axis_3_tdata = rdqb[STRM3_WIDTH-1:0];
+end
+if(NSTREAMS >= 5) begin: en_strm4_direct
+	assign m_axis_4_tdata = rdqa[STRM4_WIDTH-1:0];
+end
+if(NSTREAMS >= 6) begin: en_strm5_direct
+	assign m_axis_5_tdata = rdqb[STRM5_WIDTH-1:0];
+end
+
+end
+endgenerate
+
+//output to AXI Streams
+reg tvalid_pipe0[2:0];
+reg tvalid_pipe1[2:0];
+reg tvalid_pipe2[2:0];
+reg tvalid_pipe3[2:0];
+reg tvalid_pipe4[2:0];
+reg tvalid_pipe5[2:0];
+
+assign m_axis_0_tvalid = tvalid_pipe0[2];
+assign m_axis_1_tvalid = tvalid_pipe1[2];
+assign m_axis_2_tvalid = tvalid_pipe2[2];
+assign m_axis_3_tvalid = tvalid_pipe3[2];
+assign m_axis_4_tvalid = tvalid_pipe4[2];
+assign m_axis_5_tvalid = tvalid_pipe5[2];
+
+
+always @(posedge aclk) begin
+    tvalid_pipe0[0] <= strm0_incr_en;
+    tvalid_pipe1[0] <= strm1_incr_en;
+    tvalid_pipe2[0] <= strm2_incr_en;
+    tvalid_pipe3[0] <= strm3_incr_en;
+    tvalid_pipe4[0] <= strm4_incr_en;
+    tvalid_pipe5[0] <= strm5_incr_en;
+    for(i=0; i<2; i=i+1) begin: srl
+        tvalid_pipe0[i+1] <= tvalid_pipe0[i];
+        tvalid_pipe1[i+1] <= tvalid_pipe1[i];
+        tvalid_pipe2[i+1] <= tvalid_pipe2[i];
+        tvalid_pipe3[i+1] <= tvalid_pipe3[i];
+        tvalid_pipe4[i+1] <= tvalid_pipe4[i];
+        tvalid_pipe5[i+1] <= tvalid_pipe5[i];
+    end
+end
+
+assign config_q0 = 0;
+
+endmodule
diff --git a/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v b/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v
index 7b207fbd6db7c9d985ba3ed50d7fcd97612e07f5..4219d0f1c74bddff690b0d0cb21ce6a448c01c97 100644
--- a/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v
+++ b/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v
@@ -1,86 +1,100 @@
-/*
- Copyright (c) 2020, Xilinx
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
- * Neither the name of FINN nor the names of its
-   contributors may be used to endorse or promote products derived from
-   this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-module ramb18_wf_dualport
-#(
-    parameter ID = 0,
-	parameter DWIDTH = 18,
-	parameter AWIDTH = 10,
-	parameter MEM_INIT = ""
-)
-(
-	input clk,
-	
-	input wea,
-	input [AWIDTH-1:0] addra,
-	input [DWIDTH-1:0] wdataa,
-	output reg [DWIDTH-1:0] rdqa,
-
-	input web,
-	input [AWIDTH-1:0] addrb,
-	input [DWIDTH-1:0] wdatab,
-	output reg [DWIDTH-1:0] rdqb
-);
-
-(* ram_style = "block" *) reg [DWIDTH-1:0] mem[0:2**AWIDTH-1];
-reg [DWIDTH-1:0] rdataa;
-reg [DWIDTH-1:0] rdatab;
-
-reg [7:0] idx = ID;
-//initialize memory
-initial begin
-    //note the hacky way of adding a filename memblock_ID.dat to the path provided in MEM_INIT
-	//ID can go up to 99
-	if (ID < 0 && ID > 99) begin
-	    $display("ID out of range [0-99]");
-	    $finish();
-    end
-	//MEM_INIT path must be terminated by /
-	if (ID < 10)
-		$readmemh({MEM_INIT,"memblock_",idx+8'd48,".dat"}, mem, 0, 1023);
-	else
-		$readmemh({MEM_INIT,"memblock_",(idx/10)+8'd48,(idx%10)+8'd48,".dat"}, mem, 0, 1023);
-end
-
-//memory ports, with output pipeline register
-always @(posedge clk) begin
-    if(wea)
-        mem[addra] <= wdataa;
-    rdataa <= mem[addra];
-    rdqa <= rdataa;
-end
-always @(posedge clk) begin
-    if(web)
-        mem[addrb] <= wdatab;
-    rdatab <= mem[addrb];
-    rdqb <= rdatab;
-end
-
-endmodule
\ No newline at end of file
+/*
+ Copyright (c) 2020, Xilinx
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name of FINN nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+module ramb18_wf_dualport
+#(
+    parameter ID = 0,
+	parameter DWIDTH = 18,
+	parameter AWIDTH = 10,
+	parameter MEM_INIT = "",
+  parameter RAM_STYLE = "auto"
+)
+(
+	input clk,
+
+	input wea,
+	input [AWIDTH-1:0] addra,
+	input [DWIDTH-1:0] wdataa,
+	output reg [DWIDTH-1:0] rdqa,
+
+	input web,
+	input [AWIDTH-1:0] addrb,
+	input [DWIDTH-1:0] wdatab,
+	output reg [DWIDTH-1:0] rdqb
+);
+
+(* ram_style = RAM_STYLE *) reg [DWIDTH-1:0] mem[0:2**AWIDTH-1];
+reg [DWIDTH-1:0] rdataa;
+reg [DWIDTH-1:0] rdatab;
+
+`ifdef SYNTHESIS
+reg [7:0] idx = ID;
+`else
+reg [15:0] idx;
+`endif
+
+//initialize memory
+initial begin
+  //note the hacky way of adding a filename memblock_ID.dat to the path provided in MEM_INIT
+  //ID can go up to 99
+  if (ID < 0 && ID > 99) begin
+    $display("ID out of range [0-99]");
+    $finish();
+  end
+	//MEM_INIT path must be terminated by /
+  `ifdef SYNTHESIS
+  if (ID < 10)
+    $readmemh({MEM_INIT,"memblock_",idx+8'd48,".dat"}, mem, 0, 1023);
+  else
+    $readmemh({MEM_INIT,"memblock_",(idx/10)+8'd48,(idx%10)+8'd48,".dat"}, mem, 0, 1023);
+  `else
+  $sformat(idx,"%0d",ID);
+  if (ID < 10)
+    $readmemh({MEM_INIT,"memblock_",idx[7:0],".dat"}, mem, 0, 1023);
+  else
+    $readmemh({MEM_INIT,"memblock_",idx,".dat"}, mem, 0, 1023);
+  `endif
+end
+
+//memory ports, with output pipeline register
+always @(posedge clk) begin
+    if(wea)
+        mem[addra] <= wdataa;
+    rdataa <= mem[addra];
+    rdqa <= rdataa;
+end
+always @(posedge clk) begin
+    if(web)
+        mem[addrb] <= wdatab;
+    rdatab <= mem[addrb];
+    rdqb <= rdatab;
+end
+
+endmodule
diff --git a/run-docker.sh b/run-docker.sh
index e010733080b7cae205119e2bc136cff836f71fa5..018bd9aa8c39666a1b9c0ef7f426587f265769f7 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -44,10 +44,10 @@ DOCKER_PASSWD="finn"
 # generate a random number per-run to allow multiple
 # containers from the same user
 DOCKER_RND=$(shuf -i0-32768 -n1)
-DOCKER_TAG="finn_${DOCKER_UNAME}"
+DOCKER_TAG="finn_dev_${DOCKER_UNAME}"
 # uncomment to run multiple instances with different names
 # DOCKER_INST_NAME="finn_${DOCKER_UNAME}_${DOCKER_RND}"
-DOCKER_INST_NAME="finn_${DOCKER_UNAME}"
+DOCKER_INST_NAME="finn_dev_${DOCKER_UNAME}"
 # ensure Docker tag and inst. name are all lowercase
 DOCKER_TAG=$(echo "$DOCKER_TAG" | tr '[:upper:]' '[:lower:]')
 DOCKER_INST_NAME=$(echo "$DOCKER_INST_NAME" | tr '[:upper:]' '[:lower:]')
@@ -59,50 +59,22 @@ DOCKER_INST_NAME=$(echo "$DOCKER_INST_NAME" | tr '[:upper:]' '[:lower:]')
 : ${PYNQ_PASSWORD="xilinx"}
 : ${PYNQ_BOARD="Pynq-Z1"}
 : ${PYNQ_TARGET_DIR="/home/xilinx/$DOCKER_INST_NAME"}
+: ${NUM_DEFAULT_WORKERS=1}
 
 # Absolute path to this script, e.g. /home/user/bin/foo.sh
 SCRIPT=$(readlink -f "$0")
 # Absolute path this script is in, thus /home/user/bin
 SCRIPTPATH=$(dirname "$SCRIPT")
 
-BREVITAS_REPO=https://github.com/Xilinx/brevitas.git
-EXAMPLES_REPO=https://github.com/maltanar/brevitas_cnv_lfc.git
-CNPY_REPO=https://github.com/rogersce/cnpy.git
-#FINN_HLS_REPO=https://github.com/Xilinx/finn-hlslib.git
-FINN_HLS_REPO=https://github.com/Tobi-Alonso/finn-hlslib.git
-PYVERILATOR_REPO=https://github.com/maltanar/pyverilator
-PYNQSHELL_REPO=https://github.com/maltanar/PYNQ-HelloWorld.git
-
-BREVITAS_LOCAL=$SCRIPTPATH/brevitas
-EXAMPLES_LOCAL=$SCRIPTPATH/brevitas_cnv_lfc
-CNPY_LOCAL=$SCRIPTPATH/cnpy
-FINN_HLS_LOCAL=$SCRIPTPATH/finn-hlslib
-PYVERILATOR_LOCAL=$SCRIPTPATH/pyverilator
-PYNQSHELL_LOCAL=$SCRIPTPATH/PYNQ-HelloWorld
 BUILD_LOCAL=/tmp/$DOCKER_INST_NAME
 VIVADO_HLS_LOCAL=$VIVADO_PATH
 VIVADO_IP_CACHE=$BUILD_LOCAL/vivado_ip_cache
 
-# clone dependency repos
-git clone --branch feature/finn_onnx_export $BREVITAS_REPO $BREVITAS_LOCAL ||  git -C "$BREVITAS_LOCAL" pull
-git clone $EXAMPLES_REPO $EXAMPLES_LOCAL ||  git -C "$EXAMPLES_LOCAL" checkout feature/rework_scaling_clipping; git -C "$EXAMPLES_LOCAL" pull
-git clone $CNPY_REPO $CNPY_LOCAL ||  git -C "$CNPY_LOCAL" pull
-git clone $FINN_HLS_REPO $FINN_HLS_LOCAL ||  git -C "$FINN_HLS_LOCAL" checkout master; git -C "$FINN_HLS_LOCAL" pull
-git clone $PYVERILATOR_REPO $PYVERILATOR_LOCAL ||  git -C "$PYVERILATOR_LOCAL" pull
-git clone $PYNQSHELL_REPO $PYNQSHELL_LOCAL ||  git -C "$PYNQSHELL_LOCAL" pull
-
 # ensure build dir exists locally
 mkdir -p $BUILD_LOCAL
 mkdir -p $VIVADO_IP_CACHE
 
 echo "Instance is named as $DOCKER_INST_NAME"
-echo "Mounting $SCRIPTPATH into /workspace/finn"
-echo "Mounting $SCRIPTPATH/brevitas into /workspace/brevitas"
-echo "Mounting $SCRIPTPATH/brevitas_cnv_lfc into /workspace/brevitas_cnv_lfc"
-echo "Mounting $SCRIPTPATH/cnpy into /workspace/cnpy"
-echo "Mounting $SCRIPTPATH/finn-hlslib into /workspace/finn-hlslib"
-echo "Mounting $SCRIPTPATH/pyverilator into /workspace/pyverilator"
-echo "Mounting $SCRIPTPATH/PYNQ-HelloWorld into /workspace/PYNQ-HelloWorld"
 echo "Mounting $BUILD_LOCAL into $BUILD_LOCAL"
 echo "Mounting $VIVADO_PATH into $VIVADO_PATH"
 echo "Port-forwarding for Jupyter $JUPYTER_PORT:$JUPYTER_PORT"
@@ -115,14 +87,14 @@ if [ "$1" = "test" ]; then
         DOCKER_CMD="python setup.py test"
 elif [ "$1" = "notebook" ]; then
         echo "Running Jupyter notebook server"
-        DOCKER_CMD="source ~/.bashrc; jupyter notebook --ip=0.0.0.0 --port $JUPYTER_PORT notebooks"
+        DOCKER_CMD="jupyter notebook --ip=0.0.0.0 --port $JUPYTER_PORT notebooks"
 else
         echo "Running container only"
         DOCKER_CMD="bash"
 fi
 
 # Build the FINN Docker image
-docker build --tag=$DOCKER_TAG \
+docker build -f docker/Dockerfile.finn_dev --tag=$DOCKER_TAG \
              --build-arg GID=$DOCKER_GID \
              --build-arg GNAME=$DOCKER_GNAME \
              --build-arg UNAME=$DOCKER_UNAME \
@@ -137,12 +109,6 @@ docker run -t --rm --name $DOCKER_INST_NAME -it \
 -e "XILINX_VIVADO=$VIVADO_PATH" \
 -e "SHELL=/bin/bash" \
 -v $SCRIPTPATH:/workspace/finn \
--v $SCRIPTPATH/brevitas:/workspace/brevitas \
--v $SCRIPTPATH/brevitas_cnv_lfc:/workspace/brevitas_cnv_lfc \
--v $SCRIPTPATH/cnpy:/workspace/cnpy \
--v $SCRIPTPATH/finn-hlslib:/workspace/finn-hlslib \
--v $SCRIPTPATH/pyverilator:/workspace/pyverilator \
--v $SCRIPTPATH/PYNQ-HelloWorld:/workspace/PYNQ-HelloWorld \
 -v $BUILD_LOCAL:$BUILD_LOCAL \
 -v $VIVADO_PATH:$VIVADO_PATH \
 -e VIVADO_PATH=$VIVADO_PATH \
@@ -154,6 +120,7 @@ docker run -t --rm --name $DOCKER_INST_NAME -it \
 -e PYNQ_USERNAME=$PYNQ_USERNAME \
 -e PYNQ_PASSWORD=$PYNQ_PASSWORD \
 -e PYNQ_TARGET_DIR=$PYNQ_TARGET_DIR \
+-e NUM_DEFAULT_WORKERS=$NUM_DEFAULT_WORKERS \
 -p $JUPYTER_PORT:$JUPYTER_PORT \
 -p $NETRON_PORT:$NETRON_PORT \
-$DOCKER_TAG bash -c "$DOCKER_CMD"
+$DOCKER_TAG $DOCKER_CMD
diff --git a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
index 58fe8e2962d03ab0c47957f205a0c2f2b8fc51f5..7b4ca37cd78c6299fa824ecfc16d79ae013bab37 100644
--- a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
+++ b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
@@ -34,9 +34,9 @@ from finn.util.fpgadataflow import is_fpgadataflow_node
 
 
 def hls_synth_res_estimation(model):
-    """Extracts the results from the vivado synthesis.
+    """Extracts the FPGA resource results from the Vivado HLS synthesis estimates.
 
-    Returns {node name : resource estimation}."""
+    Returns {node name : resources_dict}."""
 
     res_dict = {}
     for node in model.graph.node:
@@ -55,18 +55,15 @@ def hls_synth_res_estimation(model):
                 )
 
                 if os.path.isfile(xmlfile):
-                    res_dict[node.name] = []
+                    res_dict[node.name] = dict()
                     tree = ET.parse(xmlfile)
                     root = tree.getroot()
                     for item in root.findall("AreaEstimates/Resources"):
                         for child in item:
-                            res_dict[node.name].append(
-                                ["{} : {}".format(child.tag, child.text)]
-                            )
+                            res_dict[node.name][child.tag] = child.text
                 else:
                     raise Exception(
                         """Please run "HLSSynth_IPGen" first
                             to generate the report files"""
                     )
-
     return res_dict
diff --git a/src/finn/analysis/fpgadataflow/post_synth_res.py b/src/finn/analysis/fpgadataflow/post_synth_res.py
new file mode 100644
index 0000000000000000000000000000000000000000..508c34aaed50f2935f4915cdcea29a3e92641b3c
--- /dev/null
+++ b/src/finn/analysis/fpgadataflow/post_synth_res.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import xml.etree.ElementTree as ET
+
+from finn.transformation.move_reshape import _is_fpgadataflow_node
+
+
+def post_synth_res(model):
+    """Extracts the FPGA resource results from the Vivado synthesis.
+
+    Returns {node name : resources_dict}."""
+
+    res_dict = {}
+    synth_report_filename = model.get_metadata_prop("vivado_synth_rpt")
+    if os.path.isfile(synth_report_filename):
+        tree = ET.parse(synth_report_filename)
+        root = tree.getroot()
+        all_cells = root.findall(".//tablecell")
+        # strip all whitespace from table cell contents
+        for cell in all_cells:
+            cell.attrib["contents"] = cell.attrib["contents"].strip()
+    else:
+        raise Exception("Please run synthesis first")
+
+    for node in model.graph.node:
+        if _is_fpgadataflow_node(node):
+            row = root.findall(".//*[@contents='%s']/.." % node.name)
+            if row != []:
+                node_dict = {}
+                row = row[0].getchildren()
+                """ Expected XML structure:
+<tablerow class="" suppressoutput="0" wordwrap="0">
+    <tableheader class="" contents="Instance" halign="3" width="-1"/>
+    <tableheader class="" contents="Module" halign="3" width="-1"/>
+    <tableheader class="" contents="Total LUTs" halign="3" width="-1"/>
+    <tableheader class="" contents="Logic LUTs" halign="3" width="-1"/>
+    <tableheader class="" contents="LUTRAMs" halign="3" width="-1"/>
+    <tableheader class="" contents="SRLs" halign="3" width="-1"/>
+    <tableheader class="" contents="FFs" halign="3" width="-1"/>
+    <tableheader class="" contents="RAMB36" halign="3" width="-1"/>
+    <tableheader class="" contents="RAMB18" halign="3" width="-1"/>
+    <tableheader class="" contents="DSP48 Blocks" halign="3" width="-1"/>
+</tablerow>
+                """
+                node_dict["LUT"] = int(row[2].attrib["contents"])
+                node_dict["SRL"] = int(row[5].attrib["contents"])
+                node_dict["FF"] = int(row[6].attrib["contents"])
+                node_dict["BRAM_36K"] = int(row[7].attrib["contents"])
+                node_dict["BRAM_18K"] = int(row[8].attrib["contents"])
+                node_dict["DSP48"] = int(row[9].attrib["contents"])
+                res_dict[node.name] = node_dict
+
+    return res_dict
diff --git a/src/finn/core/remote_exec.py b/src/finn/core/remote_exec.py
index e78f07b9f1097ee6e1042846a91c2a0ff80d12d0..eff9cea291b106d69e99055d5b6e2af448fb7517 100644
--- a/src/finn/core/remote_exec.py
+++ b/src/finn/core/remote_exec.py
@@ -62,9 +62,15 @@ def remote_exec(model, execution_context):
     process_compile.communicate()
 
     cmd = (
-        "sshpass -p {} ssh {}@{} "
-        '"cd {}/{}; echo "xilinx" | sudo -S python3.6 driver.py"'
-    ).format(pynq_password, pynq_username, pynq_ip, pynq_target_dir, deployment_folder)
+        "sshpass -p {} ssh {}@{} " '"cd {}/{}; echo "{}" | sudo -S python3.6 driver.py"'
+    ).format(
+        pynq_password,
+        pynq_username,
+        pynq_ip,
+        pynq_target_dir,
+        deployment_folder,
+        pynq_password,
+    )
     bash_command = ["/bin/bash", "-c", cmd]
     process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
     process_compile.communicate()
diff --git a/src/finn/core/rtlsim_exec.py b/src/finn/core/rtlsim_exec.py
index 0841fedebcd473a488b2e62db4dc763f283789e1..af84a75e299d666c059df54211be42b691f5ccf2 100644
--- a/src/finn/core/rtlsim_exec.py
+++ b/src/finn/core/rtlsim_exec.py
@@ -35,11 +35,18 @@ from finn.util.fpgadataflow import (
     pyverilate_stitched_ip,
 )
 
+try:
+    from pyverilator import PyVerilator
+except ModuleNotFoundError:
+    PyVerilator = None
+
 
 def rtlsim_exec(model, execution_context):
     """Use PyVerilator to execute given model with stitched IP. The execution
     context contains the input values."""
 
+    if PyVerilator is None:
+        raise ImportError("Installation of PyVerilator is required.")
     # ensure stitched ip project already exists
     assert os.path.isfile(
         model.get_metadata_prop("wrapper_filename")
@@ -74,7 +81,12 @@ def rtlsim_exec(model, execution_context):
     packed_input = npy_to_rtlsim_input(i_tensor, i_dt, i_stream_w)
     num_out_values = last_node.get_number_output_values()
     # prepare pyverilator model
-    sim = pyverilate_stitched_ip(model)
+    rtlsim_so = model.get_metadata_prop("rtlsim_so")
+    if (rtlsim_so is None) or (not os.path.isfile(rtlsim_so)):
+        sim = pyverilate_stitched_ip(model)
+        model.set_metadata_prop("rtlsim_so", sim.lib._name)
+    else:
+        sim = PyVerilator(rtlsim_so)
     _reset_rtlsim(sim)
     _toggle_clk(sim)
     ret = _run_rtlsim(sim, packed_input, num_out_values, trace_file)
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 10f8b7feedf7584afb66a7fad8f1ee20745bf67d..2500b1f03b917225d92b00de033299f20e3d9f5d 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -31,13 +31,18 @@ import numpy as np
 import os
 import subprocess
 from finn.custom_op import CustomOp
-from finn.util.basic import CppBuilder
+from finn.util.basic import CppBuilder, make_build_dir
 from finn.util.fpgadataflow import (
     IPGenBuilder,
     pyverilate_get_liveness_threshold_cycles,
 )
 from . import templates
 
+try:
+    from pyverilator import PyVerilator
+except ModuleNotFoundError:
+    PyVerilator = None
+
 
 class HLSCustomOp(CustomOp):
     """HLSCustomOp class all custom ops that correspond to a finn-hlslib
@@ -73,15 +78,75 @@ class HLSCustomOp(CustomOp):
             "exec_mode": ("s", False, ""),
             "sim_cycles": ("i", False, 0),
             "rtlsim_trace": ("s", False, ""),
+            "res_estimate": ("s", False, ""),
+            "res_hls": ("s", False, ""),
+            "res_synth": ("s", False, ""),
+            "rtlsim_so": ("s", False, ""),
         }
 
+    def get_verilog_top_module_name(self):
+        "Return the Verilog top module name for this node."
+
+        node = self.onnx_node
+        prefixed_top_name = "%s_%s" % (node.name, node.name)
+        return prefixed_top_name
+
+    def get_verilog_top_filename(self):
+        "Return the Verilog top module filename for this node."
+
+        verilog_file = "{}/project_{}/sol1/impl/verilog/{}.v".format(
+            self.get_nodeattr("code_gen_dir_ipgen"),
+            self.onnx_node.name,
+            self.get_verilog_top_module_name(),
+        )
+        return verilog_file
+
+    def prepare_rtlsim(self):
+        """Creates a Verilator emulation library for the RTL code generated
+        for this node, sets the rtlsim_so attribute to its path and returns
+        a PyVerilator wrapper around it."""
+
+        if PyVerilator is None:
+            raise ImportError("Installation of PyVerilator is required.")
+        # ensure that code is generated
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        assert (
+            code_gen_dir != ""
+        ), """Node attribute "code_gen_dir_ipgen" is
+        not set. Please run HLSSynth_IPGen first."""
+        verilog_file = self.get_verilog_top_filename()
+        assert os.path.isfile(verilog_file), "Cannot find top-level Verilog file."
+        # build the Verilator emu library
+        sim = PyVerilator.build(
+            verilog_file,
+            build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
+            verilog_path=[
+                "{}/project_{}/sol1/impl/verilog/".format(
+                    code_gen_dir, self.onnx_node.name
+                )
+            ],
+        )
+        # save generated lib filename in attribute
+        self.set_nodeattr("rtlsim_so", sim.lib._name)
+        return sim
+
+    def get_rtlsim(self):
+        """Return a PyVerilator wrapper for the Verilator emulation library
+        for this node."""
+
+        rtlsim_so = self.get_nodeattr("rtlsim_so")
+        assert os.path.isfile(rtlsim_so), "Cannot find rtlsim library."
+        # create PyVerilator wrapper
+        sim = PyVerilator(rtlsim_so)
+        return sim
+
     def node_res_estimation(self):
         """Returns summarized resource estimation of BRAMs and LUTs
-        of the node."""
-        resources = []
-        resources.append("BRAMs: " + str(self.bram_estimation()))
-        resources.append("LUTs: " + str(self.lut_estimation()))
-        return resources
+        of the node as a dictionary."""
+        ret = dict()
+        ret["BRAM_18K"] = self.bram_estimation()
+        ret["LUT"] = self.lut_estimation()
+        return ret
 
     def bram_estimation(self):
         """Function for BRAM resource estimation, is member function of
@@ -99,6 +164,7 @@ class HLSCustomOp(CustomOp):
 
         # generate top cpp file for ip generation
         path = self.get_nodeattr("code_gen_dir_ipgen")
+        self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())]
         self.generate_params(model, path)
         self.global_includes()
         self.defines("ipgen")
@@ -156,11 +222,13 @@ class HLSCustomOp(CustomOp):
         """Generates c++ code for simulation (npysim)."""
         node = self.onnx_node
         path = self.get_nodeattr("code_gen_dir_npysim")
+        self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())]
         self.generate_params(model, path)
         self.global_includes()
         self.defines("npysim")
         self.read_npy_data()
         self.strm_decl()
+        self.pragmas()
         self.docompute()
         self.dataoutstrm()
         self.save_as_npy()
@@ -429,3 +497,8 @@ compilation transformations?
     def get_outstream_width(self):
         """Returns output stream width, if implemented."""
         raise Exception("get_outstream_width not implemented for this op")
+
+    def get_ap_int_max_w(self):
+        instream = self.get_instream_width()
+        outstream = self.get_outstream_width()
+        return max([instream, outstream])
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
index e2a2f90b85a790a6d4fc7053d0e742329a7a1012..e05b2dcea7e17231617f9d3880b778d1978b4ead 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
@@ -29,7 +29,6 @@
 import os
 
 import numpy as np
-from pyverilator import PyVerilator
 
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow import HLSCustomOp
@@ -73,6 +72,10 @@ class ConvolutionInputGenerator(HLSCustomOp):
         ishape = (1, ifm_dim, ifm_dim, ifm_ch)
         return ishape
 
+    def get_folded_input_shape(self):
+        """Assumption: No folding on input"""
+        return self.get_normal_input_shape()
+
     def get_normal_output_shape(self):
         k = self.get_nodeattr("ConvKernelDim")
         ifm_dim = self.get_nodeattr("IFMDim")
@@ -124,12 +127,6 @@ class ConvolutionInputGenerator(HLSCustomOp):
     def verify_node(self):
         pass
 
-    def bram_estimation(self):
-        pass
-
-    def lut_estimation(self):
-        pass
-
     def get_input_datatype(self):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
@@ -206,49 +203,26 @@ class ConvolutionInputGenerator(HLSCustomOp):
             did not produce expected ofolded utput shape"
             context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
         elif mode == "rtlsim":
-            prefixed_top_name = "%s_%s" % (node.name, node.name)
-            # check if needed file exists
-            verilog_file = "{}/project_{}/sol1/impl/verilog/{}.v".format(
-                code_gen_dir, node.name, prefixed_top_name
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            rtlsim_inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
             )
-            if os.path.isfile(verilog_file):
-                nbits = self.get_instream_width()
-                rtlsim_inp = npy_to_rtlsim_input(
-                    "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
-                )
-                sim = PyVerilator.build(
-                    verilog_file,
-                    verilog_path=[
-                        "{}/project_{}/sol1/impl/verilog/".format(
-                            code_gen_dir, node.name
-                        )
-                    ],
-                )
-                super().reset_rtlsim(sim)
-                super().toggle_clk(sim)
-                rtlsim_output = self.rtlsim(sim, rtlsim_inp)
-                odt = export_idt
-                target_bits = odt.bitwidth()
-                packed_bits = self.get_outstream_width()
-                out_npy_path = "{}/output.npy".format(code_gen_dir)
-                out_shape = self.get_folded_output_shape()
-                rtlsim_output_to_npy(
-                    rtlsim_output,
-                    out_npy_path,
-                    odt,
-                    out_shape,
-                    packed_bits,
-                    target_bits,
-                )
-                # load and reshape output
-                output = np.load(out_npy_path)
-                output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
-                context[node.output[0]] = output
-            else:
-                raise Exception(
-                    """Found no verilog files for this node,
-                    did you run the codegen_ipgen transformation?"""
-                )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+            odt = export_idt
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+            context[node.output[0]] = output
         else:
             raise Exception(
                 """Invalid value for attribute exec_mode! Is currently set to: {}
@@ -272,9 +246,10 @@ class ConvolutionInputGenerator(HLSCustomOp):
     def defines(self, var):
         numReps = 1
         self.code_gen_dict["$DEFINES$"] = [
-            """#define ConvKernelDim1 {}\n #define IFMChannels1 {}
-            #define Input_precision1 {}\n #define IFMDim1 {}\n #define OFMDim1 {}
-            #define SIMD1 {}\n #define Stride1 {}\n #define numReps {}""".format(
+            """#define ConvKernelDim1 {}\n #define IFMChannels1 {}\n
+            #define Input_precision1 {}\n #define IFMDim1 {}\n
+            #define OFMDim1 {}\n #define SIMD1 {}\n
+            #define Stride1 {}\n #define numReps {}""".format(
                 self.get_nodeattr("ConvKernelDim"),
                 self.get_nodeattr("IFMChannels"),
                 self.get_input_datatype().bitwidth(),
diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
index ce135e91088d2bfabe0259e1cc6873bb54884198..6a4070528ee50d97e62881d00b57355d2a2baf2d 100644
--- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
@@ -28,7 +28,7 @@
 
 import os
 import numpy as np
-from pyverilator import PyVerilator
+
 from finn.custom_op.fpgadataflow import HLSCustomOp
 from finn.core.datatype import DataType
 from onnx import TensorProto, helper
@@ -206,12 +206,6 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
 
         return info_messages
 
-    def bram_estimation(self):
-        pass
-
-    def lut_estimation(self):
-        pass
-
     def global_includes(self):
         self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"']
 
@@ -353,49 +347,26 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
             context[node.output[0]] = output
 
         elif mode == "rtlsim":
-            prefixed_top_name = "%s_%s" % (node.name, node.name)
-            # check if needed file exists
-            verilog_file = "{}/project_{}/sol1/impl/verilog/{}.v".format(
-                code_gen_dir, node.name, prefixed_top_name
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            rtlsim_inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
             )
-            if os.path.isfile(verilog_file):
-                nbits = self.get_instream_width()
-                rtlsim_inp = npy_to_rtlsim_input(
-                    "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
-                )
-                sim = PyVerilator.build(
-                    verilog_file,
-                    verilog_path=[
-                        "{}/project_{}/sol1/impl/verilog/".format(
-                            code_gen_dir, node.name
-                        )
-                    ],
-                )
-                super().reset_rtlsim(sim)
-                super().toggle_clk(sim)
-                rtlsim_output = self.rtlsim(sim, rtlsim_inp)
-                odt = export_idt
-                target_bits = odt.bitwidth()
-                packed_bits = self.get_outstream_width()
-                out_npy_path = "{}/output.npy".format(code_gen_dir)
-                out_shape = self.get_folded_output_shape()
-                rtlsim_output_to_npy(
-                    rtlsim_output,
-                    out_npy_path,
-                    odt,
-                    out_shape,
-                    packed_bits,
-                    target_bits,
-                )
-                # load and reshape output
-                output = np.load(out_npy_path)
-                output = np.asarray([output], dtype=np.float32).reshape(exp_shape)
-                context[node.output[0]] = output
-            else:
-                raise Exception(
-                    """Found no verilog files for this node,
-                    did you run the codegen_ipgen transformation?"""
-                )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+            odt = export_idt
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(exp_shape)
+            context[node.output[0]] = output
         else:
             raise Exception(
                 """Invalid value for attribute exec_mode! Is currently set to: {}
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index 00b8287a312fc82425b508ffef66f5187d074617..eab3decc696cb86622bbdd8f22f015515ea936d5 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -32,11 +32,14 @@ import subprocess
 from shutil import copy
 
 import numpy as np
-from pyverilator import PyVerilator
+
 from onnx import TensorProto, helper
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow import HLSCustomOp
-from finn.util.basic import interleave_matrix_outer_dim_from_partitions
+from finn.util.basic import (
+    interleave_matrix_outer_dim_from_partitions,
+    roundup_to_integer_multiple,
+)
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
     numpy_to_hls_code,
@@ -89,10 +92,33 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             # const -- embedded weights, default, long compile/synth times
             # decoupled -- streaming weights
             "mem_mode": ("s", False, "const"),
+            # FPGA resource type for memories in decoupled mode
+            # auto -- let Vivado decide
+            # block -- use BRAM
+            # distributed -- use LUTRAM
+            # see also https://www.xilinx.com/support/answers/38070.html
+            "ram_style": ("s", False, "auto"),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
+    def get_verilog_top_module_name(self):
+        "Return the Verilog top module name for this node."
+
+        node = self.onnx_node
+        # set top name depending on mem_mode
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "const":
+            prefixed_top_name = "%s_%s" % (node.name, node.name)
+        elif mem_mode == "decoupled":
+            prefixed_top_name = "%s_memstream" % (node.name)
+        else:
+            raise Exception(
+                """Please set mem_mode to "const" or "decoupled", currently no other
+                parameter value is supported!"""
+            )
+        return prefixed_top_name
+
     def calc_wmem(self):
         """Calculates and returns WMEM."""
         mw = self.get_nodeattr("MW")
@@ -270,6 +296,11 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         wp = self.get_weight_datatype().bitwidth()
         return pe * simd * wp
 
+    def get_ap_int_max_w(self):
+        temp_value = super().get_ap_int_max_w()
+        weightstream = self.get_weightstream_width()
+        return max([weightstream, temp_value])
+
     def get_folded_input_shape(self):
         mw = self.get_nodeattr("MW")
         simd = self.get_nodeattr("SIMD")
@@ -454,7 +485,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
             if export_wdt.bitwidth() != 1:
                 f_weights.write(
-                    "static FixedPointWeights<{},{},{},{}> weights = ".format(
+                    "const FixedPointWeights<{},{},{},{}> weights = ".format(
                         self.get_nodeattr("SIMD"),
                         export_wdt.get_hls_datatype_str(),
                         self.get_nodeattr("PE"),
@@ -463,7 +494,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 )
             else:
                 f_weights.write(
-                    "static BinaryWeights<{},{},{}> weights = ".format(
+                    "const BinaryWeights<{},{},{}> weights = ".format(
                         self.get_nodeattr("SIMD"),
                         self.get_nodeattr("PE"),
                         self.calc_wmem(),
@@ -500,24 +531,29 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             """Saves weights into .dat file"""
             # convert weight values into hexstring
             weight_width = self.get_weightstream_width()
+            # pad to nearest 4 bits to get hex strings
+            weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
             weight_tensor_unflipped = pack_innermost_dim_as_hex_string(
-                weight_tensor_unflipped, export_wdt, weight_width, prefix=""
+                weight_tensor_unflipped, export_wdt, weight_width_padded, prefix=""
             )
             weight_stream_len = np.prod(weight_tensor_unflipped.shape)
-            assert (
-                weight_stream_len <= 1024
-            ), """Decoupled mem mode needs
-            weight stream length <= 1024 for now"""
+            factor = math.ceil(weight_stream_len / 1024)
             # add zeroes to pad out file to 1024 entries
             weight_stream = weight_tensor_unflipped.flatten()
-            pad_amt = 1024 - weight_stream_len
+            pad_amt = (factor * 1024) - weight_stream_len
             weight_stream = np.pad(
                 weight_stream, (0, pad_amt), mode="constant", constant_values="0"
             )
             weight_stream = weight_stream.copy()
-            with open("{}/memblock_0.dat".format(code_gen_dir), "w+") as f:
-                for val in weight_stream:
+            i = 0
+            j = 0
+            for val in weight_stream:
+                if i == 1024:
+                    i = 0
+                    j += 1
+                with open("{}/memblock_{}.dat".format(code_gen_dir, j), "a+") as f:
                     f.write(val + "\n")
+                i += 1
 
         else:
             raise Exception(
@@ -631,58 +667,28 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             oshape = self.get_normal_output_shape()
             context[node.output[0]] = context[node.output[0]].reshape(*oshape)
         elif mode == "rtlsim":
-            # set top name depending on mem_mode
-            mem_mode = self.get_nodeattr("mem_mode")
-            if mem_mode == "const":
-                prefixed_top_name = "%s_%s" % (node.name, node.name)
-            elif mem_mode == "decoupled":
-                prefixed_top_name = "%s_memstream" % (node.name)
-            else:
-                raise Exception(
-                    """Please set mem_mode to "const" or "decoupled", currently no other
-                    parameter value is supported!"""
-                )
-            # check if needed file exists
-            verilog_file = "{}/project_{}/sol1/impl/verilog/{}.v".format(
-                code_gen_dir, node.name, prefixed_top_name
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            output = self.rtlsim(sim, inp)
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                output, out_npy_path, odt, out_shape, packed_bits, target_bits
             )
-            if os.path.isfile(verilog_file):
-                nbits = self.get_instream_width()
-                inp = npy_to_rtlsim_input(
-                    "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
-                )
-                sim = PyVerilator.build(
-                    verilog_file,
-                    verilog_path=[
-                        "{}/project_{}/sol1/impl/verilog/".format(
-                            code_gen_dir, node.name
-                        )
-                    ],
-                )
-                super().reset_rtlsim(sim)
-                super().toggle_clk(sim)
-                output = self.rtlsim(sim, inp)
-                odt = self.get_output_datatype()
-                target_bits = odt.bitwidth()
-                packed_bits = self.get_outstream_width()
-                out_npy_path = "{}/output.npy".format(code_gen_dir)
-                out_shape = self.get_folded_output_shape()
-                rtlsim_output_to_npy(
-                    output, out_npy_path, odt, out_shape, packed_bits, target_bits
-                )
-
-                # load and reshape output
-                output = np.load(out_npy_path)
-                oshape = self.get_normal_output_shape()
-                output = np.asarray([output], dtype=np.float32).reshape(*oshape)
-                context[node.output[0]] = output
-
-            else:
-                raise Exception(
-                    """Found no verilog files for this node,
-                    did you run the codegen_ipgen transformation?"""
-                )
 
+            # load and reshape output
+            output = np.load(out_npy_path)
+            oshape = self.get_normal_output_shape()
+            output = np.asarray([output], dtype=np.float32).reshape(*oshape)
+            context[node.output[0]] = output
         else:
             raise Exception(
                 """Invalid value for attribute exec_mode! Is currently set to: {}
@@ -697,7 +703,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
         mem_mode = self.get_nodeattr("mem_mode")
         if mem_mode == "const":
-            self.code_gen_dict["$GLOBALS$"] += ['#include "params.h"']
+            # self.code_gen_dict["$GLOBALS$"] += ['#include "params.h"']
+            pass
         elif mem_mode == "decoupled":
             self.code_gen_dict["$GLOBALS$"] += ['#include "mvau.hpp"']
         else:
@@ -714,9 +721,9 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         numInputVectors = list(self.get_nodeattr("numInputVectors"))
         numReps = np.prod(numInputVectors)
         self.code_gen_dict["$DEFINES$"] = [
-            """#define MW1 {}\n #define MH1 {}\n #define SIMD1 {}\n
-            #define PE1 {}\n #define WMEM1 {}\n #define TMEM1 {}\n
-            #define numReps {}""".format(
+            """#define MW1 {}\n #define MH1 {}\n
+            #define SIMD1 {}\n #define PE1 {}\n #define WMEM1 {}\n
+            #define TMEM1 {}\n #define numReps {}""".format(
                 self.get_nodeattr("MW"),
                 self.get_nodeattr("MH"),
                 self.get_nodeattr("SIMD"),
@@ -732,10 +739,6 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 "#define WP1 {}\n".format(wdt.bitwidth())
             )
 
-        if var == "ipgen":
-            self.code_gen_dict["$DEFINES$"].append("#define PRAGMA_SUB(x) _Pragma (#x)")
-            self.code_gen_dict["$DEFINES$"].append("#define DO_PRAGMA(x) PRAGMA_SUB(x)")
-
     def read_npy_data(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_npysim")
         dtype = self.get_input_datatype()
@@ -916,12 +919,13 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         )
 
         if mem_mode == "const":
+            self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"')
             # the weight tensor is ap_uint<simd*prec> [PE][WMEM]
             # partition for parallel access along the PE dimension (dim 1)
             self.code_gen_dict["$PRAGMAS$"].append(
                 (
-                    "DO_PRAGMA(HLS ARRAY_PARTITION "
-                    "variable=weights.m_weights complete dim=1)"
+                    "#pragma HLS ARRAY_PARTITION variable=weights.m_weights "
+                    "complete dim=1"
                 )
             )
         elif mem_mode == "decoupled":
@@ -945,14 +949,14 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             # TODO find a better way of checking for no pregenerated thresholds
             self.code_gen_dict["$PRAGMAS$"].append(
                 (
-                    "DO_PRAGMA(HLS ARRAY_PARTITION variable=threshs.m_thresholds "
-                    "complete dim=1)"
+                    "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds "
+                    "complete dim=1"
                 )
             )
             self.code_gen_dict["$PRAGMAS$"].append(
                 (
-                    "DO_PRAGMA(HLS ARRAY_PARTITION variable=threshs.m_thresholds "
-                    "complete dim=3)"
+                    "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds "
+                    "complete dim=3"
                 )
             )
 
@@ -971,23 +975,21 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             self.code_gen_dict["$LAYER_NAME$"] = [
                 "{}_{}".format(self.onnx_node.name, self.onnx_node.name)
             ]
-            # make instream width a multiple of 8 for axi interface
-            in_width = self.get_instream_width()
-            if in_width % 8 != 0:
-                in_width = math.floor(in_width / 8) + 8
+            # make instream width a multiple of 8 for AXI stream interface
+            in_width = roundup_to_integer_multiple(self.get_instream_width(), 8)
             self.code_gen_dict["$IN_RANGE$"] = ["[{}:0]".format(in_width - 1)]
             self.code_gen_dict["$OUT_RANGE$"] = [
                 "[{}:0]".format(self.get_outstream_width() - 1)
             ]
-            # make weight stream width a multiple of 8 for axi interface
-            weight_width = self.get_weightstream_width()
-            if weight_width % 8 != 0:
-                weight_width = math.floor(weight_width / 8) + 8
+            # make weight stream width a multiple of 8 for AXI stream interface
+            weight_width = roundup_to_integer_multiple(self.get_weightstream_width(), 8)
             self.code_gen_dict["$WEIGHT_RANGE$"] = ["[{}:0]".format(weight_width - 1)]
             self.code_gen_dict["$WEIGHT_WIDTH$"] = [str(weight_width)]
-            mw = self.get_nodeattr("MW")
-            mh = self.get_nodeattr("MH")
-            self.code_gen_dict["$WEIGHT_DEPTH$"] = [str(int(mw * mh))]
+            self.code_gen_dict["$WSTREAM_DEPTH$"] = [str(self.calc_wmem())]
+            self.code_gen_dict["$MEM_DEPTH$"] = [
+                str(roundup_to_integer_multiple(self.calc_wmem(), 1024))
+            ]
+            self.code_gen_dict["$RAM_STYLE$"] = [self.get_nodeattr("ram_style")]
 
             template = self.decoupled_wrapper
 
@@ -1024,9 +1026,11 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 if file.endswith(".v"):
                     verilog_file = os.path.join(memstream_dir, file)
                     copy(verilog_file, verilog_folder)
-            # copy .dat file of weights
-            dat_file = "{}/memblock_0.dat".format(code_gen_dir)
-            copy(dat_file, verilog_folder)
+            # copy .dat files of weights
+            for file in os.listdir(code_gen_dir):
+                if file.endswith(".dat"):
+                    dat_file = os.path.join(code_gen_dir, file)
+                    copy(dat_file, verilog_folder)
             # copy verilog wrapper
             verilog_wrapper = "{}/{}_memstream.v".format(
                 code_gen_dir, self.onnx_node.name
diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
index 804da50f5a2c2de7c920975de4e082851a627c4e..f370d417aa0ac1ce5d62af878575332941e2c1d0 100644
--- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
@@ -28,7 +28,7 @@
 
 import os
 import numpy as np
-from pyverilator import PyVerilator
+
 from finn.custom_op.fpgadataflow import HLSCustomOp
 from finn.custom_op.im2col import compute_conv_output_dim
 from finn.core.datatype import DataType
@@ -64,6 +64,9 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         ishape = (1, ifm_dim, ifm_dim, ifm_ch)
         return ishape
 
+    def get_folded_input_shape(self):
+        return self.get_normal_input_shape()
+
     def get_normal_output_shape(self):
         k = self.get_nodeattr("PoolDim")
         ifm_dim = self.get_nodeattr("ImgDim")
@@ -143,12 +146,6 @@ class StreamingMaxPool_Batch(HLSCustomOp):
 
         return info_messages
 
-    def bram_estimation(self):
-        pass
-
-    def lut_estimation(self):
-        pass
-
     def global_includes(self):
         self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"']
 
@@ -301,49 +298,26 @@ class StreamingMaxPool_Batch(HLSCustomOp):
             did not produce expected ofolded utput shape"
             context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
         elif mode == "rtlsim":
-            prefixed_top_name = "%s_%s" % (node.name, node.name)
-            # check if needed file exists
-            verilog_file = "{}/project_{}/sol1/impl/verilog/{}.v".format(
-                code_gen_dir, node.name, prefixed_top_name
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            rtlsim_inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
             )
-            if os.path.isfile(verilog_file):
-                nbits = self.get_instream_width()
-                rtlsim_inp = npy_to_rtlsim_input(
-                    "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
-                )
-                sim = PyVerilator.build(
-                    verilog_file,
-                    verilog_path=[
-                        "{}/project_{}/sol1/impl/verilog/".format(
-                            code_gen_dir, node.name
-                        )
-                    ],
-                )
-                super().reset_rtlsim(sim)
-                super().toggle_clk(sim)
-                rtlsim_output = self.rtlsim(sim, rtlsim_inp)
-                odt = export_idt
-                target_bits = odt.bitwidth()
-                packed_bits = self.get_outstream_width()
-                out_npy_path = "{}/output.npy".format(code_gen_dir)
-                out_shape = self.get_folded_output_shape()
-                rtlsim_output_to_npy(
-                    rtlsim_output,
-                    out_npy_path,
-                    odt,
-                    out_shape,
-                    packed_bits,
-                    target_bits,
-                )
-                # load and reshape output
-                output = np.load(out_npy_path)
-                output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
-                context[node.output[0]] = output
-            else:
-                raise Exception(
-                    """Found no verilog files for this node,
-                    did you run the codegen_ipgen transformation?"""
-                )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+            odt = export_idt
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+            context[node.output[0]] = output
         else:
             raise Exception(
                 """Invalid value for attribute exec_mode! Is currently set to: {}
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 90a54b019b090ea47e77c8efa841c86a1802edb5..bfa90ebeda06e55ffaa9b8ea5b40369ed246ba86 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -29,11 +29,12 @@
 
 # template for single node execution
 docompute_template = """
-#define AP_INT_MAX_W 16384
+#define AP_INT_MAX_W $AP_INT_MAX_W$
 #include "cnpy.h"
 #include "npy2apintstream.hpp"
 #include <vector>
 #include "bnn-library.h"
+
 // includes for network parameters
 $GLOBALS$
 
@@ -41,6 +42,7 @@ $GLOBALS$
 $DEFINES$
 
 int main(){
+$PRAGMAS$
 
 $STREAMDECLARATIONS$
 
@@ -60,8 +62,10 @@ $SAVEASCNPY$
 
 # cpp file
 ipgen_template = """
-#define AP_INT_MAX_W 4096
+#define AP_INT_MAX_W $AP_INT_MAX_W$
+
 #include "bnn-library.h"
+
 // includes for network parameters
 $GLOBALS$
 
@@ -133,16 +137,18 @@ reg [31:0] config_d0 = 0;
 wire [31:0] config_q0;
 
 //multiple wire AXI Streams
-reg m_axis_0_afull = 0;
-reg m_axis_0_tready;
+wire m_axis_0_afull;
+// FIFO count to generate programmable full
+wire [5:0] fifo_0_count;
+wire m_axis_0_tready;
 wire m_axis_0_tvalid;
 wire $WEIGHT_RANGE$ m_axis_0_tdata;
 
-reg m_axis_0_tready_q;
+wire m_axis_0_tready_q;
 wire m_axis_0_tvalid_q;
 wire $WEIGHT_RANGE$ m_axis_0_tdata_q;
 
-reg m_axis_0_tready_q2;
+wire m_axis_0_tready_q2;
 wire m_axis_0_tvalid_q2;
 wire $WEIGHT_RANGE$ m_axis_0_tdata_q2;
 
@@ -179,9 +185,10 @@ memstream
 // memory, set per-stream offsets in memory, set per-stream widths
 .CONFIG_EN(1),
 .NSTREAMS(1),
-.MEM_DEPTH(1024),
+.MEM_DEPTH($MEM_DEPTH$),
 .MEM_WIDTH($WEIGHT_WIDTH$),
 .MEM_INIT("./"),
+.RAM_STYLE("$RAM_STYLE$"),
 
 //widths per stream
 .STRM0_WIDTH($WEIGHT_WIDTH$),
@@ -192,7 +199,7 @@ memstream
 .STRM5_WIDTH($WEIGHT_WIDTH$),
 
 //depths per stream
-.STRM0_DEPTH($WEIGHT_DEPTH$),
+.STRM0_DEPTH($WSTREAM_DEPTH$),
 .STRM1_DEPTH(1),
 .STRM2_DEPTH(1),
 .STRM3_DEPTH(1),
@@ -253,12 +260,9 @@ mem
 
 );
 
-// two consecutive weight streamer FIFOs to provide the same functionality
-// as "programmable full"
 
-// weight streamer FIFO 1
 Q_srl #(
-.depth(16),
+.depth(32),
 .width($WEIGHT_WIDTH$)
 )
 $LAYER_NAME$_w_fifo_1
@@ -270,25 +274,10 @@ $LAYER_NAME$_w_fifo_1
  .i_r(m_axis_0_tready),
  .o_d(m_axis_0_tdata_q),
  .o_v(m_axis_0_tvalid_q),
- .o_r(m_axis_0_tready_q)
+ .o_r(m_axis_0_tready_q),
+ .count(fifo_0_count)
 );
 
-// weight streamer FIFO 2
-Q_srl #(
-.depth(16),
-.width($WEIGHT_WIDTH$)
-)
-$LAYER_NAME$_w_fifo_2
-(
- .clock(ap_clk),
- .reset(!ap_rst_n),
- .i_d(m_axis_0_tdata_q),
- .i_v(m_axis_0_tvalid_q),
- .i_r(m_axis_0_tready_q),
- .o_d(m_axis_0_tdata_q2),
- .o_v(m_axis_0_tvalid_q2),
- .o_r(m_axis_0_tready_q2)
-);
 
 //MVA_Stream_Unit
 
@@ -300,14 +289,16 @@ MVA_Stream_U
 .in0_V_V_TDATA(in0_V_V_TDATA),		//$IN_RANGE$ input
 .in0_V_V_TVALID(in0_V_V_TVALID),  	//input
 .in0_V_V_TREADY(in0_V_V_TREADY),	//output
-.weights_V_V_TDATA(m_axis_0_tdata_q2),	//$WEIGHT_RANGE$ input
-.weights_V_V_TVALID(m_axis_0_tvalid_q2),	//input
-.weights_V_V_TREADY(m_axis_0_tready_q2),	//output
+.weights_V_V_TDATA(m_axis_0_tdata_q),	//$WEIGHT_RANGE$ input
+.weights_V_V_TVALID(m_axis_0_tvalid_q),	//input
+.weights_V_V_TREADY(m_axis_0_tready_q),	//output
 .out_V_V_TDATA(out_V_V_TDATA),		//$OUT_RANGE$ output
 .out_V_V_TVALID(out_V_V_TVALID),	//output
 .out_V_V_TREADY(out_V_V_TREADY)		//input
 );
 
+// programmable full threshold at 16 elements
+assign m_axis_0_afull = (fifo_0_count > 16);
 
 endmodule
 """
diff --git a/src/finn/transformation/__init__.py b/src/finn/transformation/__init__.py
index a4e0bcf330a8ad1797eb76e61ba63511eb903dcf..e9f5fe15f6bdefe1e739394495f67a972ccff669 100644
--- a/src/finn/transformation/__init__.py
+++ b/src/finn/transformation/__init__.py
@@ -48,6 +48,8 @@ Guide to writing FINN transformations
 """
 
 from abc import ABC, abstractmethod
+from finn.util.basic import get_num_default_workers
+import multiprocessing as mp
 
 
 class Transformation(ABC):
@@ -60,3 +62,54 @@ class Transformation(ABC):
     @abstractmethod
     def apply(self, model):
         pass
+
+
+class NodeLocalTransformation(Transformation):
+    """
+    Parent class for transformations, which can be executed locally to one node
+    by accessing and modifying the attributes of only that node.
+    This class can then automatically parallelize the transformation.
+    Transformations sublcassing NodeLocalTransformation must implement the
+    abstract method applyNodeLocal().
+
+    To control the degree of parallelization, specify the num_workers argument
+    in the constructor, using one of the following values:
+    * None: use NUM_DEFAULT_WORKERS environment variable
+    * 0: use all available CPU cores
+    * (any other int>0): set number of parallel workers
+    """
+
+    def __init__(self, num_workers=None):
+        super().__init__()
+        if num_workers is None:
+            self._num_workers = get_num_default_workers()
+        else:
+            self._num_workers = num_workers
+        assert self._num_workers >= 0, "Number of workers must be nonnegative."
+        if self._num_workers == 0:
+            self._num_workers = mp.cpu_count()
+
+    @abstractmethod
+    def applyNodeLocal(self, node):
+        pass
+
+    def apply(self, model):
+        # Remove old nodes from the current model
+        old_nodes = []
+        for i in range(len(model.graph.node)):
+            old_nodes.append(model.graph.node.pop())
+
+        # Execute transformation in parallel
+        with mp.Pool(self._num_workers) as p:
+            new_nodes_and_bool = p.map(self.applyNodeLocal, old_nodes, chunksize=1)
+
+        # extract nodes and check if the transformation needs to run again
+        # Note: .pop() had initially reversed the node order
+        run_again = False
+        for node, run in reversed(new_nodes_and_bool):
+            # Reattach new nodes to old model
+            model.graph.node.append(node)
+            if run is True:
+                run_again = True
+
+        return (model, run_again)
diff --git a/src/finn/transformation/fpgadataflow/annotate_resources.py b/src/finn/transformation/fpgadataflow/annotate_resources.py
new file mode 100644
index 0000000000000000000000000000000000000000..d192372a7d9c1f6ee2f088c6a058b994d21f6c99
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/annotate_resources.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import finn.custom_op.registry as registry
+from finn.transformation import Transformation
+from finn.transformation.move_reshape import _is_fpgadataflow_node
+from finn.analysis.fpgadataflow.res_estimation import res_estimation
+from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
+from finn.analysis.fpgadataflow.post_synth_res import post_synth_res
+
+
+class AnnotateResources(Transformation):
+    """Annotate the amount of FPGA resources taken by each fpgadataflow
+    node as an attribute on the node, depending on the mode parameter:
+    * 'estimate' -- use the analytical estimation model
+    * 'hls' -- use results from the HLS synthesis report
+
+    No annotations can be provided unless the relevant transformation for the
+    chosen mode (e.g. HLSSynth_IPGen for hls) was previously run.
+    """
+
+    def __init__(self, mode):
+        super().__init__()
+        self.mode = mode
+
+    def apply(self, model):
+        graph = model.graph
+        if self.mode == "estimate":
+            res_fxn = res_estimation
+        elif self.mode == "hls":
+            res_fxn = hls_synth_res_estimation
+        elif self.mode == "synth":
+            res_fxn = post_synth_res
+        else:
+            raise Exception("Unrecognized mode for AnnotateResources")
+        res_dict = model.analysis(res_fxn)
+        total_dict = {}
+        for lname in res_dict.keys():
+            layer_res_dict = res_dict[lname]
+            for r_type in layer_res_dict.keys():
+                r_amount = layer_res_dict[r_type]
+                r_amount = float(r_amount)
+                if r_type in total_dict.keys():
+                    total_dict[r_type] += r_amount
+                else:
+                    total_dict[r_type] = r_amount
+        model.set_metadata_prop("res_total_" + self.mode, str(total_dict))
+        for node in graph.node:
+            if _is_fpgadataflow_node(node) and node.name in res_dict.keys():
+                op_inst = registry.getCustomOp(node)
+                op_inst.set_nodeattr("res_" + self.mode, str(res_dict[node.name]))
+
+        return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/codegen_ipgen.py b/src/finn/transformation/fpgadataflow/codegen_ipgen.py
index 30bb6eb90a515615ff333f7858da9f7f3130986d..fa7725ae1fa03cc204aa58969d6fbc6cf71e7d97 100644
--- a/src/finn/transformation/fpgadataflow/codegen_ipgen.py
+++ b/src/finn/transformation/fpgadataflow/codegen_ipgen.py
@@ -47,7 +47,7 @@ def _codegen_single_node(node, model, fpgapart, clk):
         # ensure that there is a directory
         if code_gen_dir == "" or not os.path.isdir(code_gen_dir):
             code_gen_dir = make_build_dir(
-                prefix="code_gen_ipgen_" + str(node.op_type) + "_"
+                prefix="code_gen_ipgen_" + str(node.name) + "_"
             )
             inst.set_nodeattr("code_gen_dir_ipgen", code_gen_dir)
         # ensure that there is generated code inside the dir
diff --git a/src/finn/transformation/fpgadataflow/codegen_ipstitch.py b/src/finn/transformation/fpgadataflow/codegen_ipstitch.py
index 0fbd83199d88ec68cbf11c6ded5af33fdd4d91a3..f482db793018933883a068bb16fd99ece671064b 100644
--- a/src/finn/transformation/fpgadataflow/codegen_ipstitch.py
+++ b/src/finn/transformation/fpgadataflow/codegen_ipstitch.py
@@ -176,7 +176,8 @@ class CodeGen_ipstitch(Transformation):
         tcl.append("set all_v_files [get_files -filter {FILE_TYPE == Verilog}]")
         v_file_list = "%s/all_verilog_srcs.txt" % vivado_stitch_proj_dir
         tcl.append("set fp [open %s w]" % v_file_list)
-        tcl.append("puts $fp $all_v_files")
+        # write each verilog filename to all_verilog_srcs.txt
+        tcl.append("foreach vf $all_v_files {puts $fp $vf}")
         tcl.append("close $fp")
         # write the project creator tcl script
         tcl_string = "\n".join(tcl) + "\n"
diff --git a/src/finn/transformation/fpgadataflow/codegen_npysim.py b/src/finn/transformation/fpgadataflow/codegen_npysim.py
index 178601c31a52c4cef5a17f63144a1d091a577d3e..02200e76db3f9c8207605bb93c4b07f0ebc76cab 100644
--- a/src/finn/transformation/fpgadataflow/codegen_npysim.py
+++ b/src/finn/transformation/fpgadataflow/codegen_npysim.py
@@ -47,7 +47,7 @@ def _codegen_single_node(node, model):
         # ensure that there is a directory
         if code_gen_dir == "" or not os.path.isdir(code_gen_dir):
             code_gen_dir = make_build_dir(
-                prefix="code_gen_npysim_" + str(node.op_type) + "_"
+                prefix="code_gen_npysim_" + str(node.name) + "_"
             )
             inst.set_nodeattr("code_gen_dir_npysim", code_gen_dir)
         # ensure that there is generated code inside the dir
diff --git a/src/finn/transformation/fpgadataflow/compile.py b/src/finn/transformation/fpgadataflow/compile.py
index 0b1489e61aca04ae7b771ed42b1deb6d90f446bf..40c7da8f77efeaa655459402699a401b642b776c 100644
--- a/src/finn/transformation/fpgadataflow/compile.py
+++ b/src/finn/transformation/fpgadataflow/compile.py
@@ -28,45 +28,48 @@
 
 import finn.custom_op.registry as registry
 from finn.util.fpgadataflow import is_fpgadataflow_node
-from finn.transformation import Transformation
+from finn.transformation import NodeLocalTransformation
 
 
-class Compile(Transformation):
+class Compile(NodeLocalTransformation):
     """For every node: compile C++ code in node attribute "code_gen_dir_npysim"
     and save path to executables in node attribute "executable_path".
     All nodes in the graph must have the fpgadataflow backend attribute.
 
     To use these executables, exec_mode must be set to "npysim" (using transformation
     SetExecMode) and the model has to be executed using execute_onnx() from
-    finn.core.onnx_exec"""
+    finn.core.onnx_exec
 
-    def __init__(self):
-        super().__init__()
+    * num_workers (int or None) number of parallel workers, see documentation in
+      NodeLocalTransformation for more details.
+    """
 
-    def apply(self, model):
-        for node in model.graph.node:
-            op_type = node.op_type
-            if is_fpgadataflow_node(node) is True:
-                try:
-                    # lookup op_type in registry of CustomOps
-                    inst = registry.custom_op[op_type](node)
-                    # ensure that code is generated
-                    assert (
-                        inst.get_nodeattr("code_gen_dir_npysim") != ""
-                    ), """Node
-                        attribute "code_gen_dir_npysim" is not set. Please run
-                        Transformation CodeGen_npysim first."""
-                    # call the compilation function for this node
-                    inst.compile_singlenode_code()
-                    # ensure that executable path is now set
-                    assert (
-                        inst.get_nodeattr("executable_path") != ""
-                    ), """Transformation
-                        compile was not successful, there is no path to executables set
-                        in node attribute "executable_path"."""
-                except KeyError:
-                    # exception if op_type is not supported
-                    raise Exception(
-                        "Custom op_type %s is currently not supported." % op_type
-                    )
-        return (model, False)
+    def __init__(self, num_workers=None):
+        super().__init__(num_workers=num_workers)
+
+    def applyNodeLocal(self, node):
+        op_type = node.op_type
+        if is_fpgadataflow_node(node) is True:
+            try:
+                # lookup op_type in registry of CustomOps
+                inst = registry.custom_op[op_type](node)
+                # ensure that code is generated
+                assert (
+                    inst.get_nodeattr("code_gen_dir_npysim") != ""
+                ), """Node
+                attribute "code_gen_dir_npysim" is not set. Please run
+                Transformation CodeGen_npysim first."""
+                # call the compilation function for this node
+                inst.compile_singlenode_code()
+                # ensure that executable path is now set
+                assert (
+                    inst.get_nodeattr("executable_path") != ""
+                ), """Transformation
+                compile was not successful, there is no path to executables set
+                in node attribute "executable_path"."""
+            except KeyError:
+                # exception if op_type is not supported
+                raise Exception(
+                    "Custom op_type %s is currently not supported." % op_type
+                )
+        return (node, False)
diff --git a/src/finn/transformation/fpgadataflow/hlssynth_ipgen.py b/src/finn/transformation/fpgadataflow/hlssynth_ipgen.py
index e90a6debeedb4553b825baea94a73425c5e2615c..2af623818fe0e830883ef5065e5e7c9c7364ef1e 100644
--- a/src/finn/transformation/fpgadataflow/hlssynth_ipgen.py
+++ b/src/finn/transformation/fpgadataflow/hlssynth_ipgen.py
@@ -28,45 +28,48 @@
 
 import finn.custom_op.registry as registry
 from finn.util.fpgadataflow import is_fpgadataflow_node
-from finn.transformation import Transformation
+from finn.transformation import NodeLocalTransformation
 
 
-class HLSSynth_IPGen(Transformation):
+class HLSSynth_IPGen(NodeLocalTransformation):
     """For each node: generate IP block from code in folder
     that is referenced in node attribute "code_gen_dir_ipgen"
     and save path of generated project in node attribute "ipgen_path".
     All nodes in the graph must have the fpgadataflow backend attribute.
 
     This transformation calls Vivado HLS for synthesis, so it will run for
-    some time (several minutes)"""
+    some time (several minutes)
 
-    def __init__(self):
-        super().__init__()
+    * num_workers (int or None) number of parallel workers, see documentation in
+      NodeLocalTransformation for more details.
+    """
 
-    def apply(self, model):
-        for node in model.graph.node:
-            op_type = node.op_type
-            if is_fpgadataflow_node(node) is True:
-                try:
-                    # lookup op_type in registry of CustomOps
-                    inst = registry.custom_op[op_type](node)
-                    # ensure that code is generated
-                    assert (
-                        inst.get_nodeattr("code_gen_dir_ipgen") != ""
-                    ), """Node
-                        attribute "code_gen_dir_ipgen" is empty. Please run
-                        transformation CodeGen_ipgen first."""
-                    # call the compilation function for this node
-                    inst.ipgen_singlenode_code()
-                    # ensure that executable path is now set
-                    assert (
-                        inst.get_nodeattr("ipgen_path") != ""
-                    ), """Transformation
-                        HLSSynth_IPGen was not successful. Node attribute "ipgen_path"
-                        is empty."""
-                except KeyError:
-                    # exception if op_type is not supported
-                    raise Exception(
-                        "Custom op_type %s is currently not supported." % op_type
-                    )
-        return (model, False)
+    def __init__(self, num_workers=None):
+        super().__init__(num_workers=num_workers)
+
+    def applyNodeLocal(self, node):
+        op_type = node.op_type
+        if is_fpgadataflow_node(node) is True:
+            try:
+                # lookup op_type in registry of CustomOps
+                inst = registry.custom_op[op_type](node)
+                # ensure that code is generated
+                assert (
+                    inst.get_nodeattr("code_gen_dir_ipgen") != ""
+                ), """Node
+                attribute "code_gen_dir_ipgen" is empty. Please run
+                transformation CodeGen_ipgen first."""
+                # call the compilation function for this node
+                inst.ipgen_singlenode_code()
+                # ensure that executable path is now set
+                assert (
+                    inst.get_nodeattr("ipgen_path") != ""
+                ), """Transformation
+                HLSSynth_IPGen was not successful. Node attribute "ipgen_path"
+                is empty."""
+            except KeyError:
+                # exception if op_type is not supported
+                raise Exception(
+                    "Custom op_type %s is currently not supported." % op_type
+                )
+        return (node, False)
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_proj.py b/src/finn/transformation/fpgadataflow/make_pynq_proj.py
index c2c3802635ba8b1be9bf7f0c71e48ad13b79771f..9921ce7caf2aaffd197f9bc863ab77502a963647 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_proj.py
@@ -113,11 +113,15 @@ class MakePYNQProject(Transformation):
         # create a temporary folder for the project
         vivado_pynq_proj_dir = make_build_dir(prefix="vivado_pynq_proj_")
         model.set_metadata_prop("vivado_pynq_proj", vivado_pynq_proj_dir)
+        # filename for the synth utilization report
+        synth_report_filename = vivado_pynq_proj_dir + "/synth_report.xml"
+        model.set_metadata_prop("vivado_synth_rpt", synth_report_filename)
 
         ip_config_tcl = templates.ip_config_tcl_template % (
             vivado_pynq_proj_dir,
             ip_dirs_str,
             vivado_pynq_proj_dir,
+            synth_report_filename,
             vivado_stitch_vlnv,
             in_bytes,
             out_bytes,
diff --git a/src/finn/transformation/fpgadataflow/prepare_rtlsim.py b/src/finn/transformation/fpgadataflow/prepare_rtlsim.py
new file mode 100644
index 0000000000000000000000000000000000000000..4474831381425268d2a59e7de835bba31c55a733
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/prepare_rtlsim.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import finn.custom_op.registry as registry
+import finn.util.basic as util
+from finn.transformation import NodeLocalTransformation
+
+try:
+    from pyverilator import PyVerilator
+except ModuleNotFoundError:
+    PyVerilator = None
+
+
+class PrepareRTLSim(NodeLocalTransformation):
+    """For a graph with generated RTL sources (after HLSSynth_IPGen), create a
+    Verilator emulation library for each node to prepare for rtlsim
+    execution and set the rtlsim_so property to the path to the generated
+    emulation library.
+
+    To use these libraries, exec_mode must be set to "rtlsim" (using
+    SetExecMode) and the model has to be executed using execute_onnx() from
+    finn.core.onnx_exec
+
+    * num_workers (int or None) number of parallel workers, see documentation in
+      NodeLocalTransformation for more details.
+    """
+
+    def __init__(self, num_workers=None):
+        super().__init__(num_workers=num_workers)
+
+    def applyNodeLocal(self, node):
+        op_type = node.op_type
+        if node.domain == "finn":
+            backend_attribute = util.get_by_name(node.attribute, "backend")
+            if backend_attribute is not None:
+                backend_value = backend_attribute.s.decode("UTF-8")
+                if backend_value == "fpgadataflow":
+                    try:
+                        # lookup op_type in registry of CustomOps
+                        inst = registry.custom_op[op_type](node)
+                        inst.prepare_rtlsim()
+                        # ensure that executable path is now set
+                        assert (
+                            inst.get_nodeattr("rtlsim_so") != ""
+                        ), "Failed to prepare RTLSim, no rtlsim_so attribute found."
+                    except KeyError:
+                        # exception if op_type is not supported
+                        raise Exception(
+                            "Custom op_type %s is currently not supported." % op_type
+                        )
+        return (node, False)
diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py
index edbf28c4e9d49129d22da12985f3b8c003e3d745..81cb954bb4503c8daf18bad5881661018e9d17b7 100644
--- a/src/finn/transformation/fpgadataflow/templates.py
+++ b/src/finn/transformation/fpgadataflow/templates.py
@@ -38,6 +38,7 @@ variable config_ip_use_axilite
 variable config_ip_project_dir
 variable config_output_products_dir
 variable config_remote_cache
+variable config_util_report_filename
 
 # for arguments involving paths below: use absolute paths or relative to the
 # platform/overlay/bitstream folder
@@ -47,6 +48,8 @@ set config_ip_project_dir %s
 set config_ip_repo %s
 # where the produced bitfile and .hwh file will be placed
 set config_output_products_dir %s
+# where the synth util XML report will be written
+set config_util_report_filename %s
 
 # non-path arguments
 # VLNV of the IP block
diff --git a/src/finn/transformation/move_reshape.py b/src/finn/transformation/move_reshape.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a30fd93cc0bdc322b6ec7d892d42d3c3ca96fd6
--- /dev/null
+++ b/src/finn/transformation/move_reshape.py
@@ -0,0 +1,40 @@
+from finn.transformation import Transformation
+from finn.util.basic import get_by_name
+
+
+def _is_fpgadataflow_node(node):
+    if node is not None:
+        if node.domain == "finn":
+            n_backend = get_by_name(node.attribute, "backend")
+            if n_backend is None:
+                return False
+            backend_value = n_backend.s.decode("UTF-8")
+            if backend_value == "fpgadataflow":
+                return True
+        else:
+            return False
+    else:
+        return False
+
+
+class MoveReshape(Transformation):
+    """Removes a node that implements a (1, -1) reshape if it is
+    between two fpgadataflow nodes"""
+
+    def apply(self, model):
+
+        graph = model.graph
+        graph_modified = False
+        for n in graph.node:
+            if n.op_type == "Reshape":
+                graph_modified = True
+                shape = model.get_initializer(n.input[1])
+                if (shape == [1, -1]).all():
+                    producer = model.find_producer(n.input[0])
+                    if _is_fpgadataflow_node(producer) is True:
+                        consumer = model.find_consumer(n.output[0])
+                        if _is_fpgadataflow_node(consumer) is True:
+                            consumer.input[0] = n.input[0]
+                            graph.node.remove(n)
+
+        return (model, graph_modified)
diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index f99a453d05d7cb3c824784e80103b6021f072a79..4eb0e6cb874f80620e3cb25017abcc29368b261b 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -44,6 +44,17 @@ pynq_part_map["Pynq-Z2"] = "xc7z020clg400-1"
 pynq_part_map["ZCU104"] = "xczu7ev-ffvc1156-2-e"
 
 
+def get_num_default_workers():
+    """Return the number of workers for parallel transformations. Controllable
+    via the NUM_DEFAULT_WORKERS environment variable. If the env.var. is
+    undefined, the default value of 1 is returned.
+    """
+
+    try:
+        return int(os.environ["NUM_DEFAULT_WORKERS"])
+    except KeyError:
+        return 1
+
 
 def get_finn_root():
     "Return the root directory that FINN is cloned into."
diff --git a/src/finn/util/data_packing.py b/src/finn/util/data_packing.py
index 1d919de5d55363bbe71f0dfc44ca6fe3025f5a4a..e67638bd1ca81d933fbfbffec9efcd7f84fa961e 100644
--- a/src/finn/util/data_packing.py
+++ b/src/finn/util/data_packing.py
@@ -275,6 +275,7 @@ def npy_to_rtlsim_input(input_file, input_dtype, pad_to_nbits, reverse_inner=Tru
     finn.util.basic.pack_innermost_dim_as_hex_string() for more info on how the
     packing works. If reverse_inner is set, the innermost dimension will be
     reversed prior to packing."""
+    pad_to_nbits = roundup_to_integer_multiple(pad_to_nbits, 4)
     if issubclass(type(input_file), np.ndarray):
         inp = input_file
     elif os.path.isfile(input_file):
diff --git a/src/finn/util/fpgadataflow.py b/src/finn/util/fpgadataflow.py
index a53761341bf95f0210a1f7c6e06bcf7c37a33e47..e84532d8d24909cc5add09fbc623a13c955ffb72 100644
--- a/src/finn/util/fpgadataflow.py
+++ b/src/finn/util/fpgadataflow.py
@@ -29,8 +29,11 @@
 import os
 import subprocess
 
-from pyverilator import PyVerilator
-from finn.util.basic import get_by_name
+try:
+    from pyverilator import PyVerilator
+except ModuleNotFoundError:
+    PyVerilator = None
+from finn.util.basic import get_by_name, make_build_dir
 
 
 class IPGenBuilder:
@@ -70,6 +73,9 @@ class IPGenBuilder:
 
 def pyverilate_stitched_ip(model):
     "Given a model with stitched IP, return a PyVerilator sim object."
+    if PyVerilator is None:
+        raise ImportError("Installation of PyVerilator is required.")
+
     vivado_stitch_proj_dir = model.get_metadata_prop("vivado_stitch_proj")
     with open(vivado_stitch_proj_dir + "/all_verilog_srcs.txt", "r") as f:
         all_verilog_srcs = f.read().split()
@@ -79,7 +85,10 @@ def pyverilate_stitched_ip(model):
 
     all_verilog_dirs = list(map(file_to_dir, all_verilog_srcs))
     top_verilog = model.get_metadata_prop("wrapper_filename")
-    sim = PyVerilator.build(top_verilog, verilog_path=all_verilog_dirs)
+    build_dir = make_build_dir("pyverilator_ipstitched_")
+    sim = PyVerilator.build(
+        top_verilog, verilog_path=all_verilog_dirs, build_dir=build_dir
+    )
     return sim
 
 
diff --git a/tests/end2end/test_end2end_cnv_w1a1.py b/tests/end2end/test_end2end_cnv_w1a1.py
new file mode 100644
index 0000000000000000000000000000000000000000..53f34d4d772a458eed3d417cdeb8a962338b099c
--- /dev/null
+++ b/tests/end2end/test_end2end_cnv_w1a1.py
@@ -0,0 +1,330 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+
+import numpy as np
+
+# as of Feb'20 there is a bug that segfaults ONNX shape inference if we
+# import pytorch before onnx, so we make sure to import onnx first
+import onnx  # NOQA
+
+import pytest
+import pkg_resources as pk
+from finn.core.modelwrapper import ModelWrapper
+from finn.custom_op.registry import getCustomOp
+from finn.core.onnx_exec import execute_onnx
+from finn.transformation.double_to_single_float import DoubleToSingleFloat
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.move_reshape import MoveReshape
+from finn.transformation.fold_constants import FoldConstants
+from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from finn.transformation.streamline import Streamline
+from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
+import finn.transformation.streamline.absorb as absorb
+from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+from finn.transformation.fpgadataflow.create_dataflow_partition import (
+    CreateDataflowPartition,
+)
+from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
+from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker
+from finn.transformation.fpgadataflow.codegen_ipgen import CodeGen_ipgen
+from finn.transformation.fpgadataflow.hlssynth_ipgen import HLSSynth_IPGen
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
+from finn.transformation.fpgadataflow.codegen_ipstitch import CodeGen_ipstitch
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.codegen_npysim import CodeGen_npysim
+from finn.transformation.fpgadataflow.compile import Compile
+from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
+from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject
+from finn.transformation.fpgadataflow.synth_pynq_proj import SynthPYNQProject
+from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
+from finn.util.basic import pynq_part_map
+from finn.util.test import get_test_model_trained
+from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+
+build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
+test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
+test_fpga_part = pynq_part_map[test_pynq_board]
+target_clk_ns = 5
+mem_mode = "const"
+
+
+def test_end2end_cnv_w1a1_export():
+    import brevitas.onnx as bo
+
+    cnv = get_test_model_trained("CNV", 1, 1)
+    bo.export_finn_onnx(
+        cnv, (1, 3, 32, 32), build_dir + "/end2end_cnv_w1a1_export.onnx"
+    )
+
+
+def test_end2end_cnv_w1a1_import_and_tidy():
+    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_export.onnx")
+    model = model.transform(DoubleToSingleFloat())
+    model = model.transform(InferShapes())
+    model = model.transform(FoldConstants())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model.save(build_dir + "/end2end_cnv_w1a1_tidy.onnx")
+
+
+def test_end2end_cnv_w1a1_streamline():
+    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_tidy.onnx")
+    model = model.transform(Streamline())
+    model = model.transform(LowerConvsToMatMul())
+    model = model.transform(MakeMaxPoolNHWC())
+    model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
+    model = model.transform(ConvertBipolarMatMulToXnorPopcount())
+    model = model.transform(Streamline())
+    model.save(build_dir + "/end2end_cnv_w1a1_streamlined.onnx")
+
+
+def test_end2end_cnv_w1a1_convert_to_hls_layers():
+    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_streamlined.onnx")
+    model = model.transform(to_hls.InferBinaryStreamingFCLayer(mem_mode))
+    model = model.transform(to_hls.InferQuantizedStreamingFCLayer(mem_mode))
+    model = model.transform(to_hls.InferConvInpGen())
+    model = model.transform(to_hls.InferStreamingMaxPool())
+    model = model.transform(MoveReshape())
+    model.save(build_dir + "/end2end_cnv_w1a1_hls_layers.onnx")
+
+
+def test_end2end_cnv_w1a1_create_dataflow_partition():
+    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_hls_layers.onnx")
+    parent_model = model.transform(CreateDataflowPartition())
+    parent_model.save(build_dir + "/end2end_cnv_w1a1_dataflow_parent.onnx")
+    sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+    sdp_node = getCustomOp(sdp_node)
+    dataflow_model_filename = sdp_node.get_nodeattr("model")
+    dataflow_model = ModelWrapper(dataflow_model_filename)
+    dataflow_model.save(build_dir + "/end2end_cnv_w1a1_dataflow_model.onnx")
+
+
+def test_end2end_cnv_w1a1_fold_and_tlastmarker():
+    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_dataflow_model.onnx")
+    fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
+    fc0w = getCustomOp(fc_layers[0])
+    fc1w = getCustomOp(fc_layers[1])
+    fc2w = getCustomOp(fc_layers[2])
+    fc3w = getCustomOp(fc_layers[3])
+    fc4w = getCustomOp(fc_layers[4])
+    fc5w = getCustomOp(fc_layers[5])
+    fc6w = getCustomOp(fc_layers[6])
+    fc7w = getCustomOp(fc_layers[7])
+    fc8w = getCustomOp(fc_layers[8])
+    fc0w.set_nodeattr("SIMD", 27)
+    fc0w.set_nodeattr("PE", 8)
+    fc1w.set_nodeattr("SIMD", 32)
+    fc1w.set_nodeattr("PE", 8)
+    fc2w.set_nodeattr("SIMD", 32)
+    fc2w.set_nodeattr("PE", 16)
+    fc3w.set_nodeattr("SIMD", 32)
+    fc3w.set_nodeattr("PE", 16)
+    fc4w.set_nodeattr("SIMD", 32)
+    fc4w.set_nodeattr("PE", 32)
+    fc5w.set_nodeattr("SIMD", 64)
+    fc5w.set_nodeattr("PE", 16)
+    fc6w.set_nodeattr("SIMD", 32)
+    fc6w.set_nodeattr("PE", 16)
+    fc7w.set_nodeattr("SIMD", 64)
+    fc7w.set_nodeattr("PE", 8)
+    fc8w.set_nodeattr("SIMD", 16)
+    fc8w.set_nodeattr("PE", 10)
+
+    model = model.transform(InsertDWC())
+    model = model.transform(InsertTLastMarker())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(AnnotateResources("estimate"))
+    model.save(build_dir + "/end2end_cnv_w1a1_folded.onnx")
+
+
+def test_end2end_cnv_w1a1_gen_hls_ip():
+    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_folded.onnx")
+    model = model.transform(CodeGen_ipgen(test_fpga_part, target_clk_ns))
+    model = model.transform(HLSSynth_IPGen())
+    model = model.transform(AnnotateResources("hls"))
+    model.save(build_dir + "/end2end_cnv_w1a1_ipgen.onnx")
+
+
+def test_end2end_cnv_w1a1_ip_stitch():
+    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_ipgen.onnx")
+    model = model.transform(ReplaceVerilogRelPaths())
+    model = model.transform(CodeGen_ipstitch(test_fpga_part))
+    model.save(build_dir + "/end2end_cnv_w1a1_ipstitch.onnx")
+
+
+def test_end2end_cnv_w1a1_verify_dataflow_part():
+    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_ipstitch.onnx")
+    x = np.zeros((1, 32, 32, 3), dtype=np.float32)
+    inp_name = model.graph.input[0].name
+    out_name = model.graph.output[0].name
+    inp_dict = {inp_name: x}
+    # npysim
+    model = model.transform(CodeGen_npysim())
+    model = model.transform(Compile())
+    model = model.transform(SetExecMode("npysim"))
+    model.save(build_dir + "/end2end_cnv_w1a1_ipgen_npysim.onnx")
+    ret_npysim = execute_onnx(model, inp_dict, True)
+    res_npysim = ret_npysim[out_name]
+    # node-by-node rtlsim
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(PrepareRTLSim())
+    fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
+    for fcl in fc_layers:
+        getCustomOp(fcl).set_nodeattr("rtlsim_trace", "default")
+    model.save(build_dir + "/end2end_cnv_w1a1_ipgen_nodebynode_rtlsim.onnx")
+    ret_rtlsim_nodebynode = execute_onnx(model, inp_dict, True)
+    res_rtlsim_nodebynode = ret_rtlsim_nodebynode[out_name]
+    # whole-network (ip-stitched) rtlsim
+    model.set_metadata_prop("exec_mode", "rtlsim")
+    model.set_metadata_prop("rtlsim_trace", "whole_trace.vcd")
+    model.save(build_dir + "/end2end_cnv_w1a1_ipstitch_whole_rtlsim.onnx")
+    # this is a particularly long-running test, set liveness thr. to unlimited
+    os.environ["LIVENESS_THRESHOLD"] = "-1"
+    ret_rtlsim_whole = execute_onnx(model, inp_dict, True)
+    res_rtlsim_whole = ret_rtlsim_whole[out_name]
+    assert np.isclose(res_npysim, res_rtlsim_nodebynode).all()
+    assert np.isclose(res_npysim, res_rtlsim_whole).all()
+
+
+def test_end2end_cnv_w1a1_verify_all():
+    # use the streamlined model as the "golden" model for right answers
+    golden = ModelWrapper(build_dir + "/end2end_cnv_w1a1_streamlined.onnx")
+    iname = golden.graph.input[0].name
+    oname = golden.graph.output[0].name
+    # load one of the test vectors
+    fn = pk.resource_filename("finn", "data/cifar10/cifar10-test-data-class3.npz")
+    input_tensor = np.load(fn)["arr_0"].astype(np.float32)
+    assert input_tensor.shape == (1, 3, 32, 32)
+    x = input_tensor
+    # x = np.zeros(ishape, dtype=np.float32)
+    ret_golden = execute_onnx(golden, {iname: x}, True)
+    y_golden = ret_golden[oname]
+    # set up parent+child graph to test
+    # we'll use models from the previous step as the child model
+    parent_model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_dataflow_parent.onnx")
+    iname = parent_model.graph.input[0].name
+    oname = parent_model.graph.output[0].name
+    # produce results with npysim
+    sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+    sdp_node = getCustomOp(sdp_node)
+    sdp_node.set_nodeattr("model", build_dir + "/end2end_cnv_w1a1_ipgen_npysim.onnx")
+    ret_npysim = execute_onnx(parent_model, {iname: x}, True)
+    y_npysim = ret_npysim[oname]
+    # produce results with node-by-node rtlsim
+    sdp_node.set_nodeattr(
+        "model", build_dir + "/end2end_cnv_w1a1_ipgen_nodebynode_rtlsim.onnx"
+    )
+    ret_nodebynode_rtlsim = execute_onnx(parent_model, {iname: x}, True)
+    y_nodebynode_rtlsim = ret_nodebynode_rtlsim[oname]
+    # produce results with whole-network (stitched ip) rtlsim
+    sdp_node.set_nodeattr(
+        "model", build_dir + "/end2end_cnv_w1a1_ipstitch_whole_rtlsim.onnx"
+    )
+    # this is a particularly long-running test, set liveness thr. to unlimited
+    os.environ["LIVENESS_THRESHOLD"] = "-1"
+    ret_whole_rtlsim = execute_onnx(parent_model, {iname: x}, True)
+    y_whole_rtlsim = ret_whole_rtlsim[oname]
+    assert np.isclose(y_golden, y_npysim).all()
+    assert np.isclose(y_golden, y_nodebynode_rtlsim).all()
+    assert np.isclose(y_golden, y_whole_rtlsim).all()
+
+
+def test_end2end_cnv_w1a1_make_pynq_proj():
+    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_ipstitch.onnx")
+    model = model.transform(MakePYNQProject(test_pynq_board))
+    model.save(build_dir + "/end2end_cnv_w1a1_pynq_project.onnx")
+
+
+def test_end2end_cnv_w1a1_synth_pynq_project():
+    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_pynq_project.onnx")
+    model = model.transform(SynthPYNQProject())
+    model = model.transform(AnnotateResources("synth"))
+    model.save(build_dir + "/end2end_cnv_w1a1_synth.onnx")
+
+
+def test_end2end_cnv_w1a1_make_driver():
+    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_synth.onnx")
+    model = model.transform(MakePYNQDriver())
+    model.save(build_dir + "/end2end_cnv_w1a1_pynq_driver.onnx")
+
+
+def test_end2end_cnv_w1a1_deploy_on_pynq():
+    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_pynq_driver.onnx")
+    try:
+        ip = os.environ["PYNQ_IP"]  # no fault for this one; skip if not defined
+        if ip == "":
+            pytest.skip("PYNQ board IP address not specified")
+        username = os.getenv("PYNQ_USERNAME", "xilinx")
+        password = os.getenv("PYNQ_PASSWORD", "xilinx")
+        target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
+        model = model.transform(DeployToPYNQ(ip, username, password, target_dir))
+        # save the model to be able to link it to the parent
+        model.save(build_dir + "/end2end_cnv_w1a1_pynq_deploy.onnx")
+    except KeyError:
+        pytest.skip("PYNQ board IP address not specified")
+
+
+def test_end2end_cnv_w1a1_run_on_pynq():
+    # use the streamlined model as the "golden" model for right answers
+    golden = ModelWrapper(build_dir + "/end2end_cnv_w1a1_streamlined.onnx")
+    iname = golden.graph.input[0].name
+    oname = golden.graph.output[0].name
+    # load one of the test vectors
+    fn = pk.resource_filename("finn", "data/cifar10/cifar10-test-data-class3.npz")
+    input_tensor = np.load(fn)["arr_0"].astype(np.float32)
+    assert input_tensor.shape == (1, 3, 32, 32)
+    x = input_tensor
+    # run using FINN-based execution
+    ret_golden = execute_onnx(golden, {iname: x}, True)
+    y_golden = ret_golden[oname]
+    # set up parent+child graph to test
+    # we'll use models from the previous step as the child model
+    parent_model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_dataflow_parent.onnx")
+    iname = parent_model.graph.input[0].name
+    oname = parent_model.graph.output[0].name
+    try:
+        ip = os.environ["PYNQ_IP"]  # NOQA
+        if ip == "":
+            pytest.skip("PYNQ board IP address not specified")
+        # produce results with npysim
+        sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+        sdp_node = getCustomOp(sdp_node)
+        sdp_node.set_nodeattr("model", build_dir + "/end2end_cnv_w1a1_pynq_deploy.onnx")
+        ret = execute_onnx(parent_model, {iname: x}, True)
+        y = ret[oname]
+        assert np.isclose(y, y_golden).all()
+
+    except KeyError:
+        pytest.skip("PYNQ board IP address not specified")
diff --git a/tests/end2end/test_end2end_tfc_w1a1.py b/tests/end2end/test_end2end_tfc_w1a1.py
index 9cd338caa69913dbcd1a1b66758fd633b94260ad..8a670fce2e7e6585c98efa9e4a6e27a660edf925 100644
--- a/tests/end2end/test_end2end_tfc_w1a1.py
+++ b/tests/end2end/test_end2end_tfc_w1a1.py
@@ -70,12 +70,14 @@ from finn.transformation.streamline import Streamline
 from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
 from finn.util.basic import pynq_part_map
 from finn.util.test import get_test_model_trained
+from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 
 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
 test_fpga_part = pynq_part_map[test_pynq_board]
 target_clk_ns = 5
-mem_mode = "const"
+mem_mode = "decoupled"
 
 
 def test_end2end_tfc_w1a1_export():
@@ -117,7 +119,8 @@ def test_end2end_tfc_w1a1_create_dataflow_partition():
     model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_hls_layers.onnx")
     parent_model = model.transform(CreateDataflowPartition())
     parent_model.save(build_dir + "/end2end_tfc_w1a1_dataflow_parent.onnx")
-    sdp_node = getCustomOp(parent_model.graph.node[2])
+    sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+    sdp_node = getCustomOp(sdp_node)
     dataflow_model_filename = sdp_node.get_nodeattr("model")
     dataflow_model = ModelWrapper(dataflow_model_filename)
     dataflow_model.save(build_dir + "/end2end_tfc_w1a1_dataflow_model.onnx")
@@ -125,18 +128,16 @@ def test_end2end_tfc_w1a1_create_dataflow_partition():
 
 def test_end2end_tfc_w1a1_fold_and_tlastmarker():
     model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_dataflow_model.onnx")
-    fc0 = model.graph.node[0]
-    fc1 = model.graph.node[1]
-    fc2 = model.graph.node[2]
-    fc3 = model.graph.node[3]
-    fc0w = getCustomOp(fc0)
-    fc1w = getCustomOp(fc1)
-    fc2w = getCustomOp(fc2)
-    fc3w = getCustomOp(fc3)
+    fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
+    fc0w = getCustomOp(fc_layers[0])
+    fc1w = getCustomOp(fc_layers[1])
+    fc2w = getCustomOp(fc_layers[2])
+    fc3w = getCustomOp(fc_layers[3])
     fc0w.set_nodeattr("inFIFODepth", 50)
     fc0w.set_nodeattr("SIMD", 16)
     fc0w.set_nodeattr("PE", 16)
     fc0w.set_nodeattr("outFIFODepth", 4)
+    fc0w.set_nodeattr("ram_style", "block")
     fc1w.set_nodeattr("SIMD", 8)
     fc1w.set_nodeattr("PE", 8)
     fc1w.set_nodeattr("outFIFODepth", 4)
@@ -146,16 +147,19 @@ def test_end2end_tfc_w1a1_fold_and_tlastmarker():
     fc3w.set_nodeattr("SIMD", 16)
     fc3w.set_nodeattr("PE", 10)
     fc3w.set_nodeattr("outFIFODepth", 50)
+    fc3w.set_nodeattr("ram_style", "distributed")
     model = model.transform(InsertDWC())
     model = model.transform(InsertTLastMarker())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(AnnotateResources("estimate"))
     model.save(build_dir + "/end2end_tfc_w1a1_folded.onnx")
 
 
 def test_end2end_tfc_w1a1_gen_hls_ip():
     model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_folded.onnx")
-    model = model.transform(GiveUniqueNodeNames())
     model = model.transform(CodeGen_ipgen(test_fpga_part, target_clk_ns))
     model = model.transform(HLSSynth_IPGen())
+    model = model.transform(AnnotateResources("hls"))
     model.save(build_dir + "/end2end_tfc_w1a1_ipgen.onnx")
 
 
@@ -181,10 +185,10 @@ def test_end2end_tfc_w1a1_verify_dataflow_part():
     res_npysim = ret_npysim[out_name]
     # node-by-node rtlsim
     model = model.transform(SetExecMode("rtlsim"))
-    getCustomOp(model.graph.node[0]).set_nodeattr("rtlsim_trace", "default")
-    getCustomOp(model.graph.node[1]).set_nodeattr("rtlsim_trace", "default")
-    getCustomOp(model.graph.node[2]).set_nodeattr("rtlsim_trace", "default")
-    getCustomOp(model.graph.node[3]).set_nodeattr("rtlsim_trace", "default")
+    model = model.transform(PrepareRTLSim())
+    fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
+    for fcl in fc_layers:
+        getCustomOp(fcl).set_nodeattr("rtlsim_trace", "default")
     model.save(build_dir + "/end2end_tfc_w1a1_ipstitch_nodebynode_rtlsim.onnx")
     ret_rtlsim_nodebynode = execute_onnx(model, inp_dict, True)
     res_rtlsim_nodebynode = ret_rtlsim_nodebynode[out_name]
@@ -215,7 +219,8 @@ def test_end2end_tfc_w1a1_verify_all():
     iname = parent_model.graph.input[0].name
     oname = parent_model.graph.output[0].name
     # produce results with npysim
-    sdp_node = getCustomOp(parent_model.graph.node[2])
+    sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+    sdp_node = getCustomOp(sdp_node)
     sdp_node.set_nodeattr("model", build_dir + "/end2end_tfc_w1a1_ipstitch_npysim.onnx")
     ret_npysim = execute_onnx(parent_model, {iname: x}, True)
     y_npysim = ret_npysim[oname]
@@ -245,6 +250,7 @@ def test_end2end_tfc_w1a1_make_pynq_proj():
 def test_end2end_tfc_w1a1_synth_pynq_project():
     model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_pynq_project.onnx")
     model = model.transform(SynthPYNQProject())
+    model = model.transform(AnnotateResources("synth"))
     model.save(build_dir + "/end2end_tfc_w1a1_synth.onnx")
 
 
@@ -292,7 +298,8 @@ def test_end2end_tfc_w1a1_run_on_pynq():
         if ip == "":
             pytest.skip("PYNQ board IP address not specified")
         # produce results with npysim
-        sdp_node = getCustomOp(parent_model.graph.node[2])
+        sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+        sdp_node = getCustomOp(sdp_node)
         sdp_node.set_nodeattr("model", build_dir + "/end2end_tfc_w1a1_pynq_deploy.onnx")
         ret = execute_onnx(parent_model, {iname: x}, True)
         y = ret[oname]
diff --git a/tests/end2end/test_end2end_tfc_w1a2.py b/tests/end2end/test_end2end_tfc_w1a2.py
index e3eead8454e901671ae27d62a3b1999c59f176a8..b55d985e07ac40fc875c49ba201c9552fd62c411 100644
--- a/tests/end2end/test_end2end_tfc_w1a2.py
+++ b/tests/end2end/test_end2end_tfc_w1a2.py
@@ -66,6 +66,8 @@ from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline import Streamline
 from finn.util.basic import pynq_part_map
 from finn.util.test import get_test_model_trained
+from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 
 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
@@ -109,7 +111,8 @@ def test_end2end_tfc_w1a2_create_dataflow_partition():
     model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_hls_layers.onnx")
     parent_model = model.transform(CreateDataflowPartition())
     parent_model.save(build_dir + "/end2end_tfc_w1a2_dataflow_parent.onnx")
-    sdp_node = getCustomOp(parent_model.graph.node[2])
+    sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+    sdp_node = getCustomOp(sdp_node)
     dataflow_model_filename = sdp_node.get_nodeattr("model")
     dataflow_model = ModelWrapper(dataflow_model_filename)
     dataflow_model.save(build_dir + "/end2end_tfc_w1a2_dataflow_model.onnx")
@@ -117,14 +120,11 @@ def test_end2end_tfc_w1a2_create_dataflow_partition():
 
 def test_end2end_tfc_w1a2_fold_and_tlastmarker():
     model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_dataflow_model.onnx")
-    fc0 = model.graph.node[0]
-    fc1 = model.graph.node[1]
-    fc2 = model.graph.node[2]
-    fc3 = model.graph.node[3]
-    fc0w = getCustomOp(fc0)
-    fc1w = getCustomOp(fc1)
-    fc2w = getCustomOp(fc2)
-    fc3w = getCustomOp(fc3)
+    fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
+    fc0w = getCustomOp(fc_layers[0])
+    fc1w = getCustomOp(fc_layers[1])
+    fc2w = getCustomOp(fc_layers[2])
+    fc3w = getCustomOp(fc_layers[3])
     fc0w.set_nodeattr("inFIFODepth", 50)
     fc0w.set_nodeattr("SIMD", 8)
     fc0w.set_nodeattr("PE", 16)
@@ -139,14 +139,16 @@ def test_end2end_tfc_w1a2_fold_and_tlastmarker():
     fc3w.set_nodeattr("PE", 10)
     fc3w.set_nodeattr("outFIFODepth", 50)
     model = model.transform(InsertTLastMarker())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(AnnotateResources("estimate"))
     model.save(build_dir + "/end2end_tfc_w1a2_folded.onnx")
 
 
 def test_end2end_tfc_w1a2_gen_hls_ip():
     model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_folded.onnx")
-    model = model.transform(GiveUniqueNodeNames())
     model = model.transform(CodeGen_ipgen(test_fpga_part, target_clk_ns))
     model = model.transform(HLSSynth_IPGen())
+    model = model.transform(AnnotateResources("hls"))
     model.save(build_dir + "/end2end_tfc_w1a2_ipgen.onnx")
 
 
@@ -172,10 +174,10 @@ def test_end2end_tfc_w1a2_verify_dataflow_part():
     res_npysim = ret_npysim[out_name]
     # node-by-node rtlsim
     model = model.transform(SetExecMode("rtlsim"))
-    getCustomOp(model.graph.node[0]).set_nodeattr("rtlsim_trace", "default")
-    getCustomOp(model.graph.node[1]).set_nodeattr("rtlsim_trace", "default")
-    getCustomOp(model.graph.node[2]).set_nodeattr("rtlsim_trace", "default")
-    getCustomOp(model.graph.node[3]).set_nodeattr("rtlsim_trace", "default")
+    model = model.transform(PrepareRTLSim())
+    fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
+    for fcl in fc_layers:
+        getCustomOp(fcl).set_nodeattr("rtlsim_trace", "default")
     model.save(build_dir + "/end2end_tfc_w1a2_ipstitch_nodebynode_rtlsim.onnx")
     ret_rtlsim_nodebynode = execute_onnx(model, inp_dict, True)
     res_rtlsim_nodebynode = ret_rtlsim_nodebynode[out_name]
@@ -206,7 +208,8 @@ def test_end2end_tfc_w1a2_verify_all():
     iname = parent_model.graph.input[0].name
     oname = parent_model.graph.output[0].name
     # produce results with npysim
-    sdp_node = getCustomOp(parent_model.graph.node[2])
+    sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+    sdp_node = getCustomOp(sdp_node)
     sdp_node.set_nodeattr("model", build_dir + "/end2end_tfc_w1a2_ipstitch_npysim.onnx")
     ret_npysim = execute_onnx(parent_model, {iname: x}, True)
     y_npysim = ret_npysim[oname]
@@ -236,6 +239,7 @@ def test_end2end_tfc_w1a2_make_pynq_proj():
 def test_end2end_tfc_w1a2_synth_pynq_project():
     model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_pynq_project.onnx")
     model = model.transform(SynthPYNQProject())
+    model = model.transform(AnnotateResources("synth"))
     model.save(build_dir + "/end2end_tfc_w1a2_synth.onnx")
 
 
@@ -283,7 +287,8 @@ def test_end2end_tfc_w1a2_run_on_pynq():
         if ip == "":
             pytest.skip("PYNQ board IP address not specified")
         # produce results with npysim
-        sdp_node = getCustomOp(parent_model.graph.node[2])
+        sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+        sdp_node = getCustomOp(sdp_node)
         sdp_node.set_nodeattr("model", build_dir + "/end2end_tfc_w1a2_pynq_deploy.onnx")
         ret = execute_onnx(parent_model, {iname: x}, True)
         y = ret[oname]
diff --git a/tests/end2end/test_end2end_tfc_w2a2.py b/tests/end2end/test_end2end_tfc_w2a2.py
index 84133bb6c3c32a81190ce0f8b7b4b5d3de64d079..92b8b18bc0253a07eec988c2bace9a9178682147 100644
--- a/tests/end2end/test_end2end_tfc_w2a2.py
+++ b/tests/end2end/test_end2end_tfc_w2a2.py
@@ -66,6 +66,8 @@ from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline import Streamline
 from finn.util.basic import pynq_part_map
 from finn.util.test import get_test_model_trained
+from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 
 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
@@ -109,7 +111,8 @@ def test_end2end_tfc_w2a2_create_dataflow_partition():
     model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_hls_layers.onnx")
     parent_model = model.transform(CreateDataflowPartition())
     parent_model.save(build_dir + "/end2end_tfc_w2a2_dataflow_parent.onnx")
-    sdp_node = getCustomOp(parent_model.graph.node[2])
+    sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+    sdp_node = getCustomOp(sdp_node)
     dataflow_model_filename = sdp_node.get_nodeattr("model")
     dataflow_model = ModelWrapper(dataflow_model_filename)
     dataflow_model.save(build_dir + "/end2end_tfc_w2a2_dataflow_model.onnx")
@@ -117,14 +120,11 @@ def test_end2end_tfc_w2a2_create_dataflow_partition():
 
 def test_end2end_tfc_w2a2_fold_and_tlastmarker():
     model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_dataflow_model.onnx")
-    fc0 = model.graph.node[0]
-    fc1 = model.graph.node[1]
-    fc2 = model.graph.node[2]
-    fc3 = model.graph.node[3]
-    fc0w = getCustomOp(fc0)
-    fc1w = getCustomOp(fc1)
-    fc2w = getCustomOp(fc2)
-    fc3w = getCustomOp(fc3)
+    fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
+    fc0w = getCustomOp(fc_layers[0])
+    fc1w = getCustomOp(fc_layers[1])
+    fc2w = getCustomOp(fc_layers[2])
+    fc3w = getCustomOp(fc_layers[3])
     fc0w.set_nodeattr("inFIFODepth", 50)
     fc0w.set_nodeattr("SIMD", 8)
     fc0w.set_nodeattr("PE", 16)
@@ -139,14 +139,16 @@ def test_end2end_tfc_w2a2_fold_and_tlastmarker():
     fc3w.set_nodeattr("PE", 10)
     fc3w.set_nodeattr("outFIFODepth", 50)
     model = model.transform(InsertTLastMarker())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(AnnotateResources("estimate"))
     model.save(build_dir + "/end2end_tfc_w2a2_folded.onnx")
 
 
 def test_end2end_tfc_w2a2_gen_hls_ip():
     model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_folded.onnx")
-    model = model.transform(GiveUniqueNodeNames())
     model = model.transform(CodeGen_ipgen(test_fpga_part, target_clk_ns))
     model = model.transform(HLSSynth_IPGen())
+    model = model.transform(AnnotateResources("hls"))
     model.save(build_dir + "/end2end_tfc_w2a2_ipgen.onnx")
 
 
@@ -172,10 +174,10 @@ def test_end2end_tfc_w2a2_verify_dataflow_part():
     res_npysim = ret_npysim[out_name]
     # node-by-node rtlsim
     model = model.transform(SetExecMode("rtlsim"))
-    getCustomOp(model.graph.node[0]).set_nodeattr("rtlsim_trace", "default")
-    getCustomOp(model.graph.node[1]).set_nodeattr("rtlsim_trace", "default")
-    getCustomOp(model.graph.node[2]).set_nodeattr("rtlsim_trace", "default")
-    getCustomOp(model.graph.node[3]).set_nodeattr("rtlsim_trace", "default")
+    model = model.transform(PrepareRTLSim())
+    fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
+    for fcl in fc_layers:
+        getCustomOp(fcl).set_nodeattr("rtlsim_trace", "default")
     model.save(build_dir + "/end2end_tfc_w2a2_ipstitch_nodebynode_rtlsim.onnx")
     ret_rtlsim_nodebynode = execute_onnx(model, inp_dict, True)
     res_rtlsim_nodebynode = ret_rtlsim_nodebynode[out_name]
@@ -206,7 +208,8 @@ def test_end2end_tfc_w2a2_verify_all():
     iname = parent_model.graph.input[0].name
     oname = parent_model.graph.output[0].name
     # produce results with npysim
-    sdp_node = getCustomOp(parent_model.graph.node[2])
+    sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+    sdp_node = getCustomOp(sdp_node)
     sdp_node.set_nodeattr("model", build_dir + "/end2end_tfc_w2a2_ipstitch_npysim.onnx")
     ret_npysim = execute_onnx(parent_model, {iname: x}, True)
     y_npysim = ret_npysim[oname]
@@ -236,6 +239,7 @@ def test_end2end_tfc_w2a2_make_pynq_proj():
 def test_end2end_tfc_w2a2_synth_pynq_project():
     model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_pynq_project.onnx")
     model = model.transform(SynthPYNQProject())
+    model = model.transform(AnnotateResources("synth"))
     model.save(build_dir + "/end2end_tfc_w2a2_synth.onnx")
 
 
@@ -283,7 +287,8 @@ def test_end2end_tfc_w2a2_run_on_pynq():
         if ip == "":
             pytest.skip("PYNQ board IP address not specified")
         # produce results with npysim
-        sdp_node = getCustomOp(parent_model.graph.node[2])
+        sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+        sdp_node = getCustomOp(sdp_node)
         sdp_node.set_nodeattr("model", build_dir + "/end2end_tfc_w2a2_pynq_deploy.onnx")
         ret = execute_onnx(parent_model, {iname: x}, True)
         y = ret[oname]
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
index 92d98f57c62aeb93fc17091c37214a62e78ebb8f..2ec47915b01c92c7b7c11d0cf160543fb71dd27d 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
@@ -38,6 +38,7 @@ from finn.transformation.fpgadataflow.codegen_npysim import CodeGen_npysim
 from finn.transformation.fpgadataflow.compile import Compile
 from finn.transformation.fpgadataflow.hlssynth_ipgen import HLSSynth_IPGen
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.general import GiveUniqueNodeNames
 from finn.util.basic import gen_finn_dt_tensor
 
@@ -152,6 +153,7 @@ def test_fpgadataflow_slidingwindow(idt, k, ifm_dim, ifm_ch, stride, exec_mode):
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(CodeGen_ipgen("xc7z020clg400-1", 5))
         model = model.transform(HLSSynth_IPGen())
+        model = model.transform(PrepareRTLSim())
     else:
         raise Exception("Unknown exec_mode in test_fpgadataflow_slidingwindow")
 
diff --git a/tests/fpgadataflow/test_fpgadataflow_dwc.py b/tests/fpgadataflow/test_fpgadataflow_dwc.py
index 651df836ab4dfd320e4c67ff3dd49f31ec13c110..1465881830b4fec61d1b1aa6e8465a41766fd9de 100644
--- a/tests/fpgadataflow/test_fpgadataflow_dwc.py
+++ b/tests/fpgadataflow/test_fpgadataflow_dwc.py
@@ -7,6 +7,7 @@ from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fpgadataflow.codegen_ipgen import CodeGen_ipgen
 from finn.transformation.fpgadataflow.hlssynth_ipgen import HLSSynth_IPGen
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.general import GiveUniqueNodeNames
 from finn.util.basic import gen_finn_dt_tensor
 import finn.core.onnx_exec as oxe
@@ -66,6 +67,7 @@ def test_fpgadataflow_dwc_rtlsim(Shape, INWidth, OUTWidth, finn_dtype):
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(CodeGen_ipgen("xc7z020clg400-1", 5))
     model = model.transform(HLSSynth_IPGen())
+    model = model.transform(PrepareRTLSim())
     y = oxe.execute_onnx(model, input_dict)["outp"]
 
     assert (
diff --git a/tests/fpgadataflow/test_fpgadataflow_fclayer.py b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
index 80c9e84ba92c93e8a5d57ffaceb22b5abf188963..7552fecd85ee0e36216f6c934d454f057a2a41ce 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fclayer.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
@@ -44,7 +44,11 @@ from finn.transformation.fpgadataflow.compile import Compile
 from finn.transformation.fpgadataflow.hlssynth_ipgen import HLSSynth_IPGen
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.general import GiveUniqueNodeNames
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
 
 
 def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=None):
@@ -295,6 +299,100 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(CodeGen_ipgen("xc7z020clg400-1", 5))
     model = model.transform(HLSSynth_IPGen())
+    model = model.transform(ReplaceVerilogRelPaths())
+    model = model.transform(PrepareRTLSim())
+    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
+    assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "rtlsim failed"
+
+    hls_synt_res_est = model.analysis(hls_synth_res_estimation)
+    assert "StreamingFCLayer_Batch_0" in hls_synt_res_est
+
+
+# mem_mode: const or decoupled
+@pytest.mark.parametrize("mem_mode", ["decoupled"])
+# activation: None or DataType
+@pytest.mark.parametrize("act", [DataType.INT4])
+# weight datatype
+@pytest.mark.parametrize("wdt", [DataType.INT4])
+# input datatype
+@pytest.mark.parametrize("idt", [DataType.INT4])
+# neuron folding, -1 is maximum possible
+@pytest.mark.parametrize("nf", [-1])
+# synapse folding, -1 is maximum possible
+@pytest.mark.parametrize("sf", [-1])
+# HLS matrix width (input features)
+@pytest.mark.parametrize("mw", [128])
+# HLS matrix height (output features)
+@pytest.mark.parametrize("mh", [128])
+def test_fpgadataflow_fclayer_large_depth_decoupled_mode(
+    mem_mode, idt, wdt, act, nf, sf, mw, mh
+):
+    if nf == -1:
+        nf = mh
+    if sf == -1:
+        sf = mw
+    pe = mh // nf
+    simd = mw // sf
+    assert mh % pe == 0
+    assert mw % sf == 0
+    # generate weights
+    W = gen_finn_dt_tensor(wdt, (mw, mh))
+    # generate input data
+    x = gen_finn_dt_tensor(idt, (1, mw))
+    if act is None:
+        # no activation, produce accumulators
+        T = None
+        tdt = None
+        if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
+            odt = DataType.UINT32
+        else:
+            odt = DataType.INT32
+    else:
+        odt = act
+        (min, max) = calculate_signed_dot_prod_range(idt, wdt, mw)
+        n_steps = act.get_num_possible_values() - 1
+        T = np.random.randint(min, max - 1, (mh, n_steps)).astype(np.float32)
+        # provide non-decreasing thresholds
+        T = np.sort(T, axis=1)
+        # generate thresholds for activation
+        if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
+            tdt = DataType.UINT32
+            # bias thresholds to be positive
+            T = np.ceil((T + mw) / 2)
+            assert (T >= 0).all()
+        else:
+            tdt = DataType.INT32
+    model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt)
+    for node in model.graph.node:
+        # lookup op_type in registry of CustomOps
+        inst = getCustomOp(node)
+        inst.set_nodeattr("mem_mode", mem_mode)
+
+    # prepare input data
+    input_dict = prepare_inputs(x, idt, wdt)
+    if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
+        # convert inputs to binary and use xnorpopcountmatmul
+        y = xp.xnorpopcountmatmul((x + 1) / 2, (W + 1) / 2)
+    else:
+        y = np.matmul(x, W)
+    if T is not None:
+        y = multithreshold(y, T)
+        if act == DataType.BIPOLAR:
+            # binary to bipolar
+            y = 2 * y - 1
+        else:
+            # signed offset
+            y += act.min()
+    oshape = model.get_tensor_shape("outp")
+    y_expected = y.reshape(oshape)
+    # TODO split up into several dependent tests -- need to check how this
+    # works for parametrized tests...
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(CodeGen_ipgen("xc7z020clg400-1", 5))
+    model = model.transform(HLSSynth_IPGen())
+    model = model.transform(ReplaceVerilogRelPaths())
+    model = model.transform(PrepareRTLSim())
     y_produced = oxe.execute_onnx(model, input_dict)["outp"]
     assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "rtlsim failed"
 
diff --git a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py b/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
index 4a81977d49d174f66e1a02140a7643bd352db7a2..1c5ae02e4c662f48be4f7f70b9de24a1f9f72ecf 100644
--- a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
+++ b/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
@@ -286,13 +286,13 @@ def test_fpgadataflow_ipstitch_pynq_driver():
 
 
 def test_fpgadataflow_ipstitch_pynq_deployment_folder():
-    model = ModelWrapper(
-        ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_pynq_driver.onnx"
-    )
     try:
         ip = os.environ["PYNQ_IP"]  # no default for this one; skip if not defined
         if ip == "":
             pytest.skip("PYNQ board IP address not specified")
+        model = ModelWrapper(
+            ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_pynq_driver.onnx"
+        )
         username = os.getenv("PYNQ_USERNAME", "xilinx")
         password = os.getenv("PYNQ_PASSWORD", "xilinx")
         target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
@@ -319,13 +319,13 @@ def test_fpgadataflow_ipstitch_pynq_deployment_folder():
 
 
 def test_fpgadataflow_ipstitch_remote_execution():
-    model = ModelWrapper(
-        ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_pynq_deployment.onnx"
-    )
     try:
         ip = os.environ["PYNQ_IP"]  # NOQA
         if ip == "":
             pytest.skip("PYNQ board IP address not specified")
+        model = ModelWrapper(
+            ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_pynq_deployment.onnx"
+        )
         idt = DataType.INT2
         x = gen_finn_dt_tensor(idt, (1, 4))
         input_dict = {"inp": x}
diff --git a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
index 0dd3fd7a9fefaaad9777ac98a35806a9eaa35188..38f792ed3cdd52044b28b4c19ac0603da4e502e6 100644
--- a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
+++ b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
@@ -92,7 +92,7 @@ def test_res_estimate():
     model = model.transform(GiveUniqueNodeNames())
     prod_resource_estimation = model.analysis(res_estimation)
     expect_resource_estimation = {
-        "StreamingFCLayer_Batch_0": ["BRAMs: 1", "LUTs: 304.4"]
+        "StreamingFCLayer_Batch_0": {"BRAM_18K": 1, "LUT": 304.4}
     }
 
     assert check_two_dict_for_equality(
diff --git a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py b/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
index a9acebb3114059f988aa0f21cad70e617d3d6f77..a7a731aaa5593a9fd680061d2b8ad3fc47e9f490 100644
--- a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
+++ b/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
@@ -38,6 +38,7 @@ from finn.transformation.fpgadataflow.codegen_npysim import CodeGen_npysim
 from finn.transformation.fpgadataflow.compile import Compile
 from finn.transformation.fpgadataflow.hlssynth_ipgen import HLSSynth_IPGen
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.general import GiveUniqueNodeNames
 from finn.util.basic import gen_finn_dt_tensor
 
@@ -144,6 +145,7 @@ def test_fpgadataflow_streamingmaxpool(idt, k, ifm_dim, ifm_ch, exec_mode):
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(CodeGen_ipgen("xc7z020clg400-1", 5))
         model = model.transform(HLSSynth_IPGen())
+        model = model.transform(PrepareRTLSim())
     else:
         raise Exception("Unknown exec_mode in test_fpgadataflow_slidingwindow")
 
diff --git a/tests/travis_install.sh b/tests/travis_install.sh
deleted file mode 100644
index 05ff95a6a2205d90a5813b41504e2105e36e2f92..0000000000000000000000000000000000000000
--- a/tests/travis_install.sh
+++ /dev/null
@@ -1,89 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2020, Xilinx
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-# This script is meant to be called by the "install" step defined in
-# .travis.yml. See http://docs.travis-ci.com/ for more details.
-# The behavior of the script is controlled by environment variabled defined
-# in the .travis.yml in the top level folder of the project.
-#
-# This script is inspired by Scikit-Learn (http://scikit-learn.org/)
-#
-# THIS SCRIPT IS SUPPOSED TO BE AN EXAMPLE. MODIFY IT ACCORDING TO YOUR NEEDS!
-
-set -e
-
-if [[ "$DISTRIB" == "conda" ]]; then
-    # Deactivate the travis-provided virtual environment and setup a
-    # conda-based environment instead
-    deactivate
-
-    if [[ -f "$HOME/miniconda/bin/conda" ]]; then
-        echo "Skip install conda [cached]"
-    else
-        # By default, travis caching mechanism creates an empty dir in the
-        # beginning of the build, but conda installer aborts if it finds an
-        # existing folder, so let's just remove it:
-        rm -rf "$HOME/miniconda"
-
-        # Use the miniconda installer for faster download / install of conda
-        # itself
-        wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \
-            -O miniconda.sh
-        chmod +x miniconda.sh && ./miniconda.sh -b -p $HOME/miniconda
-    fi
-    export PATH=$HOME/miniconda/bin:$PATH
-    # Make sure to use the most updated version
-    conda update --yes conda
-
-    # Configure the conda environment and put it in the path using the
-    # provided versions
-    # (prefer local venv, since the miniconda folder is cached)
-    conda create -p ./.venv --yes python=${PYTHON_VERSION} pip virtualenv
-    source activate ./.venv
-fi
-
-# for all
-pip install -U pip setuptools
-pip install tox
-
-if [[ "$COVERAGE" == "true" ]]; then
-    pip install -U pytest-cov pytest-virtualenv coverage coveralls flake8 pre-commit
-fi
-
-
-travis-cleanup() {
-    printf "Cleaning up environments ... "  # printf avoids new lines
-    if [[ "$DISTRIB" == "conda" ]]; then
-        # Force the env to be recreated next time, for build consistency
-        source deactivate
-        conda remove -p ./.venv --all --yes
-        rm -rf ./.venv
-    fi
-    echo "DONE"
-}