diff --git a/.github/workflows/quicktest-dev-pr.yml b/.github/workflows/quicktest-dev-pr.yml
index fff5b9618de9c2e223c86bc9add2cf3990c5fb78..960abd675bbb185ce2fadfab954ec2b4fd6ff94e 100644
--- a/.github/workflows/quicktest-dev-pr.yml
+++ b/.github/workflows/quicktest-dev-pr.yml
@@ -20,4 +20,4 @@ jobs:
       - name: DockerRunQuicktest
         run: |
           docker build -t finn_gha -f docker/Dockerfile.finn_ci --build-arg BUILD_PATH=/tmp/finn_gha .
-          docker run --init --hostname finn_gha -v $(pwd):/workspace/finn -e FINN_INST_NAME=finn_gha finn_gha quicktest.sh
+          docker run --init --hostname finn_gha -v $(pwd):/workspace/finn -e FINN_BUILD_DIR=/tmp/finn_gha -e FINN_INST_NAME=finn_gha finn_gha quicktest.sh
diff --git a/.gitignore b/.gitignore
index 0c1bbd84fe24be46446a7d714dd708d601813e53..225fb5cfa3df45124797da425df14974308b90c2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -39,9 +39,10 @@
 __pycache__/*
 .cache/*
 .*.swp
-*/.ipynb_checkpoints/*
+*.ipynb_checkpoints*
 
 # Project files
+.vscode
 .ropeproject
 .project
 .pydevproject
@@ -88,6 +89,10 @@ MANIFEST
 # datasets for testing
 /dataset/
 /data/
+*.csv
 
 # Google Drive key for dashboard
 /gdrive-key/
+
+# generated files as part of end2end notebooks
+/notebooks/end2end_example/**/*.onnx
diff --git a/README.md b/README.md
index 6d485627a322c7b192c1e9ad5a1058487952b11a..473885184fdc252ca16d859f14d7c42ed82ba540 100644
--- a/README.md
+++ b/README.md
@@ -12,17 +12,19 @@ inference on FPGAs.
 It specifically targets <a href="https://github.com/maltanar/qnn-inference-examples" target="_blank">quantized neural
 networks</a>, with emphasis on
 generating dataflow-style architectures customized for each network.
-The resulting FPGA accelerators can yield very high classification rates, or conversely be run with a slow clock for very low power consumption.
+The resulting FPGA accelerators are highly efficient and can yield high throughput and low latency.
 The framework is fully open-source in order to give a higher degree of flexibility, and is intended to enable neural network research spanning several layers of the software/hardware abstraction stack.
 
-For more general information about FINN, please visit the [project page](https://xilinx.github.io/finn/), check out the [publications](https://xilinx.github.io/finn/publications) or some of the [demos](https://xilinx.github.io/finn/demos).
+We have a separate repository [finn-examples](https://github.com/Xilinx/finn-examples) that houses pre-built examples for several neural networks.
+For more general information about FINN, please visit the [project page](https://xilinx.github.io/finn/) and check out the [publications](https://xilinx.github.io/finn/publications).
 
 ## Getting Started
 
-Please see the [Getting Started](https://finn.readthedocs.io/en/latest/getting_started.html) page for more information on requirements, installation, and how to run FINN in different modes. Due to the complex nature of the dependencies of the project, we only support Docker-based deployment at this time.
+Please see the [Getting Started](https://finn.readthedocs.io/en/latest/getting_started.html) page for more information on requirements, installation, and how to run FINN in different modes. Due to the complex nature of the dependencies of the project, **we only support Docker-based execution of the FINN compiler at this time**.
 
 ## What's New in FINN?
 
+* **2020-12-17:** v0.5b (beta) is released, with a new [examples repo](https://github.com/Xilinx/finn-examples) including MobileNet-v1. Read more on the <a href="https://xilinx.github.io/finn/2020/12/17/finn-v05b-beta-is-released.html">release blog post</a>.
 * **2020-09-21:** v0.4b (beta) is released. Read more on the <a href="https://xilinx.github.io/finn/2020/09/21/finn-v04b-beta-is-released.html">release blog post</a>.
 * **2020-05-08:** v0.3b (beta) is released, with initial support for convolutions, parallel transformations, more flexible memory allocation for MVAUs, throughput testing and many other smaller improvements and bugfixes. Read more on the <a href="https://xilinx.github.io/finn/2020/05/08/finn-v03b-beta-is-released.html">release blog post</a>.
 * **2020-04-15:** FINN v0.2.1b (beta): use fixed commit versions for dependency repos, otherwise identical to 0.2b
diff --git a/docker/Dockerfile.finn_ci b/docker/Dockerfile.finn_ci
index 3145d8776025f71aa32da71eb5d31ef472b885a7..69046127945791f6c21b9d5a9201f1ea550625b5 100644
--- a/docker/Dockerfile.finn_ci
+++ b/docker/Dockerfile.finn_ci
@@ -28,22 +28,20 @@
 
 FROM pytorch/pytorch:1.1.0-cuda10.0-cudnn7.5-devel
 MAINTAINER Yaman Umuroglu <yamanu@xilinx.com>
-ARG PYTHON_VERSION=3.6
-ARG BUILD_PATH
 
 WORKDIR /workspace
 
 RUN apt-get update
 RUN apt-get -y upgrade
 RUN apt-get install -y build-essential libglib2.0-0 libsm6 libxext6 libxrender-dev
-RUN apt-get install -y verilator zsh
-RUN apt-get -y install sshpass wget unzip
+RUN apt-get install -y verilator zsh nano
+RUN apt-get install -y sshpass wget unzip
 RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config
 
 # XRT deps
 RUN wget https://raw.githubusercontent.com/Xilinx/XRT/master/src/runtime_src/tools/scripts/xrtdeps.sh
 RUN apt-get update
-RUN bash xrtdeps.sh
+RUN bash xrtdeps.sh -docker
 RUN rm xrtdeps.sh
 
 # cloning dependency repos
@@ -63,22 +61,17 @@ RUN git clone https://bitbucket.org/maltanar/oh-my-xilinx.git /workspace/oh-my-x
 COPY requirements.txt .
 RUN pip install -r requirements.txt
 RUN rm requirements.txt
-RUN apt update; apt install nano
 RUN pip install pytest-dependency
 RUN pip install pytest-xdist
 RUN pip install pytest-parallel
 RUN pip install -e git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading
 
-ENV PYTHONPATH "${PYTHONPATH}:/workspace/finn/src"
-ENV PYTHONPATH "${PYTHONPATH}:/workspace/pyverilator"
-ENV VIVADO_IP_CACHE "$BUILD_PATH/vivado_ip_cache"
+ENV VIVADO_IP_CACHE "/tmp/vivado_ip_cache"
 ENV PATH "${PATH}:/workspace/oh-my-xilinx"
 ENV OHMYXILINX "/workspace/oh-my-xilinx"
 
 # colorful terminal output
 RUN echo "PS1='\[\033[1;36m\]\u\[\033[1;31m\]@\[\033[1;32m\]\h:\[\033[1;35m\]\w\[\033[1;31m\]\$\[\033[0m\] '" >>  /root/.bashrc
-RUN mkdir -p $BUILD_PATH
-RUN mkdir -p $VIVADO_IP_CACHE
 
 WORKDIR /workspace/finn
 
diff --git a/docker/Dockerfile.finn_dev b/docker/Dockerfile.finn_dev
index c3017f172c6f357a4b59a3c4129c21b6a801fd49..46b3ffb255f6e3d4ff6e92d83b285e53be4beeeb 100644
--- a/docker/Dockerfile.finn_dev
+++ b/docker/Dockerfile.finn_dev
@@ -28,14 +28,11 @@
 
 FROM pytorch/pytorch:1.1.0-cuda10.0-cudnn7.5-devel
 MAINTAINER Yaman Umuroglu <yamanu@xilinx.com>
-ARG PYTHON_VERSION=3.6
 ARG GID
 ARG GNAME
 ARG UNAME
 ARG UID
 ARG PASSWD
-ARG JUPYTER_PORT
-ARG NETRON_PORT
 
 WORKDIR /workspace
 
@@ -86,8 +83,6 @@ RUN git clone https://bitbucket.org/maltanar/oh-my-xilinx.git /workspace/oh-my-x
 
 # for this developer-oriented Docker container we assume the FINN repo is cloned and mounted from the host
 # at /workspace/finn -- see run-docker.sh for an example of how to do this.
-ENV PYTHONPATH "${PYTHONPATH}:/workspace/finn/src"
-ENV PYTHONPATH "${PYTHONPATH}:/workspace/pyverilator"
 ENV PATH "${PATH}:/workspace/oh-my-xilinx:/home/$UNAME/.local/bin"
 ENV OHMYXILINX "/workspace/oh-my-xilinx"
 
@@ -101,10 +96,19 @@ COPY docker/finn_entrypoint.sh /usr/local/bin/
 COPY docker/quicktest.sh /usr/local/bin/
 RUN chmod 755 /usr/local/bin/finn_entrypoint.sh
 RUN chmod 755 /usr/local/bin/quicktest.sh
-USER $UNAME
+# install vitis deps if required
+ARG INSTALL_XRT_DEPS
+RUN if [ "$INSTALL_XRT_DEPS" = "1" ] ; then \
+    echo "Installing XRT dependencies"; \
+    wget https://raw.githubusercontent.com/Xilinx/XRT/master/src/runtime_src/tools/scripts/xrtdeps.sh; \
+    apt-get update; \
+    bash xrtdeps.sh -docker; \
+    rm xrtdeps.sh; \
+  else \
+    echo "Skipping installation of XRT dependencies"; \
+  fi
 
-EXPOSE $JUPYTER_PORT
-EXPOSE $NETRON_PORT
+USER $UNAME
 
 ENTRYPOINT ["finn_entrypoint.sh"]
 CMD ["bash"]
diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index 0aea65fdd7999b56989239685f6606a8e1b2e618..1ed8875e886ea78511f1992d95be4417b3af80df 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -12,11 +12,11 @@ gecho () {
 
 # checkout the correct dependency repo commits
 # the repos themselves are cloned in the Dockerfile
-FINN_BASE_COMMIT=f2e5f0582ef2b7cbc134168993816c337ca8d3a6
-BREVITAS_COMMIT=b75e0408d9759ed519296e3af29b9c16fb94b0b8
+FINN_BASE_COMMIT=1363981654009067790d5f2d0c3dd303b5fa05cb
+BREVITAS_COMMIT=aff49758ec445d77c75721c7de3091a2a1797ca8
 CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4
-HLSLIB_COMMIT=cfafe11a93b79ab1af7529d68f08886913a6466e
-PYVERILATOR_COMMIT=06c29ecf3ba0361e3d0a75c98f6918ba67bf0e27
+HLSLIB_COMMIT=2e49322d1bbc4969ca293843bda1f3f9c05456fc
+PYVERILATOR_COMMIT=e2ff74030de3992dcac54bf1b6aad2915946e8cb
 OMX_COMMIT=1bae737669901e762f581af73348332b5c4b2ada
 
 gecho "Setting up known-good commit versions for FINN dependencies"
@@ -42,10 +42,15 @@ git -C /workspace/finn-hlslib checkout $HLSLIB_COMMIT --quiet
 gecho "PyVerilator @ $PYVERILATOR_COMMIT"
 git -C /workspace/pyverilator pull --quiet
 git -C /workspace/pyverilator checkout $PYVERILATOR_COMMIT --quiet
+pip install --user -e /workspace/pyverilator
 # oh-my-xilinx
 gecho "oh-my-xilinx @ $OMX_COMMIT"
 git -C /workspace/oh-my-xilinx pull --quiet
 git -C /workspace/oh-my-xilinx checkout $OMX_COMMIT --quiet
+# remove old version egg-info, if any
+rm -rf $FINN_ROOT/src/FINN.egg-info
+# run pip install for finn
+pip install --user -e $FINN_ROOT
 
 if [ ! -z "$VIVADO_PATH" ];then
   # source Vivado env.vars
@@ -56,6 +61,8 @@ fi
 # download PYNQ board files if not already there
 if [ ! -d "/workspace/finn/board_files" ]; then
     gecho "Downloading PYNQ board files for Vivado"
+    OLD_PWD=$(pwd)
+    cd /workspace/finn
     wget -q https://github.com/cathalmccabe/pynq-z1_board_files/raw/master/pynq-z1.zip
     wget -q https://d2m32eurp10079.cloudfront.net/Download/pynq-z2.zip
     unzip -q pynq-z1.zip
@@ -65,21 +72,22 @@ if [ ! -d "/workspace/finn/board_files" ]; then
     mv pynq-z2/ board_files/
     rm pynq-z1.zip
     rm pynq-z2.zip
+    cd $OLD_PWD
 fi
 if [ ! -d "/workspace/finn/board_files/ultra96v1" ]; then
     gecho "Downloading Avnet BDF files into board_files"
+    OLD_PWD=$(pwd)
+    cd /workspace/finn
     git clone https://github.com/Avnet/bdf.git
     mv /workspace/finn/bdf/* /workspace/finn/board_files/
     rm -rf /workspace/finn/bdf
+    cd $OLD_PWD
 fi
 if [ ! -z "$VITIS_PATH" ];then
   # source Vitis env.vars
   export XILINX_VITIS=$VITIS_PATH
   source $VITIS_PATH/settings64.sh
   if [ ! -z "$XILINX_XRT" ];then
-    gecho "For VitisBuild, please ensure the XRT dependencies are correctly installed"
-    gecho "by downloading and running:"
-    gecho "https://raw.githubusercontent.com/Xilinx/XRT/master/src/runtime_src/tools/scripts/xrtdeps.sh"
     # source XRT
     source $XILINX_XRT/setup.sh
   fi
diff --git a/docs/ARC2018.pdf b/docs/ARC2018.pdf
deleted file mode 100644
index e0241f870e3b8e8e6bc1c2bf5c853f17b164ec51..0000000000000000000000000000000000000000
Binary files a/docs/ARC2018.pdf and /dev/null differ
diff --git a/docs/ASAP2018.pdf b/docs/ASAP2018.pdf
deleted file mode 100644
index 6144a6cd834fae4c6a00142c004c886d3aef4e7d..0000000000000000000000000000000000000000
Binary files a/docs/ASAP2018.pdf and /dev/null differ
diff --git a/docs/BigDataBelfast2018.pdf b/docs/BigDataBelfast2018.pdf
deleted file mode 100644
index f7c96e9d37c45a9303bddf7ede9a118bb0e5003e..0000000000000000000000000000000000000000
Binary files a/docs/BigDataBelfast2018.pdf and /dev/null differ
diff --git a/docs/DAMON2019_Blott_final.pdf b/docs/DAMON2019_Blott_final.pdf
deleted file mode 100644
index 08b3aa20fcb4a3f51252fcc6257fc01233a1858d..0000000000000000000000000000000000000000
Binary files a/docs/DAMON2019_Blott_final.pdf and /dev/null differ
diff --git a/docs/FPGA2018_Tutorial.pdf b/docs/FPGA2018_Tutorial.pdf
deleted file mode 100644
index 8d63ba9c1b91768b0c59b2d439ccee2b18d6bdc9..0000000000000000000000000000000000000000
Binary files a/docs/FPGA2018_Tutorial.pdf and /dev/null differ
diff --git a/docs/FutureofAI2019_Blott.pdf b/docs/FutureofAI2019_Blott.pdf
deleted file mode 100644
index 84775f3bb21cf81f1d46b189863c010ababc68b0..0000000000000000000000000000000000000000
Binary files a/docs/FutureofAI2019_Blott.pdf and /dev/null differ
diff --git a/docs/Hotchips2018_Tutorial.pdf b/docs/Hotchips2018_Tutorial.pdf
deleted file mode 100644
index 2772634d108cf8ba7d3433ac134fd9c5cea38b36..0000000000000000000000000000000000000000
Binary files a/docs/Hotchips2018_Tutorial.pdf and /dev/null differ
diff --git a/docs/IEEECluster2018.pdf b/docs/IEEECluster2018.pdf
deleted file mode 100644
index a2e2cf07a2207dfc38c89e8081917ce2c603058f..0000000000000000000000000000000000000000
Binary files a/docs/IEEECluster2018.pdf and /dev/null differ
diff --git a/docs/MPSOC2018.pdf b/docs/MPSOC2018.pdf
deleted file mode 100644
index f1837ae216b734c451283107f83afe7ff05dd54f..0000000000000000000000000000000000000000
Binary files a/docs/MPSOC2018.pdf and /dev/null differ
diff --git a/docs/RCML2018.pdf b/docs/RCML2018.pdf
deleted file mode 100644
index c8cf780d8cdf0364dc9f1d6a1ce88c29ee59f493..0000000000000000000000000000000000000000
Binary files a/docs/RCML2018.pdf and /dev/null differ
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..db1cb00970439f03b95b9439253e72d70fdfb0bc
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,10 @@
+The `finn/` subfolder contains the documentation sources. This is built with
+Sphinx either by:
+
+* online on readthedocs:
+  - [finn.readthedocs.io](finn.readthedocs.io) for the latest release
+  - [finn-dev.readthedocs.io](finn-dev.readthedocs.io) for the `dev` branch
+* manually inside the FINN Docker container with `python setup.py docs`
+
+If you're looking for content that was hosted on the FINN project page
+with GitHub Pages, that has moved to the [github-pages branch](https://github.com/Xilinx/finn/tree/github-pages).
diff --git a/docs/_config.yml b/docs/_config.yml
deleted file mode 100644
index 474d9143165f5afc4c8b275464289a6fda53df86..0000000000000000000000000000000000000000
--- a/docs/_config.yml
+++ /dev/null
@@ -1,3 +0,0 @@
-theme: jekyll-theme-dinky
-google_analytics: UA-148815864-1
-show_downloads: false
diff --git a/docs/_layouts/default.html b/docs/_layouts/default.html
deleted file mode 100644
index e594c6aaf0d77abb238f6244ab87c0412b7b2528..0000000000000000000000000000000000000000
--- a/docs/_layouts/default.html
+++ /dev/null
@@ -1,71 +0,0 @@
-<!doctype html>
-<html lang="{{ site.lang | default: "en-US" }}">
-  <head>
-    <meta charset="utf-8">
-    <meta http-equiv="X-UA-Compatible" content="IE=edge">
-
-{% seo %}
-    <link rel="stylesheet" href="{{ '/assets/css/style.css?v=' | append: site.github.build_revision | relative_url }}">
-    <script src="{{ '/assets/js/scale.fix.js' | relative_url }}"></script>
-    <meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">
-    <!--[if lt IE 9]>
-    <script src="//html5shiv.googlecode.com/svn/trunk/html5.js"></script>
-    <![endif]-->
-  </head>
-  <body>
-    <div class="wrapper">
-      <header>
-        <!--<h1 class="header">{{ site.title | default: site.github.repository_name }}</h1>-->
-        <a href="https://xilinx.github.io/finn">
-          <img src="https://github.com/Xilinx/finn/raw/master/docs/img/finn-logo.png" style="border: none" width="150">
-        </a>
-        <p class="header">{{ site.description | default: site.github.project_tagline }}</p>
-
-        <ul>
-          {% if site.show_downloads %}
-            <li class="download"><a class="buttons" href="{{ site.github.zip_url }}">Download ZIP</a></li>
-            <li class="download"><a class="buttons" href="{{ site.github.tar_url }}">Download TAR</a></li>
-          {% endif %}
-          <li class="download"><a class="buttons" style="background: none" href="https://xilinx.github.io/finn/about">About</a></li>
-          <li class="download"><a class="buttons" style="background: none" href="https://xilinx.github.io/finn/blog">Blog</a></li>
-          <li class="download"><a class="buttons" style="background: none" href="https://xilinx.github.io/finn/demos">Demos</a></li>
-          <li class="download"><a class="buttons" style="background: none" href="https://finn.readthedocs.io">Documentation</a></li>
-          <li class="download"><a class="buttons" style="background: none" href="https://github.com/Xilinx/brevitas">Brevitas</a></li>
-          <li class="download"><a class="buttons" style="background: none" href="https://github.com/Xilinx/finn-hlslib">HLS Library</a></li>
-          <li class="download"><a class="buttons" style="background: none" href="https://xilinx.github.io/finn/publications">Publications</a></li>
-          <li class="download"><a class="buttons" style="background: none" href="https://xilinx.github.io/finn/community">Community</a></li>
-          <li><a class="buttons github" href="{{ site.github.repository_url }}">View On GitHub</a></li>
-        </ul>
-
-        {% if site.github.is_project_page %}
-          <p class="header">This project is maintained by <a class="header name" href="{{ site.github.owner_url }}">{{ site.github.owner_name }}</a></p>
-        {% endif %}
-
-        {% if site.github.is_user_page %}
-          <ul>
-            <li><a class="buttons github" href="{{ site.github.owner_url }}">GitHub Profile</a></li>
-          </ul>
-        {% endif %}
-      </header>
-
-      <section>
-        {{ content }}
-      </section>
-
-      <footer>
-        <p><small>Hosted on <a href="https://pages.github.com">GitHub Pages</a> using the Dinky theme</small></p>
-      </footer>
-    </div>
-    <!--[if !IE]><script>fixScale(document);</script><![endif]-->
-    {% if site.google_analytics %}
-      <script>
-        (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
-        (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
-        m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
-        })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
-        ga('create', '{{ site.google_analytics }}', 'auto');
-        ga('send', 'pageview');
-      </script>
-    {% endif %}
-  </body>
-</html>
diff --git a/docs/_layouts/post.html b/docs/_layouts/post.html
deleted file mode 100644
index fdcc889fdc75cfea789a84dd8109d067d625791c..0000000000000000000000000000000000000000
--- a/docs/_layouts/post.html
+++ /dev/null
@@ -1,7 +0,0 @@
----
-layout: default
----
-<h1>{{ page.title }}</h1>
-<p>{{ page.date | date_to_string }} - {{ page.author }}</p>
-
-{{ content }}
diff --git a/docs/_posts/2019-09-27-new-project-page-is-live.md b/docs/_posts/2019-09-27-new-project-page-is-live.md
deleted file mode 100644
index 7385a0233387329ef33d46d1fc72c85575b3e69c..0000000000000000000000000000000000000000
--- a/docs/_posts/2019-09-27-new-project-page-is-live.md
+++ /dev/null
@@ -1,7 +0,0 @@
----
-layout: post
-title:  "New project page is live!"
-author: "Yaman Umuroglu"
----
-
-Welcome to the new FINN project page, we hope you like it!
diff --git a/docs/_posts/2019-10-02-rebuilding-finn-for-open-source.md b/docs/_posts/2019-10-02-rebuilding-finn-for-open-source.md
deleted file mode 100644
index 324576ee46ecd044f75072976ae644d420e8fd59..0000000000000000000000000000000000000000
--- a/docs/_posts/2019-10-02-rebuilding-finn-for-open-source.md
+++ /dev/null
@@ -1,74 +0,0 @@
----
-layout: post
-title:  "Rebuilding FINN for open source"
-author: "Yaman Umuroglu"
----
-
-We're happy to announce some exciting developments in the FINN project: we're rebuilding our solution stack from the ground up
-to be more modular, more usable and more open-source!
-
-### A quick retrospective
-
-Over the past few years, the team at Xilinx Research Labs Ireland has done quite a bit of research of Quantized Neural Networks
-(QNNs).
-Starting with <a href="https://arxiv.org/abs/1612.07119">Binarized Neural Networks (BNNs) on FPGAs</a> back in 2016, we've since
-looked at many aspects of quantized deep learning, ranging from
-at <a href ="https://arxiv.org/abs/1807.00301">better quantization methods</a> and
-<a href="https://arxiv.org/abs/1709.06262">mixing quantization and pruning</a>,
-to <a href="https://arxiv.org/pdf/1807.10577.pdf">accuracy-throughput tradeoffs</a> and
-<a href="https://arxiv.org/pdf/1807.04093.pdf">recurrent topologies</a>.
-
-Although some <a href="https://github.com/Xilinx/BNN-PYNQ">demonstrators</a> of our work has been open source for some time,
-we want to take things a step further.
-We love QNNs and the high-performance, high-efficiency dataflow accelerators we can build for them on Xilinx FPGAs, and we want you and
-the FPGA/ML community to be able to do the same.
-The (co-)design process for making this happen is actually quite involved, starting from customizing a neural network in a machine
-learning framework, going through multiple design steps that involve many optimizations, HLS code generation and Vivado synthesis, and
-ending up with an FPGA bitstream that you can deploy as part of some application.
-Many of those steps require some manual effort, but having a modular, flexible solution stack to support you through this process is greatly
-helpful.
-This is why we are rebulding our FINN solution stack from the ground-up to make it more modular, and we hope to build a community
-around it that shares our excitement around QNNs for FPGAs.
-
-### Making FINN modular
-
-<img align="left" src="https://xilinx.github.io/finn/img/finn-stack.png" alt="drawing" style="margin-right: 20px" width="300"/>
-
-The first step towards making this happen is to define what layers exist in the solution stack.
-In many ways, this solution stack is inspired by the tested-and-tried frontend/backend software architecture found in compiler
-frameworks like <a href="http://llvm.org">LLVM</a>.
-This stack breaks down the complex co-design problem into parts, and each layer focuses on a different sub-problem, consuming
-the artifacts produced by the previous one.
-The diagram on the left illustrates this briefly, and over the next few months we hope to make a first few QNNs go through all
-the layers of this stack to produce cool FPGA dataflow accelerators.
-In fact, some of these components are already available today for you to explore!
-
-Let's have a look at the main parts:
-
-* <b>Brevitas</b> is a PyTorch library that lets you do quantization-aware training. It gives you a set of `torch.nn` building
-blocks to explore different forms of weight, activation and accumulator quantization schemes. You can also learn the bitwidths for
-different layers with backpropagation! See the <a href="https://xilinx.github.io/brevitas/">Brevitas page</a> for more information.
-* <b>Frontend</b>. Once you are happy with the accuracy of your quantized neural network in Brevitas, you'll be able to export it into a custom
-<a href="https://onnx.ai">ONNX</a> representation that FINN uses internally to represent QNNs. More details about this custom ONNX
-representation will be available in an upcoming blog post.
-* The <b>FINN Compiler</b> will then import this ONNX representation, and go through several steps of optimizations such as the
-  <a href="https://arxiv.org/pdf/1709.04060.pdf">streamlining transform</a> to make the QNN simpler.
-* The <b>FPGA dataflow backend</b> will then convert the optimized QNN into a series of streaming HLS library calls. An important
- part of the stack is the <a href="https://github.com/Xilinx/finn-hlslib">FINN HLS library</a>, which provides optimized Vivado HLS
- descriptions of several common layer types (convolutions, thresholding, pooling...) found in QNNs.
- * <b>Synthesis</b>. Once the HLS calls are generated, the next steps are to call Vivado HLS and Vivado to generate a bitstream for the target
- Xilinx FPGA. We have plans to support Vivado IPI block design code generation as well for increased agility and modularity.
- * <b>PYNQ deployment</b>. Finally, you will be able to use any of the supported <a href="http://www.pynq.io/">PYNQ</a> platforms to directly call the
- generated accelerator from Python and integrate it with other functionality. Since FINN-generated dataflow accelerators expose
- streaming interfaces, we think it will be exciting to use streaming-oriented Python frameworks such as
- <a href="https://github.com/ray-project/ray">Ray</a> to create heterogeneous, high-performance task graphs incorporating QNNs.
-
- ### Getting started
-
- More will be available in the coming weeks and months, but if you want to get your hands dirty there's already plenty to start with!
- If you haven't done so already, we recommend starting with <a href="https://github.com/Xilinx/BNN-PYNQ">BNN-PYNQ</a> to see what
- dataflow QNN accelerators look and feel like.
- You can also start experimenting with <a href="https://xilinx.github.io/brevitas/">Brevitas</a> to train some QNNs, or
- put together a streaming pipeline with the <a href="https://github.com/Xilinx/finn-hlslib">FINN HLS library</a>.
- We have also created a <a href="https://gitter.im/xilinx-finn/community">Gitter channel</a> to make it easier to get in touch with
- the community, and hope to see many of you there! :)
diff --git a/docs/_posts/2020-02-28-finn-v02b-beta-is-released.md b/docs/_posts/2020-02-28-finn-v02b-beta-is-released.md
deleted file mode 100644
index 319c03e14229f4866279cb09a4b70419ce2fcdc7..0000000000000000000000000000000000000000
--- a/docs/_posts/2020-02-28-finn-v02b-beta-is-released.md
+++ /dev/null
@@ -1,33 +0,0 @@
----
-layout: post
-title:  "FINN v0.2b (beta) is released"
-author: "Yaman Umuroglu"
----
-
-We've been working on the new version of the FINN compiler for a while, and today we are excited to announce our first beta release to 
-give you a taste of how things are shaping up! 
-
-Here's a quick overview of the key features:
-
-* <b>Train and export highly-quantized networks in PyTorch using Brevitas.</b> You can use <a href="https://github.com/Xilinx/brevitas">Brevitas</a>,
-  our PyTorch library for quantization-aware training to train networks with few-bit weights and activations, then export them into 
-  FINN-ONNX to be used by the FINN compiler.
-
-* <b>Fully transparent end-to-end flow.</b> We support taking quantized networks (with limitations, see bottom of post) all the way down to a 
-  customized FPGA bitstream. This happens across many steps ranging from streamlining to Vivado IPI stitching, and each step is fully 
-  visible to the user. So if you are happy with just the threshold-activation (streamlined) QNN in ONNX, or if you want to take the 
-  generated Vivado IP block and integrate it into your own IPI design, it's easy to break out of the flow at any step. 
-  We also provide a variety of mechanisms to verify the design at different steps.
-
-* <b>ONNX-based intermediate representation.</b> We use ONNX with some custom nodes and annotations as our intermediate representation. As the 
-  FINN compiler transforms the network across many steps to produce an FPGA bitstream, you can view and explore the transformed network 
-  using the excellent <a href="https://www.lutzroeder.com/ai/netron">Netron</a> viewer from the comfort of your web browser.
-
-* Tutorials and documentation. We have prepared a set of <a href="https://github.com/Xilinx/finn/tree/master/notebooks">Jupyter notebooks</a> 
-  to let you experiment with some of the things FINN can do, covering the basics, demonstrating the end-to-end flow on an example network, 
-  and discussing some of the internals for more advanced users and developers. We also have Sphinx-generated documentation on 
-  <a href="http://finn.readthedocs.io/">readthedocs</a> for more information on the FINN compiler and its API.
-
-The release (tagged 0.2b) is now available on GitHub. Currently it's a beta release and only supports fully-connected layers in linear 
-(non-branching) topologies, but we're actively working on the end-to-end convolution support for the next release. Further down the 
-road, we hope to support more advanced topologies and provide end-to-end examples for MobileNet and ResNet-50.
diff --git a/docs/_posts/2020-03-11-rn50-released.md b/docs/_posts/2020-03-11-rn50-released.md
deleted file mode 100644
index baa924410cf56a07e22a6c85450205d18a4d45bb..0000000000000000000000000000000000000000
--- a/docs/_posts/2020-03-11-rn50-released.md
+++ /dev/null
@@ -1,75 +0,0 @@
----
-layout: post
-title:  "ResNet50 for Alveo released"
-author: "Lucian Petrica"
----
-
-We're pleased to announce as part of the FINN project our release of the first fully quantized, all-dataflow ResNet50 inference accelerator for Xilinx Alveo boards. The source code is available on [GitHub](https://github.com/Xilinx/ResNet50-PYNQ) and we provide a Python [package](https://pypi.org/project/resnet50-pynq/) and Jupyter Notebook to get you started and show how the accelerator is controlled using [PYNQ](http://www.pynq.io/) for Alveo.
-Built using a custom [FINN](https://xilinx.github.io/finn/about.html) streamlining flow, which is not yet public, 
-this accelerator showcases the advantage of deep quantization for FPGA acceleration of DNN workloads in the datacenter. 
-The key performance metrics are:
-
-FPGA Device | ImageNet Accuracy     | Max FPS    | Min Latency | Power @ Max FPS | Power @ Min Latency
-----------  |----------             |----------  |----------   |----------       |----------
-Alveo U250  | 65% Top-1 / 85% Top-5 | 2000       | 2 ms      | 70 W            | 40 W
-
-In addition to demonstrating the achievable performance of low-precision dataflow acceleration on Alveo, the ResNet50 design
-serves as proof of concept for two key features of future FINN releases: 
-modular build flows based on Vivado IP Integrator, and pure Python interface to the accelerator. 
-
-## Modular build flow
-
-FINN accelerators targetting embedded parts, such as the [BNN-PYNQ](https://github.com/Xilinx/BNN-PYNQ) accelerators, have in the past implemented the
-entire acceleration functionality in a singe monolithic HLS C++ description.
-For large datacenter-class designs this approach is not feasible, as the HLS simulation and synthesis times become very large.
-
-Instead, here we identify the key computational pattern, the residual block, which we implement as a HLS C++ IP block by assembling multiple Matrix-Vector-Activation Units from the [FINN HLS Library](https://github.com/Xilinx/finn-hlslib). 
-We then construct the accelerator by instantiating and connecting multiple residual blocks together in a Vivado IPI block design, which are then synthesized in parallel and exported as a netlist IP.
-
-<img align="left" src="https://xilinx.github.io/finn/img/rn50-ipi.png" alt="drawing" style="margin-right: 20px" width="300"/>
-
-
-In our flow, this IP is linked by Vitis into an Alveo platform, but users are free to integrate the ResNet50 IP in their own Vivado-based flows and augment it with other HLS or RTL IP. See our build scripts and documentation for more information.
-
-## Pure Python host interface
-
-Using PYNQ for Alveo, users can interface directly with the ResNet50 accelerator in Python.
-To program the accelerator, an Overlay object is created from an XCLBin file produced by Vitis.
-
-```Python
-import pynq
-
-ol=pynq.Overlay("resnet50.xclbin")
-accelerator=ol.resnet50_1
-```
-
-Before using the accelerator, we must configure the weights of the fully-connected layer in DDR Bank 0.
-Assuming the weights are already loaded in the NumPy array `fcweights`, we allocate a buffer 
-of appropriate size, copy the weights into it, and flush it to the Alveo DDR Bank 0.
-
-```Python
-fcbuf = pynq.allocate((1000,2048), dtype=np.int8, target=ol.bank0)
-fcbuf[:] = fcweights
-fcbuf.sync_to_device()
-```
-
-To perform inference we first allocate input and output buffers for one image, and copy the contents of the NumPy array `img` into the input buffer.
-We then flush the input data to the Alveo DDR Bank 0, and call the accelerator providing as arguments
-the input and output buffers, the FC layer weights buffer, and the number of images to process, in this case just one.
-After the call finishes, we pull the output buffer data from the accelerator DDR to host memory and copy its 
-contents to user memory in a NumPy array.
-
-```Python
-inbuf = pynq.allocate((224,224,3), dtype=np.int8, target=ol.bank0)
-outbuf = pynq.allocate((5,), dtype=np.uint32, target=ol.bank0)
-
-inbuf[:] = img
-inbuf.sync_to_device()
-
-accelerator.call(inbuf, outbuf, fcbuf, 1)
-
-outbuf.sync_from_device()
-results = np.copy(outbuf)
-```
-
-It's that easy! See our Jupyter Notebook demo and application examples for more details.
diff --git a/docs/_posts/2020-03-27-brevitas-quartznet-release.md b/docs/_posts/2020-03-27-brevitas-quartznet-release.md
deleted file mode 100644
index 0940f754815c834662919404860b8a7b00d08e64..0000000000000000000000000000000000000000
--- a/docs/_posts/2020-03-27-brevitas-quartznet-release.md
+++ /dev/null
@@ -1,92 +0,0 @@
----
-layout: post
-title:  "Quantized QuartzNet with Brevitas for efficient speech recognition"
-author: "Giuseppe Franco"
----
-
-*Although not yet supported in FINN, we are excited to show you how Brevitas and quantized neural network training techniques can be applied to models beyond image classification.*
-
-We are pleased to announce the release of quantized pre-trained models of [QuartzNet](https://arxiv.org/abs/1904.03288) for efficient speech recognition.
-They can be found at the [following link](https://github.com/Xilinx/brevitas/tree/master/examples/speech_to_text), with a brief
-explanation on how to test them.
-The quantized version of QuartzNet has been trained using [Brevitas](https://github.com/Xilinx/brevitas), an experimental library for quantization-aware training.
-
-QuartzNet, whose structure can be seen in Fig. 1, is a convolution-based speech-to-text network, based on a similar structure as [Jasper](https://arxiv.org/abs/1904.03288).
-
-| <img src="https://xilinx.github.io/finn/img/QuartzNet.jpg" alt="QuartzNet Structure" title="QuartzNet Structure" width="450" height="500" align="center"/>|
-| :---:|
-| *Fig. 1 QuartzNet Model, [source](https://arxiv.org/abs/1910.10261)* |
-
-The starting point is the mel-spectrogram representation of the input audio file.
-Through repeated base building blocks of 1D Convolutions (1D-Conv), Batch-Normalizations (BN), and ReLU with residual connections,
-QuartzNet is able to reconstruct the underlying text.
-The main difference with respect to Jasper is the use of Depthwise and Pointwise 1D-Conv (Fig. 2a), instead of 'simple' 1D-Conv (Fig. 2b).
-Thanks to this structure, QuartzNet is able to achieve better performance in terms of Word Error Rate (WER) compared to Jasper,
-using *only* 19.9 M parameters, compared to 333M parameters of Jasper.
-
-Moreover, the authors proposed a grouped-pointwise convolution strategy that allows to greatly reduce the numbers of parameters,
-down to 8.7M, with a little degradation in accuracy.
-
-| <img src="https://xilinx.github.io/finn/img/quartzPic1.jpg" alt="QuartzNet block" title="QuartzNet block" width="130" height="220" align="center"/> | <img src="https://xilinx.github.io/finn/img/JasperVertical4.jpg" alt="Jasper block" title="Jasper block" width="130" height="220" align="center"/>|
-| :---:|:---:|
-| *Fig. 2a QuartzNet Block, [source](https://arxiv.org/abs/1910.10261)* | *Fig. 2b Jasper Block [source](https://arxiv.org/abs/1904.03288)*  |
-
-
-The authors of QuartzNet proposes different BxR configurations. Each B<sub>i</sub> block consist of the same base building block described above,
-repeated R times.
-Different BxR configurations have been trained on several different datasets (Wall Street Journal,
-LibriSpeech + Mozilla Common Voice, LibriSpeech only).
-
-For our quantization experiments, we focus on the 15x5 variant trained on LibriSpeech with spec-augmentation without grouped convolutions.
-More detail about this configuration can be found in the paper and on a [related discussion with the authors](https://github.com/NVIDIA/NeMo/issues/230).
-
-Started from the [official implementation](https://github.com/NVIDIA/NeMo/blob/master/examples/asr/quartznet.py),
-the first step was to implement a quantized version of the topology in Brevitas, using quantized convolutions and activations.
-
-After implementing the quantized version, the second step was to re-train the model, starting
-from the [pre-trained models](https://ngc.nvidia.com/catalog/models/nvidia:quartznet_15x5_ls_sp)
-kindly released by the authors.
-
-We focused on three main quantization configurations. Two configurations at 8 bit, with per-tensor and per-channel scaling,
-and one configuration at 4 bit, with per-channel scaling.
-
-We compare our results with the one achieved by the authors, not only in terms of pure WER, but also the parameter's memory footprint,
-and the number of operations performed. Note that the WER is always based on greedy decoding. The results can be seen in Fig. 3 and Fig. 4,
-and are summed up in Table 1.
-
-| Configuration | Word Error Rate (WER) | Memory Footprint (MegaByte) | Mega MACs |
-| :-----------: | :-------------------: | :-------------------------: | :-------: |
-| FP 300E, 1G   | 11.58%                | 37.69                       | 1658.54   |
-| FP 400E, 1G   | 11.08%                | 37.69                       | 1658.54   |
-| FP 1500E, 1G  | 10.78%                | 37.69                       | 1658.54   |
-| FP 300E, 2G   | 12.52%                | 24.06                       | 1058.75   |
-| FP 300E, 4G   | 13.48%                | 17.25                       |  758.86   |
-| 8 bit, 1G Per-Channel scaling| 10.98% | 18.58                       |  414.63   |
-| 8 bit, 1G Per-Tensor scaling | 11.03% | 18.58                       |  414.63   |
-| 4 bit, 1G Per-Channel scaling| 12.00% |  9.44                       |  104.18   |
-
-| <img src="https://xilinx.github.io/finn/img/WERMB.jpg" alt="WERvsMB" title="WERvsMB" width="500" height="300" align="center"/> |
-| :---:|
-| *Fig. 3 Memory footprint over WER on LibriSpeech dev-other* |
-
-| <img src="https://xilinx.github.io/finn/img/WERNops.jpg" alt="WERvsMACs" title="WERvsMACs" width="500" height="300" align="center"/> |
-| :---: |
-| *Fig. 4 Number of MACs Operations over WER on LibriSpeech dev-other*  |
-
-In evaluating the memory footprint, we consider half-precision (16 bit) Floating Point (FP) numbers for the original QuartzNet.
-As we can see on Fig. 3, the quantized implementations are able to achieve comparable accuracy compared to the corresponding floating-point verion,
-while greatly reducing the memory occupation. In the graph, the terms <em>E</em> stands for Epochs, while <em>G</em> for Groups, referring
-to the numbers of groups used for the grouped convolutions.
-In case of our 4 bit implementation, the first and last layer are left at 8 bit, but this is taken in account both in the computation
-of the memory occupation and of the number of operations.
-Notice how the 4 bit version is able to greatly reduce the memory footprint of the network compared to the grouped convolution variants, while still granting better accuracy.
-
-
-For comparing accuracy against the number of multiply-accumulate (MAC), we consider 16 bit floating-point multiplications as 16 bit integer multiplications.
-This means that we are greatly underestimating the complexity of operations performed in the original floating-point QuartzNet model.
-Assuming a n^2 growth in the cost of integer multiplication, we consider a 4 bit MAC 16x less expensive than a 16 bit one.
-The number of MACs in the Fig. 2b is normalized with respect to 16 bit.
-Also in this case, it is clear to see that the quantized versions are able to greatly reduce the amount of operations required,
-with little-to-none degradation in accuracy. In particular, the 8 bit versions are already able to have a better WER and lower amount
-of MACs compared to the grouped convolutions, and this is confirmed also by the 4 bit version, with a little degradation in terms of
-WER.
diff --git a/docs/_posts/2020-05-08-finn-v03b-beta-is-released.md b/docs/_posts/2020-05-08-finn-v03b-beta-is-released.md
deleted file mode 100644
index 8119b7338a7e14e5b9093a4a177f43e68267a911..0000000000000000000000000000000000000000
--- a/docs/_posts/2020-05-08-finn-v03b-beta-is-released.md
+++ /dev/null
@@ -1,46 +0,0 @@
----
-layout: post
-title:  "FINN v0.3b (beta) is released"
-author: "Yaman Umuroglu"
----
-
-We're happy to announce the v0.3b (beta) release of the FINN compiler.
-The full changelog is quite large as we've been working on a lot of exciting
-new features, but here is a summary:
-
-<img src="https://xilinx.github.io/finn/img/cnv-mp-fc.png" width="800" align="center"/>
-
-
-**Initial support for ConvNets and end-to-end notebook example.** The
-preliminary support for convolutions is now in place. Head over to the new
-<a href="https://github.com/Xilinx/finn/blob/staging/v0.3b/notebooks/end2end_example/cnv_end2end_example.ipynb">
-end-to-end notebook</a> to try out the end-to-end flow for convolutions
-and build the demonstrator for a simple binarized CNN on CIFAR-10.
-
-<img src="https://xilinx.github.io/finn/img/parallel-speedup.png" width="500" align="center"/>
-
-**Parallel transformations.** When working with larger designs, HLS synthesis
-and simulation compile times can be quite long. Thanks to a contribution by
-@HenniOVP we now support multi-process parallelization several FINN transformations.
-You can read more about those <a href="https://github.com/Xilinx/finn/blob/staging/v0.3b/notebooks/advanced/1_custom_transformation_pass.ipynb">here</a>.
-
-<img src="https://xilinx.github.io/finn/finn/img/mem_mode.png" width="600" align="center"/>
-
-**Decoupled memory mode for MVAUs.** To have more control over how the weight
-memories are implemented, you can now specify the `mem_mode` and `ram_style`
-attributes when instantiating compute engines. Read more <a href="https://finn.readthedocs.io/en/latest/internals.html#streamingfclayer-mem-mode">here.</a>
-
-**Throughput testing and optimizations.** To do a quick assessment of the
-customized accelerators you build, we now support a throughput test mode that
-lets you benchmark the accelerator with a configurable number of samples.
-To get better utilization from the heterogeneous streaming architectures FINN
-builds, we have also introduced a FIFO insertion transformation.
-You can see these in action in the updated <a href="https://github.com/Xilinx/finn/blob/staging/v0.3b/notebooks/end2end_example/tfc_end2end_example.ipynb">
-TFC-w1a1 end2end notebook.</a>
-
-We have a slew of other smaller features, bugfixes and various other improvements.
-The release (tagged 0.3b) is now available on GitHub.
-We're continuously working to improve FINN in terms of layer, network and
-infrastructure.
-If you'd like to help out, please check out the <a href="https://github.com/Xilinx/finn/blob/staging/v0.3b/CONTRIBUTING.md">contribution guidelines</a> and
-share your ideas on the <a href="https://gitter.im/xilinx-finn/community">FINN Gitter channel</a>!
diff --git a/docs/_posts/2020-09-21-finn-v04b-beta-is-released.md b/docs/_posts/2020-09-21-finn-v04b-beta-is-released.md
deleted file mode 100644
index b6eb7003fa34840c1d018fc92d4bab0cbc7db623..0000000000000000000000000000000000000000
--- a/docs/_posts/2020-09-21-finn-v04b-beta-is-released.md
+++ /dev/null
@@ -1,56 +0,0 @@
----
-layout: post
-title:  "FINN v0.4b (beta) is released"
-author: "Yaman Umuroglu"
----
-
-Version v0.4b (beta) of the FINN compiler is now available. As with the previous
-release there's a whole lot of new features and bugfixes that have gone in,
-but here are some highlights:
-
-<img src="https://mma.prnewswire.com/media/752936/ALVEO_PRESS.jpg" width="300" align="center"/>
-
-**Build support for Alveo/Vitis + more Zynq variants.** We now have a
-`VitisBuild` transformation to provide a FINN flow that goes all the way to
-bitfiles targeting Xilinx Alveo platforms. This transformation takes care of
-FIFO, datawidth converter and DMA engine insertion so you can simply give it a
-FINN model with HLS layers and let it run.
-Similarly, we've simplified the Zynq build flow with `ZynqBuild` to provide a
-similar experience, which should now be able to support most Zynq and Zynq
-UltraScale+ platforms.
-You can read more about the new hardware build transformations
-<a href="https://finn.readthedocs.io/en/latest/hw_build.html">here</a>.
-
-<img src="https://xilinx.github.io/finn/img/finn-dashboard.png" width="450" align="center"/>
-
-**Fully-accelerated end-to-end examples + dashboard.** The FINN end-to-end example networks
-are now fully accelerated on the FPGA, allowing raw images to be directly fed in
-and top-1 indices to be retrieved.
-We now also have a <a href="https://bit.ly/finn-end2end-dashboard">dashboard</a>
-which gets automatically updated with the latest build results from end-to-end
-examples, including FPGA resources and performance.
-This also enables running full-performance accuracy validation on hardware,
-which is now incorporated into the <a href="https://github.com/Xilinx/finn/blob/master/notebooks/end2end_example/tfc_end2end_example.ipynb#validation">end-to-end notebooks</a>.
-
-<img src="https://xilinx.github.io/finn/img/finn-brevitas-debug.png" width="450" align="center"/>
-
-**Brevitas-FINN co-debug support.** We can now export graphs from Brevitas with special DebugMarker nodes (like above) and PyTorch forward hooks to compare intermediate activations between the Brevitas version and FINN-ONNX exported version. This is handy for debugging especially larger networks when they don't export correctly. <a href="https://github.com/Xilinx/finn/blob/dev/tests/brevitas/test_brevitas_debug.py">Here</a> is an example of how to use this.
-
-<img src="https://xilinx.github.io/finn/img/accumulator-minimization.png" width="450" align="center"/>
-
-**Accumulator minimization.** When converting to HLS layers, FINN will now automatically try to pick a minimal bitwidth for each accumulator, based on the precision and size of the dot product it accumulates over. While prior accumulators were at a fixed bitwidth like
-32-bits, the new approach can significantly save on resources by picking e.g. 10-bit accumulators (as per above) where possible. We've also expanded the range of DataTypes available in FINN to cover everything between 1-32 bits to provide more flexibility.
-
-<img src="https://xilinx.github.io/finn/img/finn-cycle-estimate.png" width="450" align="center"/>
-
-**New layers and cycle estimation.** We've been working on supporting more of the finn-hlslib layers in FINN and
-<a href="https://github.com/Xilinx/finn/tree/dev/src/finn/custom_op/fpgadataflow">the list</a>  has expanded significantly.
-Many of these layers (and their accompanying conversion transformations) will be utilized for new FINN end-to-end example networks,
-like MobileNet-v1, ResNet-50 and a QuartzNet, over the course of the next few releases. These layers also support <a href="https://github.com/Xilinx/finn/blob/dev/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py">clock cycle estimation</a>
-based on workload and parallelization parameters, allowing the user to estimate performance without having to go to synthesis.
-
-The release (tagged 0.4b) is now available on GitHub.
-We're continuously working to improve FINN in terms of layer, network and
-infrastructure.
-If you'd like to help out, please check out the <a href="https://github.com/Xilinx/finn/blob/master/CONTRIBUTING.md">contribution guidelines</a> and
-share your ideas on the <a href="https://gitter.im/xilinx-finn/community">FINN Gitter channel</a>!
diff --git a/docs/about.md b/docs/about.md
deleted file mode 100644
index 8a9029e22b66e330c815c20b902ac235887aa661..0000000000000000000000000000000000000000
--- a/docs/about.md
+++ /dev/null
@@ -1,39 +0,0 @@
-## What is FINN?
-
-<img align="left" src="img/finn-example.png" alt="drawing" width="300"/>
-
-FINN is an
-experimental framework from Xilinx Research Labs to explore deep neural network
-inference on FPGAs.
-It specifically targets <a href="https://github.com/maltanar/qnn-inference-examples" target="_blank">quantized neural
-networks</a>, with emphasis on
-generating dataflow-style architectures customized for each network.
-It is not
-intended to be a generic DNN accelerator like xDNN, but rather a tool for
-exploring the design space of DNN inference accelerators on FPGAs.
-<br>
-## Features
-
-* **Templated Vivado HLS library of streaming components:** FINN comes with an
-HLS hardware library that implements convolutional, fully-connected, pooling and
-LSTM layer types as streaming components. The library uses C++ templates to
-support a wide range of precisions.
-* **Ultra low-latency and high performance
-with dataflow:** By composing streaming components for each layer, FINN can
-generate accelerators that can classify images at sub-microsecond latency.
-* **Many end-to-end example designs:** We provide examples that start from training a
-quantized neural network, all the way down to an accelerated design running on
-hardware. The examples span a range of datasets and network topologies.
-* **Toolflow for rapid design generation:** The FINN toolflow supports allocating
-separate compute resources per layer, either automatically or manually, and
-generating the full design for synthesis. This enables rapid exploration of the
-design space.
-
-## Who are we?
-
-The FINN team is part of Xilinx's CTO group under Ivo Bolsens (CTO) and Kees Vissers (Fellow) and working very closely with the Pynq team and Kristof Denolf and Jack Lo for integration with video processing.
-
-<img src="img/finn-team.jpg" alt="The FINN Team" width="400"/>
-
-From left to right: Lucian Petrica, Giulio Gambardella,
-Alessandro Pappalardo, Ken O’Brien, Michaela Blott, Nick Fraser, Yaman Umuroglu
diff --git a/docs/blog.md b/docs/blog.md
deleted file mode 100644
index b75366b30945d691d65b0d62ab4330b0b9b66e04..0000000000000000000000000000000000000000
--- a/docs/blog.md
+++ /dev/null
@@ -1,9 +0,0 @@
-# Blog
-
-<ul>
-  {% for post in site.posts %}
-    <li>
-      <a href="https://xilinx.github.io/finn/{{ post.url }}">{{ post.title }}</a>
-    </li>
-  {% endfor %}
-</ul>
diff --git a/docs/community.md b/docs/community.md
deleted file mode 100644
index 46f0f7b50d514b04e5e02ec6a4e95f5511c84f39..0000000000000000000000000000000000000000
--- a/docs/community.md
+++ /dev/null
@@ -1,15 +0,0 @@
-# Community
-
-We have a Gitter channel for all things FINN-related here:
-[![Gitter](https://badges.gitter.im/xilinx-finn/community.svg)](https://gitter.im/xilinx-finn/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)
-
-
-### External Collaborators
-* NTNU, Norway: Magnus Jahre, Magnus Sjalander
-* University of Sydney, Australia: Julian Faraone, Philip Leong
-* ETH Zurich, Switzerland: Kaan Kara, Ce Zhang, Lois Orosa, Onur Mutlu
-* University of Kaiserslautern, Germany: Vladimir Rybalkin, Mohsin Ghaffar, Nobert Wehn
-* Imperial College, UK: Alex (Jiang) Su and Peter Cheung
-* Northeastern University, USA: Miriam Leeser
-* Trinity College Dublin, Ireland: Linda Doyle
-* Missing Link Electronics, Germany
diff --git a/docs/demos.md b/docs/demos.md
deleted file mode 100644
index eef4b91d7a7e8c1630a84c330fd403c30a18383a..0000000000000000000000000000000000000000
--- a/docs/demos.md
+++ /dev/null
@@ -1,56 +0,0 @@
-## Neural Network Demos
-
-Multiple Jupyter notebooks examples are provided, with different datasets and two architectures:
-
-* **Feed-forward Dataflow**: all layers of the network are
-implemented in the hardware, the output of one layer is the input of the
-following one that starts processing as soon as data is available. The network
-parameters for all layers are cached in the on-chip memory. For each network
-topology, a customized hardware implementation is generated that provides low
-latency and high throughput.
-
-* **Dataflow with loopback**: a fixed hardware
-architecture is implemented, being able to compute multiple layers in a single
-call. The complete network is executed in multiple calls, which are scheduled on
-the same hardware architecture. Changing the network topology implies changing
-the runtime scheduling, but not the hardware architecture. This provides a
-flexible implementation but features slightly higher latency.
-
-Our design
-examples are mostly for the <a href="http://www.pynq.io/" target="_blank">PYNQ</a> Z1 and Z2 boards, and a
-few for the Ultra96. Future support for AWS F1 and other Xilinx platforms is
-also planned.
-
-### Demos with Dataflow Architecture
-
-| Thumbnail | Dataset | Neural Network | Task | Link |
-|-----------|---------|-------------|------|--------|
-|<img src="img/cifar-10.png" alt="drawing" width="200"/>|<a href="https://www.cs.toronto.edu/~kriz/cifar.html" target="_blank">CIFAR-10</a>|6 convolutional, 3 max pool and 3 fully connected layers|Image classification (animals and vehicles)|<a href="https://github.com/Xilinx/BNN-PYNQ/blob/master/notebooks/CNV-BNN_Cifar10.ipynb" target="_blank">Cifar10</a>|
-|<img src="img/svhn.png" alt="drawing" width="200"/>|<a href="http://ufldl.stanford.edu/housenumbers/" target="_blank">Street View House Numbers</a>|6 convolutional, 3 max pool and 3 fully connected layers|Image classification (house numbers)|<a href="https://github.com/Xilinx/BNN-PYNQ/blob/master/notebooks/CNV-BNN_SVHN.ipynb" target="_blank">SVHN</a>|
-|<img src="img/gtsrb.png" alt="drawing" width="200"/>|<a href="http://benchmark.ini.rub.de/?section=gtsdb&subsection=dataset" target="_blank">German Road Signs</a>|6 convolutional, 3 max pool and 3 fully connected layers|Image classification (road signs)|<a href="https://github.com/Xilinx/BNN-PYNQ/blob/master/notebooks/CNV-BNN_Road-Signs.ipynb" target="_blank">GTRSB</a>|
-|<img src="img/mnist.jpg" alt="drawing" width="200"/>|<a href="http://yann.lecun.com/exdb/mnist/" target="_blank">MNIST</a>|3 fully connected layers|Image classification (handwritten digits)|<a href="https://github.com/Xilinx/BNN-PYNQ/blob/master/notebooks/LFC-BNN_MNIST_Webcam.ipynb" target="_blank">MNIST</a>|
-|<img src="img/fraktur.png" alt="drawing" width="200"/>|Fraktur|Bi-LSTM|Optical Character Recognition|<a href="https://github.com/Xilinx/LSTM-PYNQ/blob/master/notebooks/Fraktur_OCR.ipynb" target="_blank">Fraktur</a>|
-
-### Demos with Loopback Architecture
-
-* <a href="https://github.com/Xilinx/QNN-MO-PYNQ/blob/master/notebooks/dorefanet-classification.ipynb" target="_blank">ImageNet Classification</a>: shows an example
-on how to classify a non-labelled image (e.g., downloaded from the web, your
-phone etc) in one of the 1000 classes available on the <a href="http://image-
-net.org/challenges/LSVRC/2014/browse-synsets" target="_blank"> ImageNet </a>
-dataset.
-
-* <a href="https://github.com/Xilinx/QNN-MO-PYNQ/blob/master/notebooks/dorefanet-imagenet-samples.ipynb" target="_blank">ImageNet - Dataset validation</a>: shows an example classifying labelled image (i.e.,  extracted
-from the dataset) in one of the 1000 classes available on the <a href="http
-://image-net.org/challenges/LSVRC/2014/browse-synsets" target="_blank"> ImageNet
-</a> dataset.
-
-* <a href="https://github.com/Xilinx/QNN-MO-PYNQ/blob/master/notebooks/dorefanet-imagenet-loop.ipynb" target="_blank">ImageNet - Dataset validation in a loop</a>: shows an example classifying labelled image
-(i.e.,  extracted from the dataset) in one of the 1000 classes available on the
-<a href="http://image-net.org/challenges/LSVRC/2014/browse-synsets"
-target="_blank"> ImageNet </a> dataset in a loop.
-
-* <a href="https://github.com/Xilinx/QNN-MO-PYNQ/blob/master/notebooks/tiny-yolo-image.ipynb" target="_blank">Object Detection - from image</a>: shows object detection in a image
-(e.g., downloaded from the web, your phone etc), being able to identify objects
-in a scene and drawing bounding boxes around them. The objects can be one of the
-20 available in the  <a href="http://host.robots.ox.ac.uk/pascal/VOC/"
-target="_blank"> PASCAL VOC </a> dataset
diff --git a/docs/example-networks.md b/docs/example-networks.md
deleted file mode 100644
index 060836ff95d89a752cc70426674cd429e74fdd06..0000000000000000000000000000000000000000
--- a/docs/example-networks.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# Status for FINN example networks
-
-This page has moved to:
-
-https://finn-dev.readthedocs.io/en/latest/example_networks.html
diff --git a/docs/finn-sheduling-and-folding.pptx b/docs/finn-sheduling-and-folding.pptx
deleted file mode 100644
index 30bbe4d55b1cda9df25a791227983dc7cb750e58..0000000000000000000000000000000000000000
Binary files a/docs/finn-sheduling-and-folding.pptx and /dev/null differ
diff --git a/docs/finn/brevitas_export.rst b/docs/finn/brevitas_export.rst
index 83684ae092609ef0f83a5525508febf4676b2d7a..65f6ab6b3053d9f11239b3c048143b3d2f346808 100644
--- a/docs/finn/brevitas_export.rst
+++ b/docs/finn/brevitas_export.rst
@@ -4,9 +4,7 @@
 Brevitas Export
 ***************
 
-.. note:: **This website is currently under construction.**
-
-.. image:: /img/brevitas-export.png
+.. image:: img/brevitas-export.png
    :scale: 70%
    :align: center
 
diff --git a/docs/finn/command_line.rst b/docs/finn/command_line.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ccb891a0ab42eebdd85f10c14384aaa217e8ed8b
--- /dev/null
+++ b/docs/finn/command_line.rst
@@ -0,0 +1,205 @@
+.. _command_line:
+
+*******************
+Command Line Entry
+*******************
+
+Although FINN is primarily *compiler infrastructure* that provides the capabilities
+researchers can use to explore custom QNN inference, we also provide
+two command line entry points for productivity and ease-of-use:
+
+* *Simple dataflow build mode:* Best-effort dataflow build by JSON build config file to convert your ONNX model.
+* *Advanced build mode:* Provide your own build script with full flexibility
+
+.. note:: **When setting up builds using either build mode, you should keep all required data (model, config files etc.) inside the build folder and not use symlinks.**
+
+.. warning::
+  If you are using a neural network with a topology that is substantially
+  different to the FINN end-to-end examples, the simple dataflow build mode below
+  is likely to fail. For those cases, we recommend making a copy of the end-to-end
+  Jupyter notebook as a starting point, visualizing the model at intermediate
+  steps and adding calls to new transformations as needed.
+  Once you have a working flow, you can implement a command line entry for this
+  by using the "advanced mode" described here.
+
+
+Simple dataflow build mode
+--------------------------
+
+This mode is intended for simpler networks whose topologies resemble the
+FINN end-to-end examples.
+It runs a fixed build flow spanning tidy-up, streamlining, HLS conversion
+and hardware synthesis.
+It can be configured to produce different outputs, including stitched IP for
+integration in Vivado IPI as well as bitfiles.
+
+To use it, first create a folder with the necessary configuration and model files:
+
+1. Create a new folder for the dataflow build. It's best to keep this folder
+   outside the FINN repo folder for cleaner separation. Let's call this folder
+   ``dataflow_build_dir``.
+2. Put your ONNX model to be converted under ``dataflow_build_dir/model.onnx``.
+   The filename is important and must exactly be ``model.onnx``.
+3. Create a JSON file with the build configuration. It must be named ``dataflow_build_dir/dataflow_build_config.json``.
+   Read more about the build configuration options on :py:mod:``finn.builder.build_dataflow_config.DataflowBuildConfig``.
+   You can find an example .json file under ``src/finn/qnn-data/build_dataflow/dataflow_build_config.json``
+4. (Optional) create a JSON file with the folding configuration. It must be named ``dataflow_build_dir/folding_config.json``.
+   You can find an example .json file under ``src/finn/qnn-data/build_dataflow/folding_config.json``.
+   Instead of specifying the folding configuration, you can use the `target_fps` option in the build configuration
+   to control the degree of parallelization for your network.
+
+Now you can invoke the simple dataflow build as follows:
+
+::
+
+  ./run-docker.sh build_dataflow <path/to/dataflow_build_dir/>
+
+Depending on the chosen output products, the dataflow build will run for a while
+as it go through numerous steps:
+
+.. code-block:: none
+
+  Building dataflow accelerator from /home/maltanar/sandbox/build_dataflow/model.onnx
+  Outputs will be generated at output_tfc_w1a1_Pynq-Z1
+  Build log is at output_tfc_w1a1_Pynq-Z1/build_dataflow.log
+  Running step: step_tidy_up [1/16]
+  Running step: step_streamline [2/16]
+  Running step: step_convert_to_hls [3/16]
+  Running step: step_create_dataflow_partition [4/16]
+  Running step: step_target_fps_parallelization [5/16]
+  Running step: step_apply_folding_config [6/16]
+  Running step: step_generate_estimate_reports [7/16]
+  Running step: step_hls_codegen [8/16]
+  Running step: step_hls_ipgen [9/16]
+  Running step: step_set_fifo_depths [10/16]
+  Running step: step_create_stitched_ip [11/16]
+  Running step: step_measure_rtlsim_performance [12/16]
+  Running step: step_make_pynq_driver [13/16]
+  Running step: step_out_of_context_synthesis [14/16]
+  Running step: step_synthesize_bitfile [15/16]
+  Running step: step_deployment_package [16/16]
+
+
+You can read a brief description of what each step does on
+:py:mod:`finn.builder.build_dataflow_steps`. Note that a step whose output
+products are not enabled will still run, but will do nothing.
+
+
+Generated outputs
+-----------------
+
+.. note:: **All reports mentioned below are Python dictionaries exported as JSON.**
+
+You will find the generated outputs under the subfolder you specified in the
+build configuration, which can include a variety of folders and files
+depending on the chosen output products.
+
+The following outputs will be generated regardless of which particular outputs are selected:
+
+* ``build_dataflow.log`` is the build logfile that will contain any warnings/errors
+* ``time_per_step.json`` will report the time (in seconds) each build step took
+* ``final_hw_config.json`` will contain the final (after parallelization, FIFO sizing etc) hardware configuration for the build
+* ``intermediate_models/`` will contain the ONNX file(s) produced after each build step
+
+
+The other output products are controlled by the `generate_outputs` field in the
+build configuration), and are detailed below.
+
+* :py:mod:`finn.builder.build_dataflow.DataflowOutputType.ESTIMATE_REPORTS` produces a variety of reports to estimate resource usage and performance *without* running any synthesis. This can be useful for setting up the parallelization and other hardware configuration:
+
+  * ``report/estimate_layer_cycles.json`` -- cycles per layer estimation from analytical model
+  * ``report/estimate_layer_resources.json`` -- resources per layer estimation from analytical model
+  * ``report/estimate_layer_config_alternatives.json`` -- resources per layer estimation from analytical model, including what other config alternatives would have yielded
+  * ``report/estimate_network_performance.json`` -- whole-network performance estimation from analytical model
+  * ``report/op_and_param_counts.json`` -- per-layer and total number of operations and parameters (independent of parallelization)
+
+* :py:mod:`finn.builder.build_dataflow.DataflowOutputType.STITCHED_IP`: produces a stitched Vivado IP block design that can be integrated with other FPGA designs in Vivado IPI:
+
+  * ``stitched_ip/finn_vivado_stitch_proj.xpr`` -- Vivado project (including Vivado IP Integrator block design) to generate the stitched IP
+  * ``stitched_ip/ip`` -- exported Vivado IP for the stitched design
+
+* :py:mod:`finn.builder.build_dataflow.DataflowOutputType.RTLSIM_PERFORMANCE`: measure latency and performance for the stitched IP in RTL simulation, using PyVerilator
+
+  * ``report/rtlsim_performance.json`` -- accelerator throughput and latency from RTL simulation
+
+* :py:mod:`finn.builder.build_dataflow.DataflowOutputType.OOC_SYNTH` runs out-of-context synthesis for the stitched IP. This is useful for getting post-synthesis resource counts and achievable clock frequency without having to produce a full bitfile with DMA engines:
+
+  * ``report/ooc_synth_and_timing.json`` -- resources and achievable clock frequency from out-of-context synthesis
+
+* :py:mod:`finn.builder.build_dataflow.DataflowOutputType.BITFILE` will run Vivado and/or Vitis to insert the FINN accelerator inside a shell, with DMA engines instantiated to move data to/from main memory:
+
+  * ``bitfile/finn-accel.(bit|xclbin)`` -- generated bitfile depending on platform
+  * ``report/post_synth_resources.xml`` -- FPGA resource utilization after synthesis
+  * ``report/post_route_timing.rpt`` -- post-route timing report
+
+
+* :py:mod:`finn.builder.build_dataflow.DataflowOutputType.PYNQ_DRIVER` will generate a PYNQ Python driver that can be used to interface the generated accelerator:
+
+  * ``driver/driver.py`` -- Python driver that can be used on PYNQ on Zynq or Alveo platforms to launch the accelerator
+
+* :py:mod:`finn.builder.build_dataflow.DataflowOutputType.DEPLOYMENT_PACKAGE`:
+
+  * ``deploy/`` -- deployment package folder with a bitfile and driver, ready to be copied to target hardware platform
+
+Verification of intermediate steps
+----------------------------------
+
+FINN dataflow builds go through many steps before the bitfile is generated,
+and the flow may produce erronous models due to bugs or unsupported features.
+When running new models throught this process it's a good idea to enable the
+verification features of the dataflow build. In this way, FINN will use the
+input you provide to run through the intermediate models, produce some output
+and compare it against the expected output that you provide.
+
+This is achieved by setting up the following members of the build configuration:
+
+* Set ``verify_steps`` to be a list of :py:mod:`finn.builder.build_dataflow.VerificationStepType`
+  where each element in the list indicates the output of a particular step
+  that will be verified. See the documentation of the ``VerificationStepType``
+  for more information.
+* Set ``verify_input_npy`` to the .npy filename to use as the test input to the
+  verification process. We recommend using a single input example as the
+  verification execution time can be lengthy for rtlsim, especially for larger
+  networks. The shape of the numpy array must match the expected shape by
+  the model.
+* Set ``verify_expected_output_npy`` to the .npy filename to use as the "golden"
+  output that the generated outputs will be compared against. The shape of the
+  numpy array must match the produced output shape of the model.
+
+The output of the verification is twofold:
+
+* A message like ``Verification for folded_hls_cppsim : SUCCESS`` will appear in
+  the build logfile.
+* The output generated by the model at each verified step will be saved as a
+  .npy file under ``verification_output/`` where each file created will indicate
+  the verification step and the result of the verification (FAIL/SUCCESS).
+
+Advanced mode
+--------------
+
+In other cases, you may want to have more control over the build process to
+implement your own FINN flow with a different combination of compilation steps,
+applying preprocessing to the model, calling custom transformations and so on.
+This is possible by using the `build_custom` entry as follows:
+
+1. Create a new folder for the custom build. It's best to keep this folder
+outside the FINN repo folder for cleaner separation. Let's call this folder
+``custom_build_dir``.
+
+2. Create a ``custom_build_dir/build.py`` file that will perform the build when
+executed. You should also put any ONNX model(s) or other Python modules you
+may want to include in your build flow in this folder (so that they get mounted
+into the Docker container while building). Besides the filename and data placement,
+you have complete freedom on how to implement the build flow here, including
+calling the steps from the simple dataflow build mode above,
+making calls to FINN library functions, preprocessing and altering models, building several variants etc.
+You can find a basic example of build.py under ``src/finn/qnn-data/build_dataflow/build.py``.
+
+You can launch the custom build flow using:
+
+::
+
+ ./run-docker.sh build_custom <path/to/custom_build_dir/>
+
+This will mount the specified folder into the FINN Docker container and launch
+your ``build.py``.
diff --git a/docs/finn/developers.rst b/docs/finn/developers.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6e7fa0d920a943e468fd70464b050ab74cf8ec7d
--- /dev/null
+++ b/docs/finn/developers.rst
@@ -0,0 +1,195 @@
+***********************
+Developer documentation
+***********************
+
+.. note:: **This page is under construction.**
+
+This page is intended to serve as a starting point for new FINN developers.
+Power users may also find this information useful.
+
+Getting started
+================
+
+Before starting to do development on FINN it's a good idea to start
+with understanding the basics as a user. Going through all of the
+:ref:`tutorials` is strongly recommended if you haven' already done so.
+Additionally, please review the documentation available on :ref:`internals`.
+
+Repository structure
+=====================
+
+.. image:: img/repo-structure.png
+   :scale: 70%
+   :align: center
+
+The figure above gives a description of the repositories used by the
+FINN project, and how they are interrelated.
+
+Branching model
+===============
+
+All of the FINN repositories mentioned above use a variant of the
+GitHub flow from https://guides.github.com/introduction/flow as
+further detailed below:
+
+* The `master` or `main` branch contains the latest released
+  version, with a version tag.
+
+* The `dev` branch is where new feature branches get merged after
+  testing. `dev` is "almost ready to release" at any time, and is
+  tested with nightly Jenkins builds -- including all unit tests
+  and end-to-end tests.
+
+* New features or fixes are developed in branches that split from
+  `dev` and are named similar to `feature/name_of_feature`.
+  Single-commit fixes may be made without feature branches.
+
+* New features must come with unit tests and docstrings. If
+  applicable, it must also be tested as part of an end-to-end flow,
+  preferably with a new standalone test. Make sure the existing
+  test suite (including end-to-end tests) still pass.
+  When in doubt, consult with the FINN maintainers.
+
+* When a new feature is ready, a pull request (PR) can be opened
+  targeting the `dev` branch, with a brief description of what the
+  PR changes or introduces.
+
+* Larger features must be broken down into several, smaller PRs. If
+  your PRs have dependencies on each other please state in which order
+  they should be reviewed and merged.
+
+Docker images
+===============
+
+If you want to add new dependencies (packages, repos) to FINN it's
+important to understand how we handle this in Docker.
+There are currently two Docker images used in FINN:
+
+* The finn.dev image, used for deploying and developing the FINN compiler. Details described below.
+* The finn.ci image, which is used for continuous integration testing. Almost identical to finn.dev image, key differences are no user setup and fewer packages installed (e.g. no Jupyter).
+
+The finn.dev image is built and launched as follows:
+
+1. run-docker.sh launches the build of the Docker image with `docker build`
+
+2. Docker image is built from docker/Dockerfile.finn_dev using the following steps:
+
+  * Base: PyTorch dev image
+  * Set up apt dependencies: apt-get install a few packages for verilator and
+  * Set up pip dependencies: Python packages FINN depends on are listed in requirements.txt, which is copied into the container and pip-installed. Some additional packages (such as Jupyter and Netron) are also installed.
+  * Do user setup: Switch to the same user running the container to avoid running as root.
+  * Clone dependency repos: These include Brevitas, finn-hlslib, finn-base, pyverilator and oh-my-xilinx. The correct commit version will be checked out by the entrypoint script.
+  * Install XRT deps, if needed: For Vitis builds we need to install the extra dependencies for XRT. This is only triggered if the image is built with the INSTALL_XRT_DEPS=1 argument.
+
+3. Docker image is ready, run-docker.sh can now launch a container from this image with `docker run`. It sets up certain environment variables and volume mounts:
+
+  * Vivado/Vitis is mounted from the host into the container (on the same path).
+  * The finn root folder is mounted under /workspace/finn. This allows modifying the source code on the host and testing inside the container.
+  * The build folder is mounted under /tmp/finn_dev_username (can be overridden by defining FINN_HOST_BUILD_DIR). This will be used for generated files. Mounting on the host allows easy examination of the generated files, and keeping the generated files after the container exits.
+  * Various environment variables are set up for use inside the container. See the run-docker.sh script for a complete list.
+
+4. Entrypoint script (docker/finn_entrypoint.sh) upon launching container performs the following:
+
+  * Update and checkout the dependency repos at specified commits.
+  * Source Vivado settings64.sh from specified path to make vivado and vivado_hls available.
+  * Download PYNQ board files into the finn root directory, unless they already exist.
+  * Source Vitits settings64.sh if Vitis is mounted.
+
+5. Depending on the arguments to run-docker.sh a different application is launched. run-docker.sh notebook launches a Jupyter server for the tutorials, whereas run-docker.sh build_custom and run-docker.sh build_dataflow trigger a dataflow build (see documentation). Running without arguments yields an interactive shell. See run-docker.sh for other options.
+
+Linting
+=======
+
+We use a pre-commit hook to auto-format Python code and check for issues.
+See https://pre-commit.com/ for installation. Once you have pre-commit, you can install
+the hooks into your local clone of the FINN repo.
+It's recommended to do this **on the host** and not inside the Docker container:
+
+::
+
+  pre-commit install
+
+
+Every time you commit some code, the pre-commit hooks will first run, performing various
+checks and fixes. In some cases pre-commit won't be able to fix the issues and
+you may have to fix it manually, then run `git commit` once again.
+The checks are configured in .pre-commit-config.yaml under the repo root.
+
+Testing
+=======
+
+Tests are vital to keep FINN running.  All the FINN tests can be found at https://github.com/Xilinx/finn/tree/master/tests.
+These tests can be roughly grouped into three categories:
+
+ * Unit tests: targeting unit functionality, e.g. a single transformation. Example: https://github.com/Xilinx/finn/blob/master/tests/transformation/streamline/test_sign_to_thres.py tests the expected behavior of the `ConvertSignToThres` transformation pass.
+
+ * Small-scale integration tests: targeting a group of related classes or functions that to test how they behave together. Example: https://github.com/Xilinx/finn/blob/master/tests/fpgadataflow/test_convert_to_hls_conv_layer.py sets up variants of ONNX Conv nodes that are first lowered and then converted to FINN HLS layers.
+
+ * End-to-end tests: testing a typical 'end-to-end' compilation flow in FINN, where one end is a trained QNN and the other end is a hardware implementation. These tests can be quite large and are typically broken into several steps that depend on prior ones. Examples: https://github.com/Xilinx/finn/tree/master/tests/end2end
+
+Additionally, finn-base, brevitas and finn-hlslib also include their own test suites.
+The full FINN compiler test suite
+(which will take several hours to run and require a PYNQ board) can be executed
+by:
+
+::
+
+  bash ./run-docker.sh test
+
+There is a quicker variant of the test suite that skips the tests marked as
+requiring Vivado or as slow-running tests:
+
+::
+
+  bash ./run-docker.sh quicktest
+
+When developing a new feature it's useful to be able to run just a single test,
+or a group of tests that e.g. share the same prefix.
+You can do this inside the Docker container
+from the FINN root directory as follows:
+
+::
+
+  python setup.py test --addopts "-k test_brevitas_debug --pdb"
+
+
+If you want to run tests in parallel (e.g. to take advantage of a multi-core CPU)
+you can use:
+
+* pytest-parallel for any rtlsim tests, e.g. `python setup.py test --addopts "-k rtlsim --workers auto"`
+* pytest-xdist for anything else, make sure to add `--dist=loadfile` if you have tests in the same file that have dependencies on each other e.g. `python setup.py test --addopts "-k mytest -n auto --dist=loadfile"`
+
+Finally, the full test suite with appropriate parallelization can be run inside the container by:
+
+::
+
+  quicktest.sh full
+
+See more options on pytest at https://docs.pytest.org/en/stable/usage.html.
+
+Documentation
+==============
+
+FINN provides two types of documentation:
+
+* manually written documentation, like this page
+* autogenerated API docs from Sphinx
+
+Everything is built using Sphinx, which is installed into the finn.dev
+Docker image. You can build the documentation locally by running the following
+inside the container:
+
+::
+
+  python setup.py docs
+
+You can view the generated documentation on build/html/index.html.
+The documentation is also built online by readthedocs:
+
+  * finn.readthedocs.io contains the docs from the master branch
+  * finn-dev.readthedocs.io contains the docs from the dev branch
+
+When adding new features, please add docstrings to new functions and classes
+(at least the top-level ones intended to be called by power users or other devs).
+We recommend reading the Google Python guide on docstrings here for contributors:
+https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings
diff --git a/docs/finn/end_to_end_flow.rst b/docs/finn/end_to_end_flow.rst
index 529df57a5efe636cdd508efc3297bd05013a6b71..a51d56d771384fddbc51271a074748e23ec8295c 100644
--- a/docs/finn/end_to_end_flow.rst
+++ b/docs/finn/end_to_end_flow.rst
@@ -2,12 +2,10 @@
 End-to-End Flow
 ***************
 
-.. note:: **This website is currently under construction.**
-
 The following image shows an example end-to-end flow in FINN, starting from a trained PyTorch/Brevitas network and going all the way to a running FPGA accelerator.
 As you can see in the picture, FINN has a high modularity and has the property that the flow can be stopped at any point and the intermediate result can be used for further processing or other purposes. This enables a wide range of users to benefit from FINN, even if they do not use the whole flow.
 
-.. image:: ../../notebooks/end2end_example/finn-design-flow-example.svg
+.. image:: ../../notebooks/end2end_example/bnn-pynq/finn-design-flow-example.svg
    :scale: 50%
    :align: center
 
diff --git a/docs/finn/example_networks.rst b/docs/finn/example_networks.rst
index 9016a8d2a65703cc1231e2468440a5ff74863d3c..3f1ae0d603b18e8467477ea6e44863a02dee467b 100644
--- a/docs/finn/example_networks.rst
+++ b/docs/finn/example_networks.rst
@@ -4,8 +4,17 @@
 Example Networks
 ****************
 
-FINN uses `several pre-trained QNNs <https://github.com/Xilinx/brevitas/tree/master/brevitas_examples/bnn_pynq>`_
-that serve as examples and testcases.
+Please visit the `finn-examples <https://github.com/Xilinx/finn-examples>`_
+repository. This repo includes prebuilt bitfiles for various PYNQ and Alveo
+platforms, as well as the scripts to rebuild these examples using the FINN
+compiler.
+
+
+End-to-end Integration tests
+============================
+
+The FINN compiler uses `several pre-trained QNNs <https://github.com/Xilinx/brevitas/tree/master/brevitas_examples/bnn_pynq>`_
+that serve as both examples and testcases.
 
 * TFC, SFC, LFC... are fully-connected networks trained on the MNIST dataset
 * CNV is a convolutional network trained on the CIFAR-10 dataset
@@ -16,10 +25,10 @@ and the key performance indicators (FPGA resource, frames per second...) are
 automatically posted to the dashboard below.
 To implement a new network, you can use the `integration test code <https://github.com/Xilinx/finn/blob/dev/tests/end2end/test_end2end_bnn_pynq.py>`_
 as a starting point, as well as the `relevant Jupyter notebooks
-<https://github.com/Xilinx/finn/tree/master/notebooks/end2end_example>`_.
+<https://github.com/Xilinx/finn/tree/master/notebooks/end2end_example/bnn-pynq>`_.
 
 .. image:: https://firebasestorage.googleapis.com/v0/b/drive-assets.google.com.a.appspot.com/o/Asset%20-%20Drive%20Icon512.png?alt=media
   :width: 50px
   :align: left
 
-`FINN end-to-end dashboard on Google Drive <https://bit.ly/finn-end2end-dashboard>`_
+`FINN end-to-end integration tests dashboard on Google Drive <https://bit.ly/finn-end2end-dashboard>`_
diff --git a/docs/finn/getting_started.rst b/docs/finn/getting_started.rst
index 418b665e1317c23c22daf2e3fd0ebcd40a1e2151..3b475303d7dde91e8b6a21856eb4d66417f164d7 100644
--- a/docs/finn/getting_started.rst
+++ b/docs/finn/getting_started.rst
@@ -4,22 +4,68 @@
 Getting Started
 ***************
 
-.. note:: **This website is currently under construction.**
-
 How to use the FINN compiler
 ============================
-The FINN compiler should not be thought of a single pushbutton tool that does everything for you, but rather as a collection of scripts/tools that will help you convert a QNN into a custom FPGA accelerator that performs high-performance inference. We do provide several examples of taking trained networks all the way down to FPGA bitfiles, but if you are trying to do this for custom networks you will have to write your own Python scripts that call the appropriate FINN Compiler functions that process your design correctly, or adding new functions as required.
+Currently, it's best to think of the FINN compiler as *compiler infrastructure*
+instead of a full *compiler* like `gcc` (although the aim is to get there).
+Although we provide a :ref:`command_line` entry for building dataflow
+accelerators, this only exposes a basic flow that works for simpler networks.
+A better way of looking at the FINN compiler is as a collection of scripts/tools that will help
+you convert a QNN into a custom FPGA accelerator that performs high-performance inference.
+
+**So where do I get started?** The best way of getting started with the FINN
+compiler is to follow the existing
+`Jupyter notebooks <tutorials>`_ and check out the prebuilt
+`examples <https://github.com/Xilinx/finn-examples>`_.
+
+**How do I compile my custom network?**
+This depends on how similar your custom network is to the examples we provide.
+If there are substantial differences, you will most likely have to write your own
+Python scripts that call the appropriate FINN compiler
+functions that process your design correctly, or adding new functions (including
+Vivado HLS layers)
+as required.
+For custom networks, we recommend making a copy of the end-to-end
+Jupyter notebook as a starting point, visualizing the model at intermediate
+steps and adding calls to new transformations as needed.
+Once you have a working flow, you can implement a command line entry for this
+by using the "advanced mode" described in the :ref:`command_line` section.
+
 
-Requirements
-============
+
+
+System Requirements
+====================
 
 * Ubuntu 18.04 with ``bash`` installed
-* Docker
+* Docker `without root <https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user>`_
 * A working Vivado 2019.1 or 2020.1 installation
 * A ``VIVADO_PATH`` environment variable pointing to the Vivado installation directory (e.g. the directory where settings64.sh is located)
-* (optional) A PYNQ board with a network connection
+* *(optional)* A PYNQ board with a network connection
    * the ``bitstring`` package must be installed on the PYNQ: ``sudo pip3 install bitstring``
-* (optional) An Alveo board, and a working Vitis 2020.1 installation if you want to use Vitis and Alveo (see `Alveo first-time setup`_ below)
+* *(optional)* An Alveo board, and a working Vitis 2020.1 installation if you want to use Vitis and Alveo (see `Alveo first-time setup`_ below)
+
+We also recommend running the FINN compiler on a system with sufficiently
+strong hardware:
+
+* **RAM.** Depending on your target FPGA platform, your system must have sufficient RAM to be
+  able to run Vivado/Vitis synthesis for that part. See `this page <https://www.xilinx.com/products/design-tools/vivado/memory.html>`_
+  for more information. For targeting Zynq and Zynq UltraScale+ parts, at least 8 GB is recommended. Larger parts may require up to 16 GB.
+  For targeting Alveo parts with Vitis, at least 64 GB RAM is recommended.
+
+* **CPU.** FINN can parallelize HLS synthesis and several other operations for different
+  layers, so using a multi-core CPU is recommended. However, this should be balanced
+  against the memory usage as a high degree of parallelization will require more
+  memory. See the ``NUM_DEFAULT_WORKERS`` environment variable below for more on
+  how to control the degree of parallelization.
+
+* **Storage.** While going through the build steps, FINN will generate many files as part of
+  the process. For larger networks, you may need 10s of GB of space for the temporary
+  files generated during the build.
+  By default, these generated files will be placed under ``/tmp/finn_dev_<username>``.
+  You can override this location by using the ``FINN_HOST_BUILD_DIR`` environment
+  variable.
+  Mapping the generated file dir to a fast SSD will result in quicker builds.
 
 
 Running FINN in Docker
@@ -28,11 +74,11 @@ We use Docker extensively for developing and deploying FINN. If you are not fami
 
 Getting an interactive shell for development or experimentation
 ***************************************************************
-.. note:: **run-docker.sh requires bash to execute correctly.**
+.. warning:: Do not use ``sudo`` to launch the FINN Docker. Instead, setup Docker to run `without root <https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user>`_
 
 ::
 
-  ./run_docker.sh
+  bash ./run_docker.sh
 
 Simply running sh run-docker.sh without any additional arguments will clone the dependency repos, create a Docker container and give you a terminal with you can use for development for experimentation.
 If you want a new terminal on an already-running container, you can do this with `docker exec -it finn_dev_<username> bash`.
@@ -41,54 +87,30 @@ If you want a new terminal on an already-running container, you can do this with
 
 .. note:: **Develop from host, run inside container:** The FINN repository directory will be mounted from the host, so that you can use a text editor on your host computer to develop and the changes will be reflected directly inside the container.
 
-Running the Jupyter notebooks
-*****************************
-::
-
-  ./run-docker.sh notebook
-
-This will launch the `Jupyter notebook <https://jupyter.org/>`_ server inside a Docker container, and print a link on the terminal that you can open in your browser to run the FINN notebooks or create new ones.
-.. note:: The link will look something like this (the token you get will be different):
-http://127.0.0.1:8888/?token=f5c6bd32ae93ec103a88152214baedff4ce1850d81065bfc
-
-The run-docker.sh script forwards ports 8888 for Jupyter and 8081 for Netron, and launches the notebook server with appropriate arguments.
-
-Running the test suite directly
-*******************************
-FINN comes with a set of tests to check for regressions. The full test suite
-(which will take several hours to run and require a PYNQ board) can be executed
-by:
-
-::
-
-  ./run-docker.sh test
-
-There is a quicker variant of the test suite that skips the tests marked as
-requiring Vivado or as slow-running tests:
+Command Line Entry
+*******************
+FINN is currently more compiler infrastructure than compiler, but we do offer
+a :ref:`command_line` entry for certain use-cases. These run a predefined flow
+or a user-defined flow from the command line as follows:
 
 ::
 
-  ./run-docker.sh quicktest
+  bash ./run_docker.sh build_dataflow <path/to/dataflow_build_dir/>
+  bash ./run_docker.sh build_custom <path/to/custom_build_dir/>
 
-If you want to run individual tests, you can do this *inside the Docker container
-from the FINN root directory* as follows:
 
+Running the Jupyter notebooks
+*****************************
 ::
 
-  python setup.py test --addopts "-k test_brevitas_debug"
-
-If you want to run tests in parallel (e.g. to take advantage of a multi-core CPU)
-you can use:
- * pytest-parallel for any rtlsim tests, e.g. `python setup.py test --addopts "-k rtlsim --workers auto"`
- * pytest-xdist for anything else, make sure to add `--dist=loadfile` if you have tests in the same file that have dependencies on each other e.g. `python setup.py test --addopts "-k mytest -n auto --dist=loadfile"`
-
-Please see the pytest documentation for more about picking tests by marks or by name.
+  bash ./run-docker.sh notebook
 
-Finally, the full test suite with appropriate parallelization can be run inside the container by:
+This will launch the `Jupyter notebook <https://jupyter.org/>`_ server inside a Docker container, and print a link on the terminal that you can open in your browser to run the FINN notebooks or create new ones.
+.. note:: The link will look something like this (the token you get will be different):
+http://127.0.0.1:8888/?token=f5c6bd32ae93ec103a88152214baedff4ce1850d81065bfc
 
-::
+The ``run-docker.sh`` script forwards ports 8888 for Jupyter and 8081 for Netron, and launches the notebook server with appropriate arguments.
 
-  quicktest.sh full
 
 Environment variables
 **********************
@@ -106,11 +128,12 @@ These are summarized below:
 * ``PYNQ_USERNAME`` and ``PYNQ_PASSWORD`` (or ``ALVEO_USERNAME`` and ``ALVEO_PASSWORD``) specify the PYNQ board / Alveo host access credentials for the test suite. For PYNQ, password is always needed to run as sudo. For Alveo, you can leave the password empty and place your ssh private key in the ``finn/ssh_keys`` folder to use keypair authentication.
 * ``PYNQ_TARGET_DIR`` (or ``ALVEO_TARGET_DIR``) specifies the target dir on the PYNQ board / Alveo host for the test suite
 * (optional) ``FINN_HOST_BUILD_DIR`` specifies which directory on the host will be used as the build directory. Defaults to ``/tmp/finn_dev_<username>``
+* (optional) ``IMAGENET_VAL_PATH`` specifies the path to the ImageNet validation directory for tests.
 
 Supported Hardware
 ===================
-**End-to-end support including driver:** For quick deployment, FINN targets boards supported by  `PYNQ <https://pynq.io/>`_ . For these platforms, we can build a full bitfile including DMAs to move data into and out of the FINN-generated accelerator, as well as a Python driver to launch the accelerator. We support the Pynq-Z1, Pynq-Z2, Ultra96, ZCU102 and ZCU104 boards.
-As of FINN v0.4b we also have preliminary support for `Xilinx Alveo boards <>`_ using PYNQ and Vitis, see instructions below for Alveo setup.
+**Shell-integrated accelerator + driver:** For quick deployment, we target boards supported by  `PYNQ <https://pynq.io/>`_ . For these platforms, we can build a full bitfile including DMAs to move data into and out of the FINN-generated accelerator, as well as a Python driver to launch the accelerator. We support the Pynq-Z1, Pynq-Z2, Ultra96, ZCU102 and ZCU104 boards.
+As of FINN v0.4b we also have preliminary support for `Xilinx Alveo boards <https://www.xilinx.com/products/boards-and-kits/alveo.html>`_ using PYNQ and Vitis, see instructions below for Alveo setup.
 
 **Vivado IPI support for any Xilinx FPGA:** FINN generates a Vivado IP Integrator (IPI) design from the neural network with AXI stream (FIFO) in-out interfaces, which can be integrated onto any Xilinx FPGA as part of a larger system. It's up to you to take the FINN-generated accelerator (what we call "stitched IP" in the tutorials), wire it up to your FPGA design and send/receive neural network data to/from the accelerator.
 
@@ -123,9 +146,9 @@ On the target side:
 1. Install Xilinx XRT and set up the ``XILINX_XRT`` environment variable to point to your installation, for instance ``/opt/xilinx/xrt``.
 2. Install the Vitis platform files for Alveo and set up the ``PLATFORM_REPO_PATHS`` environment variable to point to your installation, for instance ``/opt/xilinx/platforms``.
 3. Create a conda environment named *finn-pynq-alveo* by following this guide `to set up PYNQ for Alveo <https://pynq.readthedocs.io/en/latest/getting_started/alveo_getting_started.html>`_. It's best to follow the recommended environment.yml (set of package versions) in this guide.
-4. Activate the environment with `conda activate finn-pynq-alveo` and install the bitstring package with ``pip install bitstring``
+4. Activate the environment with `conda activate finn-pynq-alveo` and install the bitstring package with ``pip install bitstring``.
 5. Done! You should now be able to e.g. ``import pynq`` in Python scripts.
-6 (optional) If you don't want to specify the ``ALVEO_PASSWORD`` environment variable, you can `set up public key authentication <https://www.digitalocean.com/community/tutorials/how-to-configure-ssh-key-based-authentication-on-a-linux-server>`_. Copy your private key to the ``finn/ssh_keys`` folder on the host to get password-less deployment and remote execution.
+6. (optional) If you don't want to specify the ``ALVEO_PASSWORD`` environment variable, you can `set up public key authentication <https://www.digitalocean.com/community/tutorials/how-to-configure-ssh-key-based-authentication-on-a-linux-server>`_. Copy your private key to the ``finn/ssh_keys`` folder on the host to get password-less deployment and remote execution.
 
 
 On the host side:
diff --git a/docs/finn/hw_build.rst b/docs/finn/hw_build.rst
index 9f292a91c352d3c0d4f7adb340c85f15c1b52a53..d03fc400bde90da905c45d408c95badc85b7d6ec 100644
--- a/docs/finn/hw_build.rst
+++ b/docs/finn/hw_build.rst
@@ -4,7 +4,7 @@
 Hardware Build and Deployment
 *****************************
 
-.. image:: /img/finn-hw-build.png
+.. image:: img/finn-hw-build.png
    :scale: 70%
    :align: center
 
@@ -66,7 +66,7 @@ block. **
 
 
 FIFO Insertion and IP Generation
--------------------------------
+---------------------------------
 
 FINN will descend into each partition and insert FIFO nodes between streaming nodes,
 where FIFO depths dictated by the node attributes, using the :py:mod:`finn.transformation.fpgadataflow.insert_fifo.InsertFIFO`
diff --git a/docs/img/finn-stack.png b/docs/finn/img/finn-stack.png
similarity index 100%
rename from docs/img/finn-stack.png
rename to docs/finn/img/finn-stack.png
diff --git a/docs/finn/img/repo-structure.png b/docs/finn/img/repo-structure.png
new file mode 100644
index 0000000000000000000000000000000000000000..05031ff9a5500c3302a36ea88309b3707bc5d108
Binary files /dev/null and b/docs/finn/img/repo-structure.png differ
diff --git a/docs/finn/index.rst b/docs/finn/index.rst
index 94b9e7d37d7e5c5d5f9ca1a788b554340e46a507..fa7ed30205da5b9c63c469ca600211e7865a9730 100644
--- a/docs/finn/index.rst
+++ b/docs/finn/index.rst
@@ -1,30 +1,38 @@
-.. finn documentation master file, created by
-   sphinx-quickstart on Mon Feb 24 14:55:45 2020.
-   You can adapt this file completely to your liking, but it should at least
-   contain the root `toctree` directive.
-
 ****
 FINN
 ****
-.. note:: **This website is currently under construction.**
 
-Welcome to the FINN Read the Docs website. This website is about the new, more modular version of FINN, which is currently under development on GitHub, and we welcome contributions from the community! Stay tuned for more updates.
+Welcome to the FINN Read the Docs website!
 
 What is FINN?
 =============
+.. image:: img/finn-stack.png
+   :scale: 40%
+   :align: center
+
 'FINN' is colloquially used to refer to two separate but highly related things:
 
-* The FINN project, which is an experimental framework from Xilinx Research Labs to explore deep neural network inference on FPGAs. It specifically targets quantized neural networks, with emphasis on generating dataflow-style architectures customized for each network. It includes tools for training quantized neural networks such as Brevitas, the FINN compiler, and the finn-hlslib Vivado HLS library of FPGA components for QNNs. An overview of the project can be taken from the following graphic and details can be seen on the `FINN project homepage <https://xilinx.github.io/finn/>`_.
+* The FINN **project**, which is an experimental framework from Xilinx Research Labs
+to explore deep neural network inference on FPGAs. It specifically targets
+quantized neural networks (QNNs), with emphasis on generating dataflow-style
+architectures customized for each network.
+The key components are illustrated in the figure above;
+including tools for training
+quantized neural networks (Brevitas), the FINN compiler, and the finn-hlslib
+Vivado HLS library of FPGA components for QNNs.
+Read more on the `FINN project homepage <https://xilinx.github.io/finn/>`_.
 
-.. image:: ../img/finn-stack.png
-   :scale: 40%
-   :align: center
+* The FINN **compiler**, which this Read the Docs website is the documentation for.
+The compiler is a central part of the FINN project (above) that maps QNNs to
+dataflow-style FPGA architectures.
+You can find the FINN compiler in this `GitHub repository <https://github.com/Xilinx/finn>`_.
 
-* The FINN compiler, which this Read the Docs website corresponds to and is the centerpiece of the FINN project. Details can be looked up directly in the `FINN GitHub repository <https://github.com/Xilinx/finn>`_. To learn more about the FINN compiler, use this website and for a hands-on experience the repository contains some Jupyter notebooks which can be found under this `link <https://github.com/Xilinx/finn/tree/dev/notebooks>`_.
 
 More FINN Resources
 ===================
 
+* `The FINN examples repository <https://github.com/Xilinx/finn-examples>`_
+
 * `List of publications <https://github.com/Xilinx/finn/blob/master/docs/publications.md>`_
 
 * `Roadmap <https://github.com/Xilinx/finn/projects/1>`_
@@ -36,7 +44,9 @@ More FINN Resources
    getting_started
    tutorials
    end_to_end_flow
+   command_line
    example_networks
    internals
+   developers
    source_code/finn
    genindex
diff --git a/docs/finn/internals.rst b/docs/finn/internals.rst
index 4cbf671235cbe61b7afcba9979c1259ecddf35a0..0fbc3cf72795005591994ddca0fa0d58b72622a8 100644
--- a/docs/finn/internals.rst
+++ b/docs/finn/internals.rst
@@ -1,9 +1,9 @@
+.. _internals:
+
 *********
 Internals
 *********
 
-.. note:: **This website is currently under construction.**
-
 Intermediate Representation: FINN-ONNX
 ======================================
 
diff --git a/docs/finn/nw_prep.rst b/docs/finn/nw_prep.rst
index f9909d2befdff14b546c850b3cf56820785b2ffc..f5c64e76a4412e1b74ba321d5be1f3e29be1063e 100644
--- a/docs/finn/nw_prep.rst
+++ b/docs/finn/nw_prep.rst
@@ -4,9 +4,7 @@
 Network Preparation
 *******************
 
-.. note:: **This website is currently under construction.**
-
-.. image:: /img/nw-prep.png
+.. image:: img/nw-prep.png
    :scale: 70%
    :align: center
 
diff --git a/docs/finn/source_code/finn.analysis.fpgadataflow.rst b/docs/finn/source_code/finn.analysis.fpgadataflow.rst
index 72f8fb811b97b34759d346289a0421f5af7f9a1c..b52e994ee6033d4c3c1aae6400e20e103455d7b6 100644
--- a/docs/finn/source_code/finn.analysis.fpgadataflow.rst
+++ b/docs/finn/source_code/finn.analysis.fpgadataflow.rst
@@ -4,6 +4,32 @@ Analysis - fpgadataflow
 Analysis Passes (fpgadataflow)
 ==============================
 
+finn.analysis.fpgadataflow.dataflow\_performance
+------------------------------------------------
+
+.. automodule:: finn.analysis.fpgadataflow.dataflow_performance
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
+finn.analysis.fpgadataflow.exp\_cycles\_per\_layer
+---------------------------------------------------
+
+.. automodule:: finn.analysis.fpgadataflow.exp_cycles_per_layer
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
+finn.analysis.fpgadataflow.floorplan\_params
+--------------------------------------------
+
+.. automodule:: finn.analysis.fpgadataflow.floorplan_params
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 finn.analysis.fpgadataflow.hls\_synth\_res\_estimation
 -------------------------------------------------------------
 
@@ -12,6 +38,14 @@ finn.analysis.fpgadataflow.hls\_synth\_res\_estimation
    :undoc-members:
    :show-inheritance:
 
+ finn.analysis.fpgadataflow.op\_and\_param\_counts
+ --------------------------------------------------
+
+ .. automodule:: finn.analysis.fpgadataflow.op_and_param_counts
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 finn.analysis.fpgadataflow.post\_synth\_res
 --------------------------------------------------
 
diff --git a/docs/finn/source_code/finn.analysis.rst b/docs/finn/source_code/finn.analysis.rst
index 61946f8fe6d2be894dd9c3334a09c485e1ee1673..7312150657c86976638e73fdf2c0450160989a6a 100644
--- a/docs/finn/source_code/finn.analysis.rst
+++ b/docs/finn/source_code/finn.analysis.rst
@@ -15,6 +15,15 @@ Submodules
 Analysis Passes
 ===============
 
+finn.analysis.base
+-----------------------------
+
+.. automodule:: finn.analysis.base
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 finn.analysis.topology
 -----------------------------
 
diff --git a/docs/finn/source_code/finn.builder.rst b/docs/finn/source_code/finn.builder.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2433cab83d1aa140010f4082ec8323bdaa8c6ff4
--- /dev/null
+++ b/docs/finn/source_code/finn.builder.rst
@@ -0,0 +1,31 @@
+*******
+Builder
+*******
+
+Modules
+=======
+
+finn.builder.build\_dataflow
+----------------------------
+
+.. automodule:: finn.builder.build_dataflow
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+finn.builder.build\_dataflow\_config
+------------------------------------
+
+.. automodule:: finn.builder.build_dataflow_config
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+
+finn.builder.build\_dataflow\_steps
+------------------------------------
+
+.. automodule:: finn.builder.build_dataflow_steps
+  :members:
+  :undoc-members:
+  :show-inheritance:
diff --git a/docs/finn/source_code/finn.core.rst b/docs/finn/source_code/finn.core.rst
index 3ac077c694aafe938d36162dae86e1aafd6913dd..86afd5a1063db37bb212f5ceb07cfa69bbbcbc0b 100644
--- a/docs/finn/source_code/finn.core.rst
+++ b/docs/finn/source_code/finn.core.rst
@@ -5,6 +5,14 @@ Core
 Modules
 =======
 
+finn.core.data\_layout
+-------------------------
+
+.. automodule:: finn.core.data_layout
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 finn.core.datatype
 -------------------------
 
diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
index 5345fed596484c95c7204c16a4c5f57aa3101a81..7b4e7bfa05f895cd03aed2859576e07db28bd9f9 100644
--- a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
+++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
@@ -1,5 +1,5 @@
 ************************
-Custom Op - FPGADataFlow
+Custom Op - fpgadataflow
 ************************
 
 HLS Custom Op Nodes
diff --git a/docs/finn/source_code/finn.custom_op.general.rst b/docs/finn/source_code/finn.custom_op.general.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e86774a48e22b5af9e4d2995a4287a740b1c08e5
--- /dev/null
+++ b/docs/finn/source_code/finn.custom_op.general.rst
@@ -0,0 +1,62 @@
+************************
+Custom Op - General
+************************
+
+General Custom Ops
+===================
+
+finn.custom\_op.general.debugmarker
+-----------------------------------
+
+.. automodule:: finn.custom_op.general.debugmarker
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+finn.custom\_op.general.im2col
+------------------------------
+
+.. automodule:: finn.custom_op.general.im2col
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+finn.custom\_op.general.maxpoolnhwc
+------------------------------------
+
+.. automodule:: finn.custom_op.general.maxpoolnhwc
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+finn.custom\_op.general.multithreshold
+---------------------------------------
+
+.. automodule:: finn.custom_op.general.multithreshold
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+finn.custom\_op.general.quantavgpool2d
+--------------------------------------
+
+.. automodule:: finn.custom_op.general.quantavgpool2d
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+finn.custom\_op.general.streamingdataflowpartition
+---------------------------------------------------
+
+.. automodule:: finn.custom_op.general.streamingdataflowpartition
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+finn.custom\_op.general.xnorpopcount
+-------------------------------------
+
+.. automodule:: finn.custom_op.general.xnorpopcount
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/finn/source_code/finn.custom_op.rst b/docs/finn/source_code/finn.custom_op.rst
index 8c43ddb424b5f690a0c266c4f31ab95dfa77e480..1ee3e1dce1898b06605c89202ee841489b817942 100644
--- a/docs/finn/source_code/finn.custom_op.rst
+++ b/docs/finn/source_code/finn.custom_op.rst
@@ -9,6 +9,7 @@ Submodules
    :maxdepth: 2
 
    finn.custom_op.fpgadataflow
+   finn.custom_op.general
 
 Custom Op Nodes
 ===============
@@ -16,55 +17,15 @@ Custom Op Nodes
 Base Class
 ----------
 
-.. automodule:: finn.custom_op
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-finn.custom\_op.im2col
------------------------------
-
-.. automodule:: finn.custom_op.general.im2col
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-finn.custom\_op.maxpoolnhwc
-----------------------------------
-
-.. automodule:: finn.custom_op.general.maxpoolnhwc
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-finn.custom\_op.multithreshold
--------------------------------------
-
-.. automodule:: finn.custom_op.general.multithreshold
+.. automodule:: finn.custom_op.base
    :members:
    :undoc-members:
    :show-inheritance:
 
 finn.custom\_op.registry
--------------------------------
+-------------------------
 
 .. automodule:: finn.custom_op.registry
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-finn.custom\_op.streamingdataflowpartition
--------------------------------------------------
-
-.. automodule:: finn.custom_op.general.streamingdataflowpartition
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-finn.custom\_op.xnorpopcount
------------------------------------
-
-.. automodule:: finn.custom_op.general.xnorpopcount
-   :members:
-   :undoc-members:
-   :show-inheritance:
+  :members:
+  :undoc-members:
+  :show-inheritance:
diff --git a/docs/finn/source_code/finn.rst b/docs/finn/source_code/finn.rst
index 1197c50a035b77ec24f51f9e95f6208db162db8e..607ac636a43d88150493eebb86b1e568b38b681a 100644
--- a/docs/finn/source_code/finn.rst
+++ b/docs/finn/source_code/finn.rst
@@ -3,6 +3,8 @@ FINN API
 ********
 The FINN sources are divided into different modules. They are listed below.
 
+.. note:: **Some of these functions and modules are located in the `finn-base` repository.**
+
 Modules
 =======
 
@@ -10,6 +12,7 @@ Modules
    :maxdepth: 1
 
    finn.analysis
+   finn.builder
    finn.core
    finn.custom_op
    finn.transformation
diff --git a/docs/finn/source_code/finn.transformation.fpgadataflow.rst b/docs/finn/source_code/finn.transformation.fpgadataflow.rst
index cd6e443675974b8b59e92ec2142cf848e62ba3c6..42bc7fb5315756b924e0d1cce58ca4e110bda824 100644
--- a/docs/finn/source_code/finn.transformation.fpgadataflow.rst
+++ b/docs/finn/source_code/finn.transformation.fpgadataflow.rst
@@ -5,6 +5,15 @@ Transformation - fpgadataflow
 Transformations (fpgadataflow)
 ==============================
 
+finn.transformation.fpgadataflow.annotate\_cycles
+-----------------------------------------------------------
+
+.. automodule:: finn.transformation.fpgadataflow.annotate_cycles
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 finn.transformation.fpgadataflow.annotate\_resources
 -----------------------------------------------------------
 
@@ -53,6 +62,15 @@ finn.transformation.fpgadataflow.create\_stitched\_ip
    :undoc-members:
    :show-inheritance:
 
+finn.transformation.fpgadataflow.floorplan
+----------------------------------------------------
+
+.. automodule:: finn.transformation.fpgadataflow.floorplan
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+
 finn.transformation.fpgadataflow.hlssynth\_ip
 ----------------------------------------------------
 
@@ -77,6 +95,15 @@ finn.transformation.fpgadataflow.insert\_fifo
    :undoc-members:
    :show-inheritance:
 
+finn.transformation.fpgadataflow.insert\_iodma
+----------------------------------------------------
+
+.. automodule:: finn.transformation.fpgadataflow.insert_iodma
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+
 finn.transformation.fpgadataflow.insert\_tlastmarker
 -----------------------------------------------------------
 
@@ -101,13 +128,23 @@ finn.transformation.fpgadataflow.make\_pynq\_driver
    :undoc-members:
    :show-inheritance:
 
-finn.transformation.fpgadataflow.make\_pynq\_proj
---------------------------------------------------------
+finn.transformation.fpgadataflow.make\_zynq\_proj
+----------------------------------------------------------
+
+.. automodule:: finn.transformation.fpgadataflow.make_zynq_proj
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+
+finn.transformation.fpgadataflow.minimize\_accumulator\_width
+--------------------------------------------------------------
+
+.. automodule:: finn.transformation.fpgadataflow.minimize_accumulator_width
+  :members:
+  :undoc-members:
+  :show-inheritance:
 
-.. automodule:: finn.transformation.fpgadataflow.make_pynq_proj
-   :members:
-   :undoc-members:
-   :show-inheritance:
 
 finn.transformation.fpgadataflow.prepare\_cppsim
 -------------------------------------------------------
@@ -149,10 +186,34 @@ finn.transformation.fpgadataflow.set\_exec\_mode
    :undoc-members:
    :show-inheritance:
 
-finn.transformation.fpgadataflow.synth\_pynq\_proj
----------------------------------------------------------
+finn.transformation.fpgadataflow.set\_fifo\_depths
+-------------------------------------------------------
+
+.. automodule:: finn.transformation.fpgadataflow.set_fifo_depths
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+finn.transformation.fpgadataflow.set\_folding
+-------------------------------------------------------
+
+.. automodule:: finn.transformation.fpgadataflow.set_folding
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+finn.transformation.fpgadataflow.synth\_ooc
+-------------------------------------------------------
 
-.. automodule:: finn.transformation.fpgadataflow.synth_pynq_proj
+.. automodule:: finn.transformation.fpgadataflow.synth_ooc
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+finn.transformation.fpgadataflow.template\_driver
+-------------------------------------------------
+
+.. automodule:: finn.transformation.fpgadataflow.template_driver
    :members:
    :undoc-members:
    :show-inheritance:
@@ -161,6 +222,14 @@ finn.transformation.fpgadataflow.templates
 -------------------------------------------------
 
 .. automodule:: finn.transformation.fpgadataflow.templates
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+finn.transformation.fpgadataflow.vitis\_build
+-------------------------------------------------
+
+.. automodule:: finn.transformation.fpgadataflow.vitis_build
    :members:
    :undoc-members:
    :show-inheritance:
diff --git a/docs/finn/source_code/finn.transformation.rst b/docs/finn/source_code/finn.transformation.rst
index 4378cbf8e3bae09d49f20c4fe460e822a2a03993..aeb0d7614222740315633f7658cab9cc7e75490b 100644
--- a/docs/finn/source_code/finn.transformation.rst
+++ b/docs/finn/source_code/finn.transformation.rst
@@ -40,6 +40,15 @@ finn.transformation.bipolar\_to\_xnor
    :undoc-members:
    :show-inheritance:
 
+finn.transformation.change\_datalayout
+--------------------------------------------
+
+.. automodule:: finn.transformation.change_datalayout
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+
 finn.transformation.double\_to\_single\_float
 ----------------------------------------------------
 
@@ -64,6 +73,14 @@ finn.transformation.general
    :undoc-members:
    :show-inheritance:
 
+finn.transformation.infer\_data\_layouts
+-------------------------------------------
+
+.. automodule:: finn.transformation.infer_data_layouts
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
 finn.transformation.infer\_datatypes
 -------------------------------------------
 
@@ -96,6 +113,16 @@ finn.transformation.lower\_convs\_to\_matmul
    :undoc-members:
    :show-inheritance:
 
+
+finn.transformation.merge\_onnx\_models
+----------------------------------------
+
+.. automodule:: finn.transformation.merge_onnx_models
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+
 finn.transformation.move\_reshape
 ----------------------------------------
 
diff --git a/docs/finn/source_code/finn.transformation.streamline.rst b/docs/finn/source_code/finn.transformation.streamline.rst
index 017622f04a313afa51fa5a20ac8a10317e9658bb..f43d6d12314d3bad38f189d2831e21447f10cf10 100644
--- a/docs/finn/source_code/finn.transformation.streamline.rst
+++ b/docs/finn/source_code/finn.transformation.streamline.rst
@@ -26,6 +26,14 @@ finn.transformation.streamline.collapse\_repeated
    :undoc-members:
    :show-inheritance:
 
+finn.transformation.streamline.remove
+-------------------------------------
+
+.. automodule:: finn.transformation.streamline.remove
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
 finn.transformation.streamline.reorder
 ---------------------------------------------
 
diff --git a/docs/finn/source_code/finn.util.rst b/docs/finn/source_code/finn.util.rst
index d5bbd62357a8d807375d44e91e1cf95ca642ab9d..82e4bf3261582c9be622cbe3f15af38ba5e3fa41 100644
--- a/docs/finn/source_code/finn.util.rst
+++ b/docs/finn/source_code/finn.util.rst
@@ -13,6 +13,24 @@ finn.util.basic
    :undoc-members:
    :show-inheritance:
 
+finn.util.config
+----------------
+
+.. automodule:: finn.util.config
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+finn.util.create
+----------------
+
+.. automodule:: finn.util.create
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+
+
 finn.util.data\_packing
 ------------------------------
 
@@ -29,6 +47,23 @@ finn.util.fpgadataflow
    :undoc-members:
    :show-inheritance:
 
+finn.util.gdrive
+-----------------------------
+
+.. automodule:: finn.util.gdrive
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+finn.util.imagenet
+-----------------------------
+
+.. automodule:: finn.util.imagenet
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+
 finn.util.onnx
 ---------------------
 
@@ -37,6 +72,24 @@ finn.util.onnx
    :undoc-members:
    :show-inheritance:
 
+finn.util.pytorch
+------------------
+
+.. automodule:: finn.util.pytorch
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+
+finn.util.pyverilator
+---------------------
+
+.. automodule:: finn.util.pyverilator
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+
 finn.util.test
 ---------------------
 
@@ -45,6 +98,14 @@ finn.util.test
    :undoc-members:
    :show-inheritance:
 
+finn.util.vcd
+------------------------------
+
+.. automodule:: finn.util.vcd
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
 finn.util.visualization
 ------------------------------
 
@@ -52,3 +113,11 @@ finn.util.visualization
    :members:
    :undoc-members:
    :show-inheritance:
+
+finn.util.vivado
+------------------------------
+
+.. automodule:: finn.util.vivado
+  :members:
+  :undoc-members:
+  :show-inheritance:
diff --git a/docs/finn/tutorials.rst b/docs/finn/tutorials.rst
index cda9d1b7cfccde9b77c09b6ac2776dfaaaa9daff..4e3e8d24b2984b4473504030dc0f6a4001b0e0c8 100644
--- a/docs/finn/tutorials.rst
+++ b/docs/finn/tutorials.rst
@@ -4,9 +4,8 @@
 Tutorials
 *********
 
-.. note:: **This website is currently under construction.**
-
-FINN provides several Jupyter notebooks that can help to get familiar with the basics, the internals and the end-to-end flow in FINN. All Jupyter notebooks can be found in the repo in the `notebook folder <https://github.com/Xilinx/finn/tree/master/notebooks>`_.
+FINN provides several Jupyter notebooks that can help to get familiar with the basics, the internals and the end-to-end flow in FINN.
+All Jupyter notebooks can be found in the repo in the `notebook folder <https://github.com/Xilinx/finn/tree/master/notebooks>`_.
 
 Basics
 ======
@@ -19,7 +18,17 @@ The notebooks in this folder should give a basic insight into FINN, how to get s
 
 * 1_brevitas_network_import
 
-  * This notebook shows how to import a brevitas network and prepare it for the FINN flow.
+  * This notebook shows how to import a Brevitas network and prepare it for the FINN flow.
+
+End-to-End Flow
+===============
+
+There are two groups of notebooks currently available under `the end2end_example directory <https://github.com/Xilinx/finn/tree/master/notebooks/end2end_example>`_ :
+
+* ``cybersecurity`` shows how to train a quantized MLP with Brevitas and deploy it with FINN using the :ref:`command_line` build system.
+
+* ``bnn-pynq`` shows the internal compiler steps that take pretrained Brevitas QNNs on MNIST and CIFAR-10 and generate the FPGA accelerator.
+
 
 Advanced
 ========
@@ -33,20 +42,3 @@ The notebooks in this folder are more developer oriented. They should help you t
 * 1_custom_transformation_pass
 
   * This notebook explains what a transformation pass is and how to write one for FINN.
-
-End-to-End Flow
-===============
-
-This notebook shows the FINN end-to-end flow step by step using an example of a simple, binarized, fully-connected network trained on the MNIST data set. Starting with the brevitas export and taking this particular network all the way down to hardware by using a specific sequence of transformations.
-
-* cnv_end2end_example
-
-  * This notebook takes a simple convolutional model step-by-step from a trained Brevitas net to a running FPGA bitfile.
-
-* tfc_end2end_example
-
-  * This notebook takes a simple fully-connected  model step-by-step from a trained Brevitas net to a running FPGA bitfile.
-
-* tfc_end2end_verification
-
-  * This notebook runs parellel to the tfc_end2end_example notebook above, and shows how the output of each step can be verified.
diff --git a/docs/finn/verification.rst b/docs/finn/verification.rst
index c52c0840aa40566d930164490b1fd249d7c07757..7c636941ad5b8d3d95a152f78e883f6f4782a2f0 100644
--- a/docs/finn/verification.rst
+++ b/docs/finn/verification.rst
@@ -4,13 +4,11 @@
 Functional Verification
 ***********************
 
-.. note:: **This website is currently under construction.**
-
-.. image:: ../../notebooks/end2end_example/verification.png
+.. image:: ../../notebooks/end2end_example/bnn-pynq/verification.png
    :scale: 70%
    :align: center
 
-This part of the flow is covered by the Jupyter notebook about the verification of a simple fully-connected network, which you can find in the `end2end notebook folder <https://github.com/Xilinx/finn/tree/master/notebooks/end2end_example/tfc_end2end_verification.ipynb>`_.
+This part of the flow is covered by the Jupyter notebook about the verification of a simple fully-connected network, which you can find in the `end2end notebook folder <https://github.com/Xilinx/finn/tree/master/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb>`_.
 
 When the network is transformed it is important to verify the functionality to make sure the transformation did not change the behaviour of the model. There are multiple ways of verification that can be applied in different stages of the network inside FINN. All can be accessed using the execution function in module :py:mod:`finn.core.onnx_exec`. The execution happens in most cases node by node, which supports networks that have a mixture of standard ONNX nodes, custom nodes and HLS custom nodes. A single node can be executed using one or more of the following methods:
 
diff --git a/docs/finn/vivado_synth.rst b/docs/finn/vivado_synth.rst
index 26a1f7a85284f2a438efcc05f593533da4cc8641..ca8b8ad655df7b227441f020aca6d629ce1b6afc 100644
--- a/docs/finn/vivado_synth.rst
+++ b/docs/finn/vivado_synth.rst
@@ -1,12 +1,10 @@
 .. _vivado_synth:
 
 *************************
-Vivado HLS and Vivado IPI 
+Vivado HLS and Vivado IPI
 *************************
 
-.. note:: **This website is currently under construction.**
-
-.. image:: /img/vivado-synth.png
+.. image:: img/vivado-synth.png
    :scale: 70%
    :align: center
 
diff --git a/docs/img/JasperVertical4.jpg b/docs/img/JasperVertical4.jpg
deleted file mode 100644
index d7364ec8a99f51e77b421c85a8da4eebe2883751..0000000000000000000000000000000000000000
Binary files a/docs/img/JasperVertical4.jpg and /dev/null differ
diff --git a/docs/img/QuartzNet.jpg b/docs/img/QuartzNet.jpg
deleted file mode 100644
index ce258fcd5f458caae606af0973c2eb14aea0af27..0000000000000000000000000000000000000000
Binary files a/docs/img/QuartzNet.jpg and /dev/null differ
diff --git a/docs/img/WERMB.jpg b/docs/img/WERMB.jpg
deleted file mode 100644
index 3c1ce7d6bc3e378f6e75c204a01538f02a9cb007..0000000000000000000000000000000000000000
Binary files a/docs/img/WERMB.jpg and /dev/null differ
diff --git a/docs/img/WERNops.jpg b/docs/img/WERNops.jpg
deleted file mode 100644
index e539bb26077fb98f9a0f7b554ed63a18d57207a1..0000000000000000000000000000000000000000
Binary files a/docs/img/WERNops.jpg and /dev/null differ
diff --git a/docs/img/accumulator-minimization.png b/docs/img/accumulator-minimization.png
deleted file mode 100644
index 76f81bf9192f0fbc894f89c6c4673c9542f65817..0000000000000000000000000000000000000000
Binary files a/docs/img/accumulator-minimization.png and /dev/null differ
diff --git a/docs/img/cifar-10.png b/docs/img/cifar-10.png
deleted file mode 100644
index b1f7538d689d2a92c627fff561a4ad7fa1e94ccc..0000000000000000000000000000000000000000
Binary files a/docs/img/cifar-10.png and /dev/null differ
diff --git a/docs/img/finn-brevitas-debug.png b/docs/img/finn-brevitas-debug.png
deleted file mode 100644
index fdc260412fb5066e3993abd94c6160456b3c5f1b..0000000000000000000000000000000000000000
Binary files a/docs/img/finn-brevitas-debug.png and /dev/null differ
diff --git a/docs/img/finn-cycle-estimate.png b/docs/img/finn-cycle-estimate.png
deleted file mode 100644
index 1a4faa210975b80bad35b49d13648678deac8da3..0000000000000000000000000000000000000000
Binary files a/docs/img/finn-cycle-estimate.png and /dev/null differ
diff --git a/docs/img/finn-dashboard.png b/docs/img/finn-dashboard.png
deleted file mode 100644
index 33ef9726a7fd80d4de590f3d7ab60618173f52d0..0000000000000000000000000000000000000000
Binary files a/docs/img/finn-dashboard.png and /dev/null differ
diff --git a/docs/img/finn-example.png b/docs/img/finn-example.png
deleted file mode 100644
index b850adcf3bf08431875f2af46787b56ef2c6e57c..0000000000000000000000000000000000000000
Binary files a/docs/img/finn-example.png and /dev/null differ
diff --git a/docs/img/finn-examples-header.png b/docs/img/finn-examples-header.png
new file mode 100644
index 0000000000000000000000000000000000000000..50f8fa7761e10a958ed3567f268ef675cf1814f7
Binary files /dev/null and b/docs/img/finn-examples-header.png differ
diff --git a/docs/img/finn-flow.png b/docs/img/finn-flow.png
deleted file mode 100644
index 47aef287095c46b2f38030bca0a8d2e5d7cae2a9..0000000000000000000000000000000000000000
Binary files a/docs/img/finn-flow.png and /dev/null differ
diff --git a/docs/img/finn-logo.png b/docs/img/finn-logo.png
deleted file mode 100644
index 02f073b9a351fe3e4d27ef3b077059e16954a644..0000000000000000000000000000000000000000
Binary files a/docs/img/finn-logo.png and /dev/null differ
diff --git a/docs/img/finn-logo.svg b/docs/img/finn-logo.svg
deleted file mode 100644
index 0d9e2ef783e71bb5cd082ac147dcc8be4e1d3bb6..0000000000000000000000000000000000000000
--- a/docs/img/finn-logo.svg
+++ /dev/null
@@ -1,136 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<!-- Created with Inkscape (http://www.inkscape.org/) -->
-
-<svg
-   xmlns:dc="http://purl.org/dc/elements/1.1/"
-   xmlns:cc="http://creativecommons.org/ns#"
-   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
-   xmlns:svg="http://www.w3.org/2000/svg"
-   xmlns="http://www.w3.org/2000/svg"
-   xmlns:xlink="http://www.w3.org/1999/xlink"
-   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
-   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
-   version="1.1"
-   id="svg2"
-   xml:space="preserve"
-   width="176.34172"
-   height="53.759998"
-   viewBox="0 0 176.34172 53.759998"
-   sodipodi:docname="finn-logo.svg"
-   inkscape:version="0.92.3 (2405546, 2018-03-11)"><metadata
-     id="metadata8"><rdf:RDF><cc:Work
-         rdf:about=""><dc:format>image/svg+xml</dc:format><dc:type
-           rdf:resource="http://purl.org/dc/dcmitype/StillImage" /><dc:title></dc:title></cc:Work></rdf:RDF></metadata><defs
-     id="defs6"><clipPath
-       clipPathUnits="userSpaceOnUse"
-       id="clipPath18"><path
-         d="M 0,1.2207e-4 H 960 V 540.00012 H 0 Z"
-         id="path16"
-         inkscape:connector-curvature="0"
-         style="clip-rule:evenodd" /></clipPath><clipPath
-       clipPathUnits="userSpaceOnUse"
-       id="clipPath28"><path
-         d="M 5.66e-7,1.2207e-4 H 960 V 540.00012 H 5.66e-7 Z"
-         id="path26"
-         inkscape:connector-curvature="0"
-         style="clip-rule:evenodd" /></clipPath><clipPath
-       clipPathUnits="userSpaceOnUse"
-       id="clipPath40"><path
-         d="M -7.3641e-5,1.8311e-4 H 959.99993 V 540.00018 H -7.3641e-5 Z"
-         id="path38"
-         inkscape:connector-curvature="0"
-         style="clip-rule:evenodd" /></clipPath><clipPath
-       clipPathUnits="userSpaceOnUse"
-       id="clipPath52"><path
-         d="M 1.4305e-5,0 H 960.00001 V 540 H 1.4305e-5 Z"
-         id="path50"
-         inkscape:connector-curvature="0"
-         style="clip-rule:evenodd" /></clipPath><clipPath
-       clipPathUnits="userSpaceOnUse"
-       id="clipPath64"><path
-         d="M 1.4305e-5,0 H 960.00001 V 540 H 1.4305e-5 Z"
-         id="path62"
-         inkscape:connector-curvature="0"
-         style="clip-rule:evenodd" /></clipPath><clipPath
-       clipPathUnits="userSpaceOnUse"
-       id="clipPath76"><path
-         d="M 1.4305e-5,0 H 960.00001 V 540 H 1.4305e-5 Z"
-         id="path74"
-         inkscape:connector-curvature="0"
-         style="clip-rule:evenodd" /></clipPath><clipPath
-       clipPathUnits="userSpaceOnUse"
-       id="clipPath88"><path
-         d="M 1.4305e-5,0 H 960.00001 V 540 H 1.4305e-5 Z"
-         id="path86"
-         inkscape:connector-curvature="0"
-         style="clip-rule:evenodd" /></clipPath><clipPath
-       clipPathUnits="userSpaceOnUse"
-       id="clipPath100"><path
-         d="M -2.287e-6,1.2207e-4 H 960 V 540.00012 H -2.287e-6 Z"
-         id="path98"
-         inkscape:connector-curvature="0"
-         style="clip-rule:evenodd" /></clipPath><mask
-       maskUnits="userSpaceOnUse"
-       x="0"
-       y="0"
-       width="1"
-       height="1"
-       id="mask104"><image
-         width="1"
-         height="1"
-         style="image-rendering:optimizeSpeed"
-         preserveAspectRatio="none"
-         xlink:href=""
-         id="image106" /></mask></defs><sodipodi:namedview
-     pagecolor="#ffffff"
-     bordercolor="#666666"
-     borderopacity="1"
-     objecttolerance="10"
-     gridtolerance="10"
-     guidetolerance="10"
-     inkscape:pageopacity="0"
-     inkscape:pageshadow="2"
-     inkscape:window-width="1849"
-     inkscape:window-height="850"
-     id="namedview4"
-     showgrid="false"
-     fit-margin-top="0"
-     fit-margin-left="0"
-     fit-margin-right="0"
-     fit-margin-bottom="0"
-     inkscape:zoom="6.6114484"
-     inkscape:cx="173.49449"
-     inkscape:cy="22.417885"
-     inkscape:window-x="67"
-     inkscape:window-y="27"
-     inkscape:window-maximized="1"
-     inkscape:current-layer="g10" /><g
-     id="g10"
-     inkscape:groupmode="layer"
-     inkscape:label="finn-logo"
-     transform="matrix(1.3333333,0,0,-1.3333333,-45.599999,559.99999)"><g
-       id="g82"><g
-         id="g84"
-         clip-path="url(#clipPath88)"><text
-   transform="matrix(1,0,0,-1,71.04,385.22)"
-   style="font-style:italic;font-variant:normal;font-weight:bold;font-size:39.95999908px;font-family:Arial;-inkscape-font-specification:Arial-BoldItalicMT;writing-mode:lr-tb;fill:#ff0000;fill-opacity:1;fill-rule:nonzero;stroke:none"
-   id="text92"><tspan
-     x="0 24.41556 35.524441 64.375557"
-     y="0"
-     sodipodi:role="line"
-     id="tspan90">FINN</tspan></text>
-
-</g></g><g
-       id="g94"><g
-         id="g96"
-         clip-path="url(#clipPath100)"><g
-           id="g102"
-           transform="matrix(44.16,0,0,40.32,34.2,379.68)"><image
-             width="1"
-             height="1"
-             style="image-rendering:optimizeSpeed"
-             preserveAspectRatio="none"
-             transform="matrix(1,0,0,-1,0,1)"
-             xlink:href=""
-             mask="url(#mask104)"
-             id="image108" /></g></g></g></g></svg>
diff --git a/docs/img/finn-team.jpg b/docs/img/finn-team.jpg
deleted file mode 100644
index d2711758b8218aa76c61f6b756c575b40fbb64b8..0000000000000000000000000000000000000000
Binary files a/docs/img/finn-team.jpg and /dev/null differ
diff --git a/docs/img/fraktur.png b/docs/img/fraktur.png
deleted file mode 100644
index 8e105d181d69dd877993987e154811aa9a6153f4..0000000000000000000000000000000000000000
Binary files a/docs/img/fraktur.png and /dev/null differ
diff --git a/docs/img/gtsrb.png b/docs/img/gtsrb.png
deleted file mode 100644
index ebc98ee4a6606ad80a00d8614a5e8c196878f0e5..0000000000000000000000000000000000000000
Binary files a/docs/img/gtsrb.png and /dev/null differ
diff --git a/docs/img/imagenet.jpg b/docs/img/imagenet.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..5cdd5aa303d9add5fbe6270936da0e152eca0135
Binary files /dev/null and b/docs/img/imagenet.jpg differ
diff --git a/docs/img/mnist.jpg b/docs/img/mnist.jpg
deleted file mode 100644
index 29cea887284017d2573b87b35e2481420f15c85f..0000000000000000000000000000000000000000
Binary files a/docs/img/mnist.jpg and /dev/null differ
diff --git a/docs/img/parallel-speedup.png b/docs/img/parallel-speedup.png
deleted file mode 100644
index abea8f9bd5251d51732c9281e9b0705adeeabe64..0000000000000000000000000000000000000000
Binary files a/docs/img/parallel-speedup.png and /dev/null differ
diff --git a/docs/img/quartzPic1.jpg b/docs/img/quartzPic1.jpg
deleted file mode 100644
index cec4829f2187d720be8589d075c83443eaaef69c..0000000000000000000000000000000000000000
Binary files a/docs/img/quartzPic1.jpg and /dev/null differ
diff --git a/docs/img/rn50-ipi.png b/docs/img/rn50-ipi.png
deleted file mode 100644
index 504b011c9660b446ae39d407a8ce3d824bd2cd6a..0000000000000000000000000000000000000000
Binary files a/docs/img/rn50-ipi.png and /dev/null differ
diff --git a/docs/img/svhn.png b/docs/img/svhn.png
deleted file mode 100644
index 774a393fa71c15ae90cd4cb0294941a35049f23d..0000000000000000000000000000000000000000
Binary files a/docs/img/svhn.png and /dev/null differ
diff --git a/docs/index.md b/docs/index.md
deleted file mode 100644
index ac635db3474f9deb8cca13da84adb4f79f50b222..0000000000000000000000000000000000000000
--- a/docs/index.md
+++ /dev/null
@@ -1,28 +0,0 @@
-# FINN
-<img align="left" src="img/finn-stack.png" alt="drawing" style="margin-right: 20px" width="300"/>
-
-FINN is an
-experimental framework from Xilinx Research Labs to explore deep neural network
-inference on FPGAs.
-It specifically targets <a href="https://github.com/maltanar/qnn-inference-examples" target="_blank">quantized neural
-networks</a>, with emphasis on
-generating dataflow-style architectures customized for each network.
-It is not
-intended to be a generic DNN accelerator like xDNN, but rather a tool for
-exploring the design space of DNN inference accelerators on FPGAs.
-<br><br>
-A new, more modular version of FINN is currently under development <a href="https://github.com/Xilinx/finn">on GitHub</a>, and we welcome contributions from the community!
-
-
-## Quickstart
-
-Depending on what you would like to do, we have different suggestions on where to get started:
-
-* **I want to try out prebuilt QNN accelerators on real hardware.** Head over to <a href="https://github.com/Xilinx/BNN-PYNQ" target="_blank">BNN-PYNQ</a> repository to try out some image
-classification accelerators, or to <a href="https://github.com/Xilinx/LSTM-PYNQ" target="_blank">LSTM-PYNQ</a>
-to try optical character recognition with LSTMs.
-* **I want to train new quantized networks for FINN.** Check out <a href="https://github.com/Xilinx/brevitas">Brevitas</a>,
-our PyTorch library for training quantized networks. The Brevitas-to-FINN part of the flow is coming soon!
-* **I want to understand the computations involved in quantized inference.** Check out these Jupyter notebooks on <a href="https://github.com/maltanar/qnn-inference-examples">QNN inference</a>. This repo contains simple Numpy/Python layer implementations and a few pretrained QNNs for instructive purposes.
-* **I want to understand how it all fits together.** Check out our [publications](#publications),
-particularly the <a href="https://arxiv.org/abs/1612.07119" target="_blank">FINN paper at FPGA'17</a> and the <a href="https://arxiv.org/abs/1809.04570" target="_blank">FINN-R paper in ACM TRETS</a>.
diff --git a/docs/publications.md b/docs/publications.md
deleted file mode 100644
index 4f8a0c86fa5e9e0dcfdc9a44ee7d4ebba5097c46..0000000000000000000000000000000000000000
--- a/docs/publications.md
+++ /dev/null
@@ -1,39 +0,0 @@
-## Publications
-
-* FPL'18: <a href="https://arxiv.org/pdf/1807.04093.pdf" target="_blank">FINN-L:Library Extensions and Design Trade-off Analysis for Variable Precision LSTM Networks on FPGAs</a>
-* FPL'18: <a href="https://arxiv.org/pdf/1806.08862.pdf" target="_blank">BISMO: A Scalable Bit-Serial Matrix Multiplication Overlay for Reconfigurable Computing</a>
-* FPL'18: <a href="http://kalman.mee.tcd.ie/fpl2018/content/pdfs/FPL2018-43iDzVTplcpussvbfIaaHz/XZmyRhWvHACdwHRVTCTVB/6jfImwD836ibhOELmms0Ut.pdf" target="_blank">Customizing Low-Precision Deep Neural Networks For FPGAs</a>
-* ACM TRETS, Special Issue on Deep Learning: <a href="https://arxiv.org/abs/1809.04570" target="_blank">FINN-R: An End-to-End Deep-Learning Framework for Fast Exploration of Quantized Neural Networks</a>
-* ARC'18: <a href="https://arxiv.org/pdf/1807.10577.pdf" target="_blank">Accuracy to Throughput Trade-Offs for Reduced Precision Neural Networks on Reconfigurable Logic</a>
-* CVPR’18: <a href="https://arxiv.org/abs/1807.00301" target="_blank">SYQ: Learning Symmetric Quantization For Efficient Deep Neural Networks</a>
-* DATE'18: <a href="https://ieeexplore.ieee.org/abstract/document/8342121/" target="_blank">Inference of quantized neural networks on heterogeneous all-programmable devices</a>
-* ICONIP’17: <a href="https://arxiv.org/abs/1709.06262" target="_blank">Compressing Low Precision Deep Neural Networks Using Sparsity-Induced Regularization in Ternary Networks</a>
-* ICCD'17: <a href="https://ieeexplore.ieee.org/abstract/document/8119246/" target="_blank">Scaling Neural Network Performance through Customized Hardware Architectures on Reconfigurable Logic</a>
-* PARMA-DITAM'17: <a href="https://arxiv.org/abs/1701.03400" target="_blank">Scaling Binarized Neural Networks on Reconfigurable Logic</a>
-* FPGA'17: <a href="https://arxiv.org/abs/1612.07119" target="_blank">FINN: A Framework for Fast, Scalable Binarized Neural Network Inference</a>
-* H2RC'16: <a href="https://h2rc.cse.sc.edu/2016/papers/paper_25.pdf" target="_blank">A C++ Library for Rapid Exploration of Binary Neural Networks on Reconfigurable Logic</a>
-
-## External Publications and Projects Based on FINN
-
-If you are using FINN in your
-work and would like to be listed here, please contact us!
-
-* <a href="https://coefs.uncc.edu/htabkhiv/teaching/hardware-software-co-design-real-time-ai/" target="_blank">Hardware-Software Co-Design Real-time AI (UNC Charlotte)</a>
-* <a href="https://ieeexplore.ieee.org/abstract/document/8442108" target="_blank">BinaryEye: A 20 kfps Streaming Camera System on FPGA with Real-Time On-Device Image Recognition Using Binary Neural Networks</a>
-* <a href="https://qiita.com/ykshr/items/08147098516a45203761" target="_blank">Cucumber sorting with FINN (in Japanese)</a>
-* <a href="https://github.com/mohaghasemzadeh/ReBNet" target="_blank">ReBNet: Residual Binarized Neural Network, FCCM'18 best paper</a>
-
-## Events, Tutorials and Keynotes
-* DAMON'2019 keynote:  <a href="https://github.com/Xilinx/FINN/blob/master/docs/DAMON2019_Blott_final.pdf" target="_blank">Performance Scaling with Innovative Compute Architectures and FPGAs</a>
-* Future of AI'2019 keynote:  <a href="https://github.com/Xilinx/FINN/blob/master/docs/FutureofAI2019_Blott.pdf" target="_blank">Future of AI: Unconventional Compute Architectures</a>
-* BigData Belfast'2018 talk: <a href="https://github.com/Xilinx/FINN/blob/master/docs/BigDataBelfast2018.pdf" target="_blank">Unconventional Compute Architectures for Enabling the Roll-Out of Deep Learning</a>
-* CLUSTER'2018 keynote: <a href="https://github.com/Xilinx/FINN/blob/master/docs/IEEECluster2018.pdf" target="_blank">Unconventional Compute Architectures with Reconfigurable Devices in the Cloud</a>
-* RCML'2018 invited talk: <a href="https://github.com/Xilinx/FINN/blob/master/docs/ARC2018.pdf" target="_blank">The Emerging Computational Landscape of Neural Networks</a>
-* HotChips'2018 ML tutorial: <a href="https://github.com/Xilinx/FINN/blob/master/docs/Hotchips2018_Tutorial.pdf" target="_blank">Overview of Deep Learning and Computer Architectures for Accelerating DNNs</a>
-  + <a href="https://youtu.be/ydsZ7A0FF0I" target="_blank">Video</a>
-* ASAP'2018 keynote: <a href="https://github.com/Xilinx/FINN/blob/master/docs/ASAP2018.pdf" target="_blank">Design Trade-offs for Machine Learning Solutions on Reconfigurable Devices</a>
-* ARC'2018 keynote: <a href="https://github.com/Xilinx/FINN/blob/master/docs/ARC2018.pdf" target="_blank">Scalable Machine Learning with Reconfigurable Devices</a>
-* FPGA'2018 tutorial: <a href="https://github.com/Xilinx/FINN/blob/master/docs/FPGA2018_tutorial.pdf" target="_blank">Training Quantized Neural Networks</a>
-* MPSoC 2017 talk: <a href="https://github.com/Xilinx/FINN/blob/master/docs/MPSOC2018.pdf" target="_blank">A Framework for Reduced Precision Neural Networks on FPGAs</a>
-* TCD 2017 guest lecture on ML: <a href="https://www.youtube.com/watch?v=pIVh-4tqjPc" target="_blank">Machine Learning for Embedded Systems (Video)</a>
-* QPYNQ'2017 tutorial: <a href="https://www.ntnu.edu/ie/eecs/qpynq" target="_blank">Quantized Neural Networks with Xilinx PYNQ</a>
diff --git a/finn-rtllib/memstream/hdl/axilite_if.v b/finn-rtllib/memstream/hdl/axilite_if.v
index 93b2227de1b51d4fca145e8b61e6ed6dc2ed3121..bdd4de288ed3a5de859cbb20c3157d7f21f8239c 100644
--- a/finn-rtllib/memstream/hdl/axilite_if.v
+++ b/finn-rtllib/memstream/hdl/axilite_if.v
@@ -127,7 +127,7 @@ always @(posedge aclk or negedge aresetn)
 always @(*) begin
     internal_waddr = awaddr >> $clog2(DATA_WIDTH/8);
     internal_wdata = wdata;
-    internal_wen = (state == STATE_IDLE) & awvalid & wvalid; 
+    internal_wen = (state == STATE_IDLE) & awvalid & wvalid;
 end
 
 always @(posedge aclk) begin
@@ -208,4 +208,3 @@ always @(posedge aclk or negedge aresetn)
     end
 
 endmodule
-
diff --git a/finn-rtllib/memstream/hdl/memstream_singleblock.v b/finn-rtllib/memstream/hdl/memstream_singleblock.v
index 54ee56764e187520997e03bdcb291b4183e6ecf0..6bb3a97115325d81d4292c5af3c33921c2680a30 100644
--- a/finn-rtllib/memstream/hdl/memstream_singleblock.v
+++ b/finn-rtllib/memstream/hdl/memstream_singleblock.v
@@ -98,7 +98,7 @@ wire strm1_incr_en;
 assign strm0_incr_en = m_axis_0_tready | ~m_axis_0_tvalid;
 assign strm1_incr_en = m_axis_1_tready | ~m_axis_1_tvalid;
 
-reg rack_shift[1:0]; 
+reg rack_shift[1:0];
 
 generate
 if(MEM_DEPTH > 1) begin: use_ram
diff --git a/finn-rtllib/memstream/sim/tb_memstream_writes.v b/finn-rtllib/memstream/sim/tb_memstream_writes.v
index 867acfe813280cc3c9a473fb2a7e6bc9d7c05b23..a6ac747e967e594ac010f25a2827ebf7a7fcaa0f 100644
--- a/finn-rtllib/memstream/sim/tb_memstream_writes.v
+++ b/finn-rtllib/memstream/sim/tb_memstream_writes.v
@@ -179,7 +179,6 @@ task axi_read;
                     data = data | (rdata<<(32*j));
                 end
             join
-            
             @(posedge clk);
         end
     end
@@ -270,7 +269,6 @@ memstream
     MEM_WIDTH,
     ".",
     "auto",
-    
     //widths per stream
     STRM0_WIDTH,
     STRM1_WIDTH,
@@ -278,7 +276,6 @@ memstream
     STRM3_WIDTH,
     STRM4_WIDTH,
     STRM5_WIDTH,
-    
     //depths per stream
     STRM0_DEPTH,
     STRM1_DEPTH,
@@ -286,7 +283,6 @@ memstream
     STRM3_DEPTH,
     STRM4_DEPTH,
     STRM5_DEPTH,
-    
     //offsets for each stream
     STRM0_OFFSET,
     STRM1_OFFSET,
@@ -332,32 +328,26 @@ dut
     m_axis_0_tready,
     m_axis_0_tvalid,
     m_axis_0_tdata,
-    
     m_axis_1_afull,
     m_axis_1_tready,
     m_axis_1_tvalid,
     m_axis_1_tdata,
-    
     m_axis_2_afull,
     m_axis_2_tready,
     m_axis_2_tvalid,
     m_axis_2_tdata,
-    
     m_axis_3_afull,
     m_axis_3_tready,
     m_axis_3_tvalid,
     m_axis_3_tdata,
-    
     m_axis_4_afull,
     m_axis_4_tready,
     m_axis_4_tvalid,
     m_axis_4_tdata,
-    
     m_axis_5_afull,
     m_axis_5_tready,
     m_axis_5_tvalid,
     m_axis_5_tdata
-    
 
 );
 
@@ -406,7 +396,6 @@ initial begin
 				end
 			end
 		end
-		
 		//check stream 2
 	    begin
 		    $display("Starting stream 2 checker");
diff --git a/finn-rtllib/memstream/sim/test.sh b/finn-rtllib/memstream/sim/test.sh
index 3348e64b715ccbba17a38ac3bdf2c2c4173c3956..7cb0497d261ac41a763bad8e58afabb204887d39 100755
--- a/finn-rtllib/memstream/sim/test.sh
+++ b/finn-rtllib/memstream/sim/test.sh
@@ -30,4 +30,3 @@
 
 iverilog ../hdl/*.v tb_memstream_writes.v -o sim
 ./sim
-
diff --git a/finn-rtllib/memstream/xgui/memstream_v1_0.tcl b/finn-rtllib/memstream/xgui/memstream_v1_0.tcl
index b8be5e0a2f5c960cc5cb47ff9b348efffad98762..87565bc5613ce783d6a8067e8323d2358adb8061 100644
--- a/finn-rtllib/memstream/xgui/memstream_v1_0.tcl
+++ b/finn-rtllib/memstream/xgui/memstream_v1_0.tcl
@@ -38,7 +38,6 @@ proc init_gui { IPINST } {
 
 proc update_PARAM_VALUE.AXILITE_ADDR_WIDTH { PARAM_VALUE.AXILITE_ADDR_WIDTH PARAM_VALUE.MEM_DEPTH PARAM_VALUE.MEM_WIDTH } {
 	# Procedure called to update AXILITE_ADDR_WIDTH when any of the dependent parameters in the arguments change
-	
 	set AXILITE_ADDR_WIDTH ${PARAM_VALUE.AXILITE_ADDR_WIDTH}
 	set MEM_DEPTH ${PARAM_VALUE.MEM_DEPTH}
 	set MEM_WIDTH ${PARAM_VALUE.MEM_WIDTH}
@@ -393,4 +392,3 @@ proc update_MODELPARAM_VALUE.AXILITE_ADDR_WIDTH { MODELPARAM_VALUE.AXILITE_ADDR_
 	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
 	set_property value [get_property value ${PARAM_VALUE.AXILITE_ADDR_WIDTH}] ${MODELPARAM_VALUE.AXILITE_ADDR_WIDTH}
 }
-
diff --git a/notebooks/end2end_example/README.md b/notebooks/end2end_example/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9f9dded0e630c8688bed485e1349d2c79f3ec4c0
--- /dev/null
+++ b/notebooks/end2end_example/README.md
@@ -0,0 +1,13 @@
+This repo contains several Jupyter notebook examples on how to use FINN.
+These are intended as tutorials
+
+* `cybersecurity` is a three-part tutorial that shows how to train a simple
+quantized MLP in Brevitas and deploy that with FINN. This is a good tutorial
+for getting started with FINN
+
+* `bnn-pynq` contains several tutorials that show the steps of the FINN
+compiler flow for a simple MLP and convolutional net. This gives a better understanding of the FINN internals.
+
+In addition to these notebooks, you may want to check out the [finn-examples](https://github.com/Xilinx/finn-examples) repo, which contains
+several prebuilt bitfiles (including a MobileNet-v1) as well as the Python scripts to rebuild them with
+FINN.
diff --git a/notebooks/end2end_example/StreamingDataflowPartition_1.pdf b/notebooks/end2end_example/bnn-pynq/StreamingDataflowPartition_1.pdf
similarity index 100%
rename from notebooks/end2end_example/StreamingDataflowPartition_1.pdf
rename to notebooks/end2end_example/bnn-pynq/StreamingDataflowPartition_1.pdf
diff --git a/docs/img/cnv-mp-fc.png b/notebooks/end2end_example/bnn-pynq/cnv-mp-fc.png
similarity index 100%
rename from docs/img/cnv-mp-fc.png
rename to notebooks/end2end_example/bnn-pynq/cnv-mp-fc.png
diff --git a/notebooks/end2end_example/cnv_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
similarity index 100%
rename from notebooks/end2end_example/cnv_end2end_example.ipynb
rename to notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
diff --git a/notebooks/end2end_example/finn-design-flow-example.svg b/notebooks/end2end_example/bnn-pynq/finn-design-flow-example.svg
similarity index 100%
rename from notebooks/end2end_example/finn-design-flow-example.svg
rename to notebooks/end2end_example/bnn-pynq/finn-design-flow-example.svg
diff --git a/notebooks/end2end_example/finn-hw-arch.png b/notebooks/end2end_example/bnn-pynq/finn-hw-arch.png
similarity index 100%
rename from notebooks/end2end_example/finn-hw-arch.png
rename to notebooks/end2end_example/bnn-pynq/finn-hw-arch.png
diff --git a/notebooks/end2end_example/pynq_shell_project.png b/notebooks/end2end_example/bnn-pynq/pynq_shell_project.png
similarity index 100%
rename from notebooks/end2end_example/pynq_shell_project.png
rename to notebooks/end2end_example/bnn-pynq/pynq_shell_project.png
diff --git a/notebooks/end2end_example/stitched_ip.png b/notebooks/end2end_example/bnn-pynq/stitched_ip.png
similarity index 100%
rename from notebooks/end2end_example/stitched_ip.png
rename to notebooks/end2end_example/bnn-pynq/stitched_ip.png
diff --git a/notebooks/end2end_example/tfc_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
similarity index 100%
rename from notebooks/end2end_example/tfc_end2end_example.ipynb
rename to notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
diff --git a/notebooks/end2end_example/tfc_end2end_verification.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb
similarity index 100%
rename from notebooks/end2end_example/tfc_end2end_verification.ipynb
rename to notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb
diff --git a/notebooks/end2end_example/top.pdf b/notebooks/end2end_example/bnn-pynq/top.pdf
similarity index 100%
rename from notebooks/end2end_example/top.pdf
rename to notebooks/end2end_example/bnn-pynq/top.pdf
diff --git a/notebooks/end2end_example/verification.png b/notebooks/end2end_example/bnn-pynq/verification.png
similarity index 100%
rename from notebooks/end2end_example/verification.png
rename to notebooks/end2end_example/bnn-pynq/verification.png
diff --git a/notebooks/end2end_example/cnv-mp-fc.png b/notebooks/end2end_example/cnv-mp-fc.png
deleted file mode 100644
index 1d1fb0045e764be2ecf6773b3701553a928fbd23..0000000000000000000000000000000000000000
Binary files a/notebooks/end2end_example/cnv-mp-fc.png and /dev/null differ
diff --git a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..91a776f84e9554579d97447c9ca0889da5c29e48
--- /dev/null
+++ b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
@@ -0,0 +1,773 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Train a Quantized MLP on UNSW-NB15 with Brevitas"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this notebook, we will show how to create, train and export a quantized Multi Layer Perceptron (MLP) with quantized weights and activations with [Brevitas](https://github.com/Xilinx/brevitas).\n",
+    "Specifically, the task at hand will be to label network packets as normal or suspicious (e.g. originating from an attacker, virus, malware or otherwise) by training on a quantized variant of the UNSW-NB15 dataset. \n",
+    "\n",
+    "**You won't need a GPU to train the neural net.** This MLP will be small enough to train on a modern x86 CPU, so no GPU is required to follow this tutorial  Alternatively, we provide pre-trained parameters for the MLP if you want to skip the training entirely.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## A quick introduction to the task and the dataset\n",
+    "\n",
+    "*The task:* The goal of [*network intrusion detection*](https://ieeexplore.ieee.org/abstract/document/283931) is to identify, preferably in real time, unauthorized use, misuse, and abuse of computer systems by both system insiders and external penetrators. This may be achieved by a mix of techniques, and machine-learning (ML) based techniques are increasing in popularity. \n",
+    "\n",
+    "*The dataset:* Several datasets are available for use in ML-based methods for intrusion detection.\n",
+    "The [UNSW-NB15](https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/) is one such dataset created by the Australian Centre for Cyber Security (ACCS) to provide a comprehensive network based data set which can reflect modern network traffic scenarios. You can find more details about the dataset on [its homepage](https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/).\n",
+    "\n",
+    "*Performance considerations:* FPGAs are commonly used for implementing high-performance packet processing systems that still provide a degree of programmability. To avoid introducing bottlenecks on the network, the DNN implementation must be capable of detecting malicious ones at line rate, which can be millions of packets per second, and is expected to increase further as next-generation networking solutions provide increased\n",
+    "throughput. This is a good reason to consider FPGA acceleration for this particular use-case."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Outline\n",
+    "-------------\n",
+    "\n",
+    "* [Initial setup](#initial_setup)\n",
+    "* [Define the Quantized MLP model](#define_quantized_mlp)\n",
+    "* [Load the UNSW_NB15 dataset](#load_dataset) \n",
+    "* [Define Train and Test  Methods](#train_test)\n",
+    "* [(Option 1) Train the Model from Scratch](#train_scratch)\n",
+    "* [(Option 2) Load Pre-Trained Parameters](#load_pretrained)\n",
+    "* [Network Surgery Before Export](#network_surgery)\n",
+    "* [Export to FINN-ONNX](#export_finn_onnx)\n",
+    "* [View the Exported ONNX in Netron](#view_in_netron)\n",
+    "* [That's it!](#thats_it)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Initial Setup <a id='initial_setup'></a>\n",
+    "\n",
+    "Let's start by making sure we have all the Python packages we'll need for this notebook."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: pandas in /workspace/.local/lib/python3.6/site-packages (1.1.5)\n",
+      "Requirement already satisfied: pytz>=2017.2 in /opt/conda/lib/python3.6/site-packages (from pandas) (2019.1)\n",
+      "Requirement already satisfied: numpy>=1.15.4 in /opt/conda/lib/python3.6/site-packages (from pandas) (1.19.4)\n",
+      "Requirement already satisfied: python-dateutil>=2.7.3 in /opt/conda/lib/python3.6/site-packages (from pandas) (2.8.1)\n",
+      "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.6/site-packages (from python-dateutil>=2.7.3->pandas) (1.15.0)\n",
+      "Requirement already satisfied: scikit-learn in /workspace/.local/lib/python3.6/site-packages (0.23.2)\n",
+      "Requirement already satisfied: scipy>=0.19.1 in /opt/conda/lib/python3.6/site-packages (from scikit-learn) (1.5.2)\n",
+      "Requirement already satisfied: joblib>=0.11 in /workspace/.local/lib/python3.6/site-packages (from scikit-learn) (1.0.0)\n",
+      "Requirement already satisfied: numpy>=1.13.3 in /opt/conda/lib/python3.6/site-packages (from scikit-learn) (1.19.4)\n",
+      "Requirement already satisfied: threadpoolctl>=2.0.0 in /workspace/.local/lib/python3.6/site-packages (from scikit-learn) (2.1.0)\n",
+      "Requirement already satisfied: tqdm in /opt/conda/lib/python3.6/site-packages (4.31.1)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install --user pandas\n",
+    "!pip install --user scikit-learn\n",
+    "!pip install --user tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import onnx\n",
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**This is important -- always import onnx before torch**. This is a workaround for a [known bug](https://github.com/onnx/onnx/issues/2394)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Define the Quantized MLP Model <a id='define_quantized_mlp'></a>\n",
+    "\n",
+    "We'll now define an MLP model that will be trained to perform inference with quantized weights and activations.\n",
+    "For this, we'll use the quantization-aware training (QAT) capabilities offered by[Brevitas](https://github.com/Xilinx/brevitas).\n",
+    "\n",
+    "Our MLP will have four fully-connected (FC) layers in total: three hidden layers with 64 neurons, and a final output layer with a single output, all using 2-bit weights. We'll use 2-bit quantized ReLU activation functions, and apply batch normalization between each FC layer and its activation.\n",
+    "\n",
+    "In case you'd like to experiment with different quantization settings or topology parameters, we'll define all these topology settings as variables."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_size = 593      \n",
+    "hidden1 = 64      \n",
+    "hidden2 = 64\n",
+    "hidden3 = 64\n",
+    "weight_bit_width = 2\n",
+    "act_bit_width = 2\n",
+    "num_classes = 1    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we can define our MLP using the layer primitives provided by Brevitas:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from brevitas.nn import QuantLinear, QuantReLU\n",
+    "import torch.nn as nn\n",
+    "\n",
+    "model = nn.Sequential(\n",
+    "      QuantLinear(input_size, hidden1, bias=True, weight_bit_width=weight_bit_width),\n",
+    "      nn.BatchNorm1d(hidden1),\n",
+    "      nn.Dropout(0.5),\n",
+    "      QuantReLU(bit_width=act_bit_width),\n",
+    "      QuantLinear(hidden1, hidden2, bias=True, weight_bit_width=weight_bit_width),\n",
+    "      nn.BatchNorm1d(hidden2),\n",
+    "      nn.Dropout(0.5),\n",
+    "      QuantReLU(bit_width=act_bit_width),\n",
+    "      QuantLinear(hidden2, hidden3, bias=True, weight_bit_width=weight_bit_width),\n",
+    "      nn.BatchNorm1d(hidden3),\n",
+    "      nn.Dropout(0.5),\n",
+    "      QuantReLU(bit_width=act_bit_width),\n",
+    "      QuantLinear(hidden3, num_classes, bias=True, weight_bit_width=weight_bit_width)\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note that the MLP's output is not yet quantized. Even though we want the final output of our MLP to be a binary (0/1) value indicating the classification, we've only defined a single-neuron FC layer as the output. While training the network we'll pass that output through a sigmoid function as part of the loss criterion, which [gives better numerical stability](https://pytorch.org/docs/stable/generated/torch.nn.BCEWithLogitsLoss.html). Later on, after we're done training the network, we'll add a quantization node at the end before we export it to FINN."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load the UNSW_NB15 Dataset <a id='load_dataset'></a>\n",
+    "\n",
+    "### Dataset Quantization <a id='dataset_qnt'></a>\n",
+    "\n",
+    "The goal of this notebook is to train a Quantized Neural Network (QNN) to be later deployed as an FPGA accelerator generated by the FINN compiler. Although we can choose a variety of different precisions for the input, [Murovic and Trost](https://ev.fe.uni-lj.si/1-2-2019/Murovic.pdf) have previously shown we can actually binarize the inputs and still get good (90%+) accuracy.\n",
+    "Thus, we will create a binarized representation for the dataset by following the procedure defined by [Murovic and Trost](https://ev.fe.uni-lj.si/1-2-2019/Murovic.pdf), which we repeat briefly here:\n",
+    "\n",
+    "* Original features have different formats ranging from integers, floating numbers to strings.\n",
+    "* Integers, which for example represent a packet lifetime, are binarized with as many bits as to include the maximum value. \n",
+    "* Another case is with features formatted as strings (protocols), which are binarized by simply counting the number of all different strings for each feature and coding them in the appropriate number of bits.\n",
+    "* Floating-point numbers are reformatted into fixed-point representation.\n",
+    "* In the end, each sample is transformed into a 593-bit wide binary vector. \n",
+    "* All vectors are labeled as bad (0) or normal (1)\n",
+    "\n",
+    "Following their open-source implementation provided as a Matlab script [here](https://github.com/TadejMurovic/BNN_Deployment/blob/master/cybersecurity_dataset_unswb15.m), we've created a [Python version](dataloader_quantized.py).\n",
+    "This `UNSW_NB15_quantized` class implements a PyTorch `DataLoader`, which represents a Python iterable over a dataset. This is useful because enables access to data in batches."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Download the training and test set from the [official website](https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/) - uncomment the following lines to download:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#! wget https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/a%20part%20of%20training%20and%20testing%20set/UNSW_NB15_training-set.csv\n",
+    "#! wget https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/a%20part%20of%20training%20and%20testing%20set/UNSW_NB15_testing-set.csv"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch.utils.data import DataLoader, Dataset\n",
+    "from dataloader_quantized import UNSW_NB15_quantized\n",
+    "\n",
+    "file_path_train = \"UNSW_NB15_training-set.csv\"\n",
+    "file_path_test = \"UNSW_NB15_testing-set.csv\"\n",
+    "\n",
+    "train_quantized_dataset = UNSW_NB15_quantized(file_path_train = file_path_train, \\\n",
+    "                                              file_path_test = file_path_test, \\\n",
+    "                                              train=True)\n",
+    "\n",
+    "test_quantized_dataset = UNSW_NB15_quantized(file_path_train = file_path_train, \\\n",
+    "                                              file_path_test = file_path_test, \\\n",
+    "                                              train=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch_size = 1000\n",
+    "\n",
+    "# dataset loaders\n",
+    "train_quantized_loader = DataLoader(train_quantized_dataset, batch_size=batch_size, shuffle=True)\n",
+    "test_quantized_loader = DataLoader(test_quantized_dataset, batch_size=batch_size, shuffle=True)    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Define Train and Test  Methods  <a id='train_test'></a>\n",
+    "The train and test methods will use a `DataLoader`, which feeds the model with a new predefined batch of training data in each iteration, until the entire training data is fed to the model. Each repetition of this process is called an `epoch`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train(model, train_loader, optimizer, criterion):\n",
+    "    losses = []\n",
+    "    # ensure model is in training mode\n",
+    "    model.train()    \n",
+    "    \n",
+    "    for i, data in enumerate(train_loader, 0):        \n",
+    "        inputs, target = data\n",
+    "        optimizer.zero_grad()   \n",
+    "                \n",
+    "        # forward pass\n",
+    "        output = model(inputs.float())\n",
+    "        loss = criterion(output, target.unsqueeze(1))\n",
+    "        \n",
+    "        # backward pass + run optimizer to update weights\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "        \n",
+    "        # keep track of loss value\n",
+    "        losses.append(loss.data.numpy()) \n",
+    "           \n",
+    "    return losses"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "\n",
+    "def test(model, test_loader):    \n",
+    "    # ensure model is in eval mode\n",
+    "    model.eval() \n",
+    "    y_true = []\n",
+    "    y_pred = []\n",
+    "   \n",
+    "    with torch.no_grad():\n",
+    "        for data in test_loader:\n",
+    "            inputs, target = data\n",
+    "            output_orig = model(inputs.float())\n",
+    "            # run the output through sigmoid\n",
+    "            output = torch.sigmoid(output_orig)  \n",
+    "            # compare against a threshold of 0.5 to generate 0/1\n",
+    "            pred = (output.detach().numpy() > 0.5) * 1\n",
+    "            target = target.float()\n",
+    "            y_true.extend(target.tolist()) \n",
+    "            y_pred.extend(pred.reshape(-1).tolist())\n",
+    "        \n",
+    "    return accuracy_score(y_true, y_pred)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## (Option 1) Train the Model from Scratch <a id=\"train_scratch\"></a>\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Before we start training our MLP we need to define some hyperparameters. Moreover, in order to monitor the loss function evolution over epochs, we need to define a method for it. As mentioned earlier, we'll use a loss criterion which applies a sigmoid function during the training phase (`BCEWithLogitsLoss`). For the testing phase, we're manually computing the sigmoid and thresholding at 0.5."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_epochs = 5\n",
+    "lr = 0.001 \n",
+    "\n",
+    "def display_loss_plot(losses, title=\"Training loss\", xlabel=\"Iterations\", ylabel=\"Loss\"):\n",
+    "    x_axis = [i for i in range(len(losses))]\n",
+    "    plt.plot(x_axis,losses)\n",
+    "    plt.title(title)\n",
+    "    plt.xlabel(xlabel)\n",
+    "    plt.ylabel(ylabel)\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# loss criterion and optimizer\n",
+    "criterion = nn.BCEWithLogitsLoss()\n",
+    "optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "from tqdm import tqdm, trange\n",
+    "\n",
+    "running_loss = []\n",
+    "running_test_acc = []\n",
+    "t = trange(num_epochs, desc=\"Training loss\", leave=True)\n",
+    "\n",
+    "for epoch in t:\n",
+    "        loss_epoch = train(model, train_quantized_loader, optimizer,criterion)\n",
+    "        test_acc = test(model, test_quantized_loader)\n",
+    "        t.set_description(\"Training loss = %f test accuracy = %f\" % (np.mean(loss_epoch), test_acc))\n",
+    "        t.refresh() # to show immediately the update           \n",
+    "        running_loss.append(loss_epoch)\n",
+    "        running_test_acc.append(test_acc)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "loss_per_epoch = [np.mean(loss_per_epoch) for loss_per_epoch in running_loss]\n",
+    "display_loss_plot(loss_per_epoch)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test(model, test_quantized_loader)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## (Option 2) Load Pre-Trained Parameters <a id=\"load_pretrained\"></a>\n",
+    "\n",
+    "Instead of training from scratch, you can also use pre-trained parameters we provide here. These parameters should achieve ~91.9% test accuracy."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "IncompatibleKeys(missing_keys=[], unexpected_keys=[])"
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "\n",
+    "trained_state_dict = torch.load(\"state_dict.pth\")[\"models_state_dict\"][0]\n",
+    "\n",
+    "model.load_state_dict(trained_state_dict, strict=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.9188772287810328"
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test(model, test_quantized_loader)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Network Surgery Before Export <a id=\"network_surgery\"></a>\n",
+    "\n",
+    "Sometimes, it's desirable to make some changes to our trained network prior to export (this is known in general as \"network surgery\"). This depends on the model and is not generally necessary, but in this case we want to make a couple of changes to get better results with FINN."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's start by padding the input. Our input vectors are 593-bit, which will make folding (parallelization) for the first layer a bit tricky since 593 is a prime number. So we'll pad the weight matrix of the first layer with seven 0-valued columns to work with an input size of 600 instead. When using the modified network we'll similarly provide inputs padded to 600 bits."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(64, 593)"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from copy import deepcopy\n",
+    "\n",
+    "modified_model = deepcopy(model)\n",
+    "\n",
+    "W_orig = modified_model[0].weight.data.detach().numpy()\n",
+    "W_orig.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(64, 600)"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "# pad the second (593-sized) dimensions with 7 zeroes at the end\n",
+    "W_new = np.pad(W_orig, [(0,0), (0,7)])\n",
+    "W_new.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([64, 600])"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "modified_model[0].weight.data = torch.from_numpy(W_new)\n",
+    "modified_model[0].weight.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, we'll modify the expected input/output ranges. In FINN, we prefer to work with bipolar {-1, +1} instead of binary {0, 1} values. To achieve this, we'll create a \"wrapper\" model that handles the pre/postprocessing as follows:\n",
+    "\n",
+    "* on the input side, we'll pre-process by (x + 1) / 2 in order to map incoming {-1, +1} inputs to {0, 1} ones which the trained network is used to. Since we're just multiplying/adding a scalar, these operations can be *streamlined* in FINN and implemented with no extra cost.\n",
+    "\n",
+    "* on the output side, we'll add a binary quantizer which maps everthing below 0 to -1 and everything above 0 to +1. This is essentially the same behavior as the sigmoid we used earlier, except the outputs are bipolar instead of binary."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from brevitas.core.quant import QuantType\n",
+    "from brevitas.nn import QuantIdentity\n",
+    "\n",
+    "\n",
+    "class CybSecMLPForExport(nn.Module):\n",
+    "    def __init__(self, my_pretrained_model):\n",
+    "        super(CybSecMLPForExport, self).__init__()\n",
+    "        self.pretrained = my_pretrained_model\n",
+    "        self.qnt_output = QuantIdentity(quant_type=QuantType.BINARY, bit_width=1, min_val=-1.0, max_val=1.0)\n",
+    "    \n",
+    "    def forward(self, x):\n",
+    "        # assume x contains bipolar {-1,1} elems\n",
+    "        # shift from {-1,1} -> {0,1} since that is the\n",
+    "        # input range for the trained network\n",
+    "        x = (x + torch.tensor([1.0])) / 2.0  \n",
+    "        out_original = self.pretrained(x)\n",
+    "        out_final = self.qnt_output(out_original)   # output as {-1,1}     \n",
+    "        return out_final\n",
+    "\n",
+    "model_for_export = CybSecMLPForExport(modified_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def test_padded_bipolar(model, test_loader):    \n",
+    "    # ensure model is in eval mode\n",
+    "    model.eval() \n",
+    "    y_true = []\n",
+    "    y_pred = []\n",
+    "   \n",
+    "    with torch.no_grad():\n",
+    "        for data in test_loader:\n",
+    "            inputs, target = data\n",
+    "            # pad inputs to 600 elements\n",
+    "            input_padded = np.pad(inputs, [(0,0), (0,7)])\n",
+    "            # convert inputs to {-1,+1}\n",
+    "            input_scaled = 2*input_padded - 1\n",
+    "            # run the model\n",
+    "            output = model(torch.from_numpy(input_scaled).float())\n",
+    "            y_pred.extend(list(output.flatten()))\n",
+    "            # make targets bipolar {-1,+1}\n",
+    "            expected = 2*target.float() - 1\n",
+    "            expected = expected.detach().numpy()\n",
+    "            y_true.extend(list(expected.flatten()))\n",
+    "        \n",
+    "    return accuracy_score(y_true, y_pred)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.9188772287810328"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test_padded_bipolar(model_for_export, test_quantized_loader)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Export to FINN-ONNX <a id=\"export_finn_onnx\" ></a>\n",
+    "\n",
+    "FINN expects an ONNX model as input. We'll now export our network into ONNX to be imported and used in FINN for the next notebooks. Note that the particular ONNX representation used for FINN differs from standard ONNX, you can read more about this [here](https://finn.readthedocs.io/en/latest/internals.html#intermediate-representation-finn-onnx)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model saved to cybsec-mlp.onnx\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:15: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
+      "  from ipykernel import kernelapp as app\n"
+     ]
+    }
+   ],
+   "source": [
+    "import brevitas.onnx as bo\n",
+    "\n",
+    "export_onnx_path = \"cybsec-mlp.onnx\"\n",
+    "input_shape = (1, 600)\n",
+    "bo.export_finn_onnx(model_for_export, input_shape, export_onnx_path)\n",
+    "\n",
+    "print(\"Model saved to %s\" % export_onnx_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## View the Exported ONNX in Netron <a id=\"view_in_netron\" ></a>\n",
+    "\n",
+    "Let's examine the exported ONNX model with Netron. Particular things of note:\n",
+    "\n",
+    "* The input preprocessing (x + 1) / 2 is exported as part of the network (initial Add and Div layers)\n",
+    "* We've exported the padded version; shape of the first MatMul node's weight parameter is 600x64\n",
+    "* The weight parameters (second inputs) for MatMul nodes are annotated with `quantization: finn_datatype:INT2`\n",
+    "* The quantized activations are exported as `MultiThreshold` nodes with `domain=finn.custom_op.general`\n",
+    "* There's a final `MultiThreshold` node with threshold=0 to produce the final bipolar output (this is the `qnt_output` from `CybSecMLPForExport`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Serving 'cybsec-mlp.onnx' at http://0.0.0.0:8081\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "        <iframe\n",
+       "            width=\"100%\"\n",
+       "            height=\"400\"\n",
+       "            src=\"http://0.0.0.0:8081/\"\n",
+       "            frameborder=\"0\"\n",
+       "            allowfullscreen\n",
+       "        ></iframe>\n",
+       "        "
+      ],
+      "text/plain": [
+       "<IPython.lib.display.IFrame at 0x7f4045ac19e8>"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from finn.util.visualization import showInNetron\n",
+    "\n",
+    "showInNetron(export_onnx_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## That's it! <a id=\"thats_it\" ></a>\n",
+    "You created, trained and tested a quantized MLP that is ready to be loaded into FINN, congratulations! You can now proceed to the next notebook."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/end2end_example/cybersecurity/2-export-to-finn-and-verify.ipynb b/notebooks/end2end_example/cybersecurity/2-export-to-finn-and-verify.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..f48cada0dd25f08f1659a778d04785bda27f443e
--- /dev/null
+++ b/notebooks/end2end_example/cybersecurity/2-export-to-finn-and-verify.ipynb
@@ -0,0 +1,483 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Verify Exported ONNX Model in FINN\n",
+    "\n",
+    "**Important: This notebook depends on the 1-train-mlp-with-brevitas notebook, because we are using the ONNX model that was exported there. So please make sure the needed .onnx file is generated before you run this notebook. Also remember to 'close and halt' any other FINN notebooks, since Netron visualizations use the same port.**\n",
+    "\n",
+    "In this notebook we will show how to import the network we trained in Brevitas and verify it in the FINN compiler. \n",
+    "This verification process can actually be done at various stages in the compiler [as explained in this notebook](../bnn-pynq/tfc_end2end_verification.ipynb) but for this example we'll only consider the first step: verifying the exported high-level FINN-ONNX model.\n",
+    "Once this model is sucessfully verified, we'll generate an FPGA accelerator from it in the next notebook."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import onnx \n",
+    "import torch "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**This is important -- always import onnx before torch**. This is a workaround for a [known bug](https://github.com/onnx/onnx/issues/2394)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Outline\n",
+    "-------------\n",
+    "1. [Import model and visualize in Netron](#brevitas_import_visualization)\n",
+    "2. [Network preperations: Tidy up transformations](#network_preparations)\n",
+    "3. [Load the dataset and Brevitas model](#load_dataset) \n",
+    "4. [Compare FINN and Brevitas execution](#compare_brevitas)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 1. Import model and visualize in Netron <a id=\"brevitas_import_visualization\"></a>\n",
+    "\n",
+    "Now that we have the model in .onnx format, we can work with it using FINN. To import it into FINN, we'll use the [`ModelWrapper`](https://finn.readthedocs.io/en/latest/source_code/finn.core.html#finn.core.modelwrapper.ModelWrapper). It is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from finn.core.modelwrapper import ModelWrapper\n",
+    "\n",
+    "model_file_path = \"cybsec-mlp.onnx\"\n",
+    "model_for_sim = ModelWrapper(model_file_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To visualize the exported model, Netron can be used. Netron is a visualizer for neural networks and allows interactive investigation of network properties. For example, you can click on the individual nodes and view the properties."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Serving 'cybsec-mlp.onnx' at http://0.0.0.0:8081\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "        <iframe\n",
+       "            width=\"100%\"\n",
+       "            height=\"400\"\n",
+       "            src=\"http://0.0.0.0:8081/\"\n",
+       "            frameborder=\"0\"\n",
+       "            allowfullscreen\n",
+       "        ></iframe>\n",
+       "        "
+      ],
+      "text/plain": [
+       "<IPython.lib.display.IFrame at 0x7fc1fc950748>"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from finn.util.visualization import showInNetron\n",
+    "showInNetron(model_file_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 2. Network preperation: Tidy up transformations <a id=\"network_preparations\"></a>\n",
+    "\n",
+    "Before running the verification, we need to prepare our FINN-ONNX model. In particular, all the intermediate tensors need to have statically defined shapes. To do this, we apply some transformations to the model like a kind of \"tidy-up\" to make it easier to process. You can read more about these transformations in [this notebook](../bnn-pynq/tfc_end2end_example.ipynb).\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames, RemoveStaticGraphInputs\n",
+    "from finn.transformation.infer_shapes import InferShapes\n",
+    "from finn.transformation.infer_datatypes import InferDataTypes\n",
+    "from finn.transformation.fold_constants import FoldConstants\n",
+    "\n",
+    "model_for_sim = model_for_sim.transform(InferShapes())\n",
+    "model_for_sim = model_for_sim.transform(FoldConstants())\n",
+    "model_for_sim = model_for_sim.transform(GiveUniqueNodeNames())\n",
+    "model_for_sim = model_for_sim.transform(GiveReadableTensorNames())\n",
+    "model_for_sim = model_for_sim.transform(InferDataTypes())\n",
+    "model_for_sim = model_for_sim.transform(RemoveStaticGraphInputs())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "There's one more thing we'll do: we will mark the input tensor datatype as bipolar, which will be used by the compiler later on. \n",
+    "\n",
+    "*In the near future it will be possible to add this information to the model while exporting, instead of having to add it manually.*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Input tensor name: global_in\n",
+      "Output tensor name: global_out\n",
+      "Input tensor shape: [1, 600]\n",
+      "Input tensor datatype: DataType.BIPOLAR\n"
+     ]
+    }
+   ],
+   "source": [
+    "from finn.core.datatype import DataType\n",
+    "\n",
+    "finnonnx_in_tensor_name = model_for_sim.graph.input[0].name\n",
+    "finnonnx_out_tensor_name = model_for_sim.graph.output[0].name\n",
+    "print(\"Input tensor name: %s\" % finnonnx_in_tensor_name)\n",
+    "print(\"Output tensor name: %s\" % finnonnx_out_tensor_name)\n",
+    "finnonnx_model_in_shape = model_for_sim.get_tensor_shape(finnonnx_in_tensor_name)\n",
+    "print(\"Input tensor shape: %s\" % str(finnonnx_model_in_shape))\n",
+    "model_for_sim.set_tensor_datatype(finnonnx_in_tensor_name, DataType.BIPOLAR)\n",
+    "print(\"Input tensor datatype: %s\" % str(model_for_sim.get_tensor_datatype(finnonnx_in_tensor_name)))\n",
+    "\n",
+    "verified_model_filename = \"cybsec-mlp-verified.onnx\"\n",
+    "model_for_sim.save(verified_model_filename)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's view our ready-to-go model. Some changes to note:\n",
+    "\n",
+    "* all intermediate tensors now have their shapes specified (indicated by numbers next to the arrows going between layers)\n",
+    "* the datatype on the input tensor is set to DataType.BIPOLAR (click on the `global_in` node to view properties)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Stopping http://0.0.0.0:8081\n",
+      "Serving 'cybsec-mlp-verified.onnx' at http://0.0.0.0:8081\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "        <iframe\n",
+       "            width=\"100%\"\n",
+       "            height=\"400\"\n",
+       "            src=\"http://0.0.0.0:8081/\"\n",
+       "            frameborder=\"0\"\n",
+       "            allowfullscreen\n",
+       "        ></iframe>\n",
+       "        "
+      ],
+      "text/plain": [
+       "<IPython.lib.display.IFrame at 0x7fc280154278>"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "showInNetron(verified_model_filename)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 3. Load the Dataset and the Brevitas Model <a id=\"load_dataset\"></a>\n",
+    "\n",
+    "We'll use some example data from the quantized UNSW-NB15 dataset (from the previous notebook) to use as inputs for the verification. \n",
+    "\n",
+    "Recall that the quantized values from the dataset are 593-bit binary {0, 1} vectors whereas our exported model takes 600-bit bipolar {-1, +1} vectors, so we'll have to preprocess it a bit before we can use it for verifying the ONNX model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([100, 593])"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from torch.utils.data import DataLoader, Dataset\n",
+    "from dataloader_quantized import UNSW_NB15_quantized\n",
+    "\n",
+    "test_quantized_dataset = UNSW_NB15_quantized(file_path_train='UNSW_NB15_training-set.csv', \\\n",
+    "                                              file_path_test = \"UNSW_NB15_testing-set.csv\", \\\n",
+    "                                              train=False)\n",
+    "\n",
+    "n_verification_inputs = 100\n",
+    "# last column is the label, exclude it\n",
+    "input_tensor = test_quantized_dataset.data[:n_verification_inputs,:-1]\n",
+    "input_tensor.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's also bring up the MLP we trained in Brevitas from the previous notebook. We'll compare its outputs to what is generated by FINN."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "IncompatibleKeys(missing_keys=[], unexpected_keys=[])"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "input_size = 593      \n",
+    "hidden1 = 64      \n",
+    "hidden2 = 64\n",
+    "hidden3 = 64\n",
+    "weight_bit_width = 2\n",
+    "act_bit_width = 2\n",
+    "num_classes = 1\n",
+    "\n",
+    "from brevitas.nn import QuantLinear, QuantReLU\n",
+    "import torch.nn as nn\n",
+    "\n",
+    "brevitas_model = nn.Sequential(\n",
+    "      QuantLinear(input_size, hidden1, bias=True, weight_bit_width=weight_bit_width),\n",
+    "      nn.BatchNorm1d(hidden1),\n",
+    "      nn.Dropout(0.5),\n",
+    "      QuantReLU(bit_width=act_bit_width),\n",
+    "      QuantLinear(hidden1, hidden2, bias=True, weight_bit_width=weight_bit_width),\n",
+    "      nn.BatchNorm1d(hidden2),\n",
+    "      nn.Dropout(0.5),\n",
+    "      QuantReLU(bit_width=act_bit_width),\n",
+    "      QuantLinear(hidden2, hidden3, bias=True, weight_bit_width=weight_bit_width),\n",
+    "      nn.BatchNorm1d(hidden3),\n",
+    "      nn.Dropout(0.5),\n",
+    "      QuantReLU(bit_width=act_bit_width),\n",
+    "      QuantLinear(hidden3, num_classes, bias=True, weight_bit_width=weight_bit_width)\n",
+    ")\n",
+    "\n",
+    "# replace this with your trained network checkpoint if you're not\n",
+    "# using the pretrained weights\n",
+    "trained_state_dict = torch.load(\"state_dict.pth\")[\"models_state_dict\"][0]\n",
+    "brevitas_model.load_state_dict(trained_state_dict, strict=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def inference_with_brevitas(current_inp):\n",
+    "    brevitas_output = brevitas_model.forward(current_inp)\n",
+    "    # apply sigmoid + threshold\n",
+    "    brevitas_output = torch.sigmoid(brevitas_output)\n",
+    "    brevitas_output = (brevitas_output.detach().numpy() > 0.5) * 1\n",
+    "    # convert output to bipolar\n",
+    "    brevitas_output = 2*brevitas_output - 1\n",
+    "    return brevitas_output"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 4. Compare FINN & Brevitas execution <a id=\"compare_brevitas\"></a>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's make helper functions to execute the same input with Brevitas and FINN. For FINN, we'll use the [`finn.core.onnx_exec`](https://finn.readthedocs.io/en/latest/source_code/finn.core.html#finn.core.onnx_exec.execute_onnx) function to execute the exported FINN-ONNX on the inputs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def inference_with_finn_onnx(current_inp):\n",
+    "    # convert input to numpy for FINN\n",
+    "    current_inp = current_inp.detach().numpy()\n",
+    "    # add padding and re-scale to bipolar\n",
+    "    current_inp = np.pad(current_inp, [(0, 0), (0, 7)])\n",
+    "    current_inp = 2*current_inp-1\n",
+    "    # reshape to expected input (add 1 for batch dimension)\n",
+    "    current_inp = current_inp.reshape(finnonnx_model_in_shape)\n",
+    "    # create the input dictionary\n",
+    "    input_dict = {finnonnx_in_tensor_name : current_inp} \n",
+    "    # run with FINN's execute_onnx\n",
+    "    output_dict = oxe.execute_onnx(model_for_sim, input_dict)\n",
+    "    #get the output tensor\n",
+    "    finn_output = output_dict[finnonnx_out_tensor_name] \n",
+    "    return finn_output"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we can call our inference helper functions for each input and compare the outputs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "ok 100 nok 0: 100%|██████████| 100/100 [00:48<00:00,  2.09it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import finn.core.onnx_exec as oxe\n",
+    "import numpy as np\n",
+    "from tqdm import trange\n",
+    "\n",
+    "verify_range = trange(n_verification_inputs, desc=\"FINN execution\", position=0, leave=True)\n",
+    "brevitas_model.eval()\n",
+    "\n",
+    "ok = 0\n",
+    "nok = 0\n",
+    "\n",
+    "for i in verify_range:\n",
+    "    # run in Brevitas with PyTorch tensor\n",
+    "    current_inp = input_tensor[i].reshape((1, 593))\n",
+    "    brevitas_output = inference_with_brevitas(current_inp)\n",
+    "    finn_output = inference_with_finn_onnx(current_inp)\n",
+    "    # compare the outputs\n",
+    "    ok += 1 if finn_output == brevitas_output else 0\n",
+    "    nok += 1 if finn_output != brevitas_output else 0\n",
+    "    verify_range.set_description(\"ok %d nok %d\" % (ok, nok))\n",
+    "    verify_range.refresh() # to show immediately the update"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Verification succeeded. Brevitas and FINN-ONNX execution outputs are identical\n"
+     ]
+    }
+   ],
+   "source": [
+    "if ok == n_verification_inputs:\n",
+    "    print(\"Verification succeeded. Brevitas and FINN-ONNX execution outputs are identical\")\n",
+    "else:\n",
+    "    print(\"Verification failed. Brevitas and FINN-ONNX execution outputs are NOT identical\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This concludes our second notebook. In the next one, we'll take the ONNX model we just verified all the way down to FPGA hardware with the FINN compiler."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..1ee1cefbe17d96ffd7a2e6384e037e1d9fbdd989
--- /dev/null
+++ b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
@@ -0,0 +1,804 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Building the Streaming Dataflow Accelerator\n",
+    "\n",
+    "**Important: This notebook depends on the 2-cybersecurity-finn-verification notebook because we are using models that were created by these notebooks. So please make sure the needed .onnx files are generated prior to running this notebook.**\n",
+    "\n",
+    "<img align=\"left\" src=\"finn-example.png\" alt=\"drawing\" style=\"margin-right: 20px\" width=\"250\"/>\n",
+    "\n",
+    "In this notebook, we'll use the FINN compiler generate an FPGA accelerator with a streaming dataflow architecture from our quantized MLP for the cybersecurity task. The key idea in such architectures is to parallelize across layers as well as within layers by dedicating a proportionate amount of compute resources to each layer, illustrated on the figure to the left. You can read more about the general concept in the [FINN](https://arxiv.org/pdf/1612.07119) and [FINN-R](https://dl.acm.org/doi/pdf/10.1145/3242897) papers. This is done by mapping each layer to a Vivado HLS description, parallelizing each layer's implementation to the appropriate degree and using on-chip FIFOs to link up the layers to create the full accelerator.\n",
+    "\n",
+    "These implementations offer a good balance of performance and flexibility, but building them by hand is difficult and time-consuming. This is where the FINN compiler comes in: it can build streaming dataflow accelerators from an ONNX description to match the desired throughput."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Outline\n",
+    "-------------\n",
+    "\n",
+    "1. [Introduction to  `build_dataflow` Tool](#intro_build_dataflow) \n",
+    "2. [Understanding the Build Configuration: `DataflowBuildConfig`](#underst_build_conf)     \n",
+    "    2.1.[Output Products](#output_prod)   \n",
+    "    2.2.[Configuring the Board and FPGA Part](#config_fpga)   \n",
+    "    2.3 [Configuring the Performance](#config_perf)    \n",
+    "4. [Launch a Build: Only Estimate Reports](#build_estimate_report)\n",
+    "5. [Launch a Build: Stitched IP, out-of-context synth and rtlsim Performance](#build_ip_synth_rtlsim)\n",
+    "6. [Launch a Build: PYNQ Bitfile and Driver](#build_bitfile_driver)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Introduction to  `build_dataflow` Tool <a id=\"intro_build_dataflow\"></a>\n",
+    "\n",
+    "Since version 0.5b, the FINN compiler has a `build_dataflow` tool. Compared to previous versions which required setting up all the needed transformations in a Python script, it makes experimenting with dataflow architecture generation easier. The core idea is to specify the relevant build info as a configuration `dict`, which invokes all the necessary steps to make the dataflow build happen. It can be invoked either from the [command line](https://finn-dev.readthedocs.io/en/latest/command_line.html) or with a single Python function call\n",
+    "\n",
+    "\n",
+    "In this notebook, we'll use the Python function call to invoke the builds to stay inside the Jupyter notebook, but feel free to experiment with reproducing what we do here with the `./run-docker.sh build_dataflow` and `./run-docker.sh build_custom` command-line entry points too, as documented [here]((https://finn-dev.readthedocs.io/en/latest/command_line.html))."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Understanding the Build Configuration: `DataflowBuildConfig` <a id=\"underst_build_conf\"></a>\n",
+    "\n",
+    "The build configuration is specified by an instance of `finn.builder.build_dataflow_config.DataflowBuildConfig`. The configuration is a Python [`dataclass`](https://docs.python.org/3/library/dataclasses.html) which can be serialized into or de-serialized from JSON files for persistence, although we'll just set it up in Python here.\n",
+    "There are many options in the configuration to customize different aspects of the build, we'll only cover a few of them in this notebook. You can read the details on all the config options on [the FINN API documentation](https://finn-dev.readthedocs.io/en/latest/source_code/finn.builder.html#finn.builder.build_dataflow_config.DataflowBuildConfig).\n",
+    "\n",
+    "Let's go over some of the members of the `DataflowBuildConfig`:\n",
+    "\n",
+    "### Output Products <a id=\"output_prod\"></a>\n",
+    "\n",
+    "The build can produce many different outputs, and some of them can take a long time (e.g. bitfile synthesis for a large network). When you first start working on generating a new accelerator and exploring the different performance options, you may not want to go all the way to a bitfile. Thus, in the beginning you may just select the estimate reports as the output products. Gradually, you can generate the output products from later stages until you are happy enough with the design to build the full accelerator integrated into a shell.\n",
+    "\n",
+    "The output products are controlled by:\n",
+    "\n",
+    "* `generate_outputs`: list of output products (of type [`finn.builder.build_dataflow_config.DataflowOutputType`](https://finn-dev.readthedocs.io/en/latest/source_code/finn.builder.html#finn.builder.build_dataflow_config.DataflowOutputType)) that will be generated by the build. Some available options are:\n",
+    "    - `ESTIMATE_REPORTS` : report expected resources and performance per layer and for the whole network without any synthesis\n",
+    "    - `STITCHED_IP` : create a stream-in stream-out IP design that can be integrated into other Vivado IPI or RTL designs\n",
+    "    - `RTLSIM_PERFORMANCE` : use PyVerilator to do a performance/latency test of the `STITCHED_IP` design\n",
+    "    - `OOC_SYNTH` : run out-of-context synthesis (just the accelerator itself, without any system surrounding it) on the `STITCHED_IP` design to get post-synthesis FPGA resources and achievable clock frequency\n",
+    "    - `BITFILE` : integrate the accelerator into a shell to produce a standalone bitfile\n",
+    "    - `PYNQ_DRIVER` : generate a PYNQ Python driver that can be used to launch the accelerator\n",
+    "    - `DEPLOYMENT_PACKAGE` : create a folder with the `BITFILE` and `PYNQ_DRIVER` outputs, ready to be copied to the target FPGA platform.\n",
+    "* `output_dir`: the directory where the all the generated build outputs above will be written into.\n",
+    "* `steps`: list of predefined (or custom) build steps FINN will go through. Use `build_dataflow_config.estimate_only_dataflow_steps` to execute only the steps needed for estimation (without any synthesis), and the `build_dataflow_config.default_build_dataflow_steps` otherwise (which is the default value).\n",
+    "\n",
+    "### Configuring the Board and FPGA Part <a id=\"config_fpga\"></a>\n",
+    "\n",
+    "* `fpga_part`: Xilinx FPGA part to be used for synthesis, can be left unspecified to be inferred from `board` below, or specified explicitly for e.g. out-of-context synthesis.\n",
+    "* `board`: target Xilinx Zynq or Alveo board for generating accelerators integrated into a shell. See the `pynq_part_map` and `alveo_part_map` dicts in [this file](https://github.com/Xilinx/finn-base/blob/dev/src/finn/util/basic.py#L41) for a list of possible boards.\n",
+    "* `shell_flow_type`: the target [shell flow type](https://finn-dev.readthedocs.io/en/latest/source_code/finn.builder.html#finn.builder.build_dataflow_config.ShellFlowType), only needed for generating full bitfiles where the FINN design is integrated into a shell (so only needed if `BITFILE` is selected) \n",
+    "\n",
+    "### Configuring the Performance <a id=\"config_perf\"></a>\n",
+    "\n",
+    "You can configure the performance (and correspondingly, the FPGA resource footprint) of the generated in two ways:\n",
+    "\n",
+    "1) (basic) Set a target performance and let the compiler figure out the per-node parallelization settings.\n",
+    "\n",
+    "2) (advanced) Specify a separate .json as `folding_config_file` that lists the degree of parallelization (as well as other hardware options) for each layer.\n",
+    "\n",
+    "This notebook only deals with the basic approach, for which you need to set up:\n",
+    "\n",
+    "* `target_fps`: target inference performance in frames per second. Note that target may not be achievable due to specific layer constraints, or due to resource limitations of the FPGA.\n",
+    "* `synth_clk_period_ns`: target clock frequency (in nanoseconds) for Vivado synthesis. e.g. `synth_clk_period_ns=5.0` will target a 200 MHz clock. Note that the target clock period may not be achievable depending on the FPGA part and design complexity."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Launch a Build: Only Estimate Reports <a id=\"build_estimate_report\"></a>\n",
+    "\n",
+    "First, we'll launch a build that only generates the estimate reports, which does not require any synthesis. Note two things below: how the `generate_outputs` only contains `ESTIMATE_REPORTS`, but also how the `steps` uses a value of `estimate_only_dataflow_steps`. This skips steps like HLS synthesis to provide a quick estimate from analytical models."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Building dataflow accelerator from cybsec-mlp-verified.onnx\n",
+      "Intermediate outputs will be generated in /tmp/finn_dev_osboxes\n",
+      "Final outputs will be generated in output_estimates_only\n",
+      "Build log is at output_estimates_only/build_dataflow.log\n",
+      "Running step: step_tidy_up [1/7]\n",
+      "Running step: step_streamline [2/7]\n",
+      "Running step: step_convert_to_hls [3/7]\n",
+      "Running step: step_create_dataflow_partition [4/7]\n",
+      "Running step: step_target_fps_parallelization [5/7]\n",
+      "Running step: step_apply_folding_config [6/7]\n",
+      "Running step: step_generate_estimate_reports [7/7]\n",
+      "Completed successfully\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import finn.builder.build_dataflow as build\n",
+    "import finn.builder.build_dataflow_config as build_cfg\n",
+    "\n",
+    "model_file = \"cybsec-mlp-verified.onnx\"\n",
+    "\n",
+    "estimates_output_dir = \"output_estimates_only\"\n",
+    "\n",
+    "cfg = build.DataflowBuildConfig(\n",
+    "    output_dir          = estimates_output_dir,\n",
+    "    target_fps          = 1000000,\n",
+    "    synth_clk_period_ns = 10.0,\n",
+    "    fpga_part           = \"xc7z020clg400-1\",\n",
+    "    steps               = build_cfg.estimate_only_dataflow_steps,\n",
+    "    generate_outputs=[\n",
+    "        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "build.build_dataflow_cfg(model_file, cfg)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We'll now examine the generated outputs from this build. If we look under the outputs directory, we'll find a subfolder with the generated estimate reports."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "build_dataflow.log  intermediate_models  report  time_per_step.json\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "! ls {estimates_output_dir}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "estimate_layer_config_alternatives.json  estimate_network_performance.json\r\n",
+      "estimate_layer_cycles.json\t\t op_and_param_counts.json\r\n",
+      "estimate_layer_resources.json\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "! ls {estimates_output_dir}/report"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We see that various reports have been generated as .json files. Let's examine the contents of the `estimate_network_performance.json` for starters. Here, we can see the analytical estimates for the performance and latency."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\r\n",
+      "  \"critical_path_cycles\": 272,\r\n",
+      "  \"max_cycles\": 80,\r\n",
+      "  \"max_cycles_node_name\": \"StreamingFCLayer_Batch_0\",\r\n",
+      "  \"estimated_throughput_fps\": 1250000.0,\r\n",
+      "  \"estimated_latency_ns\": 2720.0\r\n",
+      "}"
+     ]
+    }
+   ],
+   "source": [
+    "! cat {estimates_output_dir}/report/estimate_network_performance.json"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Since all of these reports are .json files, we can easily load them into Python for further processing. Let's define a helper function and look at the `estimate_layer_cycles.json` report."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "def read_json_dict(filename):\n",
+    "    with open(filename, \"r\") as f:\n",
+    "        ret = json.load(f)\n",
+    "    return ret"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'StreamingFCLayer_Batch_0': 80,\n",
+       " 'StreamingFCLayer_Batch_1': 64,\n",
+       " 'StreamingFCLayer_Batch_2': 64,\n",
+       " 'StreamingFCLayer_Batch_3': 64}"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "read_json_dict(estimates_output_dir + \"/report/estimate_layer_cycles.json\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we can see the estimated number of clock cycles each layer will take. Recall that all of these layers will be running in parallel, and the slowest layer will determine the overall throughput of the entire neural network. FINN attempts to parallelize each layer such that they all take a similar number of cycles, and less than the corresponding number of cycles that would be required to meet `target_fps`.\n",
+    "\n",
+    "Finally, we can see the layer-by-layer resource estimates in the `estimate_layer_resources.json` report:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'StreamingFCLayer_Batch_0': {'BRAM_18K': 27,\n",
+       "  'BRAM_efficiency': 0.15432098765432098,\n",
+       "  'LUT': 8149,\n",
+       "  'URAM': 0,\n",
+       "  'URAM_efficiency': 1,\n",
+       "  'DSP': 0},\n",
+       " 'StreamingFCLayer_Batch_1': {'BRAM_18K': 4,\n",
+       "  'BRAM_efficiency': 0.1111111111111111,\n",
+       "  'LUT': 1435,\n",
+       "  'URAM': 0,\n",
+       "  'URAM_efficiency': 1,\n",
+       "  'DSP': 0},\n",
+       " 'StreamingFCLayer_Batch_2': {'BRAM_18K': 4,\n",
+       "  'BRAM_efficiency': 0.1111111111111111,\n",
+       "  'LUT': 1435,\n",
+       "  'URAM': 0,\n",
+       "  'URAM_efficiency': 1,\n",
+       "  'DSP': 0},\n",
+       " 'StreamingFCLayer_Batch_3': {'BRAM_18K': 1,\n",
+       "  'BRAM_efficiency': 0.006944444444444444,\n",
+       "  'LUT': 341,\n",
+       "  'URAM': 0,\n",
+       "  'URAM_efficiency': 1,\n",
+       "  'DSP': 0},\n",
+       " 'total': {'BRAM_18K': 36.0, 'LUT': 11360.0, 'URAM': 0.0, 'DSP': 0.0}}"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "read_json_dict(estimates_output_dir + \"/report/estimate_layer_resources.json\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This particular report is useful to determine whether the current configuration will fit into a particular FPGA. If you see that the resource requirements are too high for the FPGA you had in mind, you should consider lowering the `target_fps`.\n",
+    "\n",
+    "*Note that the analytical models tend to over-estimate how much resources are needed, since they can't capture the effects of various synthesis optimizations.*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Launch a Build: Stitched IP, out-of-context synth and rtlsim Performance <a id=\"build_ip_synth_rtlsim\"></a>\n",
+    "\n",
+    "Once we have a configuration that gives satisfactory estimates, we can move on to generating the accelerator. We can do this in different ways depending on how we want to integrate the accelerator into a larger system. For instance, if we have a larger streaming system built in Vivado or if we'd like to re-use this generated accelerator as an IP component in other projects, the `STITCHED_IP` output product is a good choice. We can also use the `OOC_SYNTH` output product to get post-synthesis resource and clock frequency numbers for our accelerator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Building dataflow accelerator from cybsec-mlp-verified.onnx\n",
+      "Intermediate outputs will be generated in /tmp/finn_dev_osboxes\n",
+      "Final outputs will be generated in output_ipstitch_ooc_rtlsim\n",
+      "Build log is at output_ipstitch_ooc_rtlsim/build_dataflow.log\n",
+      "Running step: step_tidy_up [1/15]\n",
+      "Running step: step_streamline [2/15]\n",
+      "Running step: step_convert_to_hls [3/15]\n",
+      "Running step: step_create_dataflow_partition [4/15]\n",
+      "Running step: step_target_fps_parallelization [5/15]\n",
+      "Running step: step_apply_folding_config [6/15]\n",
+      "Running step: step_generate_estimate_reports [7/15]\n",
+      "Running step: step_hls_ipgen [8/15]\n",
+      "Running step: step_set_fifo_depths [9/15]\n",
+      "Running step: step_create_stitched_ip [10/15]\n",
+      "Running step: step_measure_rtlsim_performance [11/15]\n",
+      "Running step: step_make_pynq_driver [12/15]\n",
+      "Running step: step_out_of_context_synthesis [13/15]\n",
+      "Running step: step_synthesize_bitfile [14/15]\n",
+      "Running step: step_deployment_package [15/15]\n",
+      "Completed successfully\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import finn.builder.build_dataflow as build\n",
+    "import finn.builder.build_dataflow_config as build_cfg\n",
+    "\n",
+    "model_file = \"cybsec-mlp-verified.onnx\"\n",
+    "\n",
+    "rtlsim_output_dir = \"output_ipstitch_ooc_rtlsim\"\n",
+    "\n",
+    "cfg = build.DataflowBuildConfig(\n",
+    "    output_dir          = rtlsim_output_dir,\n",
+    "    target_fps          = 1000000,\n",
+    "    synth_clk_period_ns = 10.0,\n",
+    "    fpga_part           = \"xc7z020clg400-1\",\n",
+    "    generate_outputs=[\n",
+    "        build_cfg.DataflowOutputType.STITCHED_IP,\n",
+    "        build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,\n",
+    "        build_cfg.DataflowOutputType.OOC_SYNTH,\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "build.build_dataflow_cfg(model_file, cfg)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Among the output products, we will find the accelerator exported as IP:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "all_verilog_srcs.txt\t\t       finn_vivado_stitch_proj.xpr\r\n",
+      "finn_vivado_stitch_proj.cache\t       ip\r\n",
+      "finn_vivado_stitch_proj.hbs\t       make_project.sh\r\n",
+      "finn_vivado_stitch_proj.hw\t       make_project.tcl\r\n",
+      "finn_vivado_stitch_proj.ip_user_files  vivado.jou\r\n",
+      "finn_vivado_stitch_proj.srcs\t       vivado.log\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "! ls {rtlsim_output_dir}/stitched_ip"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We also have a few reports generated by these output products, different from the ones generated by `ESTIMATE_REPORTS`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "estimate_layer_resources_hls.json  rtlsim_performance.json\r\n",
+      "ooc_synth_and_timing.json\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "! ls {rtlsim_output_dir}/report"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In `ooc_synth_and_timing.json` we can find the post-synthesis and maximum clock frequency estimate for the accelerator. Note that the clock frequency estimate here tends to be optimistic, since out-of-context synthesis is less constrained."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\r\n",
+      "  \"vivado_proj_folder\": \"/tmp/finn_dev_osboxes/synth_out_of_context_wy3b6qf4/results_finn_design_wrapper\",\r\n",
+      "  \"LUT\": 7073.0,\r\n",
+      "  \"FF\": 7534.0,\r\n",
+      "  \"DSP\": 0.0,\r\n",
+      "  \"BRAM\": 18.0,\r\n",
+      "  \"WNS\": 0.632,\r\n",
+      "  \"\": 0,\r\n",
+      "  \"fmax_mhz\": 106.7463706233988,\r\n",
+      "  \"estimated_throughput_fps\": 1334329.6327924852\r\n",
+      "}"
+     ]
+    }
+   ],
+   "source": [
+    "! cat {rtlsim_output_dir}/report/ooc_synth_and_timing.json"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In `rtlsim_performance.json` we can find the steady-state throughput and latency for the accelerator, as obtained by rtlsim. If the DRAM bandwidth numbers reported here are below what the hardware platform is capable of (i.e. the accelerator is not memory-bound), you can expect the same steady-state throughput in real hardware."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\r\n",
+      "  \"cycles\": 838,\r\n",
+      "  \"runtime[ms]\": 0.00838,\r\n",
+      "  \"throughput[images/s]\": 954653.9379474939,\r\n",
+      "  \"DRAM_in_bandwidth[Mb/s]\": 71.59904534606204,\r\n",
+      "  \"DRAM_out_bandwidth[Mb/s]\": 0.11933174224343673,\r\n",
+      "  \"fclk[mhz]\": 100.0,\r\n",
+      "  \"N\": 8,\r\n",
+      "  \"latency_cycles\": 229\r\n",
+      "}"
+     ]
+    }
+   ],
+   "source": [
+    "! cat {rtlsim_output_dir}/report/rtlsim_performance.json"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, let's have a look at `final_hw_config.json`. This is the node-by-node hardware configuration determined by the FINN compiler, including FIFO depths, parallelization settings (PE/SIMD) and others. If you want to optimize your build further (the \"advanced\" method we mentioned under \"Configuring the performance\"), you can use this .json file as the `folding_config_file` for a new run to use it as a starting point for further exploration and optimizations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\r\n",
+      "  \"Defaults\": {},\r\n",
+      "  \"StreamingFIFO_0\": {\r\n",
+      "    \"ram_style\": \"auto\",\r\n",
+      "    \"depth\": 32,\r\n",
+      "    \"impl_style\": \"rtl\"\r\n",
+      "  },\r\n",
+      "  \"StreamingFCLayer_Batch_0\": {\r\n",
+      "    \"PE\": 32,\r\n",
+      "    \"SIMD\": 15,\r\n",
+      "    \"ram_style\": \"auto\",\r\n",
+      "    \"resType\": \"lut\",\r\n",
+      "    \"mem_mode\": \"decoupled\",\r\n",
+      "    \"runtime_writeable_weights\": 0\r\n",
+      "  },\r\n",
+      "  \"StreamingDataWidthConverter_Batch_0\": {\r\n",
+      "    \"impl_style\": \"hls\"\r\n",
+      "  },\r\n",
+      "  \"StreamingFCLayer_Batch_1\": {\r\n",
+      "    \"PE\": 4,\r\n",
+      "    \"SIMD\": 16,\r\n",
+      "    \"ram_style\": \"auto\",\r\n",
+      "    \"resType\": \"lut\",\r\n",
+      "    \"mem_mode\": \"decoupled\",\r\n",
+      "    \"runtime_writeable_weights\": 0\r\n",
+      "  },\r\n",
+      "  \"StreamingDataWidthConverter_Batch_1\": {\r\n",
+      "    \"impl_style\": \"hls\"\r\n",
+      "  },\r\n",
+      "  \"StreamingFCLayer_Batch_2\": {\r\n",
+      "    \"PE\": 4,\r\n",
+      "    \"SIMD\": 16,\r\n",
+      "    \"ram_style\": \"auto\",\r\n",
+      "    \"resType\": \"lut\",\r\n",
+      "    \"mem_mode\": \"decoupled\",\r\n",
+      "    \"runtime_writeable_weights\": 0\r\n",
+      "  },\r\n",
+      "  \"StreamingDataWidthConverter_Batch_2\": {\r\n",
+      "    \"impl_style\": \"hls\"\r\n",
+      "  },\r\n",
+      "  \"StreamingFCLayer_Batch_3\": {\r\n",
+      "    \"PE\": 1,\r\n",
+      "    \"SIMD\": 1,\r\n",
+      "    \"ram_style\": \"auto\",\r\n",
+      "    \"resType\": \"lut\",\r\n",
+      "    \"mem_mode\": \"decoupled\",\r\n",
+      "    \"runtime_writeable_weights\": 0\r\n",
+      "  }\r\n",
+      "}"
+     ]
+    }
+   ],
+   "source": [
+    "! cat {rtlsim_output_dir}/final_hw_config.json"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Launch a Build: PYNQ Bitfile and Driver <a id=\"build_bitfile_driver\"></a>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Building dataflow accelerator from cybsec-mlp-verified.onnx\n",
+      "Intermediate outputs will be generated in /tmp/finn_dev_osboxes\n",
+      "Final outputs will be generated in output_final\n",
+      "Build log is at output_final/build_dataflow.log\n",
+      "Running step: step_tidy_up [1/15]\n",
+      "Running step: step_streamline [2/15]\n",
+      "Running step: step_convert_to_hls [3/15]\n",
+      "Running step: step_create_dataflow_partition [4/15]\n",
+      "Running step: step_target_fps_parallelization [5/15]\n",
+      "Running step: step_apply_folding_config [6/15]\n",
+      "Running step: step_generate_estimate_reports [7/15]\n",
+      "Running step: step_hls_ipgen [8/15]\n",
+      "Running step: step_set_fifo_depths [9/15]\n",
+      "Running step: step_create_stitched_ip [10/15]\n",
+      "Running step: step_measure_rtlsim_performance [11/15]\n",
+      "Running step: step_make_pynq_driver [12/15]\n",
+      "Running step: step_out_of_context_synthesis [13/15]\n",
+      "Running step: step_synthesize_bitfile [14/15]\n",
+      "Running step: step_deployment_package [15/15]\n",
+      "Completed successfully\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import finn.builder.build_dataflow as build\n",
+    "import finn.builder.build_dataflow_config as build_cfg\n",
+    "\n",
+    "model_file = \"cybsec-mlp-verified.onnx\"\n",
+    "\n",
+    "final_output_dir = \"output_final\"\n",
+    "\n",
+    "cfg = build.DataflowBuildConfig(\n",
+    "    output_dir          = final_output_dir,\n",
+    "    target_fps          = 1000000,\n",
+    "    synth_clk_period_ns = 10.0,\n",
+    "    board               = \"Pynq-Z1\",\n",
+    "    shell_flow_type     = build_cfg.ShellFlowType.VIVADO_ZYNQ,\n",
+    "    generate_outputs=[\n",
+    "        build_cfg.DataflowOutputType.BITFILE,\n",
+    "        build_cfg.DataflowOutputType.PYNQ_DRIVER,\n",
+    "        build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE,\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "build.build_dataflow_cfg(model_file, cfg)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For our final build, the output products include the bitfile (and the accompanying .hwh file, also needed to execute correctly on PYNQ for Zynq platforms):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "finn-accel.bit\tfinn-accel.hwh\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "! ls {final_output_dir}/bitfile"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The generated Python driver lets us execute the accelerator on PYNQ platforms with simply numpy i/o. You can find some notebooks showing how to use FINN-generated accelerators at runtime in the [finn-examples](https://github.com/Xilinx/finn-examples) repository."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "driver.py  driver_base.py  finn  runtime_weights  validate.py\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "! ls {final_output_dir}/driver"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The reports folder contains the post-synthesis resource and timing reports:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "estimate_layer_resources_hls.json  post_synth_resources.xml\r\n",
+      "post_route_timing.rpt\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "! ls {final_output_dir}/report"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, we have the `deploy` folder which contains everything you need to copy onto the target board to get the accelerator running:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "bitfile  driver\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "! ls {final_output_dir}/deploy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/end2end_example/cybersecurity/README.md b/notebooks/end2end_example/cybersecurity/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ad8a7ad602a766dad3d37f2d4e0719009b30c187
--- /dev/null
+++ b/notebooks/end2end_example/cybersecurity/README.md
@@ -0,0 +1,21 @@
+# Training and Deploying a Quantized MLP
+
+In this folder you will find a series of notebooks that guide you through
+the process of training a highly quantized neural network (QNN) and generating
+a high-performance streaming dataflow accelerator from it using the FINN
+compiler.
+If you'd like to train your own QNNs and deploy them using FINN, this is a
+good starting point.
+
+Here, the example application is classifying network packets as malicious or
+not by training a multi-layer perceptron (MLP) on the UNSW-NB15 dataset.
+We recommend following these notebooks in the order they appear:
+
+1. Training a few-bit MLP on the UNSW-NB15 dataset
+2. Exporting the trained network and verify that it works as intended
+3. Generating a streaming dataflow accelerator using the FINN compiler
+
+Note: This tutorial abstract away the internal details of the steps to provide
+a simpler introduction. If you'd like to understand more of the internal
+details of what happens during the accelerator build, we recommend the
+(BNN-PYNQ end-to-end notebooks)[../bnn-pynq].
diff --git a/notebooks/end2end_example/cybersecurity/dataloader_quantized.py b/notebooks/end2end_example/cybersecurity/dataloader_quantized.py
new file mode 100644
index 0000000000000000000000000000000000000000..45651faa5a9a57e9a1d0d784b15ebe8945d9ddd7
--- /dev/null
+++ b/notebooks/end2end_example/cybersecurity/dataloader_quantized.py
@@ -0,0 +1,406 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import torch
+import pandas as pd
+import numpy as np
+from sklearn import preprocessing
+from sklearn.preprocessing import OneHotEncoder
+import math
+
+# quantize the UNSW_NB15 dataset and convert it to binary vectors
+# reimplementation
+# paper: https://ev.fe.uni-lj.si/1-2-2019/Murovic.pdf
+# original matlab code: https://git.io/JLLdN
+
+
+class UNSW_NB15_quantized(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        file_path_train,
+        file_path_test,
+        quantization=True,
+        onehot=False,
+        train=True,
+    ):
+
+        self.dataframe = (
+            pd.concat([pd.read_csv(file_path_train), pd.read_csv(file_path_test)])
+            .reset_index()
+            .drop(columns=["index", "id", "attack_cat"])
+        )
+
+        if onehot:
+            self.one_hot_df_encoded = self.one_hot_encoding(self.dataframe)
+
+        if quantization:
+            _, self.train_df, self.test_df = self.quantize_df(self.dataframe)
+
+        if train:
+            self.data = torch.FloatTensor(self.train_df.astype("float"))
+        else:
+            self.data = torch.FloatTensor(self.test_df.astype("float"))
+
+    def get_dataframe(self):
+        return self.dataframe
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        target = self.data[index][-1]
+        data_val = self.data[index][:-1]
+        return data_val, target
+
+    def dec2bin(
+        self, column: pd.Series, number_of_bits: int, left_msb: bool = True
+    ) -> pd.Series:
+        """Convert a decimal pd.Series to binary pd.Series with numbers in their
+        # base-2 equivalents.
+        The output is a numpy nd array.
+        # adapted from: https://stackoverflow.com/q/51471097/1520469
+        Parameters
+        ----------
+         column: pd.Series
+            Series wit all decimal numbers that will be cast to binary
+         number_of_bits: str
+            The desired number of bits for the binary number. If bigger than
+            what is needed then those bits will be 0.
+            The number_of_bits should be >= than what is needed to express the
+            largest decimal input
+         left_msb: bool
+            Specify that the most significant digit is the leftmost element.
+            If this is False, it will be the rightmost element.
+        Returns
+        -------
+        numpy.ndarray
+           Numpy array with all elements in binary representation of the input.
+
+        """
+
+        def my_binary_repr(number, nbits):
+            return np.binary_repr(number, nbits)[::-1]
+
+        func = my_binary_repr if left_msb else np.binary_repr
+
+        return np.vectorize(func)(column.values, number_of_bits)
+
+    def round_like_matlab_number(self, n: np.float64) -> int:
+        """Round the input "n" like matlab uint32(n) cast (which also rounds) e.g.
+        0.5->1;  1.5->2; 2.3->2;   2.45->2 """
+        if n - math.floor(n) < 0.5:
+            return math.floor(n)
+        return math.ceil(n)
+
+    def round_like_matlab_series(self, series: pd.Series) -> pd.Series:
+        rounded_values_list = []
+        for value in series:
+            rounded_values_list.append(self.round_like_matlab_number(value))
+        return pd.Series(rounded_values_list)
+
+    def integer_encoding(self, df):
+        """Applies integer encoding to the object columns of the dataframe"""
+        le = preprocessing.LabelEncoder()
+        for column in df.select_dtypes("object").columns.tolist():
+            df[column] = le.fit_transform(df[column])
+        return df
+
+    def quantize_df(self, df):
+        """Quantized the input dataframe. The scaling is done by multiplying
+        every column by the inverse of the minimum of that column"""
+        # gets the smallest positive number of a vector
+        def get_min_positive_number(vector):
+            return vector[vector > 0].min()
+
+        # computes the maximum required bits necessary to represent each number
+        # from a vector of numbers
+        def get_max_bits(vector):
+            return math.ceil(math.log2(float(vector.max()) + 1.0))
+
+        # splits a string into a list of all characters
+        def char_split(s):
+            return np.array([ch for ch in s])
+
+        df_encoded = self.integer_encoding(df)
+        python_quantized_df = df_encoded.copy()
+        dict_correct_rate_values = {
+            715: 34716,
+            11691: 25278,
+            27417: 5259117,
+            45319: 60744,
+            73620: 9039,
+            74498: 15070,
+            86933: 1024485,
+            89021: 1689027,
+            90272: 5259117,
+            103372: 1562102,
+            118192: 1759777,
+            122489: 246327,
+            159266: 18853,
+            190473: 18423,
+        }
+
+        for column in python_quantized_df.columns:
+            column_data = df_encoded[column]
+
+            m = get_min_positive_number(column_data)
+            m_inv = 1.0 / m
+            if m_inv > 1:
+                column_data = column_data * np.float64(m_inv)
+
+            maxbits = get_max_bits(column_data)
+            # CLIP, ROUND and CAST to UINT32
+            column_data = np.clip(
+                column_data, 0, 4294967295
+            )  # clip due to overflow of uint32 of matlab code
+            column_data = self.round_like_matlab_series(
+                column_data
+            )  # round like matlab
+            column_data = column_data.astype(np.uint32)  # cast like matlab
+
+            if column == "rate":
+                column_data.update(pd.Series(dict_correct_rate_values))
+
+            python_quantized_df[column] = (
+                self.dec2bin(column_data, maxbits, left_msb=False)
+                .reshape((-1, 1))
+                .flatten()
+            )
+
+        for column in python_quantized_df.columns:
+            python_quantized_df[column] = (
+                python_quantized_df[column].apply(char_split).values
+            )
+
+        python_quantized_df_separated = pd.DataFrame(
+            np.column_stack(python_quantized_df.values.T.tolist())
+        )
+        python_train = python_quantized_df_separated.iloc[:175341]
+        python_test = python_quantized_df_separated.iloc[175341:]
+
+        return (
+            python_quantized_df_separated.values,
+            python_train.values,
+            python_test.values,
+        )
+
+    def one_hot_encoding(self, df):
+        dataframe = df.copy()
+        """Applies 1 hot encoding to the proto, service and state columns """
+
+        string_columns = ["proto", "service", "state"]
+        string_categories = [
+            [
+                [
+                    "tcp",
+                    "udp",
+                    "arp",
+                    "ospf",
+                    "icmp",
+                    "igmp",
+                    "rtp",
+                    "ddp",
+                    "ipv6-frag",
+                    "cftp",
+                    "wsn",
+                    "pvp",
+                    "wb-expak",
+                    "mtp",
+                    "pri-enc",
+                    "sat-mon",
+                    "cphb",
+                    "sun-nd",
+                    "iso-ip",
+                    "xtp",
+                    "il",
+                    "unas",
+                    "mfe-nsp",
+                    "3pc",
+                    "ipv6-route",
+                    "idrp",
+                    "bna",
+                    "swipe",
+                    "kryptolan",
+                    "cpnx",
+                    "rsvp",
+                    "wb-mon",
+                    "vmtp",
+                    "ib",
+                    "dgp",
+                    "eigrp",
+                    "ax.25",
+                    "gmtp",
+                    "pnni",
+                    "sep",
+                    "pgm",
+                    "idpr-cmtp",
+                    "zero",
+                    "rvd",
+                    "mobile",
+                    "narp",
+                    "fc",
+                    "pipe",
+                    "ipcomp",
+                    "ipv6-no",
+                    "sat-expak",
+                    "ipv6-opts",
+                    "snp",
+                    "ipcv",
+                    "br-sat-mon",
+                    "ttp",
+                    "tcf",
+                    "nsfnet-igp",
+                    "sprite-rpc",
+                    "aes-sp3-d",
+                    "sccopmce",
+                    "sctp",
+                    "qnx",
+                    "scps",
+                    "etherip",
+                    "aris",
+                    "pim",
+                    "compaq-peer",
+                    "vrrp",
+                    "iatp",
+                    "stp",
+                    "l2tp",
+                    "srp",
+                    "sm",
+                    "isis",
+                    "smp",
+                    "fire",
+                    "ptp",
+                    "crtp",
+                    "sps",
+                    "merit-inp",
+                    "idpr",
+                    "skip",
+                    "any",
+                    "larp",
+                    "ipip",
+                    "micp",
+                    "encap",
+                    "ifmp",
+                    "tp++",
+                    "a/n",
+                    "ipv6",
+                    "i-nlsp",
+                    "ipx-n-ip",
+                    "sdrp",
+                    "tlsp",
+                    "gre",
+                    "mhrp",
+                    "ddx",
+                    "ippc",
+                    "visa",
+                    "secure-vmtp",
+                    "uti",
+                    "vines",
+                    "crudp",
+                    "iplt",
+                    "ggp",
+                    "ip",
+                    "ipnip",
+                    "st2",
+                    "argus",
+                    "bbn-rcc",
+                    "egp",
+                    "emcon",
+                    "igp",
+                    "nvp",
+                    "pup",
+                    "xnet",
+                    "chaos",
+                    "mux",
+                    "dcn",
+                    "hmp",
+                    "prm",
+                    "trunk-1",
+                    "xns-idp",
+                    "leaf-1",
+                    "leaf-2",
+                    "rdp",
+                    "irtp",
+                    "iso-tp4",
+                    "netblt",
+                    "trunk-2",
+                    "cbt",
+                ]
+            ],
+            [
+                [
+                    "-",
+                    "ftp",
+                    "smtp",
+                    "snmp",
+                    "http",
+                    "ftp-data",
+                    "dns",
+                    "ssh",
+                    "radius",
+                    "pop3",
+                    "dhcp",
+                    "ssl",
+                    "irc",
+                ]
+            ],
+            [
+                [
+                    "FIN",
+                    "INT",
+                    "CON",
+                    "ECO",
+                    "REQ",
+                    "RST",
+                    "PAR",
+                    "URN",
+                    "no",
+                    "ACC",
+                    "CLO",
+                ]
+            ],
+        ]
+
+        for column, categories in zip(string_columns, string_categories):
+            column_df = dataframe.loc[:, [column]]
+
+            one_hot_encoder = OneHotEncoder(sparse=False, categories=categories)
+            # Fit OneHotEncoder to dataframe
+            one_hot_encoder.fit(column_df)
+            # Transform the dataframe
+            column_df_encoded = one_hot_encoder.transform(column_df)
+            # Create dataframe from the 2-d array
+            column_df_encoded = pd.DataFrame(
+                data=column_df_encoded, columns=one_hot_encoder.categories_[0]
+            )
+            dataframe = pd.concat([column_df_encoded, dataframe], axis=1, sort=False)
+
+        # delete proto,service and state columns
+        dataframe = dataframe.drop(string_columns, 1)
+
+        return dataframe
diff --git a/notebooks/end2end_example/cybersecurity/finn-example.png b/notebooks/end2end_example/cybersecurity/finn-example.png
new file mode 100644
index 0000000000000000000000000000000000000000..b9335f720151ea64d9ae70cdf4d4c27dabec6f74
Binary files /dev/null and b/notebooks/end2end_example/cybersecurity/finn-example.png differ
diff --git a/notebooks/end2end_example/cybersecurity/state_dict.pth b/notebooks/end2end_example/cybersecurity/state_dict.pth
new file mode 100644
index 0000000000000000000000000000000000000000..53c002e3fa6f2ae3e7c8f0abb71fa446d80a8f09
Binary files /dev/null and b/notebooks/end2end_example/cybersecurity/state_dict.pth differ
diff --git a/requirements.txt b/requirements.txt
index 71eaa3224213f0ff021107ce5090a9fb1901234a..6dd4b5724782d01fc2958cc56c04cbc8e70af31f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,6 @@
 bitstring==3.1.7
+clize==4.1.1
+dataclasses-json==0.5.2
 docrep==0.2.7
 future==0.18.2
 gspread==3.6.0
@@ -7,6 +9,7 @@ onnx==1.7.0
 onnxruntime==1.4.0
 pre-commit==2.6.0
 scipy==1.5.2
+setupext-janitor>=1.1.2
 toposort==1.5
 vcdvcd==1.0.5
 wget==3.2
diff --git a/run-docker.sh b/run-docker.sh
index 8eab28508359bc512357cc2bf2654167c04ef370..135f51d4d613c8454862385cc8cad656e620cdbd 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -42,21 +42,28 @@ recho () {
 }
 
 if [ -z "$VIVADO_PATH" ];then
-        recho "Please set the VIVADO_PATH that contains the path to your Vivado installation directory."
-        recho "FINN functionality depending on Vivado or Vivado HLS will not be available."
+  recho "Please set the VIVADO_PATH that contains the path to your Vivado installation directory."
+  recho "FINN functionality depending on Vivado or Vivado HLS will not be available."
 fi
 
 if [ -z "$PYNQ_IP" ];then
-        recho "Please set the PYNQ_IP env.var. to enable PYNQ deployment tests."
+  recho "Please set the PYNQ_IP env.var. to enable PYNQ deployment tests."
 fi
 
 if [ -z "$VITIS_PATH" ];then
-        recho "Please set the VITIS_PATH that contains the path to your Vitis installation directory."
-        recho "FINN functionality depending on Vitis will not be available."
+  recho "Please set the VITIS_PATH that contains the path to your Vitis installation directory."
+  recho "FINN functionality depending on Vitis will not be available."
 else
-    if [ -z "$PLATFORM_REPO_PATHS" ];then
-            recho "Please set PLATFORM_REPO_PATHS pointing to Vitis platform files (DSAs)."
-    fi
+  if [ -z "$PLATFORM_REPO_PATHS" ];then
+    recho "Please set PLATFORM_REPO_PATHS pointing to Vitis platform files (DSAs)."
+    recho "This is required to be able to use Vitis."
+    exit -1
+  fi
+  if [ -z "$XILINX_XRT" ];then
+    recho "Please set XILINX_XRT pointing to your XRT installation."
+    recho "This is required to be able to use Vitis."
+    exit -1
+  fi
 fi
 
 DOCKER_GID=$(id -g)
@@ -97,60 +104,85 @@ SCRIPTPATH=$(dirname "$SCRIPT")
 : ${PLATFORM_REPO_PATHS="/opt/xilinx/platforms"}
 : ${FINN_HOST_BUILD_DIR="/tmp/$DOCKER_INST_NAME"}
 
-FINN_CONTAINER_BUILD_DIR=/tmp/$DOCKER_INST_NAME
+DOCKER_INTERACTIVE=""
+DOCKER_EXTRA=""
+
+if [ "$1" = "test" ]; then
+  gecho "Running test suite (all tests)"
+  DOCKER_CMD="python setup.py test"
+elif [ "$1" = "quicktest" ]; then
+  gecho "Running test suite (non-Vivado, non-slow tests)"
+  DOCKER_CMD="quicktest.sh"
+elif [ "$1" = "notebook" ]; then
+  gecho "Running Jupyter notebook server"
+  DOCKER_CMD="jupyter notebook --ip=0.0.0.0 --port $JUPYTER_PORT notebooks"
+  DOCKER_EXTRA+="-e JUPYTER_PORT=$JUPYTER_PORT "
+  DOCKER_EXTRA+="-e NETRON_PORT=$NETRON_PORT "
+  DOCKER_EXTRA+="-p $JUPYTER_PORT:$JUPYTER_PORT "
+  DOCKER_EXTRA+="-p $NETRON_PORT:$NETRON_PORT "
+elif [ "$1" = "build_dataflow" ]; then
+  BUILD_DATAFLOW_DIR=$(readlink -f "$2")
+  DOCKER_EXTRA="-v $BUILD_DATAFLOW_DIR:$BUILD_DATAFLOW_DIR"
+  DOCKER_INTERACTIVE="-it"
+  #FINN_HOST_BUILD_DIR=$BUILD_DATAFLOW_DIR/build
+  gecho "Running build_dataflow for folder $BUILD_DATAFLOW_DIR"
+  DOCKER_CMD="build_dataflow $BUILD_DATAFLOW_DIR"
+elif [ "$1" = "build_custom" ]; then
+  BUILD_CUSTOM_DIR=$(readlink -f "$2")
+  DOCKER_EXTRA="-v $BUILD_CUSTOM_DIR:$BUILD_CUSTOM_DIR -w $BUILD_CUSTOM_DIR"
+  DOCKER_INTERACTIVE="-it"
+  #FINN_HOST_BUILD_DIR=$BUILD_DATAFLOW_DIR/build
+  gecho "Running build_custom: $BUILD_CUSTOM_DIR/build.py"
+  DOCKER_CMD="python -mpdb -cc -cq build.py"
+else
+  gecho "Running container only"
+  DOCKER_CMD="bash"
+  DOCKER_INTERACTIVE="-it"
+fi
+
 VIVADO_HLS_LOCAL=$VIVADO_PATH
-VIVADO_IP_CACHE=$FINN_CONTAINER_BUILD_DIR/vivado_ip_cache
+VIVADO_IP_CACHE=$FINN_HOST_BUILD_DIR/vivado_ip_cache
+INSTALL_XRT_DEPS=0
 
 # ensure build dir exists locally
 mkdir -p $FINN_HOST_BUILD_DIR
 mkdir -p $FINN_SSH_KEY_DIR
 
-gecho "Instance is named as $DOCKER_INST_NAME"
-gecho "Mounting $FINN_HOST_BUILD_DIR into $FINN_CONTAINER_BUILD_DIR"
+gecho "Docker container is named $DOCKER_INST_NAME"
+gecho "Mounting $FINN_HOST_BUILD_DIR into $FINN_HOST_BUILD_DIR"
 gecho "Mounting $VIVADO_PATH into $VIVADO_PATH"
-gecho "Mounting $VITIS_PATH into $VITIS_PATH"
+if [ ! -z "$VITIS_PATH" ];then
+  gecho "Mounting $VITIS_PATH into $VITIS_PATH"
+  INSTALL_XRT_DEPS=1
+fi
 gecho "Port-forwarding for Jupyter $JUPYTER_PORT:$JUPYTER_PORT"
 gecho "Port-forwarding for Netron $NETRON_PORT:$NETRON_PORT"
 gecho "Vivado IP cache dir is at $VIVADO_IP_CACHE"
 gecho "Using default PYNQ board $PYNQ_BOARD"
 
-DOCKER_INTERACTIVE=""
-
-if [ "$1" = "test" ]; then
-        gecho "Running test suite (all tests)"
-        DOCKER_CMD="python setup.py test"
-elif [ "$1" = "quicktest" ]; then
-        gecho "Running test suite (non-Vivado, non-slow tests)"
-        DOCKER_CMD="quicktest.sh"
-elif [ "$1" = "notebook" ]; then
-        gecho "Running Jupyter notebook server"
-        DOCKER_CMD="jupyter notebook --ip=0.0.0.0 --port $JUPYTER_PORT notebooks"
-else
-        gecho "Running container only"
-        DOCKER_CMD="bash"
-        DOCKER_INTERACTIVE="-it"
-fi
-
 # Build the FINN Docker image
+# Need to ensure this is done within the finn/ root folder:
+OLD_PWD=$(pwd)
+cd $SCRIPTPATH
 docker build -f docker/Dockerfile.finn_dev --tag=$DOCKER_TAG \
              --build-arg GID=$DOCKER_GID \
              --build-arg GNAME=$DOCKER_GNAME \
              --build-arg UNAME=$DOCKER_UNAME \
              --build-arg UID=$DOCKER_UID \
              --build-arg PASSWD=$DOCKER_PASSWD \
-             --build-arg JUPYTER_PORT=$JUPYTER_PORT \
-             --build-arg NETRON_PORT=$NETRON_PORT \
+             --build-arg INSTALL_XRT_DEPS=$INSTALL_XRT_DEPS \
              .
+cd $OLD_PWD
 # Launch container with current directory mounted
 # important to pass the --init flag here for correct Vivado operation, see:
 # https://stackoverflow.com/questions/55733058/vivado-synthesis-hangs-in-docker-container-spawned-by-jenkins
-DOCKER_EXEC="docker run -t --rm --name $DOCKER_INST_NAME $DOCKER_INTERACTIVE --init "
+DOCKER_EXEC="docker run -t --rm $DOCKER_INTERACTIVE --init "
 DOCKER_EXEC+="--hostname $DOCKER_INST_NAME "
 DOCKER_EXEC+="-e SHELL=/bin/bash "
 DOCKER_EXEC+="-v $SCRIPTPATH:/workspace/finn "
-DOCKER_EXEC+="-v $FINN_HOST_BUILD_DIR:$FINN_CONTAINER_BUILD_DIR "
+DOCKER_EXEC+="-v $FINN_HOST_BUILD_DIR:$FINN_HOST_BUILD_DIR "
 DOCKER_EXEC+="-v $FINN_SSH_KEY_DIR:/home/$DOCKER_UNAME/.ssh "
-DOCKER_EXEC+="-e FINN_INST_NAME=$DOCKER_INST_NAME "
+DOCKER_EXEC+="-e FINN_BUILD_DIR=$FINN_HOST_BUILD_DIR "
 DOCKER_EXEC+="-e FINN_ROOT="/workspace/finn" "
 DOCKER_EXEC+="-e VIVADO_IP_CACHE=$VIVADO_IP_CACHE "
 DOCKER_EXEC+="-e PYNQ_BOARD=$PYNQ_BOARD "
@@ -159,8 +191,10 @@ DOCKER_EXEC+="-e PYNQ_USERNAME=$PYNQ_USERNAME "
 DOCKER_EXEC+="-e PYNQ_PASSWORD=$PYNQ_PASSWORD "
 DOCKER_EXEC+="-e PYNQ_TARGET_DIR=$PYNQ_TARGET_DIR "
 DOCKER_EXEC+="-e NUM_DEFAULT_WORKERS=$NUM_DEFAULT_WORKERS "
-DOCKER_EXEC+="-p $JUPYTER_PORT:$JUPYTER_PORT "
-DOCKER_EXEC+="-p $NETRON_PORT:$NETRON_PORT "
+if [ ! -z "$IMAGENET_VAL_PATH" ];then
+  DOCKER_EXEC+="-v $IMAGENET_VAL_PATH:$IMAGENET_VAL_PATH "
+  DOCKER_EXEC+="-e IMAGENET_VAL_PATH=$IMAGENET_VAL_PATH "
+fi
 if [ ! -z "$VIVADO_PATH" ];then
   DOCKER_EXEC+="-e "XILINX_VIVADO=$VIVADO_PATH" "
   DOCKER_EXEC+="-v $VIVADO_PATH:$VIVADO_PATH "
@@ -168,12 +202,12 @@ if [ ! -z "$VIVADO_PATH" ];then
 fi
 if [ ! -z "$VITIS_PATH" ];then
   if [ -z "$PLATFORM_REPO_PATHS" ];then
-          recho "PLATFORM_REPO_PATHS must be set for Vitis/Alveo flows"
-          exit -1
+    recho "PLATFORM_REPO_PATHS must be set for Vitis/Alveo flows"
+    exit -1
   fi
   if [ -z "$XILINX_XRT" ];then
-          recho "XILINX_XRT must be set for Vitis/Alveo flows"
-          exit -1
+    recho "XILINX_XRT must be set for Vitis/Alveo flows"
+    exit -1
   fi
   DOCKER_EXEC+="-v $VITIS_PATH:$VITIS_PATH "
   DOCKER_EXEC+="-v $PLATFORM_REPO_PATHS:$PLATFORM_REPO_PATHS "
@@ -187,6 +221,7 @@ if [ ! -z "$VITIS_PATH" ];then
   DOCKER_EXEC+="-e ALVEO_BOARD=$ALVEO_BOARD "
   DOCKER_EXEC+="-e ALVEO_TARGET_DIR=$ALVEO_TARGET_DIR "
 fi
+DOCKER_EXEC+="$DOCKER_EXTRA "
 DOCKER_EXEC+="$DOCKER_TAG $DOCKER_CMD"
 
 $DOCKER_EXEC
diff --git a/setup.cfg b/setup.cfg
index 7729d0949ee133e06242905afab31708e79ebf04..45fe40156acd966fed302522e9e8ca716a4d331c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -31,16 +31,16 @@
 # http://setuptools.readthedocs.io/en/latest/setuptools.html#configuring-setup-using-setup-cfg-files
 
 [metadata]
-name = FINN
+name = finn
 description = A Framework for Fast, Scalable Quantized Neural Network Inference
 author = Yaman Umuroglu
 author-email = yamanu@xilinx.com
 license = new-bsd
-long-description = file: README.rst
-long-description-content-type = text/x-rst; charset=UTF-8
-url = http://www.pynq.io/ml
+long-description = file: README.md
+long-description-content-type = text/markdown
+url = https://xilinx.github.io/finn/
 project-urls =
-    Documentation = https://pyscaffold.org/
+    Documentation = https://finn.readthedocs.io/
 # Change if running only on Windows, Mac or Linux (comma-separated)
 platforms = any
 # Add here all kinds of additional classifiers as defined under
@@ -48,17 +48,19 @@ platforms = any
 classifiers =
     Development Status :: 4 - Beta
     Programming Language :: Python
+    Operating System :: POSIX :: Linux
 
 [options]
 zip_safe = False
-packages = find:
+packages = find_namespace:
 include_package_data = True
 package_dir =
     =src
 # DON'T CHANGE THE FOLLOWING LINE! IT WILL BE UPDATED BY PYSCAFFOLD!
 setup_requires = pyscaffold>=3.2a0,<3.3a0
-# Add here dependencies of your project (semicolon/line-separated), e.g.
-# install_requires = numpy; scipy
+# finn-base is added specifically to be able to build on readthedocs
+install_requires =
+    finn-base @ git+https://github.com/Xilinx/finn-base#egg=finn-base
 # The usage of test_requires is discouraged, see `Dependency Management` docs
 # tests_require = pytest; pytest-cov
 # Require a specific Python version, e.g. Python 2.7 or >= 3.4
@@ -79,6 +81,8 @@ testing =
     pytest-cov
 
 [options.entry_points]
+console_scripts =
+    build_dataflow = finn.builder.build_dataflow:main
 # Add here console scripts like:
 # console_scripts =
 #     script_name = finn.module:function
@@ -116,7 +120,7 @@ dists = bdist_wheel
 
 [bdist_wheel]
 # Use this option if your package is pure-python
-universal = 1
+universal = 0
 
 [build_sphinx]
 source_dir = docs/finn
@@ -145,3 +149,5 @@ package = finn
 extensions =
     travis
     pre_commit
+    namespace
+namespace = finn
diff --git a/src/finn/analysis/fpgadataflow/dataflow_performance.py b/src/finn/analysis/fpgadataflow/dataflow_performance.py
index e678630ae97318af47dd432a7c68442a6642b65f..dafe8a9f89468d9ccba926562a4729a793c2fbf0 100644
--- a/src/finn/analysis/fpgadataflow/dataflow_performance.py
+++ b/src/finn/analysis/fpgadataflow/dataflow_performance.py
@@ -53,7 +53,7 @@ def dataflow_performance(model):
     for node in model.graph.node:
         if is_fpgadataflow_node(node) is True:
             inst = getCustomOp(node)
-            node_cycles = inst.get_nodeattr("cycles_estimate")
+            node_cycles = int(inst.get_nodeattr("cycles_estimate"))
             if node_cycles > max_cycles:
                 max_cycles = node_cycles
                 max_node_name = node.name
@@ -72,7 +72,7 @@ def dataflow_performance(model):
                 latency_at_node_output[node.name] = node_cycles + max_pred_latency
     critical_path_cycles = max(latency_at_node_output.values())
     return {
-        "critical_path_cycles": critical_path_cycles,
-        "max_cycles": max_cycles,
+        "critical_path_cycles": int(critical_path_cycles),
+        "max_cycles": int(max_cycles),
         "max_cycles_node_name": max_node_name,
     }
diff --git a/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py b/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py
index 0fcf2e382561852eb1c0b02e1d417db05057655c..bb1cad56da5a2d3a8690566dd6f3f220af9c12a6 100644
--- a/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py
+++ b/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py
@@ -42,6 +42,6 @@ def exp_cycles_per_layer(model):
     for node in model.graph.node:
         if is_fpgadataflow_node(node) is True:
             inst = registry.getCustomOp(node)
-            cycle_dict[node.name] = inst.get_exp_cycles()
+            cycle_dict[node.name] = int(inst.get_exp_cycles())
 
     return cycle_dict
diff --git a/src/finn/analysis/fpgadataflow/floorplan_params.py b/src/finn/analysis/fpgadataflow/floorplan_params.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c8cbf53de1ae7dc951911678a3f118bd3506dfe
--- /dev/null
+++ b/src/finn/analysis/fpgadataflow/floorplan_params.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from finn.util.fpgadataflow import is_fpgadataflow_node
+from finn.custom_op.registry import getCustomOp
+
+
+def floorplan_params(model):
+    """Gathers SLR and partition IDs from nodes.
+
+    Returns {node name : {slr, device id, partition id, memory port}}."""
+
+    ret_dict = {
+        "Defaults": {
+            "slr": [-1, ["all"]],
+            "partition_id": [0, ["all"]],
+            "device_id": [0, ["all"]],
+            "mem_port": ["", ["all"]],
+        }
+    }
+    for node in model.graph.node:
+        if is_fpgadataflow_node(node) is True:
+            node_inst = getCustomOp(node)
+            node_slr = node_inst.get_nodeattr("slr")
+            node_pid = node_inst.get_nodeattr("partition_id")
+            node_mport = node_inst.get_nodeattr("mem_port")
+            ret_dict[node.name] = {
+                "slr": node_slr,
+                "partition_id": node_pid,
+                "device_id": 0,
+                "mem_port": node_mport,
+            }
+
+    return ret_dict
diff --git a/src/finn/analysis/fpgadataflow/op_and_param_counts.py b/src/finn/analysis/fpgadataflow/op_and_param_counts.py
new file mode 100644
index 0000000000000000000000000000000000000000..27c6dfd997a14ab8b213baaa469d402bed1cf3a8
--- /dev/null
+++ b/src/finn/analysis/fpgadataflow/op_and_param_counts.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import finn.custom_op.registry as registry
+from finn.util.basic import is_finn_op
+
+
+def aggregate_dict_keys(res_dict):
+    total_dict = {}
+    for layer in res_dict:
+        layer_res_dict = res_dict[layer]
+        for r_type in layer_res_dict.keys():
+            if "efficiency" in r_type:
+                continue
+            r_amount = layer_res_dict[r_type]
+            r_amount = float(r_amount)
+            if r_type in total_dict.keys():
+                total_dict[r_type] += r_amount
+            else:
+                total_dict[r_type] = r_amount
+    return total_dict
+
+
+def op_and_param_counts(model):
+    """Return per-node and aggregate op counts per inference."""
+
+    ret_dict = {}
+    for node in model.graph.node:
+        if is_finn_op(node.domain):
+            inst = registry.getCustomOp(node)
+            if hasattr(inst, "get_op_and_param_counts"):
+                node_op_and_param_counts = inst.get_op_and_param_counts()
+                ret_dict[node.name] = node_op_and_param_counts
+    ret_dict["total"] = aggregate_dict_keys(ret_dict)
+    return ret_dict
diff --git a/src/finn/analysis/fpgadataflow/res_estimation.py b/src/finn/analysis/fpgadataflow/res_estimation.py
index 2c714b1f12b75e9789f1865d6737422f4d9d9a97..31cfeb76a6d4f411808af5dcd265e4f07352ae02 100644
--- a/src/finn/analysis/fpgadataflow/res_estimation.py
+++ b/src/finn/analysis/fpgadataflow/res_estimation.py
@@ -61,7 +61,10 @@ def res_estimation_complete(model):
         if is_fpgadataflow_node(node) is True:
             op_type = node.op_type
             inst = registry.getCustomOp(node)
-            if op_type == "StreamingFCLayer_Batch" or op_type == "Vector_Vector_Activate_Batch":
+            if (
+                op_type == "StreamingFCLayer_Batch"
+                or op_type == "Vector_Vector_Activate_Batch"
+            ):
                 orig_restype = inst.get_nodeattr("resType")
                 res_dict[node.name] = []
                 inst.set_nodeattr("resType", "dsp")
diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..c46bfa48dff289d37f2cb2a89cdbef8e2789317f
--- /dev/null
+++ b/src/finn/builder/build_dataflow.py
@@ -0,0 +1,198 @@
+# Copyright (c) 2020 Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from finn.core.modelwrapper import ModelWrapper
+import os
+import json
+import time
+import clize
+import sys
+import logging
+import pdb  # NOQA
+import traceback
+from finn.builder.build_dataflow_steps import build_dataflow_step_lookup
+from finn.builder.build_dataflow_config import (
+    DataflowBuildConfig,
+    default_build_dataflow_steps,
+)
+
+
+# adapted from https://stackoverflow.com/a/39215961
+class StreamToLogger(object):
+    """
+    Fake file-like stream object that redirects writes to a logger instance.
+    """
+
+    def __init__(self, logger, level):
+        self.logger = logger
+        self.level = level
+        self.linebuf = ""
+
+    def write(self, buf):
+        for line in buf.rstrip().splitlines():
+            self.logger.log(self.level, line.rstrip())
+
+    def flush(self):
+        pass
+
+
+def resolve_build_steps(cfg: DataflowBuildConfig):
+    steps = cfg.steps
+    if steps is None:
+        steps = default_build_dataflow_steps
+    steps_as_fxns = []
+    for transform_step in steps:
+        if type(transform_step) is str:
+            # lookup step function from step name
+            steps_as_fxns.append(build_dataflow_step_lookup[transform_step])
+        elif callable(transform_step):
+            # treat step as function to be called as-is
+            steps_as_fxns.append(transform_step)
+        else:
+            raise Exception("Could not resolve build step: " + str(transform_step))
+    return steps_as_fxns
+
+
+def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
+    """Best-effort build a dataflow accelerator using the given configuration.
+
+    :param model_filename: ONNX model filename to build
+    :param cfg: Build configuration
+    """
+    model = ModelWrapper(model_filename)
+    assert type(model) is ModelWrapper
+    finn_build_dir = os.environ["FINN_BUILD_DIR"]
+    print("Building dataflow accelerator from " + model_filename)
+    print("Intermediate outputs will be generated in " + finn_build_dir)
+    print("Final outputs will be generated in " + cfg.output_dir)
+    print("Build log is at " + cfg.output_dir + "/build_dataflow.log")
+    # create the output dir if it doesn't exist
+    if not os.path.exists(cfg.output_dir):
+        os.makedirs(cfg.output_dir)
+    step_num = 1
+    time_per_step = dict()
+    build_dataflow_steps = resolve_build_steps(cfg)
+    # set up logger
+    logging.basicConfig(
+        level=logging.DEBUG,
+        format="[%(asctime)s] %(message)s",
+        filename=cfg.output_dir + "/build_dataflow.log",
+        filemode="a",
+    )
+    log = logging.getLogger("build_dataflow")
+    stdout_logger = StreamToLogger(log, logging.INFO)
+    stderr_logger = StreamToLogger(log, logging.ERROR)
+    stdout_orig = sys.stdout
+    stderr_orig = sys.stderr
+    for transform_step in build_dataflow_steps:
+        try:
+            step_name = transform_step.__name__
+            print(
+                "Running step: %s [%d/%d]"
+                % (step_name, step_num, len(build_dataflow_steps))
+            )
+            # redirect output to logfile
+            sys.stdout = stdout_logger
+            sys.stderr = stderr_logger
+            print(
+                "Running step: %s [%d/%d]"
+                % (step_name, step_num, len(build_dataflow_steps))
+            )
+            # run the step
+            step_start = time.time()
+            model = transform_step(model, cfg)
+            step_end = time.time()
+            # restore stdout/stderr
+            sys.stdout = stdout_orig
+            sys.stderr = stderr_orig
+            time_per_step[step_name] = step_end - step_start
+            chkpt_name = "%d_%s.onnx" % (step_num, step_name)
+            if cfg.save_intermediate_models:
+                intermediate_model_dir = cfg.output_dir + "/intermediate_models"
+                if not os.path.exists(intermediate_model_dir):
+                    os.makedirs(intermediate_model_dir)
+                model.save("%s/%s" % (intermediate_model_dir, chkpt_name))
+            step_num += 1
+        except:  # noqa
+            # restore stdout/stderr
+            sys.stdout = stdout_orig
+            sys.stderr = stderr_orig
+            # print exception info and traceback
+            extype, value, tb = sys.exc_info()
+            traceback.print_exc()
+            # start postmortem debug if configured
+            if cfg.enable_build_pdb_debug:
+                pdb.post_mortem(tb)
+            else:
+                print("enable_build_pdb_debug not set in build config, exiting...")
+            print("Build failed")
+            return -1
+
+    with open(cfg.output_dir + "/time_per_step.json", "w") as f:
+        json.dump(time_per_step, f, indent=2)
+    print("Completed successfully")
+    return 0
+
+
+def build_dataflow_directory(path_to_cfg_dir: str):
+    """Best-effort build a dataflow accelerator from the specified directory.
+
+    :param path_to_cfg_dir: Directory containing the model and build config
+
+    The specified directory path_to_cfg_dir must contain the following files:
+
+    * model.onnx : ONNX model to be converted to dataflow accelerator
+    * dataflow_build_config.json : JSON file with build configuration
+
+    """
+    # get absolute path
+    path_to_cfg_dir = os.path.abspath(path_to_cfg_dir)
+    assert os.path.isdir(path_to_cfg_dir), "Directory not found: " + path_to_cfg_dir
+    onnx_filename = path_to_cfg_dir + "/model.onnx"
+    json_filename = path_to_cfg_dir + "/dataflow_build_config.json"
+    assert os.path.isfile(onnx_filename), "ONNX not found: " + onnx_filename
+    assert os.path.isfile(json_filename), "Build config not found: " + json_filename
+    with open(json_filename, "r") as f:
+        json_str = f.read()
+    build_cfg = DataflowBuildConfig.from_json(json_str)
+    old_wd = os.getcwd()
+    # change into build dir to resolve relative paths
+    os.chdir(path_to_cfg_dir)
+    ret = build_dataflow_cfg(onnx_filename, build_cfg)
+    os.chdir(old_wd)
+    return ret
+
+
+def main():
+    """Entry point for dataflow builds. Invokes `build_dataflow_directory` using
+    command line arguments"""
+    clize.run(build_dataflow_directory)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..b206e00a2eb6da1d76ccf57c078b16f61868a98c
--- /dev/null
+++ b/src/finn/builder/build_dataflow_config.py
@@ -0,0 +1,335 @@
+# Copyright (c) 2020 Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from typing import List, Optional, Any
+from finn.util.basic import pynq_part_map, alveo_part_map
+from finn.transformation.fpgadataflow.vitis_build import VitisOptStrategy
+from enum import Enum
+from dataclasses import dataclass
+from dataclasses_json import dataclass_json
+import os
+import numpy as np
+
+
+class ShellFlowType(str, Enum):
+    """For builds that produce a bitfile, select the shell flow that will integrate
+    the FINN-generated accelerator."""
+
+    VIVADO_ZYNQ = "vivado_zynq"
+    VITIS_ALVEO = "vitis_alveo"
+
+
+class DataflowOutputType(str, Enum):
+    "Output product types that can be generated by build_dataflow"
+
+    STITCHED_IP = "stitched_ip"
+    ESTIMATE_REPORTS = "estimate_reports"
+    OOC_SYNTH = "out_of_context_synth"
+    RTLSIM_PERFORMANCE = "rtlsim_performance"
+    BITFILE = "bitfile"
+    PYNQ_DRIVER = "pynq_driver"
+    DEPLOYMENT_PACKAGE = "deployment_package"
+
+
+class ComputeEngineMemMode(str, Enum):
+    """Memory mode for generated compute engines. See
+    https://finn.readthedocs.io/en/latest/internals.html#streamingfclayer-mem-mode
+    for more information."""
+
+    CONST = "const"
+    DECOUPLED = "decoupled"
+
+
+class VitisOptStrategyCfg(str, Enum):
+    """Vitis optimization strategy with serializable string enum values."""
+
+    DEFAULT = "default"
+    POWER = "power"
+    PERFORMANCE = "performance"
+    PERFORMANCE_BEST = "performance_best"
+    SIZE = "size"
+    BUILD_SPEED = "quick"
+
+
+class LargeFIFOMemStyle(str, Enum):
+    """Type of memory resource to use for large FIFOs."""
+
+    AUTO = "auto"
+    BRAM = "block"
+    LUTRAM = "distributed"
+    URAM = "ultra"
+
+
+class VerificationStepType(str, Enum):
+    "Steps at which FINN ONNX execution can be launched for verification."
+
+    #: verify after step_tidy_up, using Python execution
+    TIDY_UP_PYTHON = "initial_python"
+    #: verify after step_streamline , using Python execution
+    STREAMLINED_PYTHON = "streamlined_python"
+    #: verify after step_apply_folding_config, using C++ for each HLS node
+    FOLDED_HLS_CPPSIM = "folded_hls_cppsim"
+    #: verify after step_create_stitched_ip, using stitched-ip Verilog
+    STITCHED_IP_RTLSIM = "stitched_ip_rtlsim"
+
+
+#: List of steps that will be run as part of the standard dataflow build, in the
+#: specified order. Use the `steps` as part of build config to restrict which
+#: steps will be run.
+default_build_dataflow_steps = [
+    "step_tidy_up",
+    "step_streamline",
+    "step_convert_to_hls",
+    "step_create_dataflow_partition",
+    "step_target_fps_parallelization",
+    "step_apply_folding_config",
+    "step_generate_estimate_reports",
+    "step_hls_codegen",
+    "step_hls_ipgen",
+    "step_set_fifo_depths",
+    "step_create_stitched_ip",
+    "step_measure_rtlsim_performance",
+    "step_make_pynq_driver",
+    "step_out_of_context_synthesis",
+    "step_synthesize_bitfile",
+    "step_deployment_package",
+]
+
+#: List of steps to run for an estimate-only (no synthesis) dataflow build
+estimate_only_dataflow_steps = [
+    "step_tidy_up",
+    "step_streamline",
+    "step_convert_to_hls",
+    "step_create_dataflow_partition",
+    "step_target_fps_parallelization",
+    "step_apply_folding_config",
+    "step_generate_estimate_reports",
+]
+
+#: List of steps to run for a dataflow build including HLS code generation, but
+#: without any synthesis.
+hls_codegen_dataflow_steps = estimate_only_dataflow_steps + ["step_hls_codegen"]
+
+
+@dataclass_json
+@dataclass
+class DataflowBuildConfig:
+    """Build configuration to be passed to the build_dataflow function. Can be
+    serialized into or de-serialized from JSON files for persistence.
+    See list of attributes below for more information on the build configuration.
+    """
+
+    #: Directory where the final build outputs will be written into
+    output_dir: str
+
+    #: Target clock frequency (in nanoseconds) for Vivado synthesis.
+    #: e.g. synth_clk_period_ns=5.0 will target a 200 MHz clock.
+    #: If hls_clk_period_ns is not specified it will default to this value.
+    synth_clk_period_ns: float
+
+    #: Which output(s) to generate from the build flow.  See documentation of
+    #: DataflowOutputType for available options.
+    generate_outputs: List[DataflowOutputType]
+
+    #: (Optional) Path to configuration JSON file. May include parallelization,
+    #: FIFO sizes, RAM and implementation style attributes and so on.
+    #: If the parallelization attributes (PE, SIMD) are part of the config,
+    #: this will override the automatically generated parallelization
+    #: attributes inferred from target_fps (if any)
+    #: Will be applied with :py:mod:`finn.transformation.general.ApplyConfig`
+    folding_config_file: Optional[str] = None
+
+    #: (Optional) Target inference performance in frames per second.
+    #: Note that target may not be achievable due to specific layer constraints,
+    #: or due to resource limitations of the FPGA.
+    #: If parallelization attributes are specified as part of folding_config_file
+    #: that will override the target_fps setting here.
+    target_fps: Optional[int] = None
+
+    #: (Optional) At which steps the generated intermediate output model
+    #: will be verified. See documentation of VerificationStepType for
+    #: available options.
+    verify_steps: Optional[List[VerificationStepType]] = None
+
+    #: (Optional) Name of .npy file that will be used as the input for
+    #: verification. Only required if verify_steps is not empty.
+    verify_input_npy: Optional[str] = "input.npy"
+
+    #: (Optional) Name of .npy file that will be used as the expected output for
+    #: verification. Only required if verify_steps is not empty.
+    verify_expected_output_npy: Optional[str] = "expected_output.npy"
+
+    #: (Optional) Control the maximum width of the per-PE MVAU stream while
+    #: exploring the parallelization attributes to reach target_fps
+    #: Only relevant if target_fps is specified.
+    #: Set this to a large value (e.g. 10000) if targeting full unfolding or
+    #: very high performance.
+    mvau_wwidth_max: Optional[int] = 36
+
+    #: (Optional) Whether thresholding layers (which implement quantized
+    #: activations in FINN) will be implemented as stand-alone HLS layers,
+    #: instead of being part of StreamingFCLayer. This gives larger flexibility,
+    #: and makes it possible to have runtime-writable thresholds.
+    standalone_thresholds: Optional[bool] = False
+
+    #: Target board, only needed for generating full bitfiles where the FINN
+    #: design is integrated into a shell.
+    #: e.g. "Pynq-Z1" or "U250"
+    board: Optional[str] = None
+
+    #: Target shell flow, only needed for generating full bitfiles where the FINN
+    #: design is integrated into a shell. See documentation of ShellFlowType
+    #: for options.
+    shell_flow_type: Optional[ShellFlowType] = None
+
+    #: Target Xilinx FPGA part. Only needed when board is not specified.
+    #: e.g. "xc7z020clg400-1"
+    fpga_part: Optional[str] = None
+
+    #: Whether FIFO depths will be set automatically. Involves running stitched
+    #: rtlsim and can take a long time.
+    #: If set to False, the folding_config_file can be used to specify sizes
+    #: for each FIFO.
+    auto_fifo_depths: Optional[bool] = True
+
+    #: Memory resource type for large FIFOs
+    #: Only relevant when `auto_fifo_depths = True`
+    large_fifo_mem_style: Optional[LargeFIFOMemStyle] = LargeFIFOMemStyle.AUTO
+
+    #: Target clock frequency (in nanoseconds) for Vivado HLS synthesis.
+    #: e.g. `hls_clk_period_ns=5.0` will target a 200 MHz clock.
+    #: If not specified it will default to synth_clk_period_ns
+    hls_clk_period_ns: Optional[float] = None
+
+    #: Which memory mode will be used for compute layers
+    default_mem_mode: Optional[ComputeEngineMemMode] = ComputeEngineMemMode.DECOUPLED
+
+    #: Which Vitis platform will be used.
+    #: Only relevant when `shell_flow_type = ShellFlowType.VITIS_ALVEO`
+    #: e.g. "xilinx_u250_xdma_201830_2"
+    vitis_platform: Optional[str] = None
+
+    #: Path to JSON config file assigning each layer to an SLR.
+    #: Only relevant when `shell_flow_type = ShellFlowType.VITIS_ALVEO`
+    #: Will be applied with :py:mod:`finn.transformation.general.ApplyConfig`
+    vitis_floorplan_file: Optional[str] = None
+
+    #: Vitis optimization strategy
+    #: Only relevant when `shell_flow_type = ShellFlowType.VITIS_ALVEO`
+    vitis_opt_strategy: Optional[VitisOptStrategyCfg] = VitisOptStrategyCfg.DEFAULT
+
+    #: Whether intermediate ONNX files will be saved during the build process.
+    #: These can be useful for debugging if the build fails.
+    save_intermediate_models: Optional[bool] = True
+
+    #: Whether hardware debugging will be enabled (e.g. ILA cores inserted to
+    #: debug signals in the generated hardware)
+    enable_hw_debug: Optional[bool] = False
+
+    #: Whether pdb postmortem debuggig will be launched when the build fails
+    enable_build_pdb_debug: Optional[bool] = True
+
+    #: If given, only run the steps in the list. If not, run default steps.
+    #: See `default_build_dataflow_steps` for the default list of steps.
+    #: When specified:
+    #: Each item can either be a string, or a function (does not apply to json
+    #: serialized configs) and does the following:
+    #: - strings are resolved to functions from the default list
+    #: - functions are called with (model, DataflowBuildConfig) as args
+    steps: Optional[List[Any]] = None
+
+    def _resolve_hls_clk_period(self):
+        if self.hls_clk_period_ns is None:
+            # use same clk for synth and hls if not explicitly specified
+            return self.synth_clk_period_ns
+        else:
+            return self.hls_clk_period_ns
+
+    def _resolve_driver_platform(self):
+        if self.shell_flow_type == ShellFlowType.VIVADO_ZYNQ:
+            return "zynq-iodma"
+        elif self.shell_flow_type == ShellFlowType.VITIS_ALVEO:
+            return "alveo"
+        else:
+            raise Exception(
+                "Couldn't resolve driver platform for " + str(self.shell_flow_type)
+            )
+
+    def _resolve_fpga_part(self):
+        if self.fpga_part is None:
+            # lookup from part map if not specified
+            if self.shell_flow_type == ShellFlowType.VIVADO_ZYNQ:
+                return pynq_part_map[self.board]
+            elif self.shell_flow_type == ShellFlowType.VITIS_ALVEO:
+                return alveo_part_map[self.board]
+            else:
+                raise Exception("Couldn't resolve fpga_part for " + self.board)
+        else:
+            # return as-is when explicitly specified
+            return self.fpga_part
+
+    def _resolve_cycles_per_frame(self):
+        if self.target_fps is None:
+            return None
+        else:
+            n_clock_cycles_per_sec = 10 ** 9 / self.synth_clk_period_ns
+            n_cycles_per_frame = n_clock_cycles_per_sec / self.target_fps
+            return int(n_cycles_per_frame)
+
+    def _resolve_vitis_opt_strategy(self):
+        # convert human-readable enum to value expected by v++
+        name_to_strategy = {
+            VitisOptStrategyCfg.DEFAULT: VitisOptStrategy.DEFAULT,
+            VitisOptStrategyCfg.POWER: VitisOptStrategy.POWER,
+            VitisOptStrategyCfg.PERFORMANCE: VitisOptStrategy.PERFORMANCE,
+            VitisOptStrategyCfg.PERFORMANCE_BEST: VitisOptStrategy.PERFORMANCE_BEST,
+            VitisOptStrategyCfg.SIZE: VitisOptStrategy.SIZE,
+            VitisOptStrategyCfg.BUILD_SPEED: VitisOptStrategy.BUILD_SPEED,
+        }
+        return name_to_strategy[self.vitis_opt_strategy]
+
+    def _resolve_verification_steps(self):
+        if self.verify_steps is None:
+            return []
+        else:
+            return self.verify_steps
+
+    def _resolve_verification_io_pair(self):
+        if self.verify_steps is None:
+            return None
+        else:
+            assert os.path.isfile(self.verify_input_npy), (
+                "verify_input_npy not found: " + self.verify_input_npy
+            )
+            verify_input_npy = np.load(self.verify_input_npy)
+            assert os.path.isfile(self.verify_expected_output_npy), (
+                "verify_expected_output_npy not found: "
+                + self.verify_expected_output_npy
+            )
+            verify_expected_output_npy = np.load(self.verify_expected_output_npy)
+            return (verify_input_npy, verify_expected_output_npy)
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c1861e5286e92abcf983056f8263daae14334e8
--- /dev/null
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -0,0 +1,562 @@
+# Copyright (c) 2020 Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from finn.core.modelwrapper import ModelWrapper
+import os
+import json
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+import finn.transformation.streamline.absorb as absorb
+from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
+from finn.transformation.fold_constants import FoldConstants
+from finn.transformation.general import (
+    ApplyConfig,
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    RemoveUnusedTensors,
+    RemoveStaticGraphInputs,
+)
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.streamline import Streamline
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
+from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from finn.transformation.streamline.reorder import (
+    MakeMaxPoolNHWC,
+    MoveScalarLinearPastInvariants,
+)
+from shutil import copy, copytree
+from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.set_fifo_depths import (
+    InsertAndSetFIFODepths,
+    RemoveShallowFIFOs,
+)
+from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
+from finn.transformation.fpgadataflow.vitis_build import VitisBuild
+from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
+from finn.transformation.fpgadataflow.set_folding import SetFolding
+from finn.transformation.fpgadataflow.create_dataflow_partition import (
+    CreateDataflowPartition,
+)
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
+from finn.custom_op.registry import getCustomOp
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+from finn.analysis.fpgadataflow.res_estimation import (
+    res_estimation,
+    res_estimation_complete,
+)
+from finn.analysis.fpgadataflow.op_and_param_counts import (
+    aggregate_dict_keys,
+    op_and_param_counts,
+)
+from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
+from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
+from finn.util.config import extract_model_config_to_json
+from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext
+from finn.builder.build_dataflow_config import (
+    DataflowBuildConfig,
+    DataflowOutputType,
+    ShellFlowType,
+    VerificationStepType,
+)
+from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
+from finn.core.onnx_exec import execute_onnx
+import numpy as np
+from finn.util.test import execute_parent
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.core.throughput_test import throughput_test_rtlsim
+from copy import deepcopy
+
+
+def verify_step(
+    model: ModelWrapper, cfg: DataflowBuildConfig, step_name: str, need_parent: bool
+):
+    print("Running verification for " + step_name)
+    verify_out_dir = cfg.output_dir + "/verification_output"
+    intermediate_models_dir = cfg.output_dir + "/intermediate_models"
+    os.makedirs(verify_out_dir, exist_ok=True)
+    (in_npy, exp_out_npy) = cfg._resolve_verification_io_pair()
+    if need_parent:
+        assert (
+            cfg.save_intermediate_models
+        ), "Enable save_intermediate_models for verification"
+        parent_model_fn = intermediate_models_dir + "/dataflow_parent.onnx"
+        child_model_fn = intermediate_models_dir + "/verify_%s.onnx" % step_name
+        model.save(child_model_fn)
+        out_npy = execute_parent(parent_model_fn, child_model_fn, in_npy)
+    else:
+        inp_tensor_name = model.graph.input[0].name
+        out_tensor_name = model.graph.output[0].name
+        inp_dict = {inp_tensor_name: in_npy}
+        out_dict = execute_onnx(model, inp_dict)
+        out_npy = out_dict[out_tensor_name]
+    res = np.isclose(exp_out_npy, out_npy, atol=1e-3).all()
+    res_to_str = {True: "SUCCESS", False: "FAIL"}
+    res_str = res_to_str[res]
+    verification_output_fn = verify_out_dir + "/verify_%s_%s.npy" % (step_name, res_str)
+    np.save(verification_output_fn, out_npy)
+    print("Verification for %s : %s" % (step_name, res_str))
+
+
+def step_tidy_up(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Run the tidy-up step on given model. This includes shape and datatype
+    inference, constant folding, and giving nodes and tensors better names.
+    """
+
+    model = model.transform(InferShapes())
+    model = model.transform(FoldConstants())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(InferDataTypes())
+    model = model.transform(RemoveStaticGraphInputs())
+
+    if VerificationStepType.TIDY_UP_PYTHON in cfg._resolve_verification_steps():
+        verify_step(model, cfg, "initial_python", need_parent=False)
+
+    return model
+
+
+def step_streamline(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Run streamlining on given model. Streamlining involves moving floating point
+    scale/shift parameters around, collapsing adjacent ones into a single parameter,
+    then absorbing the scale/shift into the following `MultiThreshold` node.
+    Streamlining requires careful topology design and cannot be applied to all
+    topologies.
+    """
+
+    model = model.transform(MoveScalarLinearPastInvariants())
+    model = model.transform(Streamline())
+    need_lowering = len(model.get_nodes_by_op_type("Conv")) > 0
+    if need_lowering:
+        model = model.transform(LowerConvsToMatMul())
+        model = model.transform(MakeMaxPoolNHWC())
+        model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
+    model = model.transform(ConvertBipolarMatMulToXnorPopcount())
+    model = model.transform(Streamline())
+    # absorb final add-mul nodes into TopK
+    model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())
+    model = model.transform(InferDataLayouts())
+    model = model.transform(RemoveUnusedTensors())
+
+    if VerificationStepType.STREAMLINED_PYTHON in cfg._resolve_verification_steps():
+        verify_step(model, cfg, "streamlined_python", need_parent=False)
+
+    return model
+
+
+def step_convert_to_hls(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Convert eligible nodes to `HLSCustomOp` subclasses that represent HLS
+    layers. Which nodes and particular configurations can be converted to HLS
+    is limited, see the source code of the `convert_to_hls` module for more. """
+
+    mem_mode = cfg.default_mem_mode.value
+    if cfg.standalone_thresholds:
+        # doing this first causes all threshold layers to be standalone
+        model = model.transform(to_hls.InferThresholdingLayer())
+    # needed for bipolar MatMul layers
+    model = model.transform(to_hls.InferBinaryStreamingFCLayer(mem_mode))
+    # needed for non-bipolar MatMul layers
+    model = model.transform(to_hls.InferQuantizedStreamingFCLayer(mem_mode))
+    # TopK to LabelSelect
+    model = model.transform(to_hls.InferLabelSelectLayer())
+    # input quantization (if any) as standalone threshold
+    model = model.transform(to_hls.InferThresholdingLayer())
+    # needed for convolutions -- TODO always exec?
+    need_conv = len(model.get_nodes_by_op_type("Im2Col")) > 0
+    if need_conv:
+        model = model.transform(to_hls.InferConvInpGen())
+        model = model.transform(to_hls.InferStreamingMaxPool())
+        model = model.transform(RemoveCNVtoFCFlatten())
+    # get rid of Tranpose -> Tranpose identity seq
+    model = model.transform(absorb.AbsorbConsecutiveTransposes())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(InferDataLayouts())
+    return model
+
+
+def step_create_dataflow_partition(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Separate consecutive groups of HLSCustomOp nodes into StreamingDataflowPartition
+    nodes, which point to a separate ONNX file. Dataflow accelerator synthesis
+    can only be performed on those HLSCustomOp sub-graphs."""
+
+    parent_model = model.transform(CreateDataflowPartition())
+    sdp_nodes = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")
+    assert len(sdp_nodes) == 1, "Only a single StreamingDataflowPartition supported."
+    sdp_node = sdp_nodes[0]
+    sdp_node = getCustomOp(sdp_node)
+    dataflow_model_filename = sdp_node.get_nodeattr("model")
+    if cfg.save_intermediate_models:
+        parent_model.save(cfg.output_dir + "/intermediate_models/dataflow_parent.onnx")
+    model = ModelWrapper(dataflow_model_filename)
+    return model
+
+
+def step_target_fps_parallelization(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """If target_fps was specified, use the SetFolding transformation to determine
+    parallelization attributes."""
+
+    target_cycles_per_frame = cfg._resolve_cycles_per_frame()
+    if target_cycles_per_frame is not None:
+        model = model.transform(
+            SetFolding(target_cycles_per_frame, mvau_wwidth_max=cfg.mvau_wwidth_max)
+        )
+    return model
+
+
+def step_apply_folding_config(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Apply the folding configuration file onto the model to set folding (parallelization)
+    and other attributes, if config file is specified."""
+
+    if cfg.folding_config_file is not None:
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(ApplyConfig(cfg.folding_config_file))
+
+    if VerificationStepType.FOLDED_HLS_CPPSIM in cfg._resolve_verification_steps():
+        # prepare cppsim
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+        model = model.transform(SetExecMode("cppsim"))
+        verify_step(model, cfg, "folded_hls_cppsim", need_parent=True)
+    return model
+
+
+def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig):
+    "Generate per-layer resource and cycle estimates using analytical models."
+
+    if DataflowOutputType.ESTIMATE_REPORTS in cfg.generate_outputs:
+        report_dir = cfg.output_dir + "/report"
+        os.makedirs(report_dir, exist_ok=True)
+        ops_and_params = model.analysis(op_and_param_counts)
+        with open(report_dir + "/op_and_param_counts.json", "w") as f:
+            json.dump(ops_and_params, f, indent=2)
+        estimate_layer_cycles = model.analysis(exp_cycles_per_layer)
+        with open(report_dir + "/estimate_layer_cycles.json", "w") as f:
+            json.dump(estimate_layer_cycles, f, indent=2)
+        estimate_layer_resources = model.analysis(res_estimation)
+        estimate_layer_resources["total"] = aggregate_dict_keys(
+            estimate_layer_resources
+        )
+        with open(report_dir + "/estimate_layer_resources.json", "w") as f:
+            json.dump(estimate_layer_resources, f, indent=2)
+        estimate_layer_resources_complete = model.analysis(res_estimation_complete)
+        with open(report_dir + "/estimate_layer_config_alternatives.json", "w") as f:
+            json.dump(estimate_layer_resources_complete, f, indent=2)
+        # need to call AnnotateCycles before dataflow_performance
+        model = model.transform(AnnotateCycles())
+        estimate_network_performance = model.analysis(dataflow_performance)
+        # add some more metrics to estimated performance
+        n_clock_cycles_per_sec = (10 ** 9) / cfg.synth_clk_period_ns
+        est_fps = n_clock_cycles_per_sec / estimate_network_performance["max_cycles"]
+        estimate_network_performance["estimated_throughput_fps"] = est_fps
+        est_latency_ns = (
+            estimate_network_performance["critical_path_cycles"]
+            * cfg.synth_clk_period_ns
+        )
+        estimate_network_performance["estimated_latency_ns"] = est_latency_ns
+        with open(report_dir + "/estimate_network_performance.json", "w") as f:
+            json.dump(estimate_network_performance, f, indent=2)
+    return model
+
+
+def step_hls_codegen(model: ModelWrapper, cfg: DataflowBuildConfig):
+    "Generate Vivado HLS code to prepare HLSCustomOp nodes for IP generation."
+
+    model = model.transform(
+        PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
+    )
+    return model
+
+
+def step_hls_ipgen(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Run Vivado HLS synthesis on generated code for HLSCustomOp nodes,
+    in order to generate IP blocks."""
+
+    model = model.transform(HLSSynthIP())
+    model = model.transform(ReplaceVerilogRelPaths())
+    report_dir = cfg.output_dir + "/report"
+    os.makedirs(report_dir, exist_ok=True)
+    estimate_layer_resources_hls = model.analysis(hls_synth_res_estimation)
+    with open(report_dir + "/estimate_layer_resources_hls.json", "w") as f:
+        json.dump(estimate_layer_resources_hls, f, indent=2)
+    return model
+
+
+def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """
+    Depending on the auto_fifo_depths setting, do one of the following:
+    * if auto_fifo_depths=True:  Run the `InsertAndSetFIFODepths` transformation
+    to attempt to determine the FIFO sizes that provide full throughput. Involves
+    running stitched-IP rtlsim and may take a long time.
+    * if auto_fifo_depths=False:  Assume the folding config file contains FIFO
+    sizes as well. Runs the `InsertFIFO` transformation, then
+    `ApplyConfig(cfg.folding_config_file)`, and finally `RemoveShallowFIFOs`.
+    Coherency with config file node naming is ensured by calling
+    `GiveUniqueNodeNames`.
+    """
+
+    if cfg.auto_fifo_depths:
+        model = model.transform(
+            InsertAndSetFIFODepths(
+                cfg._resolve_fpga_part(),
+                cfg._resolve_hls_clk_period(),
+                vivado_ram_style=cfg.large_fifo_mem_style.value,
+            )
+        )
+    else:
+        # assume folding cfg json contains FIFO sizes too
+        # insert DWCs, FIFOs and run ApplyConfig once more
+        model = model.transform(InsertDWC())
+        # need to make sure all FIFOs are created so that their depth can be
+        # set by ApplyConfig, so create_shallow_fifos=True
+        model = model.transform(InsertFIFO(create_shallow_fifos=True))
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(GiveReadableTensorNames())
+        if cfg.folding_config_file is not None:
+            model = model.transform(ApplyConfig(cfg.folding_config_file))
+        # remove any shallow FIFOs
+        model = model.transform(RemoveShallowFIFOs())
+
+    # extract the final configuration and save it as json
+    hw_attrs = [
+        "PE",
+        "SIMD",
+        "ram_style",
+        "depth",
+        "impl_style",
+        "resType",
+        "mem_mode",
+        "runtime_writeable_weights",
+    ]
+    extract_model_config_to_json(
+        model, cfg.output_dir + "/final_hw_config.json", hw_attrs
+    )
+
+    # after FIFOs are ready to go, call PrepareIP and HLSSynthIP again
+    # this will only run for the new nodes (e.g. FIFOs and DWCs)
+    model = model.transform(
+        PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
+    )
+    model = model.transform(HLSSynthIP())
+    return model
+
+
+def step_create_stitched_ip(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Create stitched IP for a graph after all HLS IP blocks have been generated.
+    Depends on the DataflowOutputType.STITCHED_IP output product."""
+
+    if DataflowOutputType.STITCHED_IP in cfg.generate_outputs:
+        stitched_ip_dir = cfg.output_dir + "/stitched_ip"
+        model = model.transform(
+            CreateStitchedIP(cfg._resolve_fpga_part(), cfg.synth_clk_period_ns)
+        )
+        # TODO copy all ip sources into output dir? as zip?
+        copytree(model.get_metadata_prop("vivado_stitch_proj"), stitched_ip_dir)
+        print("Vivado stitched IP written into " + stitched_ip_dir)
+    if VerificationStepType.STITCHED_IP_RTLSIM in cfg._resolve_verification_steps():
+        # prepare ip-stitched rtlsim
+        verify_model = deepcopy(model)
+        # rtlsim only supports impl_style=rtl for StreamingFIFO, ensure that
+        for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO"):
+            getCustomOp(fifo_layer).set_nodeattr("impl_style", "rtl")
+        # similarly for StreamingDataWidthConverter with impl_style=hls
+        for dwc_layer in verify_model.get_nodes_by_op_type(
+            "StreamingDataWidthConverter_Batch"
+        ):
+            getCustomOp(dwc_layer).set_nodeattr("impl_style", "hls")
+        verify_model = verify_model.transform(PrepareRTLSim())
+        verify_model.set_metadata_prop("exec_mode", "rtlsim")
+        verify_step(verify_model, cfg, "stitched_ip_rtlsim", need_parent=True)
+    return model
+
+
+def step_measure_rtlsim_performance(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Measure performance + latency of stitched-IP model in rtlsim (pyverilator).
+    Depends on the DataflowOutputType.STITCHED_IP output product.
+    """
+
+    if DataflowOutputType.RTLSIM_PERFORMANCE in cfg.generate_outputs:
+        assert (
+            DataflowOutputType.STITCHED_IP in cfg.generate_outputs
+        ), "rtlsim_perf needs stitched IP"
+        # prepare ip-stitched rtlsim
+        rtlsim_model = deepcopy(model)
+        # rtlsim only supports impl_style=rtl for StreamingFIFO, ensure that
+        for fifo_layer in rtlsim_model.get_nodes_by_op_type("StreamingFIFO"):
+            getCustomOp(fifo_layer).set_nodeattr("impl_style", "rtl")
+        # similarly for StreamingDataWidthConverter with impl_style=hls
+        for dwc_layer in rtlsim_model.get_nodes_by_op_type(
+            "StreamingDataWidthConverter_Batch"
+        ):
+            getCustomOp(dwc_layer).set_nodeattr("impl_style", "hls")
+        rtlsim_model = rtlsim_model.transform(PrepareRTLSim())
+        rtlsim_model.set_metadata_prop("exec_mode", "rtlsim")
+        # run with single input to get latency
+        rtlsim_perf_dict = throughput_test_rtlsim(rtlsim_model, 1)
+        rtlsim_latency = rtlsim_perf_dict["cycles"]
+        # run with num inputs equal to layers to fill the whole pipeline
+        # to get the steady-state throughput
+        rtlsim_bs = len(rtlsim_model.graph.node)
+        rtlsim_perf_dict = throughput_test_rtlsim(rtlsim_model, rtlsim_bs)
+        rtlsim_perf_dict["latency_cycles"] = rtlsim_latency
+        report_dir = cfg.output_dir + "/report"
+        os.makedirs(report_dir, exist_ok=True)
+        with open(report_dir + "/rtlsim_performance.json", "w") as f:
+            json.dump(rtlsim_perf_dict, f, indent=2)
+
+    return model
+
+
+def step_make_pynq_driver(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Create a PYNQ Python driver that can be used to interface the generated
+    accelerator."""
+
+    if DataflowOutputType.PYNQ_DRIVER in cfg.generate_outputs:
+        driver_dir = cfg.output_dir + "/driver"
+        model = model.transform(MakePYNQDriver(cfg._resolve_driver_platform()))
+        copytree(model.get_metadata_prop("pynq_driver_dir"), driver_dir)
+        print("PYNQ Python driver written into " + driver_dir)
+    return model
+
+
+def step_out_of_context_synthesis(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Run out-of-context synthesis and generate reports.
+    Depends on the DataflowOutputType.STITCHED_IP output product."""
+    if DataflowOutputType.OOC_SYNTH in cfg.generate_outputs:
+        assert (
+            DataflowOutputType.STITCHED_IP in cfg.generate_outputs
+        ), "OOC needs stitched IP"
+        model = model.transform(
+            SynthOutOfContext(
+                part=cfg._resolve_fpga_part(), clk_period_ns=cfg.synth_clk_period_ns
+            )
+        )
+        report_dir = cfg.output_dir + "/report"
+        os.makedirs(report_dir, exist_ok=True)
+        ooc_res_dict = model.get_metadata_prop("res_total_ooc_synth")
+        ooc_res_dict = eval(ooc_res_dict)
+
+        estimate_network_performance = model.analysis(dataflow_performance)
+        # add some more metrics to estimated performance
+        n_clock_cycles_per_sec = float(ooc_res_dict["fmax_mhz"]) * (10 ** 6)
+        est_fps = n_clock_cycles_per_sec / estimate_network_performance["max_cycles"]
+        ooc_res_dict["estimated_throughput_fps"] = est_fps
+        with open(report_dir + "/ooc_synth_and_timing.json", "w") as f:
+            json.dump(ooc_res_dict, f, indent=2)
+    return model
+
+
+def step_synthesize_bitfile(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Synthesize a bitfile for the using the specified shell flow, using either
+    Vivado or Vitis, to target the specified board."""
+
+    if DataflowOutputType.BITFILE in cfg.generate_outputs:
+        bitfile_dir = cfg.output_dir + "/bitfile"
+        os.makedirs(bitfile_dir, exist_ok=True)
+        report_dir = cfg.output_dir + "/report"
+        os.makedirs(report_dir, exist_ok=True)
+        if cfg.shell_flow_type == ShellFlowType.VIVADO_ZYNQ:
+            model = model.transform(
+                ZynqBuild(cfg.board, cfg.synth_clk_period_ns, cfg.enable_hw_debug)
+            )
+            copy(model.get_metadata_prop("bitfile"), bitfile_dir + "/finn-accel.bit")
+            copy(model.get_metadata_prop("hw_handoff"), bitfile_dir + "/finn-accel.hwh")
+            copy(
+                model.get_metadata_prop("vivado_synth_rpt"),
+                report_dir + "/post_synth_resources.xml",
+            )
+            vivado_pynq_proj_dir = model.get_metadata_prop("vivado_pynq_proj")
+            timing_rpt = (
+                "%s/finn_zynq_link.runs/impl_1/top_wrapper_timing_summary_routed.rpt"
+                % vivado_pynq_proj_dir
+            )
+            copy(timing_rpt, report_dir + "/post_route_timing.rpt")
+
+        elif cfg.shell_flow_type == ShellFlowType.VITIS_ALVEO:
+            model = model.transform(
+                VitisBuild(
+                    cfg._resolve_fpga_part(),
+                    cfg.synth_clk_period_ns,
+                    cfg.vitis_platform,
+                    strategy=cfg._resolve_vitis_opt_strategy(),
+                    enable_debug=cfg.enable_hw_debug,
+                    floorplan_file=cfg.vitis_floorplan_file,
+                )
+            )
+            copy(model.get_metadata_prop("bitfile"), bitfile_dir + "/finn-accel.xclbin")
+            copy(
+                model.get_metadata_prop("vivado_synth_rpt"),
+                report_dir + "/post_synth_resources.xml",
+            )
+        else:
+            raise Exception("Unrecognized shell_flow_type: " + str(cfg.shell_flow_type))
+        print("Bitfile written into " + bitfile_dir)
+
+    return model
+
+
+def step_deployment_package(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Create a deployment package including the driver and bitfile."""
+
+    if DataflowOutputType.DEPLOYMENT_PACKAGE in cfg.generate_outputs:
+        deploy_dir = cfg.output_dir + "/deploy"
+        bitfile_dir = cfg.output_dir + "/bitfile"
+        driver_dir = cfg.output_dir + "/driver"
+        os.makedirs(deploy_dir, exist_ok=True)
+        copytree(bitfile_dir, deploy_dir + "/bitfile")
+        copytree(driver_dir, deploy_dir + "/driver")
+    return model
+
+
+#: map step name strings to step functions
+build_dataflow_step_lookup = {
+    "step_tidy_up": step_tidy_up,
+    "step_streamline": step_streamline,
+    "step_convert_to_hls": step_convert_to_hls,
+    "step_create_dataflow_partition": step_create_dataflow_partition,
+    "step_target_fps_parallelization": step_target_fps_parallelization,
+    "step_apply_folding_config": step_apply_folding_config,
+    "step_generate_estimate_reports": step_generate_estimate_reports,
+    "step_hls_codegen": step_hls_codegen,
+    "step_hls_ipgen": step_hls_ipgen,
+    "step_set_fifo_depths": step_set_fifo_depths,
+    "step_create_stitched_ip": step_create_stitched_ip,
+    "step_measure_rtlsim_performance": step_measure_rtlsim_performance,
+    "step_make_pynq_driver": step_make_pynq_driver,
+    "step_out_of_context_synthesis": step_out_of_context_synthesis,
+    "step_synthesize_bitfile": step_synthesize_bitfile,
+    "step_deployment_package": step_deployment_package,
+}
diff --git a/src/finn/custom_op/fpgadataflow/addstreams_batch.py b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
index 9222720543bb463f62be76e980c222194d237f44..38940ccb94f11fe49af5f49ee020f150326a026c 100644
--- a/src/finn/custom_op/fpgadataflow/addstreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
@@ -29,7 +29,7 @@
 import os
 
 import numpy as np
-
+import warnings
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from onnx import TensorProto, helper
@@ -99,11 +99,17 @@ class AddStreams_Batch(HLSCustomOp):
         )
 
     def infer_node_datatype(self, model):
-        # check input datatype against property
-        exp_idt_name = self.get_input_datatype().name
-        idt_name = self.get_nodeattr("inputDataType")
-        assert exp_idt_name == idt_name, "Bad input DataType for AddStreams layer"
-        # enforce output data type
+        node = self.onnx_node
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("inputDataType", idt.name)
+        # enforce output data type (calculated based on idt)
         odt = self.get_output_datatype()
         model.set_tensor_datatype(self.onnx_node.output[0], odt)
 
diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
index 635f37d5695a56d7c22f2287030ccb7331ab347b..097ec336ff24cd826e6530c42b7cdb1108971fa1 100644
--- a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
+++ b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
@@ -363,7 +363,7 @@ class ChannelwiseOp_Batch(HLSCustomOp):
         # get desired function
         func = self.get_nodeattr("Func")
         if func == "cmp_le":
-            func_str = "std::less_equal"
+            func_str = "comp::less_equal"
         elif func == "cmp_ge":
             func_str = "std::greater_equal"
         elif func == "add":
diff --git a/src/finn/custom_op/fpgadataflow/downsampler.py b/src/finn/custom_op/fpgadataflow/downsampler.py
index e7e0c00ccd0b82643dbff15a0426fdc3831bd685..002f71aa30b4cf94c63d572e536999327eb2a527 100644
--- a/src/finn/custom_op/fpgadataflow/downsampler.py
+++ b/src/finn/custom_op/fpgadataflow/downsampler.py
@@ -4,6 +4,7 @@ from onnx import TensorProto, helper
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+import warnings
 
 
 class DownSampler(HLSCustomOp):
@@ -98,10 +99,16 @@ class DownSampler(HLSCustomOp):
     def infer_node_datatype(self, model):
         node = self.onnx_node
         # data type stays the same
-        dtype = model.get_tensor_datatype(node.input[0])
-        exp_idtype = self.get_input_datatype()
-        assert dtype == exp_idtype, "Unexpected datatype for DownSampler"
-        model.set_tensor_datatype(node.output[0], dtype)
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("inputDataType", idt.name)
+        model.set_tensor_datatype(node.output[0], idt)
 
     def verify_node(self):
         pass
diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
index 370c87c8618da2bb2eac5ee4c20ad86d64b03703..73da77bd3f940cee5ffd10fcfc43571f1a612eb4 100644
--- a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
@@ -29,7 +29,7 @@
 import os
 
 import numpy as np
-
+import warnings
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from onnx import helper, TensorProto
@@ -104,6 +104,16 @@ class DuplicateStreams_Batch(HLSCustomOp):
         return shape_comp_node
 
     def infer_node_datatype(self, model):
+        node = self.onnx_node
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("inputDataType", idt.name)
         odt = self.get_output_datatype()
         model.set_tensor_datatype(self.onnx_node.output[0], odt)
         model.set_tensor_datatype(self.onnx_node.output[1], odt)
diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
index e8efa3abb4e75830bf31cd88c8cb21f517e0a9f7..27dfab54ec6d483d948dd383e54a44117d7c1a65 100644
--- a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
@@ -4,6 +4,7 @@ from onnx import TensorProto, helper
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+import warnings
 
 
 class FMPadding_Batch(HLSCustomOp):
@@ -103,11 +104,16 @@ class FMPadding_Batch(HLSCustomOp):
 
     def infer_node_datatype(self, model):
         node = self.onnx_node
-        # data type stays the same
-        dtype = model.get_tensor_datatype(node.input[0])
-        exp_idtype = self.get_input_datatype()
-        assert dtype == exp_idtype, "Unexpected datatype for FMPadding_Batch"
-        model.set_tensor_datatype(node.output[0], dtype)
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("inputDataType", idt.name)
+        model.set_tensor_datatype(node.output[0], idt)
 
     def verify_node(self):
         pass
diff --git a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
index 6035ad75d8037b6f93eb38700930c535a5409298..8cc71ce9eb57c2dcf1f743a7b96e501ab833f6cd 100644
--- a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
@@ -29,7 +29,7 @@
 import os
 
 import numpy as np
-
+import warnings
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from onnx import TensorProto, helper
@@ -110,6 +110,16 @@ class GlobalAccPool_Batch(HLSCustomOp):
         )
 
     def infer_node_datatype(self, model):
+        node = self.onnx_node
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("inputDataType", idt.name)
         odt = self.get_output_datatype()
         model.set_tensor_datatype(self.onnx_node.output[0], odt)
 
diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py
index 3431061e772e7eda310733f1a0d31f4b2db154ac..39069e4c157f37ea65acf7c7b3da7a78e1ab2d0e 100644
--- a/src/finn/custom_op/fpgadataflow/hlscustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py
@@ -91,7 +91,22 @@ class HLSCustomOp(CustomOp):
             "res_synth": ("s", False, ""),
             "rtlsim_so": ("s", False, ""),
             # partitioning info
+            # ID of SLR to which the Op is attached in Vitis builds
+            # Set to -1 as 'don't care'
+            "slr": ("i", False, -1),
+            # Vitis memory port to which any AXI-MM interface
+            # of this Op should be attached in Vitis builds
+            # E.g.: "DDR[0]", "HBM[0]", "PLRAM[0]"
+            "mem_port": ("s", False, ""),
+            # Partition to which the Op belongs; all Ops with the
+            # same partition_id are stitched together
+            # Users should avoid setting this attribute manually
+            # and instead use the floorplan transform to set
+            # partition IDs from Vitis design rules and SLR IDs
             "partition_id": ("i", False, 0),
+            # ID of FPGA device to which this Op is allocated, in
+            # a multi-FPGA setting
+            "device_id": ("i", False, 0),
             # input and output FIFO depths
             "inFIFODepth": ("i", False, 2),
             "outFIFODepth": ("i", False, 2),
@@ -195,6 +210,7 @@ class HLSCustomOp(CustomOp):
         ret["BRAM_efficiency"] = self.bram_efficiency_estimation()
         ret["LUT"] = self.lut_estimation()
         ret["URAM"] = self.uram_estimation()
+        ret["URAM_efficiency"] = self.uram_efficiency_estimation()
         ret["DSP"] = self.dsp_estimation()
         return ret
 
@@ -203,6 +219,11 @@ class HLSCustomOp(CustomOp):
         needed divided by the allocated BRAM storage (from estimation)"""
         return 1
 
+    def uram_efficiency_estimation(self):
+        """Function for URAM efficiency estimation: actual parameter storage
+        needed divided by the allocated URAM storage (from estimation)"""
+        return 1
+
     def bram_estimation(self):
         """Function for BRAM resource estimation, is member function of
         HLSCustomOp class but has to be filled by every node"""
@@ -229,6 +250,13 @@ class HLSCustomOp(CustomOp):
         by every node"""
         return 0
 
+    def get_op_and_param_counts(self):
+        """Return a dictionary with number of ops needed per inference for
+        this layer as well as parameter count (weights, thresholds, etc.).
+        Entries should be in the format:
+        {op_<optype> : <count>, param_<paramtype>: <count>}."""
+        return {}
+
     def code_generation_ipgen(self, model, fpgapart, clk):
         """Generates c++ code and tcl script for ip generation."""
         node = self.onnx_node
@@ -289,8 +317,14 @@ class HLSCustomOp(CustomOp):
         builder.append_tcl(code_gen_dir + "/hls_syn_{}.tcl".format(node.name))
         builder.set_ipgen_path(code_gen_dir + "/project_{}".format(node.name))
         builder.build(code_gen_dir)
-        self.set_nodeattr("ipgen_path", builder.ipgen_path)
-        self.set_nodeattr("ip_path", builder.ipgen_path + "/sol1/impl/ip")
+        ipgen_path = builder.ipgen_path
+        assert os.path.isdir(ipgen_path), "IPGen failed: %s not found" % (ipgen_path)
+        self.set_nodeattr("ipgen_path", ipgen_path)
+        ip_path = ipgen_path + "/sol1/impl/ip"
+        assert os.path.isdir(
+            ip_path
+        ), "IPGen failed: %s not found. Check log under %s" % (ip_path, code_gen_dir)
+        self.set_nodeattr("ip_path", ip_path)
         vlnv = "xilinx.com:hls:%s:1.0" % node.name
         self.set_nodeattr("ip_vlnv", vlnv)
 
@@ -628,7 +662,12 @@ compilation transformations?
         return roundup_to_integer_multiple(out_width, 8)
 
     def get_ap_int_max_w(self):
-        "Return the maximum width of any ap_int used in this module."
+        """Return the maximum width of any ap_int used in this module. Used to set the
+        AP_INT_MAX_W definition for HLS."""
         instream = self.get_instream_width()
         outstream = self.get_outstream_width()
-        return max([instream, outstream])
+        ret = max([instream, outstream])
+        assert ret <= 32768, (
+            "AP_INT_MAX_W=%d is larger than allowed maximum of 32768" % ret
+        )
+        return ret
diff --git a/src/finn/custom_op/fpgadataflow/iodma.py b/src/finn/custom_op/fpgadataflow/iodma.py
index 0ab8bf295927f233b5785f76a1d6894c7993f9ef..857496a2614894588ebf065db3e384cf2cecf106 100644
--- a/src/finn/custom_op/fpgadataflow/iodma.py
+++ b/src/finn/custom_op/fpgadataflow/iodma.py
@@ -31,7 +31,7 @@ import math
 from onnx import TensorProto, helper
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
-
+import warnings
 
 # the IODMA inerfaces a memory-mapped AXI interface and an AXI stream
 # direction "in": pulls data from AXI-MM to AXI stream
@@ -161,11 +161,16 @@ class IODMA(HLSCustomOp):
 
     def infer_node_datatype(self, model):
         node = self.onnx_node
-        # data type stays the same
-        dtype = model.get_tensor_datatype(node.input[0])
-        exp_idtype = self.get_input_datatype()
-        assert dtype == exp_idtype, "Unexpected datatype."
-        model.set_tensor_datatype(node.output[0], dtype)
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("dataType", idt.name)
+        model.set_tensor_datatype(node.output[0], idt)
 
     def verify_node(self):
         pass
diff --git a/src/finn/custom_op/fpgadataflow/labelselect_batch.py b/src/finn/custom_op/fpgadataflow/labelselect_batch.py
index 39fa87baa08cb43ea7cb4f3d2aa2159b07b8522b..1640e2f27c4672449775fa1c6f2d9b9745e305c4 100644
--- a/src/finn/custom_op/fpgadataflow/labelselect_batch.py
+++ b/src/finn/custom_op/fpgadataflow/labelselect_batch.py
@@ -253,6 +253,12 @@ class LabelSelect_Batch(HLSCustomOp):
         assert (
             context[node.output[0]].shape == exp_oshape
         ), """Output shape doesn't match expected shape."""
+        # TopK ind output normally uses TensorProto.INT64, which
+        # can cause issues for the node-by-node simulation in FINN
+        # (as the custom DataType system always assumes float containers)
+        # so cast the output to int64
+        ret = context[node.output[0]]
+        context[node.output[0]] = ret.astype(np.int64)
 
     def global_includes(self):
         self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"']
@@ -346,3 +352,9 @@ class LabelSelect_Batch(HLSCustomOp):
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
+
+    def get_exp_cycles(self):
+        nlabels = self.get_nodeattr("Labels")
+        pe = self.get_nodeattr("PE")
+        exp_cycles = nlabels / pe
+        return int(exp_cycles)
diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
index e2d97a0eaa29604006790a542157639c5c776b22..4d84b74dce001fca769ed2850a8f718ac942f14c 100644
--- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
@@ -29,7 +29,7 @@
 import os
 import numpy as np
 import math
-
+import warnings
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.core.datatype import DataType
 from onnx import TensorProto, helper
@@ -180,9 +180,17 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
 
     def infer_node_datatype(self, model):
         node = self.onnx_node
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("dataType", idt.name)
         # data type stays the same
-        dtype = model.get_tensor_datatype(node.input[0])
-        model.set_tensor_datatype(node.output[0], dtype)
+        model.set_tensor_datatype(node.output[0], idt)
 
     def verify_node(self):
         info_messages = []
@@ -408,9 +416,14 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
             )
             cmd.append(
                 "set_property -dict "
-                "[list CONFIG.S_TDATA_NUM_BYTES.VALUE_SRC PROPAGATED] "
+                "[list CONFIG.S_TDATA_NUM_BYTES.VALUE_SRC USER] "
                 "[get_bd_cells /%s/dwc]" % node_name
             )
+            cmd.append(
+                "set_property -dict "
+                "[list CONFIG.S_TDATA_NUM_BYTES {%d}] [get_bd_cells /%s/dwc]"
+                % (np.ceil(self.get_instream_width() / 8), node_name)
+            )
             cmd.append(
                 "set_property -dict "
                 "[list CONFIG.M_TDATA_NUM_BYTES {%d}] [get_bd_cells /%s/dwc]"
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index 10e0fbbde4f485a9fc9febb21308c9b0c49da041..3cc01ade73fc6b735509f2839e5c10785a8b9f54 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -26,6 +26,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import warnings
 import math
 import os
 import numpy as np
@@ -95,8 +96,14 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             # auto -- let Vivado decide
             # block -- use BRAM
             # distributed -- use LUTRAM
+            # ultra -- use UltraRAM (URAM), must have runtime_writeable_weights=1
             # see also https://www.xilinx.com/support/answers/38070.html
-            "ram_style": ("s", False, "auto", {"auto", "block", "distributed"}),
+            "ram_style": (
+                "s",
+                False,
+                "auto",
+                {"auto", "block", "distributed", "ultra"},
+            ),
             # (mem_mode = decoupled only) whether weights will be writable through
             # an AXI-lite interface during runtime
             # 1 for enabled, 0 for disabled.
@@ -149,10 +156,15 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
     def infer_node_datatype(self, model):
         node = self.onnx_node
-        # check input datatype against property
-        idt_name = self.get_input_datatype().name
-        exp_idt_name = self.get_nodeattr("inputDataType")
-        assert exp_idt_name == idt_name, "Bad input DataType for StreamingFCLayer"
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("inputDataType", idt.name)
         # set output datatype from property
         odt = self.get_output_datatype()
         model.set_tensor_datatype(node.output[0], odt)
@@ -215,6 +227,25 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
         return info_messages
 
+    def uram_estimation(self):
+        P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        D_in = self.get_nodeattr("MW")
+        D_out = self.get_nodeattr("MH")
+        omega = (D_in * D_out) / (Q * P)
+        mem_width = Q * W * P
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (mmode == "decoupled" and mstyle != "ultra") or (
+            mmode == "const" and self.calc_wmem() <= 128
+        ):
+            return 0
+        width_multiplier = math.ceil(mem_width / 72)
+        depth_multiplier = math.ceil(omega / 4096)
+        return width_multiplier * depth_multiplier
+
     def bram_estimation(self):
         """Calculates resource estimation for BRAM based on:
         - FINN-R: An End-to-End Deep-Learning Framework for Fast
@@ -234,7 +265,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         mem_width = Q * W * P
         mmode = self.get_nodeattr("mem_mode")
         mstyle = self.get_nodeattr("ram_style")
-        if (mmode == "decoupled" and mstyle == "distributed") or (
+        if (mmode == "decoupled" and mstyle in ["distributed", "ultra"]) or (
             mmode == "const" and self.calc_wmem() <= 128
         ):
             return 0
@@ -265,6 +296,20 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         bram16_est_capacity = bram16_est * 36 * 512
         return wbits / bram16_est_capacity
 
+    def uram_efficiency_estimation(self):
+        """Function for URAM efficiency estimation: actual parameter storage
+        needed divided by the allocated URAM storage (from estimation)"""
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        D_in = self.get_nodeattr("MW")
+        D_out = self.get_nodeattr("MH")
+        uram_est = self.uram_estimation()
+        if uram_est == 0:
+            return 1
+        wbits = W * D_in * D_out
+        uram_est_capacity = uram_est * 72 * 4096
+        return wbits / uram_est_capacity
+
     def lut_estimation(self):
         """Calculates resource estimations for LUTs based on:
         - FINN-R: An End-to-End Deep-Learning Framework for Fast
@@ -389,9 +434,15 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         return roundup_to_integer_multiple(weight_width, 8)
 
     def get_ap_int_max_w(self):
-        temp_value = super().get_ap_int_max_w()
+        # base class impl (max of inp/out stream widths)
+        max_of_io = super().get_ap_int_max_w()
+        # decoupled mode weight stream
         weightstream = self.get_weightstream_width()
-        return max([weightstream, temp_value])
+        # single PE weight entry
+        weight_bits = self.get_weight_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        single_pe_w = simd * weight_bits
+        return max([weightstream, max_of_io, single_pe_w])
 
     def get_folded_input_shape(self, ind=0):
         mw = self.get_nodeattr("MW")
@@ -527,6 +578,20 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             # set threshold datatype (and accumulator datatype implicitly)
             min_threshold = thresholds.min()
             max_threshold = thresholds.max()
+            # clip threshold values
+            clip_upper = None
+            clip_lower = None
+            if max_threshold > acc_max + 1:
+                clip_upper = acc_max + 1
+            if min_threshold < acc_min:
+                clip_lower = acc_min
+            if (clip_lower is not None) or (clip_upper is not None):
+                warnings.warn("Clipping some thresholds in %s" % self.onnx_node.name)
+                thresholds = np.clip(thresholds, clip_lower, clip_upper)
+                model.set_initializer(self.onnx_node.input[2], thresholds)
+                threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
+                min_threshold = thresholds.min()
+                max_threshold = thresholds.max()
             # get range required by threshold values
             tdt_min = min(acc_min, min_threshold)
             tdt_max = max(acc_max, max_threshold)
@@ -537,9 +602,10 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                     tdt = DataType.get_smallest_possible(0 - tdt_max)
             else:
                 tdt = DataType.get_smallest_possible(tdt_max)
-            assert np.vectorize(tdt.allowed)(
-                threshold_tensor
-            ).all(), "Thresholds can't be expressed with type %s" % str(tdt)
+            assert np.vectorize(tdt.allowed)(threshold_tensor).all(), (
+                "Thresholds in %s can't be expressed with type %s"
+                % (self.onnx_node.name, str(tdt))
+            )
             self.set_nodeattr("accDataType", tdt.name)
         else:
             if acc_min < 0:
@@ -589,6 +655,13 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             # ensure all thresholds are integer
             assert (orig_thres_matrix.astype(np.int32) == orig_thres_matrix).all()
         ret = orig_thres_matrix
+        # workaround for vivado_hls threshold bug
+        if ret[0][0] == 0:
+            ret = np.copy(ret)
+            ret[0][0] = 1
+            warnings.warn(
+                "Setting 0-valued first threshold to 1 to avoid vivado_hls bug"
+            )
         # ensure channels = mh , duplicating if necessary
         if ret.shape[0] == 1:
             ret = np.tile(ret, (mh, 1))
@@ -735,6 +808,13 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             if mem_mode == "decoupled":
                 # also save weights as Verilog .dat file
                 weight_filename_rtl = "{}/memblock_0.dat".format(code_gen_dir)
+                ram_style = self.get_nodeattr("ram_style")
+                if ram_style == "ultra":
+                    # UltraRAM must have no memory initializer, or only zeroes
+                    # otherwise BRAM will be inferred instead of URAM
+                    # as a workaround we provide a zero-weight init here
+                    # TODO handle this in Verilog with an if statement
+                    weights = np.zeros_like(weights)
                 self.make_weight_file(
                     weights, "decoupled_verilog_dat", weight_filename_rtl
                 )
@@ -761,9 +841,10 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 # get computed threshold datatype from attribute
                 tdt = DataType[self.get_nodeattr("accDataType")]
 
-                assert np.vectorize(tdt.allowed)(
-                    threshold_tensor
-                ).all(), "Thresholds can't be expressed with type %s" % str(tdt)
+                assert np.vectorize(tdt.allowed)(threshold_tensor).all(), (
+                    "Thresholds in %s can't be expressed with type %s"
+                    % (self.onnx_node.name, str(tdt))
+                )
                 thresholds_hls_code = numpy_to_hls_code(
                     threshold_tensor, tdt, "thresholds", False, True
                 )
@@ -784,7 +865,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                         tdt_hls,
                         odt_hls,
                         self.get_nodeattr("ActVal"),
-                        "std::less_equal<%s>" % tdt_hls,
+                        "comp::less_equal<%s>" % tdt_hls,
                     )
                 )
                 f_thresh.write(thresholds_hls_code)
@@ -1175,8 +1256,12 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         # add streamer if needed
         mem_mode = self.get_nodeattr("mem_mode")
         if mem_mode == "decoupled":
-            node_name = self.onnx_node.name
             runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
+            if self.get_nodeattr("ram_style") == "ultra":
+                assert (
+                    runtime_writable == 1
+                ), "Layer with URAM weights must have runtime_writeable_weights=1"
+            node_name = self.onnx_node.name
             # create a hierarchy for this layer, with the same port names
             clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
             rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
@@ -1295,3 +1380,27 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             if runtime_writable:
                 intf_names["axilite"] = ["s_axilite"]
         return intf_names
+
+    def get_op_and_param_counts(self):
+        in_features = self.get_nodeattr("MW")
+        out_features = self.get_nodeattr("MH")
+        weight_bits = self.get_weight_datatype().bitwidth()
+        inp_bits = self.get_input_datatype().bitwidth()
+        num_inp_vec = self.get_nodeattr("numInputVectors")
+        num_repetitions = int(np.prod(num_inp_vec))
+        mac_count = in_features * out_features * num_repetitions
+        # cannonicalize op type: highest bitwidth operand first s.t.
+        # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types
+        bw1 = min(inp_bits, weight_bits)
+        bw2 = max(inp_bits, weight_bits)
+        mac_op_type = "op_mac_%dbx%db" % (bw1, bw2)
+        weight_param_type = "param_weight_%db" % (weight_bits)
+        weight_count = in_features * out_features
+        ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count}
+        if self.get_nodeattr("noActivation") == 0:
+            tdt = DataType[self.get_nodeattr("accDataType")]
+            thres_bits = tdt.bitwidth()
+            thres_param_type = "param_threshold_%db" % (thres_bits)
+            thres_count = out_features
+            ret_dict[thres_param_type] = thres_count
+        return ret_dict
diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py
index 9063f018bdcf64c9664e92eeabec539ee2c721af..133a869b28cf9968a719e243a3266dfb25b637ba 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfifo.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py
@@ -94,9 +94,17 @@ class StreamingFIFO(HLSCustomOp):
 
     def infer_node_datatype(self, model):
         node = self.onnx_node
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("dataType", idt.name)
         # data type stays the same
-        dtype = model.get_tensor_datatype(node.input[0])
-        model.set_tensor_datatype(node.output[0], dtype)
+        model.set_tensor_datatype(node.output[0], idt)
 
     def verify_node(self):
         pass
@@ -154,7 +162,9 @@ class StreamingFIFO(HLSCustomOp):
         template = templates.ip_package_tcl
         self.code_gen_dict.clear()
         self.code_gen_dict["$TOPNAME$"] = ["{}".format(self.onnx_node.name)]
-        self.code_gen_dict["$VERILOG_DIR$"] = [verilog_dir]
+        # note: setting the root dir as absolute can cause path problems
+        # the ipgen script will be invoked from the sources dir so root_dir=. is OK
+        self.code_gen_dict["$VERILOG_DIR$"] = ["."]
         for key in self.code_gen_dict:
             # transform list into long string separated by '\n'
             code_gen_line = "\n".join(self.code_gen_dict[key])
diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
index 7850a85ccf61c7e4a26c25b807d6613a1ad66c5a..07e1197af54fe5267995cf15424a02df8a5e1500 100644
--- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
@@ -28,7 +28,7 @@
 
 import os
 import numpy as np
-
+import warnings
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.custom_op.general.im2col import compute_conv_output_dim
 from finn.core.datatype import DataType
@@ -132,9 +132,17 @@ class StreamingMaxPool_Batch(HLSCustomOp):
 
     def infer_node_datatype(self, model):
         node = self.onnx_node
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("dataType", idt.name)
         # data type stays the same
-        dtype = model.get_tensor_datatype(node.input[0])
-        model.set_tensor_datatype(node.output[0], dtype)
+        model.set_tensor_datatype(node.output[0], idt)
 
     def verify_node(self):
         info_messages = []
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index a0ca34ed0a6838dfe9c680cc6c16961ac7f897ed..40221ce3b303fc9c1ec1851c7260c82dc3f0b40a 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -310,6 +310,7 @@ set_property core_revision 2 [ipx::current_core]
 ipx::create_xgui_files [ipx::current_core]
 ipx::update_checksums [ipx::current_core]
 ipx::save_core [ipx::current_core]
+ipx::archive_core $Top.zip [ipx::current_core]
 """
 
 strm_fifo_wrapper = """
@@ -371,7 +372,7 @@ void Thresholding_Stream_Batch(hls::stream<TI> &in,
   // alternatively: number of vertical matrix chunks
   unsigned const NF = NumChannels / PE;
 
-  ThresholdsActivation<1, PE, NumSteps, TT, TO, ActVal, std::less_equal<TT>> internal_thr;
+  ThresholdsActivation<1, PE, NumSteps, TT, TO, ActVal, comp::less_equal<TT>> internal_thr;
   #pragma HLS ARRAY_PARTITION variable=internal_thr.m_thresholds complete dim=0
 
   // everything merged into a common iteration space (one "big" loop instead
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
index 8a944fe77dc938db4154bb0a2ffcff8fdaefbd72..30374a7d97f4d2189e142a9b7b6e44a5abbb46b0 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
@@ -29,7 +29,7 @@
 from math import ceil, log2
 import textwrap
 import os
-
+import warnings
 import numpy as np
 
 from onnx import TensorProto, helper
@@ -127,10 +127,15 @@ class Thresholding_Batch(HLSCustomOp):
 
     def infer_node_datatype(self, model):
         node = self.onnx_node
-        # check input datatype against property
-        idt_name = self.get_input_datatype().name
-        exp_idt_name = self.get_nodeattr("inputDataType")
-        assert exp_idt_name == idt_name, "Bad input DataType for Thresholding layer"
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("inputDataType", idt.name)
         # set output datatype from property
         odt = self.get_output_datatype()
         model.set_tensor_datatype(node.output[0], odt)
@@ -325,8 +330,17 @@ class Thresholding_Batch(HLSCustomOp):
             # ensure all thresholds are nonnegative
             assert (orig_thres_matrix >= 0).all()
         # ensure all thresholds are integer
-        assert (orig_thres_matrix.astype(np.int32) == orig_thres_matrix).all()
+        assert np.equal(
+            np.mod(orig_thres_matrix, 1), 0
+        ).all(), "Need int threshold tensor"
         ret = orig_thres_matrix
+        # workaround for vivado_hls threshold bug
+        if ret[0][0] == 0:
+            ret = np.copy(ret)
+            ret[0][0] = 1
+            warnings.warn(
+                "Setting 0-valued first threshold to 1 to avoid vivado_hls bug"
+            )
         # ensure channels = mh , duplicating if necessary
         if ret.shape[0] == 1:
             ret = np.tile(ret, (mh, 1))
@@ -387,7 +401,7 @@ class Thresholding_Batch(HLSCustomOp):
                     tdt_hls,
                     odt_hls,
                     self.get_nodeattr("ActVal"),
-                    "std::less_equal<%s>" % tdt_hls,
+                    "comp::less_equal<%s>" % tdt_hls,
                 )
             )
             f_thresh.write(thresholds_hls_code)
@@ -922,3 +936,14 @@ class Thresholding_Batch(HLSCustomOp):
             if runtime_writable:
                 intf_names["axilite"] = ["s_axilite"]
         return intf_names
+
+    def get_op_and_param_counts(self):
+        ret_dict = {}
+        weight_bits = self.get_weight_datatype().bitwidth()
+        out_features = self.get_nodeattr("NumChannels")
+        num_steps = self.get_nodeattr("numSteps")
+        # thresholds are called weights in this layer
+        thres_param_type = "param_threshold_%db" % (weight_bits)
+        thres_count = out_features * num_steps
+        ret_dict[thres_param_type] = thres_count
+        return ret_dict
diff --git a/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py b/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py
index 333884f361983e2a465715f3f4119c9c6384558e..9a897d9fa16064017dfc02f500d2360ae8431b4a 100644
--- a/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py
+++ b/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py
@@ -1,16 +1,20 @@
 import os
 import numpy as np
 import math
-
 from onnx import TensorProto, helper
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
-from finn.util.basic import interleave_matrix_outer_dim_from_partitions
+from finn.util.basic import (
+    interleave_matrix_outer_dim_from_partitions,
+    roundup_to_integer_multiple,
+    calculate_matvec_accumulator_range,
+)
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
     numpy_to_hls_code,
     rtlsim_output_to_npy,
 )
+import warnings
 
 
 class Vector_Vector_Activate_Batch(HLSCustomOp):
@@ -31,12 +35,78 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
             "inputDataType": ("s", True, ""),
             "weightDataType": ("s", True, ""),
             "outputDataType": ("s", True, ""),
+            # FINN DataType for accumulator -- auto-computed and updated
+            "accDataType": ("s", False, "INT32"),
             # no-activation mode (produce accumulators)
             "noActivation": ("i", False, 0, {0, 1}),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
+    def minimize_accumulator_width(self, model):
+        weights = model.get_initializer(self.onnx_node.input[1])
+        k = self.get_nodeattr("Kernel")
+        fm = self.get_nodeattr("Channels")
+        # put weights into the shape expected by calculate_matvec_accumulator_range
+        weights = weights.reshape(fm, k * k).transpose()
+        if len(self.onnx_node.input) > 2:
+            thresholds = model.get_initializer(self.onnx_node.input[2])
+        else:
+            thresholds = None
+        idt = self.get_input_datatype()
+        # calculate minimum and maximum values of accumulator
+        (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt)
+        if thresholds is not None:
+            threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
+            # set threshold datatype (and accumulator datatype implicitly)
+            min_threshold = thresholds.min()
+            max_threshold = thresholds.max()
+            # clip threshold values
+            clip_upper = None
+            clip_lower = None
+            if max_threshold > acc_max + 1:
+                clip_upper = acc_max + 1
+            if min_threshold < acc_min:
+                clip_lower = acc_min
+            if (clip_lower is not None) or (clip_upper is not None):
+                warnings.warn("Clipping some thresholds in %s" % self.onnx_node.name)
+                thresholds = np.clip(thresholds, clip_lower, clip_upper)
+                model.set_initializer(self.onnx_node.input[2], thresholds)
+                threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
+                min_threshold = thresholds.min()
+                max_threshold = thresholds.max()
+            # get range required by threshold values
+            tdt_min = min(acc_min, min_threshold)
+            tdt_max = max(acc_max, max_threshold)
+            if tdt_min < 0:
+                if abs(tdt_min) > tdt_max:
+                    tdt = DataType.get_smallest_possible(tdt_min)
+                else:
+                    tdt = DataType.get_smallest_possible(0 - tdt_max)
+            else:
+                tdt = DataType.get_smallest_possible(tdt_max)
+            assert np.vectorize(tdt.allowed)(threshold_tensor).all(), (
+                "Thresholds in %s can't be expressed with type %s"
+                % (self.onnx_node.name, str(tdt))
+            )
+            self.set_nodeattr("accDataType", tdt.name)
+        else:
+            if acc_min < 0:
+                if abs(acc_min) > acc_max:
+                    adt = DataType.get_smallest_possible(acc_min)
+                else:
+                    adt = DataType.get_smallest_possible(0 - acc_max)
+            else:
+                adt = DataType.get_smallest_possible(acc_max)
+            # ensure a datatype divisible by 8-bits in case this is the last node
+            bw = roundup_to_integer_multiple(adt.bitwidth(), 8)
+            new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw))
+            adt = DataType[new_adt_name]
+            self.set_nodeattr("accDataType", adt.name)
+            # for no-activation nodes, output dt = acc dt
+            self.set_nodeattr("outputDataType", adt.name)
+        return DataType[self.get_nodeattr("accDataType")]
+
     def calc_wmem(self):
         """Calculates and returns WMEM."""
         ch = self.get_nodeattr("Channels")
@@ -72,10 +142,15 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
 
     def infer_node_datatype(self, model):
         node = self.onnx_node
-        # check input datatype against property
-        idt_name = self.get_input_datatype().name
-        exp_idt_name = self.get_nodeattr("inputDataType")
-        assert exp_idt_name == idt_name, "Bad input DataType for VVAU  node"
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("inputDataType", idt.name)
         # set output datatype from property
         odt = self.get_output_datatype()
         model.set_tensor_datatype(node.output[0], odt)
@@ -203,6 +278,13 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
         not as expected (2)."""
         n_thres_steps = orig_thres_matrix.shape[1]
         ret = orig_thres_matrix
+        # workaround for vivado_hls threshold bug
+        if ret[0][0] == 0:
+            ret = np.copy(ret)
+            ret[0][0] = 1
+            warnings.warn(
+                "Setting 0-valued first threshold to 1 to avoid vivado_hls bug"
+            )
         # distribute rows between PEs
         ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
         assert (
@@ -254,10 +336,12 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
             thresholds = model.get_initializer(self.onnx_node.input[2])
             if thresholds is not None:
                 threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
-                tdt = DataType.INT32
-                assert np.vectorize(tdt.allowed)(
-                    threshold_tensor
-                ).all(), "Thresholds are not int"
+                # get computed threshold datatype from attribute
+                tdt = DataType[self.get_nodeattr("accDataType")]
+                assert np.vectorize(tdt.allowed)(threshold_tensor).all(), (
+                    "Thresholds in %s can't be expressed with type %s"
+                    % (self.onnx_node.name, str(tdt))
+                )
                 thresholds_hls_code = numpy_to_hls_code(
                     threshold_tensor, tdt, "thresholds", False, True
                 )
@@ -275,7 +359,7 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
                         tdt_hls,
                         odt_hls,
                         self.get_nodeattr("ActVal"),
-                        "std::less_equal<%s>" % tdt_hls,
+                        "comp::less_equal<%s>" % tdt_hls,
                     )
                 )
                 f_thresh.write(thresholds_hls_code)
@@ -373,11 +457,13 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
     def defines(self, var):
         dim = self.get_nodeattr("Dim")
         numReps = 1 * dim * dim
+        kernel = self.get_nodeattr("Kernel")
+        innerProdDim = kernel * kernel
         self.code_gen_dict["$DEFINES$"] = [
-            """#define Channels1 {}\n #define Kernel1 {}\n
+            """#define Channels1 {}\n #define InnerProdDim {}\n
             #define SIMD1 1\n #define PE1 {}\n #define numReps {}""".format(
                 self.get_nodeattr("Channels"),
-                self.get_nodeattr("Kernel"),
+                innerProdDim,
                 self.get_nodeattr("PE"),
                 numReps,
             )
@@ -422,7 +508,7 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
             threshs = "threshs"
         node = self.onnx_node
         self.code_gen_dict["$DOCOMPUTE$"] = [
-            """{}<Channels1, Kernel1, SIMD1, PE1, 1, {}, {}, {}>
+            """{}<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}>
             (in0, out, weights, {}, numReps, {});""".format(
                 node.op_type,
                 tmpl_args["TSrcI"],
@@ -606,3 +692,27 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
         else:
             mult_dsp = 0
         return int(mult_dsp)
+
+    def get_op_and_param_counts(self):
+        k = self.get_nodeattr("Kernel")
+        fm = self.get_nodeattr("Channels")
+        dim = self.get_nodeattr("Dim")
+        weight_bits = self.get_weight_datatype().bitwidth()
+        inp_bits = self.get_input_datatype().bitwidth()
+        num_repetitions = int(dim * dim)
+        mac_count = k * k * fm * num_repetitions
+        # cannonicalize op type: highest bitwidth operand first s.t.
+        # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types
+        bw1 = min(inp_bits, weight_bits)
+        bw2 = max(inp_bits, weight_bits)
+        mac_op_type = "op_mac_%dbx%db" % (bw1, bw2)
+        weight_param_type = "param_weight_%db" % (weight_bits)
+        weight_count = k * k * fm
+        ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count}
+        if self.get_nodeattr("noActivation") == 0:
+            tdt = DataType[self.get_nodeattr("accDataType")]
+            thres_bits = tdt.bitwidth()
+            thres_param_type = "param_threshold_%db" % (thres_bits)
+            thres_count = fm
+            ret_dict[thres_param_type] = thres_count
+        return ret_dict
diff --git a/src/finn/qnn-data/build_dataflow/build.py b/src/finn/qnn-data/build_dataflow/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d9d55a086c410220400f8db09b67ef8d10a59ea
--- /dev/null
+++ b/src/finn/qnn-data/build_dataflow/build.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2020 Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# This file is intended to serve as an example showing how to set up custom builds
+# using FINN. The custom build can be launched like this:
+# ./run-docker.sh build_custom /path/to/folder
+
+
+import finn.builder.build_dataflow as build
+import finn.builder.build_dataflow_config as build_cfg
+
+model_name = "tfc_w1a1"
+platform_name = "Pynq-Z1"
+
+cfg = build.DataflowBuildConfig(
+    output_dir="output_%s_%s" % (model_name, platform_name),
+    target_fps=100000,
+    mvau_wwidth_max=10000,
+    # can specify detailed folding/FIFO/etc config with:
+    # folding_config_file="folding_config.json",
+    synth_clk_period_ns=10.0,
+    board=platform_name,
+    shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ,
+    generate_outputs=[
+        build_cfg.DataflowOutputType.PYNQ_DRIVER,
+        build_cfg.DataflowOutputType.STITCHED_IP,
+        build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
+        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
+        build_cfg.DataflowOutputType.OOC_SYNTH,
+        build_cfg.DataflowOutputType.BITFILE,
+        build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE,
+    ],
+    verify_steps=[
+        build_cfg.VerificationStepType.TIDY_UP_PYTHON,
+        build_cfg.VerificationStepType.STREAMLINED_PYTHON,
+        build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM,
+        build_cfg.VerificationStepType.STITCHED_IP_RTLSIM,
+    ],
+    save_intermediate_models=True,
+)
+model_file = "model.onnx"
+build.build_dataflow_cfg(model_file, cfg)
diff --git a/src/finn/qnn-data/build_dataflow/dataflow_build_config.json b/src/finn/qnn-data/build_dataflow/dataflow_build_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..5e4cf2d3028fb48cbc768b744bb0144d4f0d5fda
--- /dev/null
+++ b/src/finn/qnn-data/build_dataflow/dataflow_build_config.json
@@ -0,0 +1,24 @@
+{
+  "output_dir": "output_tfc_w1a1_Pynq-Z1",
+  "target_fps": 100000,
+  "mvau_wwidth_max": 10000,
+  "synth_clk_period_ns": 10.0,
+  "board": "Pynq-Z1",
+  "standalone_thresholds": true,
+  "shell_flow_type": "vivado_zynq",
+  "verify_steps": [
+    "initial_python",
+    "streamlined_python",
+    "folded_hls_cppsim",
+    "stitched_ip_rtlsim"
+  ],
+  "generate_outputs": [
+    "estimate_reports",
+    "stitched_ip",
+    "rtlsim_performance",
+    "pynq_driver",
+    "out_of_context_synth",
+    "bitfile",
+    "deployment_package"
+  ]
+}
diff --git a/src/finn/qnn-data/build_dataflow/expected_output.npy b/src/finn/qnn-data/build_dataflow/expected_output.npy
new file mode 100644
index 0000000000000000000000000000000000000000..a8d09384633791b7e3760dc8a2d1ba88a05d526d
Binary files /dev/null and b/src/finn/qnn-data/build_dataflow/expected_output.npy differ
diff --git a/src/finn/qnn-data/build_dataflow/folding_config.json b/src/finn/qnn-data/build_dataflow/folding_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..1fbe289608f68c296c4d86fd0dbe4a07e3d70277
--- /dev/null
+++ b/src/finn/qnn-data/build_dataflow/folding_config.json
@@ -0,0 +1,30 @@
+{
+  "Defaults": {},
+  "Thresholding_Batch_0": {
+    "PE": 49,
+    "ram_style": "distributed"
+  },
+  "StreamingFCLayer_Batch_0": {
+    "PE": 16,
+    "SIMD": 49,
+    "ram_style": "block"
+  },
+  "StreamingFCLayer_Batch_1": {
+    "PE": 8,
+    "SIMD": 8,
+    "ram_style": "auto"
+  },
+  "StreamingFCLayer_Batch_2": {
+    "PE": 8,
+    "SIMD": 8,
+    "ram_style": "auto"
+  },
+  "StreamingFCLayer_Batch_3": {
+    "PE": 10,
+    "SIMD": 8,
+    "ram_style": "distributed"
+  },
+  "LabelSelect_Batch_0": {
+    "PE": 1
+  }
+}
diff --git a/src/finn/qnn-data/build_dataflow/input.npy b/src/finn/qnn-data/build_dataflow/input.npy
new file mode 100644
index 0000000000000000000000000000000000000000..edd24de05a33a15ebc330cdab31f3d77d2c47196
Binary files /dev/null and b/src/finn/qnn-data/build_dataflow/input.npy differ
diff --git a/src/finn/qnn-data/build_dataflow/model.onnx b/src/finn/qnn-data/build_dataflow/model.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..c513967dce32d0d4e48556f9c99a80f5ae881454
Binary files /dev/null and b/src/finn/qnn-data/build_dataflow/model.onnx differ
diff --git a/src/finn/qnn-data/templates/driver/driver_base.py b/src/finn/qnn-data/templates/driver/driver_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef16a537ce18c52ea42ce9178a7178e8f8b667dd
--- /dev/null
+++ b/src/finn/qnn-data/templates/driver/driver_base.py
@@ -0,0 +1,383 @@
+# Copyright (c) 2020 Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import time
+import os
+from pynq import Overlay, allocate
+from pynq.ps import Clocks
+
+from finn.util.data_packing import (
+    finnpy_to_packed_bytearray,
+    packed_bytearray_to_finnpy,
+)
+
+# Driver base class for FINN-generated dataflow accelerators.
+# The particulars of the generated accelerator are specified via the
+# io_shape_dict (generated by the MakePYNQDriver transformation).
+
+
+class FINNExampleOverlay(Overlay):
+    def __init__(
+        self,
+        bitfile_name,
+        platform,
+        io_shape_dict,
+        batch_size=1,
+        fclk_mhz=100.0,
+        device=None,
+        download=True,
+        runtime_weight_dir="runtime_weights/",
+    ):
+        """Initialize the FINN accelerator.
+
+        Parameters
+        ----------
+        bitfile_name: str
+            Path to accelerator .bit/.xclbin file
+        platform: str
+            FINN platform type, either "alveo" or "zynq-iodma"
+        io_shape_dict: dict
+            Dictionary with particulars of the generated accelerator
+        batch_size: int
+            Maximum batch size in driver (hardware batchsize is always 1)
+        fclk_mhz: float
+            Override the clock frequency, only possible for Zynq.
+        device: pynq.Device
+            Which PYNQ device to use, None for default.
+        download: bool
+            Whether to flash the bitstream.
+        runtime_weight_dir: str
+            Path to runtime weights folder.
+        """
+        super().__init__(bitfile_name, download=download, device=device)
+        self.runtime_weight_dir = runtime_weight_dir
+        self._io_shape_dict = io_shape_dict
+        self.ibuf_packed_device = None
+        self.obuf_packed_device = None
+        self.platform = platform
+        self.batch_size = batch_size
+        self.fclk_mhz = fclk_mhz
+        if self.platform == "alveo":
+            self.idma = self.idma0
+            self.odma = self.odma0
+            self.odma_handle = None
+        elif self.platform == "zynq-iodma":
+            self.idma = self.idma0
+            self.odma = self.odma0
+            # set the clock frequency as specified by user during transformations
+            if self.fclk_mhz > 0:
+                Clocks.fclk0_mhz = self.fclk_mhz
+        else:
+            raise ValueError("Supported platforms are zynq-iodma alveo")
+        # load any runtime weights
+        self.load_runtime_weights()
+
+    def load_runtime_weights(self, flush_accel=True, verify=True):
+        """Load any existing runtime weights from the specified dir into the
+        appropriate layer of the accelerator. Note that this must be enabled
+        during the accelerator build process. The runtime weights directory
+        is specified as the class member ``runtime_weight_dir``.
+
+        Parameters
+        ----------
+        flush_accel: bool
+            Run the accelerator with dummy input after weights are written to
+            flush any stale weight data in the weight streamer FIFOs.
+        verify: bool
+            Whether the written weights will be re-read and verified.
+        """
+        w_filenames = []
+        if not os.path.isdir(self.runtime_weight_dir):
+            return
+        for (dirpath, dirnames, filenames) in os.walk(self.runtime_weight_dir):
+            w_filenames.extend(filenames)
+        rt_weight_dict = {}
+        for w_filename in w_filenames:
+            if w_filename.endswith(".dat"):
+                with open(self.runtime_weight_dir + "/" + w_filename, "r") as f:
+                    dat = f.read()
+            layer_w = np.fromiter(
+                [int(x, 16) for x in dat.strip().split()], dtype=np.uint32
+            )
+            layer_ind = int(w_filename.split("_")[0])
+            rt_weight_dict[layer_ind] = layer_w
+        for layer_ind in rt_weight_dict.keys():
+            cand_if_name = "StreamingDataflowPartition_1/s_axilite_%d" % layer_ind
+            if cand_if_name in self.ip_dict.keys():
+                layer_mmio = getattr(
+                    self.StreamingDataflowPartition_1, "s_axilite_%d" % layer_ind
+                ).mmio
+                layer_w = rt_weight_dict[layer_ind]
+                layer_mmio.write_mm(0, layer_w.tobytes())
+                if verify:
+                    new_w = np.copy(layer_mmio.array[: layer_w.shape[0]])
+                    assert (layer_w == new_w).all()
+        if flush_accel:
+            # run accelerator to flush any stale weights from weight streamer FIFOs
+            self.execute_on_buffers()
+
+    @property
+    def idt(self):
+        return self._io_shape_dict["idt"]
+
+    @property
+    def odt(self):
+        return self._io_shape_dict["odt"]
+
+    @property
+    def ishape_normal(self):
+        ret = list(self._io_shape_dict["ishape_normal"])
+        ret[0] = self.batch_size
+        return tuple(ret)
+
+    @property
+    def oshape_normal(self):
+        ret = list(self._io_shape_dict["oshape_normal"])
+        ret[0] = self.batch_size
+        return tuple(ret)
+
+    @property
+    def ishape_folded(self):
+        ret = list(self._io_shape_dict["ishape_folded"])
+        ret[0] = self.batch_size
+        return tuple(ret)
+
+    @property
+    def oshape_folded(self):
+        ret = list(self._io_shape_dict["oshape_folded"])
+        ret[0] = self.batch_size
+        return tuple(ret)
+
+    @property
+    def ishape_packed(self):
+        ret = list(self._io_shape_dict["ishape_packed"])
+        ret[0] = self.batch_size
+        return tuple(ret)
+
+    @property
+    def oshape_packed(self):
+        ret = list(self._io_shape_dict["oshape_packed"])
+        ret[0] = self.batch_size
+        return tuple(ret)
+
+    @property
+    def batch_size(self):
+        return self._batch_size
+
+    @batch_size.setter
+    def batch_size(self, value):
+        self._batch_size = value
+        # free the old buffers by setting to None
+        # (reference counting should care of it)
+        if self.ibuf_packed_device is not None:
+            self.ibuf_packed_device = None
+        if self.obuf_packed_device is not None:
+            self.obuf_packed_device = None
+        if self.platform == "alveo":
+            self.ibuf_packed_device = allocate(shape=self.ishape_packed, dtype=np.uint8)
+            self.obuf_packed_device = allocate(shape=self.oshape_packed, dtype=np.uint8)
+        else:
+            self.ibuf_packed_device = allocate(
+                shape=self.ishape_packed, dtype=np.uint8, cacheable=True
+            )
+            self.obuf_packed_device = allocate(
+                shape=self.oshape_packed, dtype=np.uint8, cacheable=True
+            )
+        self.obuf_packed = np.empty_like(self.obuf_packed_device)
+
+    def fold_input(self, ibuf_normal):
+        """Reshapes input in desired shape.
+        Gets input data (ibuf_normal), checks if data is in expected normal shape.
+        Returns folded input."""
+        # ensure that shape is as expected
+        assert ibuf_normal.shape == self.ishape_normal
+        # convert to folded form
+        ibuf_folded = ibuf_normal.reshape(self.ishape_folded)
+        return ibuf_folded
+
+    def pack_input(self, ibuf_folded):
+        """Packs folded input and reverses both SIMD dim and endianness.
+        Gets input data in folded shape and returns packed input data."""
+        ibuf_packed = finnpy_to_packed_bytearray(
+            ibuf_folded,
+            self.idt,
+            reverse_endian=True,
+            reverse_inner=True,
+            fast_mode=True,
+        )
+        return ibuf_packed
+
+    def unpack_output(self, obuf_packed):
+        """Unpacks the packed output buffer from accelerator.
+        Gets packed output and returns output data in folded shape."""
+        obuf_folded = packed_bytearray_to_finnpy(
+            obuf_packed,
+            self.odt,
+            self.oshape_folded,
+            reverse_endian=True,
+            reverse_inner=True,
+            fast_mode=True,
+        )
+        return obuf_folded
+
+    def unfold_output(self, obuf_folded):
+        """Unfolds output data to normal shape.
+        Gets folded output data and returns output data in normal shape."""
+        obuf_normal = obuf_folded.reshape(self.oshape_normal)
+        return obuf_normal
+
+    def copy_input_data_to_device(self, data):
+        """Copies given input data to PYNQ buffer."""
+        np.copyto(self.ibuf_packed_device, data)
+        self.ibuf_packed_device.flush()
+
+    def copy_output_data_from_device(self, data):
+        """Copies PYNQ output buffer from device."""
+        self.obuf_packed_device.invalidate()
+        np.copyto(data, self.obuf_packed_device)
+
+    def execute_on_buffers(self, asynch=False, batch_size=None):
+        """Executes accelerator by setting up the DMA(s) on pre-allocated buffers.
+        Blocking behavior depends on the asynch parameter:
+        * ``asynch=True`` will block until all transfers are complete.
+        * ``asynch=False`` won't block, use ``wait_until_finished()`` to check
+           completion
+
+        The optional batch_size parameter can be used to execute on a smaller
+        batch than the initialized ``self.batch_size``.
+        """
+        if batch_size is None:
+            batch_size = self.batch_size
+        assert batch_size <= self.batch_size, "Specified batch_size is too large."
+        if self.platform == "zynq-iodma":
+            assert self.odma.read(0x00) & 0x4 != 0, "Output DMA is not idle"
+            # manually launch IODMAs since signatures are missing
+            self.idma.write(0x10, self.ibuf_packed_device.device_address)
+            self.idma.write(0x1C, batch_size)
+            self.odma.write(0x10, self.obuf_packed_device.device_address)
+            self.odma.write(0x1C, batch_size)
+            self.idma.write(0x00, 1)
+            self.odma.write(0x00, 1)
+        elif self.platform == "alveo":
+            assert self.odma_handle is None, "Output DMA is already running"
+            self.idma.start(self.ibuf_packed_device, batch_size)
+            self.odma_handle = self.odma.start(self.obuf_packed_device, batch_size)
+        else:
+            raise Exception("Unrecognized platform: %s" % self.platform)
+        # blocking behavior depends on asynch parameter
+        if asynch is False:
+            self.wait_until_finished()
+
+    def wait_until_finished(self):
+        "Block until the output DMA has finished writing."
+        if self.platform == "zynq-iodma":
+            # check if output IODMA is finished via register reads
+            status = self.odma.read(0x00)
+            while status & 0x2 == 0:
+                status = self.odma.read(0x00)
+        elif self.platform == "alveo":
+            assert self.odma_handle is not None, "No odma_handle to wait on"
+            self.odma_handle.wait()
+            self.odma_handle = None
+        else:
+            raise Exception("Unrecognized platform: %s" % self.platform)
+
+    def execute(self, input_npy):
+        """Given input numpy array, first perform necessary packing and copying
+        to device buffers, execute on accelerator, then unpack output and return
+        output numpy array from accelerator."""
+        ibuf_folded = self.fold_input(input_npy)
+        ibuf_packed = self.pack_input(ibuf_folded)
+        self.copy_input_data_to_device(ibuf_packed)
+        self.execute_on_buffers()
+        self.copy_output_data_from_device(self.obuf_packed)
+        obuf_folded = self.unpack_output(self.obuf_packed)
+        obuf_normal = self.unfold_output(obuf_folded)
+        return obuf_normal
+
+    def throughput_test(self):
+        """Run accelerator with empty inputs to measure throughput and other metrics.
+        Returns dictionary with various metrics."""
+        # dictionary for results of throughput test
+        res = {}
+        start = time.time()
+        self.execute_on_buffers()
+        end = time.time()
+        runtime = end - start
+        res["runtime[ms]"] = runtime * 1000
+        res["throughput[images/s]"] = self.batch_size / runtime
+        res["DRAM_in_bandwidth[Mb/s]"] = (
+            np.prod(self.ishape_packed) * 0.000001 / runtime
+        )
+        res["DRAM_out_bandwidth[Mb/s]"] = (
+            np.prod(self.oshape_packed) * 0.000001 / runtime
+        )
+        if self.platform == "zynq-iodma":
+            res["fclk[mhz]"] = Clocks.fclk0_mhz
+        elif self.platform == "alveo":
+            res["fclk[mhz]"] = self.clock_dict["clock0"]["frequency"]
+        res["batch_size"] = self.batch_size
+        # also benchmark driver-related overheads
+        input_npy = np.zeros(self.ishape_normal, dtype=self.idt.to_numpy_dt())
+        start = time.time()
+        ibuf_folded = self.fold_input(input_npy)
+        end = time.time()
+        runtime = end - start
+        res["fold_input[ms]"] = runtime
+
+        start = time.time()
+        ibuf_packed = self.pack_input(ibuf_folded)
+        end = time.time()
+        runtime = end - start
+        res["pack_input[ms]"] = runtime
+
+        start = time.time()
+        self.copy_input_data_to_device(ibuf_packed)
+        end = time.time()
+        runtime = end - start
+        res["copy_input_data_to_device[ms]"] = runtime
+
+        start = time.time()
+        self.copy_output_data_from_device(self.obuf_packed)
+        end = time.time()
+        runtime = end - start
+        res["copy_output_data_from_device[ms]"] = runtime
+
+        start = time.time()
+        obuf_folded = self.unpack_output(self.obuf_packed)
+        end = time.time()
+        runtime = end - start
+        res["unpack_output[ms]"] = runtime
+
+        start = time.time()
+        self.unfold_output(obuf_folded)
+        end = time.time()
+        runtime = end - start
+        res["unfold_output[ms]"] = runtime
+        return res
diff --git a/src/finn/qnn-data/templates/driver/validate.py b/src/finn/qnn-data/templates/driver/validate.py
new file mode 100644
index 0000000000000000000000000000000000000000..4aa7d67aa162e91b878d387bee1457e4b477e635
--- /dev/null
+++ b/src/finn/qnn-data/templates/driver/validate.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2020 Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+from driver import io_shape_dict
+from driver_base import FINNExampleOverlay
+import numpy as np
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Validate top-1 accuracy for FINN-generated accelerator"
+    )
+    parser.add_argument(
+        "--batchsize", help="number of samples for inference", type=int, default=100
+    )
+    parser.add_argument(
+        "--dataset", help="dataset to use (mnist of cifar10)", required=True
+    )
+    parser.add_argument(
+        "--platform", help="Target platform: zynq-iodma alveo", default="zynq-iodma"
+    )
+    parser.add_argument(
+        "--bitfile", help='name of bitfile (i.e. "resizer.bit")', default="resizer.bit"
+    )
+    parser.add_argument(
+        "--dataset_root", help="dataset root dir for download/reuse", default="/tmp"
+    )
+    # parse arguments
+    args = parser.parse_args()
+    bsize = args.batchsize
+    dataset = args.dataset
+    bitfile = args.bitfile
+    platform = args.platform
+    dataset_root = args.dataset_root
+
+    if dataset == "mnist":
+        from dataset_loading import mnist
+
+        trainx, trainy, testx, testy, valx, valy = mnist.load_mnist_data(
+            dataset_root, download=True, one_hot=False
+        )
+    elif dataset == "cifar10":
+        from dataset_loading import cifar
+
+        trainx, trainy, testx, testy, valx, valy = cifar.load_cifar_data(
+            dataset_root, download=True, one_hot=False
+        )
+    else:
+        raise Exception("Unrecognized dataset")
+
+    test_imgs = testx
+    test_labels = testy
+
+    ok = 0
+    nok = 0
+    total = test_imgs.shape[0]
+
+    driver = FINNExampleOverlay(
+        bitfile_name=bitfile,
+        platform=platform,
+        io_shape_dict=io_shape_dict,
+        batch_size=bsize,
+        runtime_weight_dir="runtime_weights/",
+    )
+
+    n_batches = int(total / bsize)
+
+    test_imgs = test_imgs.reshape(n_batches, bsize, -1)
+    test_labels = test_labels.reshape(n_batches, bsize)
+
+    for i in range(n_batches):
+        ibuf_normal = test_imgs[i].reshape(driver.ibuf_packed_device.shape)
+        exp = test_labels[i]
+        driver.copy_input_data_to_device(ibuf_normal)
+        driver.execute_on_buffers()
+        obuf_normal = np.empty_like(driver.obuf_packed_device)
+        driver.copy_output_data_from_device(obuf_normal)
+        ret = np.bincount(obuf_normal.flatten() == exp.flatten())
+        nok += ret[0]
+        ok += ret[1]
+        print("batch %d / %d : total OK %d NOK %d" % (i + 1, n_batches, ok, nok))
+
+    acc = 100.0 * ok / (total)
+    print("Final accuracy: %f" % acc)
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index 749cf6c91a975a2ffaffedefa77b2f3fcb793e32..94305b861cbe0c5e6b641c9dccee7976c73c236f 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -221,11 +221,13 @@ class InferPool_Batch(Transformation):
                 if n.op_type == "MaxPool":
                     k = get_by_name(n.attribute, "kernel_shape").ints[-1]
                     stride = get_by_name(n.attribute, "strides").ints[-1]
+                    # assumed datalayout
+                    dlayout = "NCHW"
                 elif n.op_type == "QuantAvgPool2d":
                     inst = getCustomOp(n)
                     k = inst.get_nodeattr("kernel")
                     stride = inst.get_nodeattr("stride")
-
+                    dlayout = inst.get_nodeattr("data_layout")
                 try:
                     pad = get_by_name(n.attribute, "pads").ints[-1]
                 except AttributeError:
@@ -250,19 +252,35 @@ class InferPool_Batch(Transformation):
 
                 odt = model.get_tensor_datatype(node_output)
 
-                ifm_ch = model.get_tensor_shape(n.input[0])[1]  # assume NCHW
+                if dlayout == "NCHW":
+                    ifm_ch = model.get_tensor_shape(n.input[0])[1]
+                else:
+                    ifm_ch = model.get_tensor_shape(n.input[0])[-1]
                 ofm_ch = ifm_ch
-                ifm_dim = model.get_tensor_shape(n.input[0])[-1]  # assume NCHW
-                ofm_dim = model.get_tensor_shape(n.output[0])[-1]  # assume NCHW
-                # create new intermediate values
-                inp_trans_out = helper.make_tensor_value_info(
-                    model.make_new_valueinfo_name(),
-                    TensorProto.FLOAT,
-                    (1, ifm_dim, ifm_dim, ifm_ch),  # NHWC
-                )
-                graph.value_info.append(inp_trans_out)
-                inp_trans_out = inp_trans_out.name
-                model.set_tensor_datatype(inp_trans_out, idt)
+                ifm_dim = model.get_tensor_shape(n.input[0])[-2]
+                ofm_dim = model.get_tensor_shape(n.output[0])[-2]
+
+                # if data layout NCHW, we need transpose nodes surrounding
+                # the hls layer
+                if dlayout == "NCHW":
+                    # create new intermediate values
+                    inp_trans_out = helper.make_tensor_value_info(
+                        model.make_new_valueinfo_name(),
+                        TensorProto.FLOAT,
+                        (1, ifm_dim, ifm_dim, ifm_ch),  # NHWC
+                    )
+                    graph.value_info.append(inp_trans_out)
+                    inp_trans_out = inp_trans_out.name
+                    model.set_tensor_datatype(inp_trans_out, idt)
+
+                    pool_output = helper.make_tensor_value_info(
+                        model.make_new_valueinfo_name(),
+                        TensorProto.FLOAT,
+                        (1, ofm_dim, ofm_dim, ofm_ch),
+                    )
+                    graph.value_info.append(pool_output)
+                    pool_output = pool_output.name
+                    # model.set_tensor_datatype(pool_output, odt)
 
                 im2col_out = helper.make_tensor_value_info(
                     model.make_new_valueinfo_name(),
@@ -273,20 +291,16 @@ class InferPool_Batch(Transformation):
                 im2col_out = im2col_out.name
                 model.set_tensor_datatype(im2col_out, idt)
 
-                pool_output = helper.make_tensor_value_info(
-                    model.make_new_valueinfo_name(),
-                    TensorProto.FLOAT,
-                    (1, ofm_dim, ofm_dim, ofm_ch),
-                )
-                graph.value_info.append(pool_output)
-                pool_output = pool_output.name
-                # model.set_tensor_datatype(pool_output, odt)
-
                 # create new nodes
-                # NCHW -> NHWC
-                inp_trans_node = helper.make_node(
-                    "Transpose", [node_input], [inp_trans_out], perm=[0, 2, 3, 1]
-                )
+                if dlayout == "NCHW":
+                    # NCHW -> NHWC
+                    inp_trans_node = helper.make_node(
+                        "Transpose", [node_input], [inp_trans_out], perm=[0, 2, 3, 1]
+                    )
+                    im2col_in = inp_trans_out
+                else:
+                    im2col_in = node_input
+                    pool_output = node_output
 
                 accum_bits = 0
                 pool_size_param = k
@@ -312,7 +326,7 @@ class InferPool_Batch(Transformation):
                 # format input tensor
                 im2col_node = helper.make_node(
                     "Im2Col",
-                    [inp_trans_out],
+                    [im2col_in],
                     [im2col_out],
                     domain="finn.custom_op.general",
                     stride=stride,
@@ -345,16 +359,21 @@ class InferPool_Batch(Transformation):
                     BatchSize=1,
                 )
 
-                # NHWC -> NCHW
-                out_trans_node = helper.make_node(
-                    "Transpose", [pool_output], [node_output], perm=[0, 3, 1, 2]
-                )
+                if dlayout == "NCHW":
+                    # NHWC -> NCHW
+                    out_trans_node = helper.make_node(
+                        "Transpose", [pool_output], [node_output], perm=[0, 3, 1, 2]
+                    )
 
                 # insert nodes where the conv is to preserve topological ordering
-                graph.node.insert(node_ind, inp_trans_node)
-                graph.node.insert(node_ind + 1, im2col_node)
-                graph.node.insert(node_ind + 2, pool_node)
-                graph.node.insert(node_ind + 3, out_trans_node)
+                if dlayout == "NCHW":
+                    graph.node.insert(node_ind, inp_trans_node)
+                    graph.node.insert(node_ind + 1, im2col_node)
+                    graph.node.insert(node_ind + 2, pool_node)
+                    graph.node.insert(node_ind + 3, out_trans_node)
+                else:
+                    graph.node.insert(node_ind, im2col_node)
+                    graph.node.insert(node_ind + 1, pool_node)
                 # remove old node
                 graph.node.remove(n)
                 graph_modified = True
@@ -555,8 +574,9 @@ class InferQuantizedStreamingFCLayer(Transformation):
                         thresholds neither 1 nor MH."""
                         odt = model.get_tensor_datatype(mt_output)
                         scale = getCustomOp(consumer).get_nodeattr("out_scale")
+                        bipolar_ok = odt == DataType.BIPOLAR and scale == 2.0
                         assert (
-                            scale == 1.0
+                            scale == 1.0 or bipolar_ok
                         ), "out_scale must be equal to 1.0 for HLS conversion."
                         actval = getCustomOp(consumer).get_nodeattr("out_bias")
                         assert (
@@ -768,6 +788,7 @@ class InferVVAU(Transformation):
                         graph.node.remove(n)
                         graph_modified = True
         if graph_modified:
+            model = model.transform(MinimizeAccumulatorWidth())
             model = model.transform(InferShapes())
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
@@ -1204,6 +1225,7 @@ class InferLabelSelectLayer(Transformation):
                     continue
 
                 num_labels = int(fc_in_shape[-1])
+                num_inp_vecs = list(fc_in_shape[:-1])
                 # create node with no parallelization first
                 pe = 1
                 assert (
@@ -1223,6 +1245,7 @@ class InferLabelSelectLayer(Transformation):
                     PE=pe,
                     K=k,
                     inputDataType=idt.name,
+                    numInputVectors=num_inp_vecs,
                 )
                 graph.node.insert(node_ind, new_node)
                 # remove old node
diff --git a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
index 6df9e6d1e62270b13f31560a99109c9b108f8025..56bfb4306e555c716a9156d6f0949c339193eb38 100644
--- a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
+++ b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
@@ -29,7 +29,7 @@
 import copy
 
 from onnx import helper
-
+from finn.custom_op.registry import getCustomOp
 from finn.transformation.base import Transformation
 from finn.util.basic import get_by_name, make_build_dir
 
@@ -117,6 +117,28 @@ class CreateDataflowPartition(Transformation):
                 # remove all dataflow nodes from the non-dataflow model
                 # keep track of where the dataflow part starts
                 df_start_ind = all_nodes.index(df_nodes[0])
+
+                # get and check floorplan
+                inst = getCustomOp(df_nodes[0])
+                slr = inst.get_nodeattr("slr")
+                for node in df_nodes[1:]:
+                    inst = getCustomOp(node)
+                    assert slr == inst.get_nodeattr(
+                        "slr"
+                    ), """all nodes with
+                same partition_id must have the same slr id"""
+
+                # check that there is only one non-null mem_port per partition
+                nmemports = 0
+                mem_port = ""
+                for node in df_nodes:
+                    inst = getCustomOp(node)
+                    port = inst.get_nodeattr("mem_port")
+                    if port is not None and port != "":
+                        nmemports += 1
+                        mem_port = port
+                assert nmemports <= 1, """too many memory ports per partition"""
+
                 for node_to_remove in df_nodes:
                     non_df_model.graph.node.remove(node_to_remove)
                 # create StreamingDataflow node with df_in/df_out io
@@ -127,6 +149,9 @@ class CreateDataflowPartition(Transformation):
                     # use the model attribute to mark the df model
                     model=df_model_filename,
                     domain="finn.custom_op.general",
+                    partition_id=target_partition_id,
+                    slr=slr,
+                    mem_port=mem_port,
                 )
                 non_df_model.graph.node.insert(df_start_ind, df_node)
                 model = non_df_model
diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index cbd353e4ad9099d13f10deadb4c99c290713d370..aed5792a63ff95803b4d7ccc80cf2c94ac732ad7 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -319,6 +319,7 @@ class CreateStitchedIP(Transformation):
             tcl.append("write_verilog -force -mode synth_stub %s.v" % block_name)
             tcl.append("write_checkpoint %s.dcp" % block_name)
             tcl.append("write_xdc %s.xdc" % block_name)
+            tcl.append("report_utilization -file %s_partition_util.rpt" % block_name)
         # export block design itself as an IP core
         block_vendor = "xilinx_finn"
         block_library = "finn"
diff --git a/src/finn/transformation/fpgadataflow/floorplan.py b/src/finn/transformation/fpgadataflow/floorplan.py
index eaade4a335258bb2ffd7ce0f87da8ae7c3fd00b3..c6bedd466e31efb622640cbd203d344ff9b3d88f 100644
--- a/src/finn/transformation/fpgadataflow/floorplan.py
+++ b/src/finn/transformation/fpgadataflow/floorplan.py
@@ -29,26 +29,90 @@
 from finn.custom_op.registry import getCustomOp
 from finn.transformation.base import Transformation
 from finn.util.basic import get_by_name
+from finn.analysis.fpgadataflow.floorplan_params import floorplan_params
+from finn.util.basic import make_build_dir
+from finn.transformation.general import ApplyConfig
+import warnings
+import json
 
 
 class Floorplan(Transformation):
-    """Perform Floorplanning of the dataflow design. Separate DMAs into their own
-    partitions IDs, and TODO: split the design into sections of defined size"""
+    """Perform Floorplanning of the dataflow design:
 
-    def __init__(self, limits=None):
+    floorplan: path to a JSON containing a dictionary with SLR assignments
+               for each node in the ONNX graph. Must be parse-able by
+               the ApplyConfig transform.
+
+    The transform applies the properties in the supplied JSON then:
+    -Separates DMAs into their own partitions IDs,
+    -If not explicitly assigned, assigns DWCs to SLRs to minimize SLLs required
+    -If not explicitly assigned, assigns FIFOs to the SLR of the upstream node
+
+    """
+
+    def __init__(self, floorplan=None):
         super().__init__()
-        self.resource_limits = limits
+        self.user_floorplan = floorplan
 
     def apply(self, model):
-        target_partition_id = 0
-        # we currently assume that all dataflow nodes belonging to the same partition
-        # are connected to each other and there is a single input/output to/from each.
+
+        # read in a user-specified floorplan or generate a default one
+        if self.user_floorplan is None:
+            floorplan = model.analysis(floorplan_params)
+            json_dir = make_build_dir(prefix="vitis_floorplan_")
+            json_file = json_dir + "/floorplan.json"
+            model.set_metadata_prop("floorplan_json", json_file)
+            with open(json_file, "w") as f:
+                json.dump(floorplan, f, indent=4)
+        else:
+            model.set_metadata_prop("floorplan_json", self.user_floorplan)
+            model = model.transform(ApplyConfig(self.user_floorplan))
+
+        # perform DWC and FIFO specific adjustments
+        unassigned_nodes = 0
+        for node in model.graph.node:
+            node_inst = getCustomOp(node)
+            node_slr = node_inst.get_nodeattr("slr")
+            if node_slr == -1:
+                unassigned_nodes += 1
+            if node.op_type == "StreamingDataWidthConverter_Batch":
+                # if we have SLR assignment already. use that
+                if node_slr != -1:
+                    continue
+                # optimize for possible SLR crossing
+                in_width = node_inst.get_nodeattr("inWidth")
+                out_width = node_inst.get_nodeattr("outWidth")
+                # find neighbour with narrowest bus
+                if in_width > out_width:
+                    narrow_neighbour = model.find_consumer(node.output[0])
+                else:
+                    narrow_neighbour = model.find_producer(node.input[0])
+                node_slr = getCustomOp(narrow_neighbour).get_nodeattr("slr")
+                node_inst.set_nodeattr("slr", node_slr)
+            if node.op_type == "StreamingFIFO":
+                # if we have SLR assignment already. use that
+                if node_slr != -1:
+                    continue
+                srcnode = model.find_producer(node.input[0])
+                node_slr = getCustomOp(srcnode).get_nodeattr("slr")
+                node_inst.set_nodeattr("slr", node_slr)
+
+        if unassigned_nodes > 0:
+            warnings.warn(
+                str(unassigned_nodes)
+                + " nodes have no entry in the provided floorplan "
+                + "and no default value was set"
+            )
+
+        # partition id generation
+        partition_cnt = 0
+
+        # Assign IODMAs to their own partitions
         all_nodes = list(model.graph.node)
         df_nodes = list(
             filter(lambda x: get_by_name(x.attribute, "backend") is not None, all_nodes)
         )
         dma_nodes = list(filter(lambda x: x.op_type == "IODMA", df_nodes))
-
         non_dma_nodes = list(filter(lambda x: x not in dma_nodes, df_nodes))
         dyn_tlastmarker_nodes = list(
             filter(
@@ -57,24 +121,53 @@ class Floorplan(Transformation):
                 non_dma_nodes,
             )
         )
-
         non_dma_nodes = list(
             filter(lambda x: x not in dyn_tlastmarker_nodes, non_dma_nodes)
         )
 
         for node in dma_nodes:
             node_inst = getCustomOp(node)
-            node_inst.set_nodeattr("partition_id", target_partition_id)
-            target_partition_id += 1
+            node_inst.set_nodeattr("partition_id", partition_cnt)
+            partition_cnt += 1
 
         for node in dyn_tlastmarker_nodes:
             node_inst = getCustomOp(node)
-            node_inst.set_nodeattr("partition_id", target_partition_id)
-            target_partition_id += 1
+            node_inst.set_nodeattr("partition_id", partition_cnt)
+            partition_cnt += 1
 
         for node in non_dma_nodes:
-            # TODO: implement proper floorplanning; for now just a single partition
+            pre_node = model.find_producer(node.input[0])
             node_inst = getCustomOp(node)
-            node_inst.set_nodeattr("partition_id", target_partition_id)
+            if pre_node not in non_dma_nodes:
+                # input node
+                node_inst.set_nodeattr("partition_id", partition_cnt)
+                partition_cnt += 1
+                continue
+            elif not (
+                node.op_type == "StreamingFCLayer_Batch"
+                and node_inst.get_nodeattr("mem_mode") is not None
+                and node_inst.get_nodeattr("mem_mode") == "external"
+            ):
+                pre_nodes = model.find_direct_predecessors(node)
+            else:
+                pre_nodes = [pre_node]
+
+            node_slr = node_inst.get_nodeattr("slr")
+            for pre_node in pre_nodes:
+                pre_inst = getCustomOp(pre_node)
+                pre_slr = pre_inst.get_nodeattr("slr")
+                if node_slr == pre_slr:
+                    partition_id = pre_inst.get_nodeattr("partition_id")
+                    node_inst.set_nodeattr("partition_id", partition_id)
+                    break
+            else:
+                # no matching, new partition
+                node_inst.set_nodeattr("partition_id", partition_cnt)
+                partition_cnt += 1
+
+        # save the updated floorplan
+        floorplan = model.analysis(floorplan_params)
+        with open(model.get_metadata_prop("floorplan_json"), "w") as f:
+            json.dump(floorplan, f, indent=4)
 
         return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py
index aef793477168028d398c697f958c4cc729ba4ec0..e7bf29da36e9978911c5bfc64665dba4d2edca4e 100644
--- a/src/finn/transformation/fpgadataflow/insert_dwc.py
+++ b/src/finn/transformation/fpgadataflow/insert_dwc.py
@@ -17,10 +17,14 @@ def _is_dwc_node(node):
 def _suitable_node(node):
     if node is not None:
         if is_fpgadataflow_node(node) is True:
-            if _is_dwc_node(node) is False:
-                return True
-            else:
+            if _is_dwc_node(node):
+                # no DWC for DWCs
+                return False
+            elif node.op_type == "IODMA":
+                # IODMA data shapes/widths need special handling
                 return False
+            else:
+                return True
         else:
             return False
     else:
@@ -28,8 +32,7 @@ def _suitable_node(node):
 
 
 class InsertDWC(Transformation):
-    """Ensure that the graph is terminated with a TLastMarker node, inserting
-    one if necessary."""
+    """Add data width converters between layers where necessary."""
 
     def __init__(self):
         super().__init__()
diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index def6babf82f8fb4bc290daa19efb4aeec074541c..c0ac1319dd520794afd66f187b35e529739e5cd7 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -44,11 +44,16 @@ class InsertFIFO(Transformation):
     node attribute 'outFIFODepth' of the previous and node attribute 'inFIFODepth'
     of the subsequent node. max() of these two values sets the FIFO depth.
 
-    The other node attributes necessary to create a FIFO node are taking from the
+    Normally, shallow-depth (<=2) FIFOs won't be created since HLS streaming
+    interfaces already have a degree of buffering. You can set
+    create_shallow_fifos=True to override this default behavior.
+
+    The other node attributes necessary to create a FIFO node are taken from the
     node the FIFO node is inserted after: 'folded_shape' and 'dtype'"""
 
-    def __init__(self):
+    def __init__(self, create_shallow_fifos=False):
         super().__init__()
+        self.create_shallow_fifos = create_shallow_fifos
 
     def apply(self, model):
         graph = model.graph
@@ -98,12 +103,12 @@ class InsertFIFO(Transformation):
                         elif n0_depth != n1_depth:
                             fifo_depth = max(n0_depth, n1_depth)
 
-                        if fifo_depth > 2:
+                        if fifo_depth > 2 or self.create_shallow_fifos:
                             # assumption: HLS streaming components already have
                             # depth-2 FIFOs on inputs and outputs, so no point
                             # creating additional small FIFOs in between --
                             # we only create the larger FIFOs specified
-                            # create fifo node
+                            # or unless create_shallow_fifos is specified
                             fifo_output_tensor = oh.make_tensor_value_info(
                                 model.make_new_valueinfo_name(),
                                 TensorProto.FLOAT,
diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py
index fe53bd39639462b8cebcdf5febe3b11e7eda96dc..3f08a672d2b3c00bfb3764202a7c9fc84448f586 100644
--- a/src/finn/transformation/fpgadataflow/insert_iodma.py
+++ b/src/finn/transformation/fpgadataflow/insert_iodma.py
@@ -113,14 +113,19 @@ class InsertIODMA(Transformation):
             if final_node.op_type != "IODMA":
                 out_shape = model.get_tensor_shape(graph_out_name)
                 out_dtype = model.get_tensor_datatype(graph_out_name)
+                final_node_inst = getCustomOp(final_node)
+                out_folded_shape = final_node_inst.get_folded_output_shape()
+                # take advantage of AXI stream width padding for DMA alignment
+                # (AXI streams are always padded to 8 bits)
+                # this is the width of stream input to DMA
+                padded_outstream_width = final_node_inst.get_outstream_width_padded()
+                padded_outstream_bytes = padded_outstream_width // 8
                 # determine the feasible interface width
-                transfer_bits = np.prod(out_shape) * out_dtype.bitwidth()
+                transfer_bits = padded_outstream_width * np.prod(out_folded_shape[:-1])
                 intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
                 assert (
                     intfwidth % 8 == 0
                 ), "No feasible interface width for transfer size"
-                # get width of stream input to DMA
-                streamWidth = getCustomOp(final_node).get_outstream_width()
                 # make new buffer
                 final_node_out = oh.make_tensor_value_info(
                     model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
@@ -129,15 +134,17 @@ class InsertIODMA(Transformation):
                 model.set_tensor_datatype(final_node_out.name, out_dtype)
                 # reroute final node output to final_node_out_name
                 final_node.output[0] = final_node_out.name
+                # FIXME: currently always using 8-bit dtypes to work around the
+                # padding problems for i/o DMA
                 dma_node = oh.make_node(
                     "IODMA",
                     [final_node_out.name],
                     [graph_out_name],
-                    numInputVectors=out_shape[:-1],
-                    NumChannels=out_shape[-1],
-                    dataType=str(out_dtype.name),
+                    numInputVectors=out_folded_shape[:-1],
+                    NumChannels=padded_outstream_bytes,
+                    dataType="UINT8",
                     intfWidth=intfwidth,
-                    streamWidth=streamWidth,
+                    streamWidth=padded_outstream_width,
                     direction="out",
                     domain="finn.custom_op.fpgadataflow",
                     backend="fpgadataflow",
@@ -146,31 +153,38 @@ class InsertIODMA(Transformation):
             if first_node.op_type != "IODMA":
                 in_shape = model.get_tensor_shape(graph_in_name)
                 in_dtype = model.get_tensor_datatype(graph_in_name)
+                first_node_inst = getCustomOp(first_node)
+                in_folded_shape = first_node_inst.get_folded_input_shape()
+                # take advantage of AXI stream width padding for DMA alignment
+                # (AXI streams are always padded to 8 bits)
+                # this is the width of stream output expected from the DMA
+                padded_instream_width = first_node_inst.get_instream_width_padded()
+                padded_instream_bytes = padded_instream_width // 8
                 # determine the feasible interface width
-                transfer_bits = np.prod(in_shape) * in_dtype.bitwidth()
+                transfer_bits = padded_instream_width * np.prod(out_folded_shape[:-1])
                 intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
                 assert (
                     intfwidth % 8 == 0
                 ), "No feasible interface width for transfer size"
-                # get width of stream output from DMA
-                streamWidth = getCustomOp(first_node).get_instream_width()
                 # make new buffer
                 first_node_in = oh.make_tensor_value_info(
                     model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape
                 )
                 model.graph.value_info.append(first_node_in)
                 model.set_tensor_datatype(first_node_in.name, in_dtype)
-                # reroute final node output to final_node_out_name
+                # reroute first node input
+                # FIXME: currently always using 8-bit dtypes to work around the
+                # padding problems for i/o DMA
                 first_node.input[0] = first_node_in.name
                 dma_node = oh.make_node(
                     "IODMA",
                     [graph_in_name],
                     [first_node_in.name],
-                    numInputVectors=in_shape[:-1],
-                    NumChannels=in_shape[-1],
-                    dataType=str(in_dtype.name),
+                    numInputVectors=in_folded_shape[:-1],
+                    NumChannels=padded_instream_bytes,
+                    dataType="UINT8",
                     intfWidth=intfwidth,
-                    streamWidth=streamWidth,
+                    streamWidth=padded_instream_width,
                     direction="in",
                     domain="finn.custom_op.fpgadataflow",
                     backend="fpgadataflow",
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
index e8e3059240556296ca635ffcc36665fb18beda3b..84dc01e536e96298ecb57e133610d800fcd2eb5c 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
@@ -32,20 +32,25 @@ from finn.transformation.base import Transformation
 from finn.util.basic import gen_finn_dt_tensor, make_build_dir
 import finn.util.data_packing as dpk
 import finn.core.datatype as dtp
-
-from . import templates
+from finn.custom_op.registry import getCustomOp
+import os
+import warnings
+import pkg_resources as pk
+from . import template_driver
 
 
 class MakePYNQDriver(Transformation):
     """Create PYNQ Python code to correctly interface the generated
-    accelerator, including data packing/unpacking. The MakePYNQProject
-    transformation must have been already applied.
+    accelerator, including data packing/unpacking. Should be called
+    after conversion to HLS layers and folding, but prior to the creation of
+    dataflow partitions for correct operation.
 
     platform: one of ["zynq-iodma", "alveo"]
 
     Outcome if successful: sets the pynq_driver_dir attribute in the ONNX
     ModelProto's metadata_props field, with the created driver dir as the
-    value.
+    value. If any layers use runtime-writable parameters, those will be gathered
+    under the runtime_weights/ subfolder of the pynq_driver_dir.
     """
 
     def __init__(self, platform):
@@ -57,8 +62,15 @@ class MakePYNQDriver(Transformation):
         pynq_driver_dir = make_build_dir(prefix="pynq_driver_")
         model.set_metadata_prop("pynq_driver_dir", pynq_driver_dir)
 
+        # create the base FINN driver -- same for all accels
+        driver_base_template = pk.resource_filename(
+            "finn.qnn-data", "templates/driver/driver_base.py"
+        )
+        driver_base_py = pynq_driver_dir + "/driver_base.py"
+        shutil.copy(driver_base_template, driver_base_py)
+
         # extract input-output shapes from the graph
-        # TODO convert this to an analysis pass
+        # TODO convert this to an analysis pass?
         i_tensor_name = model.graph.input[0].name
         o_tensor_name = model.graph.output[0].name
         i_tensor_shape_normal = tuple(model.get_tensor_shape(i_tensor_name))
@@ -89,9 +101,9 @@ class MakePYNQDriver(Transformation):
 
         # fill in the driver template
         driver_py = pynq_driver_dir + "/driver.py"
-        driver = templates.pynq_driver_template
+        driver = template_driver.pynq_driver_template
 
-        def mss(x, batch_var_name="N"):
+        def mss(x, batch_var_name="1"):
             # "make shape string"
             # for a shape like (1, ...) emit a string (N, ...)
             # where N is the default value for batch_var_name
@@ -111,26 +123,15 @@ class MakePYNQDriver(Transformation):
         driver = driver.replace("$OUTPUT_SHAPE_FOLDED$", mss(o_tensor_shape_folded))
         driver = driver.replace("$OUTPUT_SHAPE_PACKED$", mss(o_tensor_shape_packed))
 
-        # clock settings for driver
-        clk_ns = model.get_metadata_prop("clk_ns")
-        # default to 10ns / 100 MHz if property not set
-        if clk_ns is None:
-            clk_ns = 10.0
-        else:
-            clk_ns = float(clk_ns)
-        fclk_mhz = 1 / (clk_ns * 0.001)
-        # TODO change according to PYNQ board?
-        driver = driver.replace("$CLK_NAME$", "fclk0_mhz")
-        driver = driver.replace("$CLOCK_FREQ_MHZ$", str(fclk_mhz))
-
         with open(driver_py, "w") as f:
             f.write(driver)
 
         # add validate.py to run full top-1 test (only for suitable networks)
         validate_py = pynq_driver_dir + "/validate.py"
-        validate_src = templates.pynq_validation_template
-        with open(validate_py, "w") as f:
-            f.write(validate_src)
+        validate_template = pk.resource_filename(
+            "finn.qnn-data", "templates/driver/validate.py"
+        )
+        shutil.copy(validate_template, validate_py)
 
         # copy all the dependencies into the driver folder
         # driver imports utils/data_packing and core/datatype
@@ -146,4 +147,26 @@ class MakePYNQDriver(Transformation):
         shutil.copytree(dpk_root, pynq_driver_dir + "/finn/util")
         shutil.copytree(dtp_root, pynq_driver_dir + "/finn/core")
 
+        # generate weight files for runtime-writable layers
+        weights_dir = pynq_driver_dir + "/runtime_weights"
+        rt_layer_ind = 0
+        os.makedirs(weights_dir)
+        for node in model.graph.node:
+            if node.op_type in ["StreamingFCLayer_Batch", "Thresholding_Batch"]:
+                node_inst = getCustomOp(node)
+                is_rt_weights = node_inst.get_nodeattr("runtime_writeable_weights")
+                if is_rt_weights == 1:
+                    fcl_w = model.get_initializer(node.input[1])
+                    w_filename = weights_dir + "/%d_%s.dat" % (rt_layer_ind, node.name)
+                    node_inst.make_weight_file(fcl_w, "decoupled_runtime", w_filename)
+                    rt_layer_ind += 1
+            elif node.op_type == "StreamingDataflowPartition":
+                warnings.warn(
+                    """Please call MakePYNQDriver prior to
+                CreateDataflowPartition. Can only extract runtime-writable
+                weights from HLSCustomOp instances and not StreamingDataflowPartition.
+                """
+                )
+            else:
+                continue
         return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
index 7baa27757abd91c2602f15f739555014b24f559d..99c08dc0593a928c534f1dc2a0313e0c85680144 100644
--- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -33,7 +33,6 @@ from finn.custom_op.registry import getCustomOp
 from finn.transformation.base import Transformation
 from finn.core.modelwrapper import ModelWrapper
 from finn.util.basic import get_by_name, make_build_dir
-from finn.util.basic import get_num_default_workers
 from finn.util.basic import pynq_part_map
 
 from finn.transformation.fpgadataflow.create_dataflow_partition import (
@@ -137,13 +136,6 @@ class MakeZYNQProject(Transformation):
                 global_clk_ns = clk_ns
 
             ifnames = eval(kernel_model.get_metadata_prop("vivado_stitch_ifnames"))
-            assert (
-                len(ifnames["axilite"]) <= 1
-            ), "MakeZYNQProject supports max 1 AXI lite interface"
-            if len(ifnames["axilite"]) == 1:
-                axilite_intf_name = ifnames["axilite"][0]
-            else:
-                axilite_intf_name = None
 
             # gather info on connectivity
             # assume each node connected to outputs/inputs is DMA:
@@ -171,6 +163,10 @@ class MakeZYNQProject(Transformation):
                     "[get_bd_intf_pins smartconnect_0/S%02d_AXI]"
                     % (instance_names[node.name], aximm_idx)
                 )
+                assert (
+                    len(ifnames["axilite"]) == 1
+                ), "Must have 1 AXI lite interface on IODMA nodes"
+                axilite_intf_name = ifnames["axilite"][0]
                 assert axilite_intf_name is not None
                 config.append(
                     "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
@@ -186,6 +182,14 @@ class MakeZYNQProject(Transformation):
                     "create_bd_cell -type ip -vlnv %s %s"
                     % (vivado_stitch_vlnv, instance_names[node.name])
                 )
+                for axilite_intf_name in ifnames["axilite"]:
+                    config.append(
+                        "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                        "[get_bd_intf_pins axi_interconnect_0/M%02d_AXI]"
+                        % (instance_names[node.name], axilite_intf_name, axilite_idx)
+                    )
+                    axilite_idx += 1
+
             config.append(
                 "connect_bd_net [get_bd_pins %s/ap_clk] "
                 "[get_bd_pins smartconnect_0/aclk]" % instance_names[node.name]
@@ -231,7 +235,6 @@ class MakeZYNQProject(Transformation):
                     pynq_part_map[self.platform],
                     config,
                     self.enable_debug,
-                    get_num_default_workers(),
                 )
             )
 
@@ -241,7 +244,7 @@ class MakeZYNQProject(Transformation):
         with open(synth_project_sh, "w") as f:
             f.write("#!/bin/bash \n")
             f.write("cd {}\n".format(vivado_pynq_proj_dir))
-            f.write("vivado -mode tcl -source %s\n" % ipcfg)
+            f.write("vivado -mode batch -source %s\n" % ipcfg)
             f.write("cd {}\n".format(working_dir))
 
         # call the synthesis script
diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
index 30fa8b089ae6bdfc3249b2a725e1b97c2ba9c1f0..f7d59978d8f8866aefb3028d570bb6b434df33b4 100644
--- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py
+++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
@@ -187,6 +187,8 @@ class InsertAndSetFIFODepths(Transformation):
     - max_depth : how deep the "max"-sized FIFOs initially inserted will be
     - swg_exception : call CapConvolutionFIFODepths to make convolution FIFOs
                         smaller where appropriate
+    - vivado_ram_style : the StreamingFIFO.ram_style attribute to be used for
+                          large FIFOs implemented by Vivado
 
     Assumed input graph properties:
     - all nodes are fpgadataflow nodes
@@ -216,6 +218,7 @@ class InsertAndSetFIFODepths(Transformation):
         max_qsrl_depth=256,
         max_depth=2 ** 14,
         swg_exception=True,
+        vivado_ram_style="auto",
     ):
         super().__init__()
         self.fpgapart = fpgapart
@@ -223,6 +226,7 @@ class InsertAndSetFIFODepths(Transformation):
         self.max_qsrl_depth = max_qsrl_depth
         self.max_depth = max_depth
         self.swg_exception = swg_exception
+        self.vivado_ram_style = vivado_ram_style
 
     def apply(self, model):
         # change external to decoupled and warn user
@@ -356,7 +360,7 @@ class InsertAndSetFIFODepths(Transformation):
                 # Set FIFO implementation/ram styles
                 if depth > self.max_qsrl_depth:
                     node_inst.set_nodeattr("impl_style", "vivado")
-                    node_inst.set_nodeattr("ram_style", "auto")
+                    node_inst.set_nodeattr("ram_style", self.vivado_ram_style)
                 else:
                     node_inst.set_nodeattr("impl_style", "rtl")
                 # reset implementation
diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb4e0e1db51d331400e7a294890eb998c2aa4e1d
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/set_folding.py
@@ -0,0 +1,198 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from finn.custom_op.registry import getCustomOp
+from finn.transformation.base import Transformation
+from finn.util.fpgadataflow import is_fpgadataflow_node
+from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
+from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
+from finn.transformation.general import GiveUniqueNodeNames
+import warnings
+
+
+def divisors(num):
+    for x in range(1, num + 1):
+        if (num % x) == 0:
+            yield x
+
+
+class SetFolding(Transformation):
+    """Attempt to set parallelism attributes in all nodes to meet a specific
+    target expressed as cycles per frame target_cycles_per_frame. For each
+    HLSCustomOp node type, the attribute may vary but is typically one of {PE, SIMD},
+    and has a certain allowed-maximum value and divisibility constraints,
+    which SetFolding will take into account. Note that the algorithm implemented
+    by SetFolding is very simple and it is often possible to hand-tune the returned
+    parallelism configuration for better results.
+
+    In the returned model, each node's
+    cycles_estimate attribute will be set to its estimated number of cycles.
+
+    If two_pass_relaxation is enabled,
+    SetFolding will internally run a second time if the target cycles from the
+    first pass could not be achieved, instead using the achievable target (which
+    may be constrained by a single node) to obtain a balanced pipeline.
+
+    Notable exceptions and special behavior:
+
+    * When folding dense convolution/FC compute engines (StreamingFCLayer_Batch),
+    which have two attributes (PE and SIMD):
+        * first increases SIMD while weight stream width per PE is <= mvau_wwidth_max
+          (configurable in the SetFolding initializer, defaults to 36)
+        * then increases PE until the target is met or max PE reached
+
+    * When folding depthwise convolutions ("VVAU"/Vector_Vector_Activate_Batch)
+    or spatial reduction ops (Pool_Batch):
+        * the producer of the node is expected to be a ConvolutionInputGenerator
+        with depthwise=1, whose SIMD value will be set equal to the PE value of
+        its consumer node
+    """
+
+    def __init__(
+        self, target_cycles_per_frame=1000, mvau_wwidth_max=36, two_pass_relaxation=True
+    ):
+        super().__init__()
+        self.target_cycles_per_frame = target_cycles_per_frame
+        self.mvau_wwidth_max = mvau_wwidth_max
+        self.two_pass_relaxation = two_pass_relaxation
+
+    def optimize_attribute_val(self, node_inst, max_val, attr_name):
+        node_inst.set_nodeattr(attr_name, 1)
+        for val in divisors(max_val):
+            node_inst.set_nodeattr(attr_name, val)
+            cyc = node_inst.get_exp_cycles()
+            if cyc < self.target_cycles_per_frame:
+                # finish if target met
+                break
+
+    def apply(self, model):
+        graph = model.graph
+        # these ops use PE parallelism, up to a max value of NumChannels
+        pe_ops = [
+            "AddStreams_Batch",
+            "ChannelwiseOp_Batch",
+            "DuplicateStreams_Batch",
+            "GlobalAccPool_Batch",
+            "Thresholding_Batch",
+        ]
+        # these ops use SIMD parallelism, up to a max value of NumChannels
+        # ConvolutionInputGenerator has a special case when depthwise=1
+        simd_ops = ["DownSampler", "FMPadding_Batch", "ConvolutionInputGenerator"]
+        # these ops are preceded by depthwise SWG and have special behavior,
+        # as explained in the SetFolding docstring
+        depthwise_op_exceptions = ["Vector_Vector_Activate_Batch", "Pool_Batch"]
+        for node in graph.node:
+            if not is_fpgadataflow_node(node):
+                continue
+            op_type = node.op_type
+            node_inst = getCustomOp(node)
+            if op_type == "StreamingFCLayer_Batch":
+                max_simd = node_inst.get_nodeattr("MW")
+                max_pe = node_inst.get_nodeattr("MH")
+                node_inst.set_nodeattr("PE", 1)
+                node_inst.set_nodeattr("SIMD", 1)
+                # increase SIMD until either we meet
+                # the target or weight stream becomes
+                # too wide
+                for simd_val in divisors(max_simd):
+                    prev_simd_val = node_inst.get_nodeattr("SIMD")
+                    node_inst.set_nodeattr("SIMD", simd_val)
+                    cyc = node_inst.get_exp_cycles()
+                    if cyc < self.target_cycles_per_frame:
+                        # finish if target met
+                        break
+                    if (
+                        node_inst.get_weight_datatype().bitwidth()
+                        * node_inst.get_nodeattr("SIMD")
+                        > self.mvau_wwidth_max
+                    ):
+                        # revert if we've gone above width threshold
+                        node_inst.set_nodeattr("SIMD", prev_simd_val)
+                        break
+                # increase PE until target met or reached max_pe
+                self.optimize_attribute_val(node_inst, max_pe, "PE")
+            elif op_type in pe_ops:
+                max_pe = node_inst.get_nodeattr("NumChannels")
+                self.optimize_attribute_val(node_inst, max_pe, "PE")
+            elif op_type == "LabelSelect_Batch":
+                max_pe = node_inst.get_nodeattr("Labels")
+                self.optimize_attribute_val(node_inst, max_pe, "PE")
+            elif op_type in depthwise_op_exceptions:
+                max_pe = node_inst.get_nodeattr("Channels")
+                self.optimize_attribute_val(node_inst, max_pe, "PE")
+                # also set the folding of the upsteam DW SWU
+                # which must be identical to this node
+                swu_node = model.find_producer(node.input[0])
+                if swu_node.op_type == "ConvolutionInputGenerator":
+                    swu_node_inst = getCustomOp(swu_node)
+                    pe = node_inst.get_nodeattr("PE")
+                    swu_node_inst.set_nodeattr("SIMD", pe)
+                else:
+                    raise Exception(
+                        "Expected SWU on DW op input, found " + swu_node.op_type
+                    )
+            elif op_type in simd_ops:
+                if op_type == "ConvolutionInputGenerator":
+                    depthwise = node_inst.get_nodeattr("depthwise")
+                    if depthwise == 0:
+                        max_simd = node_inst.get_nodeattr("IFMChannels")
+                        self.optimize_attribute_val(node_inst, max_simd, "SIMD")
+                    else:
+                        # depthwise SWGs are handled separately
+                        continue
+                else:
+                    max_simd = node_inst.get_nodeattr("NumChannels")
+                    self.optimize_attribute_val(node_inst, max_simd, "SIMD")
+            else:
+                warnings.warn(
+                    "SetFolding doesn't know how to handle op_type " + op_type
+                )
+
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(AnnotateCycles())
+        if self.two_pass_relaxation:
+            perf_dict = model.analysis(dataflow_performance)
+            if perf_dict["max_cycles"] > self.target_cycles_per_frame:
+                # run again, but with lower target (that we managed) -- this
+                # may be coming from a single node's constraints, but we want
+                # to balance the entire dataflow pipeline instead
+                # no two_pass_relaxation this time -- no guarantee we'll
+                # converge otherwise
+                warnings.warn(
+                    "Node %s is bottleneck with %d cycles, running second pass"
+                    % (perf_dict["max_cycles_node_name"], perf_dict["max_cycles"])
+                )
+                model = model.transform(
+                    SetFolding(
+                        target_cycles_per_frame=perf_dict["max_cycles"],
+                        mvau_wwidth_max=self.mvau_wwidth_max,
+                        two_pass_relaxation=False,
+                    )
+                )
+
+        return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/template_driver.py b/src/finn/transformation/fpgadataflow/template_driver.py
new file mode 100644
index 0000000000000000000000000000000000000000..b595205714d8cb630816d2b42fe96640e49e506e
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/template_driver.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2020 Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# flake8: noqa
+
+pynq_driver_template = """
+# Copyright (c) 2020 Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import numpy as np
+import os
+from finn.core.datatype import DataType
+from driver_base import FINNExampleOverlay
+
+# dictionary describing the I/O of the FINN-generated accelerator
+io_shape_dict = {
+    # FINN DataType for input and output tensors
+    "idt" : $INPUT_FINN_DATATYPE$,
+    "odt" : $OUTPUT_FINN_DATATYPE$,
+    # shapes for input and output tensors (NHWC layout)
+    "ishape_normal" : $INPUT_SHAPE_NORMAL$,
+    "oshape_normal" : $OUTPUT_SHAPE_NORMAL$,
+    # folded / packed shapes below depend on idt/odt and input/output
+    # PE/SIMD parallelization settings -- these are calculated by the
+    # FINN compiler.
+    "ishape_folded" : $INPUT_SHAPE_FOLDED$,
+    "oshape_folded" : $OUTPUT_SHAPE_FOLDED$,
+    "ishape_packed" : $INPUT_SHAPE_PACKED$,
+    "oshape_packed" : $OUTPUT_SHAPE_PACKED$
+}
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Execute FINN-generated accelerator on numpy inputs, or run throughput test')
+    parser.add_argument('--exec_mode', help='Please select functional verification ("execute") or throughput test ("throughput_test")', default="execute")
+    parser.add_argument('--platform', help='Target platform: zynq-iodma alveo', default="$PLATFORM$")
+    parser.add_argument('--batchsize', help='number of samples for inference', type=int, default=1)
+    parser.add_argument('--bitfile', help='name of bitfile (i.e. "resizer.bit")', default="resizer.bit")
+    parser.add_argument('--inputfile', help='name of input npy file (i.e. "input.npy")', default="input.npy")
+    parser.add_argument('--outputfile', help='name of output npy file (i.e. "output.npy")', default="output.npy")
+    parser.add_argument('--runtime_weight_dir', help='path to folder containing runtime-writable .dat weights', default="runtime_weights/")
+    # parse arguments
+    args = parser.parse_args()
+    exec_mode = args.exec_mode
+    platform = args.platform
+    batch_size = args.batchsize
+    bitfile = args.bitfile
+    inputfile = args.inputfile
+    outputfile = args.outputfile
+    runtime_weight_dir = args.runtime_weight_dir
+
+    # instantiate FINN accelerator driver and pass batchsize and bitfile
+    accel = FINNExampleOverlay(
+        bitfile_name = bitfile, platform = platform,
+        io_shape_dict = io_shape_dict, batch_size = batch_size,
+        runtime_weight_dir = runtime_weight_dir
+    )
+
+    # for the remote execution the data from the input npy file has to be loaded,
+    # packed and copied to the PYNQ buffer
+    if exec_mode == "execute":
+        # remove old output file to prevent reusing old output
+        # in case execution fails
+        try:
+            os.remove(outputfile)
+        except FileNotFoundError:
+            pass
+        # load desired input .npy file
+        ibuf_normal = np.load(inputfile)
+        obuf_normal = accel.execute(ibuf_normal)
+        np.save(outputfile, obuf_normal)
+    elif exec_mode == "throughput_test":
+        # remove old metrics file
+        try:
+            os.remove("nw_metrics.txt")
+        except FileNotFoundError:
+            pass
+        res = accel.throughput_test()
+        file = open("nw_metrics.txt", "w")
+        file.write(str(res))
+        file.close()
+        print("Results written to nw_metrics.txt")
+    else:
+        raise Exception("Exec mode has to be set to remote_pynq or throughput_test")
+"""
diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py
index 253fd462fa6601bcc34dd3d2aa2f42463c0ae308..2d1c680338eec199908c305a42988403cb3645aa 100644
--- a/src/finn/transformation/fpgadataflow/templates.py
+++ b/src/finn/transformation/fpgadataflow/templates.py
@@ -89,208 +89,6 @@ make %s
 cd %s
 """
 
-pynq_driver_template = """
-import argparse
-import os
-from pynq import Overlay
-import numpy as np
-from pynq import allocate
-import time
-from finn.util.data_packing import (
-    finnpy_to_packed_bytearray,
-    packed_bytearray_to_finnpy
-)
-from finn.core.datatype import DataType
-from pynq.ps import Clocks
-
-class FINNAccelDriver():
-    def __init__(self, N, bitfile, platform="$PLATFORM$"):
-        \"\"\"Instantiate the FINN accelerator driver.
-        Gets batchsize (N) as integer and path to bitfile as string.\"\"\"
-        self.platform = platform
-        self.N = N
-        # input FINN DataType
-        self.idt = $INPUT_FINN_DATATYPE$
-        # output FINN DataType
-        self.odt = $OUTPUT_FINN_DATATYPE$
-        # input and output shapes
-        self.ishape_normal = $INPUT_SHAPE_NORMAL$
-        self.oshape_normal = $OUTPUT_SHAPE_NORMAL$
-        self.ishape_folded = $INPUT_SHAPE_FOLDED$
-        self.oshape_folded = $OUTPUT_SHAPE_FOLDED$
-        self.ishape_packed = $INPUT_SHAPE_PACKED$   # datatype np.uint8
-        self.oshape_packed = $OUTPUT_SHAPE_PACKED$  # datatype np.uint8
-        # load bitfile and set up accelerator
-        self.ol = Overlay(bitfile)
-        # neuron folding factor of output = iterations per sample
-        self.itersPerSample = self.oshape_packed[-2]
-        # clock frequency as specified by user
-        self.fclk_mhz = $CLOCK_FREQ_MHZ$
-        if self.platform == "alveo":
-            self.idma = self.ol.idma0
-            self.odma = self.ol.odma0
-        elif self.platform == "zynq-iodma":
-            self.idma = self.ol.idma0
-            self.odma = self.ol.odma0
-            # set the clock frequency as specified by user during transformations
-            if self.fclk_mhz > 0:
-                Clocks.$CLK_NAME$ = self.fclk_mhz
-        else:
-            raise ValueError("Supported platforms are zynq-iodma alveo")
-
-        # allocate a PYNQ buffer for the packed input and buffer
-        if self.platform == "alveo":
-            self.ibuf_packed_device = allocate(shape=self.ishape_packed, dtype=np.uint8)
-            self.obuf_packed_device = allocate(shape=self.oshape_packed, dtype=np.uint8)
-        else:
-            self.ibuf_packed_device = allocate(shape=self.ishape_packed, dtype=np.uint8, cacheable=True)
-            self.obuf_packed_device = allocate(shape=self.oshape_packed, dtype=np.uint8, cacheable=True)
-
-    def fold_input(self, ibuf_normal):
-        \"\"\"Reshapes input in desired shape.
-        Gets input data (ibuf_normal), checks if data is in expected normal shape.
-        Returns folded input.\"\"\"
-        # ensure that shape is as expected
-        assert ibuf_normal.shape == self.ishape_normal
-        # convert to folded form
-        ibuf_folded = ibuf_normal.reshape(self.ishape_folded)
-        return ibuf_folded
-
-    def pack_input(self, ibuf_folded):
-        \"\"\"Packs folded input and reverses both SIMD dim and endianness.
-        Gets input data in folded shape and returns packed input data.\"\"\"
-        ibuf_packed = finnpy_to_packed_bytearray(
-            ibuf_folded, self.idt, reverse_endian=True, reverse_inner=True
-        )
-        return ibuf_packed
-
-    def unpack_output(self, obuf_packed):
-        \"\"\"Unpacks the packed output buffer from accelerator.
-        Gets packed output and returns output data in folded shape.\"\"\"
-        obuf_folded = packed_bytearray_to_finnpy(
-            obuf_packed, self.odt, self.oshape_folded, reverse_endian=True, reverse_inner=True
-        )
-        return obuf_folded
-
-    def unfold_output(self, obuf_folded):
-        \"\"\"Unfolds output data to normal shape.
-        Gets folded output data and returns output data in normal shape.\"\"\"
-        obuf_normal = obuf_folded.reshape(self.oshape_normal)
-        return obuf_normal
-
-    def copy_input_data_to_device(self, data):
-        \"\"\"Copies given input data to PYNQ buffer.\"\"\"
-        np.copyto(self.ibuf_packed_device, data)
-        self.ibuf_packed_device.flush()
-
-    def copy_output_data_from_device(self, data):
-        \"\"\"Copies PYNQ output buffer from device.\"\"\"
-        self.obuf_packed_device.invalidate()
-        np.copyto(data, self.obuf_packed_device)
-
-    def execute(self):
-        \"\"\"Executes accelerator by setting up the DMA(s) and
-        waiting until all transfers/calls complete. Uses only member variables and
-        returns nothing.\"\"\"
-        if self.platform == "zynq-iodma":
-            # manually launch IODMAs since signatures are missing
-            self.idma.write(0x10, self.ibuf_packed_device.device_address)
-            self.idma.write(0x1c, self.N)
-            self.odma.write(0x10, self.obuf_packed_device.device_address)
-            self.odma.write(0x1c, self.N)
-            self.idma.write(0x00, 1)
-            self.odma.write(0x00, 1)
-            # wait until output IODMA is finished
-            status = self.odma.read(0x00)
-            while status & 0x2 == 0:
-                status = self.odma.read(0x00)
-        elif self.platform == "alveo":
-            idma_handle = self.idma.start_sw(self.ibuf_packed_device, self.N)
-            odma_handle = self.odma.start_sw(self.obuf_packed_device, self.N)
-            odma_handle.wait()
-
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Set exec mode, batchsize N, bitfile name, inputfile name and outputfile name')
-    parser.add_argument('--exec_mode', help='Please select functional verification ("execute") or throughput test ("throughput_test")', default="execute")
-    parser.add_argument('--platform', help='Target platform: zynq-iodma alveo', default="$PLATFORM$")
-    parser.add_argument('--batchsize', help='number of samples for inference', type=int, default=1)
-    parser.add_argument('--bitfile', help='name of bitfile (i.e. "resizer.bit")', default="resizer.bit")
-    parser.add_argument('--inputfile', help='name of input npy file (i.e. "input.npy")', default="input.npy")
-    parser.add_argument('--outputfile', help='name of output npy file (i.e. "output.npy")', default="output.npy")
-    # parse arguments
-    args = parser.parse_args()
-    exec_mode = args.exec_mode
-    platform = args.platform
-    N = args.batchsize
-    bitfile = args.bitfile
-    inputfile = args.inputfile
-    outputfile = args.outputfile
-
-    # instantiate FINN accelerator driver and pass batchsize and bitfile
-    finnDriver = FINNAccelDriver(N, bitfile, platform)
-
-    # for the remote execution the data from the input npy file has to be loaded,
-    # packed and copied to the PYNQ buffer
-    if exec_mode == "execute":
-        # remove old output file to prevent reusing old output
-        # in case execution fails
-        try:
-            os.remove(outputfile)
-        except FileNotFoundError:
-            pass
-        # load desired input .npy file
-        ibuf_normal = np.load(inputfile)
-        ibuf_folded = finnDriver.fold_input(ibuf_normal)
-        ibuf_packed = finnDriver.pack_input(ibuf_folded)
-        finnDriver.copy_input_data_to_device(ibuf_packed)
-    elif exec_mode != "throughput_test":
-        raise Exception("Exec mode has to be set to remote_pynq or throughput_test")
-
-    # for the throughput test the runtime of the network has to be measured
-    if exec_mode == "throughput_test":
-        # remove old metrics file
-        try:
-            os.remove("nw_metrics.txt")
-        except FileNotFoundError:
-            pass
-        # dictionary for results of throughput test
-        res={}
-        # measure runtime of network
-        start = time.time()
-
-    # execute accelerator
-    finnDriver.execute()
-
-    # measure run time and fill dictionary with results of the throughput test
-    if exec_mode == "throughput_test":
-        end = time.time()
-        runtime = end - start
-        res["runtime[ms]"] = runtime*1000
-        res["throughput[images/s]"] = N / runtime
-        res["DRAM_in_bandwidth[Mb/s]"] = np.prod(finnDriver.ishape_packed)*0.000001 / runtime
-        res["DRAM_out_bandwidth[Mb/s]"] = np.prod(finnDriver.oshape_packed)*0.000001 / runtime
-        if platform != "alveo":
-            res["fclk[mhz]"] = Clocks.fclk0_mhz
-        else:
-            res["fclk[mhz]"] = finnDriver.fclk_mhz
-        res["N"] = N
-        file = open("nw_metrics.txt", "w")
-        file.write(str(res))
-        file.close()
-
-    # if execution is selected unpack, unfold and save output to output npy file
-    else:
-        obuf_packed = np.empty_like(finnDriver.obuf_packed_device)
-        finnDriver.copy_output_data_from_device(obuf_packed)
-        obuf_folded = finnDriver.unpack_output(obuf_packed)
-        obuf_normal = finnDriver.unfold_output(obuf_folded)
-        np.save(outputfile, obuf_normal)
-
-
-"""
-
 custom_zynq_shell_template = """
 set FREQ_MHZ %s
 set NUM_AXILITE %d
@@ -347,8 +145,8 @@ if {$ZYNQ_TYPE == "zynq_us+"} {
 create_bd_cell -type ip -vlnv xilinx.com:ip:axi_interconnect:2.1 axi_interconnect_0
 create_bd_cell -type ip -vlnv xilinx.com:ip:smartconnect:1.0 smartconnect_0
 #set number of axilite interfaces, and number of axi master interfaces
-set_property -dict [list CONFIG.NUM_SI $NUM_AXILITE] [get_bd_cells smartconnect_0]
-set_property -dict [list CONFIG.NUM_MI $NUM_AXIMM] [get_bd_cells axi_interconnect_0]
+set_property -dict [list CONFIG.NUM_SI $NUM_AXIMM] [get_bd_cells smartconnect_0]
+set_property -dict [list CONFIG.NUM_MI $NUM_AXILITE] [get_bd_cells axi_interconnect_0]
 
 #create reset controller and connect interconnects to PS
 if {$ZYNQ_TYPE == "zynq_us+"} {
@@ -407,12 +205,13 @@ set_property STEPS.POST_ROUTE_PHYS_OPT_DESIGN.IS_ENABLED true [get_runs impl_1]
 
 # out-of-context synth can't be used for bitstream generation
 # set_property -name {STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS} -value {-mode out_of_context} -objects [get_runs synth_1]
-launch_runs -to_step write_bitstream impl_1 -jobs %d
+launch_runs -to_step write_bitstream impl_1
 wait_on_run [get_runs impl_1]
 
 # generate synthesis report
-open_run synth_1 -name synth_1
+open_run impl_1
 report_utilization -hierarchical -hierarchical_depth 4 -file synth_report.xml -format xml
+close_project
 """
 
 alveo_run_sh_template = """#!/bin/bash
@@ -436,55 +235,3 @@ open_project $VITIS_PROJ_PATH$/_x/link/vivado/vpl/prj/prj.xpr
 open_run impl_1
 report_utilization -hierarchical -hierarchical_depth 5 -file $VITIS_PROJ_PATH$/synth_report.xml -format xml
 """
-
-pynq_validation_template = """
-import argparse
-from driver import FINNAccelDriver
-import numpy as np
-
-if __name__ == "__main__":
-  parser = argparse.ArgumentParser(description='Validate top-1 accuracy for FINN accelerator')
-  parser.add_argument('--batchsize', help='number of samples for inference', type=int, default=100)
-  parser.add_argument('--dataset', help='dataset to use (mnist of cifar10)', required=True)
-  # parse arguments
-  args = parser.parse_args()
-  bsize = args.batchsize
-  dataset = args.dataset
-
-  if dataset == "mnist":
-    from dataset_loading import mnist
-    trainx, trainy, testx, testy, valx, valy = mnist.load_mnist_data("/tmp", download=True, one_hot=False)
-  elif dataset == "cifar10":
-    from dataset_loading import cifar
-    trainx, trainy, testx, testy, valx, valy = cifar.load_cifar_data("/tmp", download=True, one_hot=False)
-  else:
-    raise Exception("Unrecognized dataset")
-
-  test_imgs = testx
-  test_labels = testy
-
-  ok = 0
-  nok = 0
-  total = test_imgs.shape[0]
-  driver = FINNAccelDriver(bsize, "resizer.bit", "zynq-iodma")
-
-  n_batches = int(total / bsize)
-
-  test_imgs = test_imgs.reshape(n_batches, bsize, -1)
-  test_labels = test_labels.reshape(n_batches, bsize)
-
-  for i in range(n_batches):
-    ibuf_normal = test_imgs[i].reshape(driver.ibuf_packed_device.shape)
-    exp = test_labels[i]
-    driver.copy_input_data_to_device(ibuf_normal)
-    driver.execute()
-    obuf_normal = np.empty_like(driver.obuf_packed_device)
-    driver.copy_output_data_from_device(obuf_normal)
-    ret = np.bincount(obuf_normal.flatten() == exp.flatten())
-    nok += ret[0]
-    ok += ret[1]
-    print("batch %d / %d : total OK %d NOK %d" % (i, n_batches, ok, nok))
-
-  acc = 100.0 * ok / (total)
-  print("Final accuracy: %f" % acc)
-"""
diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py
index 5afb48637b8ad3e7198cdf5e7ac2f30afd3866b4..11cf46bb6e2233156c34b00f0d2126fc92255238 100644
--- a/src/finn/transformation/fpgadataflow/vitis_build.py
+++ b/src/finn/transformation/fpgadataflow/vitis_build.py
@@ -252,6 +252,19 @@ class VitisLink(Transformation):
         link_dir = make_build_dir(prefix="vitis_link_proj_")
         model.set_metadata_prop("vitis_link_proj", link_dir)
 
+        # add Vivado physopt directives if desired
+        if self.strategy == VitisOptStrategy.PERFORMANCE_BEST:
+            config.append("[vivado]")
+            config.append(
+                "prop=run.impl_1.STEPS.OPT_DESIGN.ARGS.DIRECTIVE=ExploreWithRemap"
+            )
+            config.append("prop=run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=Explore")
+            config.append("prop=run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=true")
+            config.append(
+                "prop=run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Explore"
+            )
+            config.append("prop=run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Explore")
+
         config = "\n".join(config) + "\n"
         with open(link_dir + "/config.txt", "w") as f:
             f.write(config)
@@ -303,7 +316,7 @@ class VitisLink(Transformation):
             f.write("#!/bin/bash \n")
             f.write("cd {}\n".format(link_dir))
             f.write(
-                "vivado -mode tcl -source %s\n" % (link_dir + "/gen_report_xml.tcl")
+                "vivado -mode batch -source %s\n" % (link_dir + "/gen_report_xml.tcl")
             )
             f.write("cd {}\n".format(working_dir))
         bash_command = ["bash", gen_rep_xml_sh]
@@ -316,7 +329,18 @@ class VitisLink(Transformation):
 
 
 class VitisBuild(Transformation):
-    """Best-effort attempt at building the accelerator with Vitis."""
+    """Best-effort attempt at building the accelerator with Vitis.
+
+    fpga_part: string identifying the target FPGA
+    period_ns: target clock period
+    platform: target Alveo platform, one of ["U50", "U200", "U250", "U280"]
+    strategy: Vitis optimization strategy
+    enable_debug: add Chipscope to all AXI interfaces
+    floorplan_file: path to a JSON containing a dictionary with SLR assignments
+                    for each node in the ONNX graph. Must be parse-able by
+                    the ApplyConfig transform.
+
+    """
 
     def __init__(
         self,
@@ -325,6 +349,7 @@ class VitisBuild(Transformation):
         platform,
         strategy=VitisOptStrategy.PERFORMANCE,
         enable_debug=False,
+        floorplan_file=None,
     ):
         super().__init__()
         self.fpga_part = fpga_part
@@ -332,6 +357,7 @@ class VitisBuild(Transformation):
         self.platform = platform
         self.strategy = strategy
         self.enable_debug = enable_debug
+        self.floorplan_file = floorplan_file
 
     def apply(self, model):
         _check_vitis_envvars()
@@ -342,13 +368,18 @@ class VitisBuild(Transformation):
             MakePYNQDriver(platform="alveo"),
             InsertIODMA(512),
             InsertDWC(),
-            Floorplan(),
-            CreateDataflowPartition(),
         ]
         for trn in prep_transforms:
             model = model.transform(trn)
             model = model.transform(GiveUniqueNodeNames())
             model = model.transform(GiveReadableTensorNames())
+
+        model = model.transform(Floorplan(floorplan=self.floorplan_file))
+
+        model = model.transform(CreateDataflowPartition())
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(GiveReadableTensorNames())
+
         # Build each kernel individually
         sdp_nodes = model.get_nodes_by_op_type("StreamingDataflowPartition")
         for sdp_node in sdp_nodes:
diff --git a/src/finn/util/imagenet.py b/src/finn/util/imagenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..71ed9d9d260e2b38b5d9ec47f728ad401e526ca8
--- /dev/null
+++ b/src/finn/util/imagenet.py
@@ -0,0 +1,1150 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import numpy as np
+from PIL import Image
+from finn.core.data_layout import NCHW, NHWC
+from finn.util.test import resize_smaller_side, crop_center
+
+
+def get_val_images(n_images=100, interleave_classes=False):
+    """Returns generator over (path_to_jpeg, imagenet_class_id) for the first n_images
+    in the ILSVRC2012 validation dataset. The IMAGENET_VAL_PATH environment variable
+    must point to the validation dataset folder, containing 1000 folders (one for
+    each ImageNet-1K class), in turn each containing 50 test images.
+
+    interleave_classes controls the ordering of the picked images. If False
+    (default), consecutive images will have the same class until that class has
+    no more images. Otherwise, consecutive images will be from classes 0, 1, 2...
+    and back to class 0 after the first 1000 images.
+
+    For more information on how to prepare the ILSVRC2012 validation dataset,
+    please see:
+    https://github.com/Xilinx/brevitas/blob/dev/brevitas_examples/imagenet_classification/README.md
+    """
+    try:
+        val_path = os.environ["IMAGENET_VAL_PATH"]
+        val_folders = sorted(os.listdir(val_path))
+        assert len(val_folders) == 1000, "Expected 1000 subfolders in ILSVRC2012 val"
+        assert n_images <= 50000, "ILSVRC2012 validation dataset has 50k images"
+        n_current_folder = 0
+        n_current_file = 0
+        total = 0
+        while total != n_images:
+            current_folder = os.path.join(val_path, val_folders[n_current_folder])
+            current_files = sorted(os.listdir(current_folder))
+            current_file = os.path.join(current_folder, current_files[n_current_file])
+            yield (current_file, n_current_folder)
+            total += 1
+            if interleave_classes:
+                n_current_folder += 1
+                if n_current_folder == 1000:
+                    n_current_file += 1
+                    n_current_folder = 0
+            else:
+                n_current_file += 1
+                if n_current_file == 50:
+                    n_current_folder += 1
+                    n_current_file = 0
+    except KeyError:
+        return None
+
+
+def load_resize_crop(img_path, layout=NCHW, dtype=np.float32):
+    """Load, resize and center crop given image for standard ImageNet preprocessing,
+    return a numpy array."""
+    # get single image as input and prepare image
+    img = Image.open(img_path).convert("RGB")
+    # resize smallest side of the image to 256 pixels and resize larger side
+    # with same ratio
+    img = resize_smaller_side(256, img)
+    # crop central 224*224 window
+    img = crop_center(224, img)
+    if layout == NCHW:
+        # save image as numpy array and as torch tensor to enable testing in
+        # brevitas/pytorch and finn and transpose from (H, W, C) to (C, H, W)
+        img_np = np.asarray(img).copy().astype(dtype).transpose(2, 0, 1)
+        img_np = img_np.reshape(1, 3, 224, 224)
+        return img_np
+    elif layout == NHWC:
+        img_np = np.asarray(img).copy().astype(dtype)
+        img_np = img_np.reshape(1, 224, 224, 3)
+        return img_np
+    else:
+        raise Exception("Unknown data layout for load_resize_crop")
+
+
+def measure_topk(n_images, fxn_pre, fxn_exec, fxn_post, verbose=True, k=5):
+    "Do top-k accuracy measurement on ILSVRC2012 with given functions."
+
+    workload = get_val_images(n_images)
+    top1_ok = 0.0
+    top1_nok = 0.0
+    topk_ok = 0.0
+    topk_nok = 0.0
+    for i, (img_path, target_id) in enumerate(workload):
+        img_np = load_resize_crop(img_path)
+        inp = fxn_pre(img_np)
+        ret = fxn_exec(inp)
+        res = fxn_post(ret)
+        res = res.flatten()
+        res = np.argsort(res)[-k:]
+        res = np.flip(res)
+        if target_id == res[0]:
+            top1_ok += 1.0
+        else:
+            top1_nok += 1.0
+        if target_id in res:
+            topk_ok += 1.0
+        else:
+            topk_nok += 1.0
+        cnt = i + 1
+        print(
+            "[%d/%d] Top-1: %f Top-%d: %f"
+            % (cnt, n_images, 100 * top1_ok / cnt, k, 100 * topk_ok / cnt)
+        )
+    return ((top1_ok, top1_nok), (topk_ok, topk_nok))
+
+
+# human-readable names for ImageNet classes
+class_names = {
+    0: "tench, Tinca tinca",
+    1: "goldfish, Carassius auratus",
+    2: "great white shark, white shark, man-eater, man-eating shark, "
+    "Carcharodon carcharias",
+    3: "tiger shark, Galeocerdo cuvieri",
+    4: "hammerhead, hammerhead shark",
+    5: "electric ray, crampfish, numbfish, torpedo",
+    6: "stingray",
+    7: "cock",
+    8: "hen",
+    9: "ostrich, Struthio camelus",
+    10: "brambling, Fringilla montifringilla",
+    11: "goldfinch, Carduelis carduelis",
+    12: "house finch, linnet, Carpodacus mexicanus",
+    13: "junco, snowbird",
+    14: "indigo bunting, indigo finch, indigo bird, Passerina cyanea",
+    15: "robin, American robin, Turdus migratorius",
+    16: "bulbul",
+    17: "jay",
+    18: "magpie",
+    19: "chickadee",
+    20: "water ouzel, dipper",
+    21: "kite",
+    22: "bald eagle, American eagle, Haliaeetus leucocephalus",
+    23: "vulture",
+    24: "great grey owl, great gray owl, Strix nebulosa",
+    25: "European fire salamander, Salamandra salamandra",
+    26: "common newt, Triturus vulgaris",
+    27: "eft",
+    28: "spotted salamander, Ambystoma maculatum",
+    29: "axolotl, mud puppy, Ambystoma mexicanum",
+    30: "bullfrog, Rana catesbeiana",
+    31: "tree frog, tree-frog",
+    32: "tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui",
+    33: "loggerhead, loggerhead turtle, Caretta caretta",
+    34: "leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea",
+    35: "mud turtle",
+    36: "terrapin",
+    37: "box turtle, box tortoise",
+    38: "banded gecko",
+    39: "common iguana, iguana, Iguana iguana",
+    40: "American chameleon, anole, Anolis carolinensis",
+    41: "whiptail, whiptail lizard",
+    42: "agama",
+    43: "frilled lizard, Chlamydosaurus kingi",
+    44: "alligator lizard",
+    45: "Gila monster, Heloderma suspectum",
+    46: "green lizard, Lacerta viridis",
+    47: "African chameleon, Chamaeleo chamaeleon",
+    48: "Komodo dragon, Komodo lizard, dragon lizard, giant lizard, "
+    "Varanus komodoensis",
+    49: "African crocodile, Nile crocodile, Crocodylus niloticus",
+    50: "American alligator, Alligator mississipiensis",
+    51: "triceratops",
+    52: "thunder snake, worm snake, Carphophis amoenus",
+    53: "ringneck snake, ring-necked snake, ring snake",
+    54: "hognose snake, puff adder, sand viper",
+    55: "green snake, grass snake",
+    56: "king snake, kingsnake",
+    57: "garter snake, grass snake",
+    58: "water snake",
+    59: "vine snake",
+    60: "night snake, Hypsiglena torquata",
+    61: "boa constrictor, Constrictor constrictor",
+    62: "rock python, rock snake, Python sebae",
+    63: "Indian cobra, Naja naja",
+    64: "green mamba",
+    65: "sea snake",
+    66: "horned viper, cerastes, sand viper, horned asp, Cerastes cornutus",
+    67: "diamondback, diamondback rattlesnake, Crotalus adamanteus",
+    68: "sidewinder, horned rattlesnake, Crotalus cerastes",
+    69: "trilobite",
+    70: "harvestman, daddy longlegs, Phalangium opilio",
+    71: "scorpion",
+    72: "black and gold garden spider, Argiope aurantia",
+    73: "barn spider, Araneus cavaticus",
+    74: "garden spider, Aranea diademata",
+    75: "black widow, Latrodectus mactans",
+    76: "tarantula",
+    77: "wolf spider, hunting spider",
+    78: "tick",
+    79: "centipede",
+    80: "black grouse",
+    81: "ptarmigan",
+    82: "ruffed grouse, partridge, Bonasa umbellus",
+    83: "prairie chicken, prairie grouse, prairie fowl",
+    84: "peacock",
+    85: "quail",
+    86: "partridge",
+    87: "African grey, African gray, Psittacus erithacus",
+    88: "macaw",
+    89: "sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita",
+    90: "lorikeet",
+    91: "coucal",
+    92: "bee eater",
+    93: "hornbill",
+    94: "hummingbird",
+    95: "jacamar",
+    96: "toucan",
+    97: "drake",
+    98: "red-breasted merganser, Mergus serrator",
+    99: "goose",
+    100: "black swan, Cygnus atratus",
+    101: "tusker",
+    102: "echidna, spiny anteater, anteater",
+    103: "platypus, duckbill, duckbilled platypus, duck-billed platypus, "
+    "Ornithorhynchus anatinus",
+    104: "wallaby, brush kangaroo",
+    105: "koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus",
+    106: "wombat",
+    107: "jellyfish",
+    108: "sea anemone, anemone",
+    109: "brain coral",
+    110: "flatworm, platyhelminth",
+    111: "nematode, nematode worm, roundworm",
+    112: "conch",
+    113: "snail",
+    114: "slug",
+    115: "sea slug, nudibranch",
+    116: "chiton, coat-of-mail shell, sea cradle, polyplacophore",
+    117: "chambered nautilus, pearly nautilus, nautilus",
+    118: "Dungeness crab, Cancer magister",
+    119: "rock crab, Cancer irroratus",
+    120: "fiddler crab",
+    121: "king crab, Alaska crab, Alaskan king crab, Alaska king crab, "
+    "Paralithodes camtschatica",
+    122: "American lobster, Northern lobster, Maine lobster, Homarus americanus",
+    123: "spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish",
+    124: "crayfish, crawfish, crawdad, crawdaddy",
+    125: "hermit crab",
+    126: "isopod",
+    127: "white stork, Ciconia ciconia",
+    128: "black stork, Ciconia nigra",
+    129: "spoonbill",
+    130: "flamingo",
+    131: "little blue heron, Egretta caerulea",
+    132: "American egret, great white heron, Egretta albus",
+    133: "bittern",
+    134: "crane",
+    135: "limpkin, Aramus pictus",
+    136: "European gallinule, Porphyrio porphyrio",
+    137: "American coot, marsh hen, mud hen, water hen, Fulica americana",
+    138: "bustard",
+    139: "ruddy turnstone, Arenaria interpres",
+    140: "red-backed sandpiper, dunlin, Erolia alpina",
+    141: "redshank, Tringa totanus",
+    142: "dowitcher",
+    143: "oystercatcher, oyster catcher",
+    144: "pelican",
+    145: "king penguin, Aptenodytes patagonica",
+    146: "albatross, mollymawk",
+    147: "grey whale, gray whale, devilfish, Eschrichtius gibbosus, "
+    "Eschrichtius robustus",
+    148: "killer whale, killer, orca, grampus, sea wolf, Orcinus orca",
+    149: "dugong, Dugong dugon",
+    150: "sea lion",
+    151: "Chihuahua",
+    152: "Japanese spaniel",
+    153: "Maltese dog, Maltese terrier, Maltese",
+    154: "Pekinese, Pekingese, Peke",
+    155: "Shih-Tzu",
+    156: "Blenheim spaniel",
+    157: "papillon",
+    158: "toy terrier",
+    159: "Rhodesian ridgeback",
+    160: "Afghan hound, Afghan",
+    161: "basset, basset hound",
+    162: "beagle",
+    163: "bloodhound, sleuthhound",
+    164: "bluetick",
+    165: "black-and-tan coonhound",
+    166: "Walker hound, Walker foxhound",
+    167: "English foxhound",
+    168: "redbone",
+    169: "borzoi, Russian wolfhound",
+    170: "Irish wolfhound",
+    171: "Italian greyhound",
+    172: "whippet",
+    173: "Ibizan hound, Ibizan Podenco",
+    174: "Norwegian elkhound, elkhound",
+    175: "otterhound, otter hound",
+    176: "Saluki, gazelle hound",
+    177: "Scottish deerhound, deerhound",
+    178: "Weimaraner",
+    179: "Staffordshire bullterrier, Staffordshire bull terrier",
+    180: "American Staffordshire terrier, Staffordshire terrier, American "
+    "pit bull terrier, pit bull terrier",
+    181: "Bedlington terrier",
+    182: "Border terrier",
+    183: "Kerry blue terrier",
+    184: "Irish terrier",
+    185: "Norfolk terrier",
+    186: "Norwich terrier",
+    187: "Yorkshire terrier",
+    188: "wire-haired fox terrier",
+    189: "Lakeland terrier",
+    190: "Sealyham terrier, Sealyham",
+    191: "Airedale, Airedale terrier",
+    192: "cairn, cairn terrier",
+    193: "Australian terrier",
+    194: "Dandie Dinmont, Dandie Dinmont terrier",
+    195: "Boston bull, Boston terrier",
+    196: "miniature schnauzer",
+    197: "giant schnauzer",
+    198: "standard schnauzer",
+    199: "Scotch terrier, Scottish terrier, Scottie",
+    200: "Tibetan terrier, chrysanthemum dog",
+    201: "silky terrier, Sydney silky",
+    202: "soft-coated wheaten terrier",
+    203: "West Highland white terrier",
+    204: "Lhasa, Lhasa apso",
+    205: "flat-coated retriever",
+    206: "curly-coated retriever",
+    207: "golden retriever",
+    208: "Labrador retriever",
+    209: "Chesapeake Bay retriever",
+    210: "German short-haired pointer",
+    211: "vizsla, Hungarian pointer",
+    212: "English setter",
+    213: "Irish setter, red setter",
+    214: "Gordon setter",
+    215: "Brittany spaniel",
+    216: "clumber, clumber spaniel",
+    217: "English springer, English springer spaniel",
+    218: "Welsh springer spaniel",
+    219: "cocker spaniel, English cocker spaniel, cocker",
+    220: "Sussex spaniel",
+    221: "Irish water spaniel",
+    222: "kuvasz",
+    223: "schipperke",
+    224: "groenendael",
+    225: "malinois",
+    226: "briard",
+    227: "kelpie",
+    228: "komondor",
+    229: "Old English sheepdog, bobtail",
+    230: "Shetland sheepdog, Shetland sheep dog, Shetland",
+    231: "collie",
+    232: "Border collie",
+    233: "Bouvier des Flandres, Bouviers des Flandres",
+    234: "Rottweiler",
+    235: "German shepherd, German shepherd dog, German police dog, alsatian",
+    236: "Doberman, Doberman pinscher",
+    237: "miniature pinscher",
+    238: "Greater Swiss Mountain dog",
+    239: "Bernese mountain dog",
+    240: "Appenzeller",
+    241: "EntleBucher",
+    242: "boxer",
+    243: "bull mastiff",
+    244: "Tibetan mastiff",
+    245: "French bulldog",
+    246: "Great Dane",
+    247: "Saint Bernard, St Bernard",
+    248: "Eskimo dog, husky",
+    249: "malamute, malemute, Alaskan malamute",
+    250: "Siberian husky",
+    251: "dalmatian, coach dog, carriage dog",
+    252: "affenpinscher, monkey pinscher, monkey dog",
+    253: "basenji",
+    254: "pug, pug-dog",
+    255: "Leonberg",
+    256: "Newfoundland, Newfoundland dog",
+    257: "Great Pyrenees",
+    258: "Samoyed, Samoyede",
+    259: "Pomeranian",
+    260: "chow, chow chow",
+    261: "keeshond",
+    262: "Brabancon griffon",
+    263: "Pembroke, Pembroke Welsh corgi",
+    264: "Cardigan, Cardigan Welsh corgi",
+    265: "toy poodle",
+    266: "miniature poodle",
+    267: "standard poodle",
+    268: "Mexican hairless",
+    269: "timber wolf, grey wolf, gray wolf, Canis lupus",
+    270: "white wolf, Arctic wolf, Canis lupus tundrarum",
+    271: "red wolf, maned wolf, Canis rufus, Canis niger",
+    272: "coyote, prairie wolf, brush wolf, Canis latrans",
+    273: "dingo, warrigal, warragal, Canis dingo",
+    274: "dhole, Cuon alpinus",
+    275: "African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus",
+    276: "hyena, hyaena",
+    277: "red fox, Vulpes vulpes",
+    278: "kit fox, Vulpes macrotis",
+    279: "Arctic fox, white fox, Alopex lagopus",
+    280: "grey fox, gray fox, Urocyon cinereoargenteus",
+    281: "tabby, tabby cat",
+    282: "tiger cat",
+    283: "Persian cat",
+    284: "Siamese cat, Siamese",
+    285: "Egyptian cat",
+    286: "cougar, puma, catamount, mountain lion, painter, panther, Felis concolor",
+    287: "lynx, catamount",
+    288: "leopard, Panthera pardus",
+    289: "snow leopard, ounce, Panthera uncia",
+    290: "jaguar, panther, Panthera onca, Felis onca",
+    291: "lion, king of beasts, Panthera leo",
+    292: "tiger, Panthera tigris",
+    293: "cheetah, chetah, Acinonyx jubatus",
+    294: "brown bear, bruin, Ursus arctos",
+    295: "American black bear, black bear, Ursus americanus, Euarctos americanus",
+    296: "ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus",
+    297: "sloth bear, Melursus ursinus, Ursus ursinus",
+    298: "mongoose",
+    299: "meerkat, mierkat",
+    300: "tiger beetle",
+    301: "ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle",
+    302: "ground beetle, carabid beetle",
+    303: "long-horned beetle, longicorn, longicorn beetle",
+    304: "leaf beetle, chrysomelid",
+    305: "dung beetle",
+    306: "rhinoceros beetle",
+    307: "weevil",
+    308: "fly",
+    309: "bee",
+    310: "ant, emmet, pismire",
+    311: "grasshopper, hopper",
+    312: "cricket",
+    313: "walking stick, walkingstick, stick insect",
+    314: "cockroach, roach",
+    315: "mantis, mantid",
+    316: "cicada, cicala",
+    317: "leafhopper",
+    318: "lacewing, lacewing fly",
+    319: "dragonfly, darning needle, devil's darning needle, sewing needle, "
+    "snake feeder, snake doctor, mosquito hawk, skeeter hawk",
+    320: "damselfly",
+    321: "admiral",
+    322: "ringlet, ringlet butterfly",
+    323: "monarch, monarch butterfly, milkweed butterfly, Danaus plexippus",
+    324: "cabbage butterfly",
+    325: "sulphur butterfly, sulfur butterfly",
+    326: "lycaenid, lycaenid butterfly",
+    327: "starfish, sea star",
+    328: "sea urchin",
+    329: "sea cucumber, holothurian",
+    330: "wood rabbit, cottontail, cottontail rabbit",
+    331: "hare",
+    332: "Angora, Angora rabbit",
+    333: "hamster",
+    334: "porcupine, hedgehog",
+    335: "fox squirrel, eastern fox squirrel, Sciurus niger",
+    336: "marmot",
+    337: "beaver",
+    338: "guinea pig, Cavia cobaya",
+    339: "sorrel",
+    340: "zebra",
+    341: "hog, pig, grunter, squealer, Sus scrofa",
+    342: "wild boar, boar, Sus scrofa",
+    343: "warthog",
+    344: "hippopotamus, hippo, river horse, Hippopotamus amphibius",
+    345: "ox",
+    346: "water buffalo, water ox, Asiatic buffalo, Bubalus bubalis",
+    347: "bison",
+    348: "ram, tup",
+    349: "bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, "
+    "Rocky Mountain sheep, Ovis canadensis",
+    350: "ibex, Capra ibex",
+    351: "hartebeest",
+    352: "impala, Aepyceros melampus",
+    353: "gazelle",
+    354: "Arabian camel, dromedary, Camelus dromedarius",
+    355: "llama",
+    356: "weasel",
+    357: "mink",
+    358: "polecat, fitch, foulmart, foumart, Mustela putorius",
+    359: "black-footed ferret, ferret, Mustela nigripes",
+    360: "otter",
+    361: "skunk, polecat, wood pussy",
+    362: "badger",
+    363: "armadillo",
+    364: "three-toed sloth, ai, Bradypus tridactylus",
+    365: "orangutan, orang, orangutang, Pongo pygmaeus",
+    366: "gorilla, Gorilla gorilla",
+    367: "chimpanzee, chimp, Pan troglodytes",
+    368: "gibbon, Hylobates lar",
+    369: "siamang, Hylobates syndactylus, Symphalangus syndactylus",
+    370: "guenon, guenon monkey",
+    371: "patas, hussar monkey, Erythrocebus patas",
+    372: "baboon",
+    373: "macaque",
+    374: "langur",
+    375: "colobus, colobus monkey",
+    376: "proboscis monkey, Nasalis larvatus",
+    377: "marmoset",
+    378: "capuchin, ringtail, Cebus capucinus",
+    379: "howler monkey, howler",
+    380: "titi, titi monkey",
+    381: "spider monkey, Ateles geoffroyi",
+    382: "squirrel monkey, Saimiri sciureus",
+    383: "Madagascar cat, ring-tailed lemur, Lemur catta",
+    384: "indri, indris, Indri indri, Indri brevicaudatus",
+    385: "Indian elephant, Elephas maximus",
+    386: "African elephant, Loxodonta africana",
+    387: "lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens",
+    388: "giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca",
+    389: "barracouta, snoek",
+    390: "eel",
+    391: "coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch",
+    392: "rock beauty, Holocanthus tricolor",
+    393: "anemone fish",
+    394: "sturgeon",
+    395: "gar, garfish, garpike, billfish, Lepisosteus osseus",
+    396: "lionfish",
+    397: "puffer, pufferfish, blowfish, globefish",
+    398: "abacus",
+    399: "abaya",
+    400: "academic gown, academic robe, judge's robe",
+    401: "accordion, piano accordion, squeeze box",
+    402: "acoustic guitar",
+    403: "aircraft carrier, carrier, flattop, attack aircraft carrier",
+    404: "airliner",
+    405: "airship, dirigible",
+    406: "altar",
+    407: "ambulance",
+    408: "amphibian, amphibious vehicle",
+    409: "analog clock",
+    410: "apiary, bee house",
+    411: "apron",
+    412: "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, "
+    "dustbin, trash barrel, trash bin",
+    413: "assault rifle, assault gun",
+    414: "backpack, back pack, knapsack, packsack, rucksack, haversack",
+    415: "bakery, bakeshop, bakehouse",
+    416: "balance beam, beam",
+    417: "balloon",
+    418: "ballpoint, ballpoint pen, ballpen, Biro",
+    419: "Band Aid",
+    420: "banjo",
+    421: "bannister, banister, balustrade, balusters, handrail",
+    422: "barbell",
+    423: "barber chair",
+    424: "barbershop",
+    425: "barn",
+    426: "barometer",
+    427: "barrel, cask",
+    428: "barrow, garden cart, lawn cart, wheelbarrow",
+    429: "baseball",
+    430: "basketball",
+    431: "bassinet",
+    432: "bassoon",
+    433: "bathing cap, swimming cap",
+    434: "bath towel",
+    435: "bathtub, bathing tub, bath, tub",
+    436: "beach wagon, station wagon, wagon, estate car, beach waggon, "
+    "station waggon, waggon",
+    437: "beacon, lighthouse, beacon light, pharos",
+    438: "beaker",
+    439: "bearskin, busby, shako",
+    440: "beer bottle",
+    441: "beer glass",
+    442: "bell cote, bell cot",
+    443: "bib",
+    444: "bicycle-built-for-two, tandem bicycle, tandem",
+    445: "bikini, two-piece",
+    446: "binder, ring-binder",
+    447: "binoculars, field glasses, opera glasses",
+    448: "birdhouse",
+    449: "boathouse",
+    450: "bobsled, bobsleigh, bob",
+    451: "bolo tie, bolo, bola tie, bola",
+    452: "bonnet, poke bonnet",
+    453: "bookcase",
+    454: "bookshop, bookstore, bookstall",
+    455: "bottlecap",
+    456: "bow",
+    457: "bow tie, bow-tie, bowtie",
+    458: "brass, memorial tablet, plaque",
+    459: "brassiere, bra, bandeau",
+    460: "breakwater, groin, groyne, mole, bulwark, seawall, jetty",
+    461: "breastplate, aegis, egis",
+    462: "broom",
+    463: "bucket, pail",
+    464: "buckle",
+    465: "bulletproof vest",
+    466: "bullet train, bullet",
+    467: "butcher shop, meat market",
+    468: "cab, hack, taxi, taxicab",
+    469: "caldron, cauldron",
+    470: "candle, taper, wax light",
+    471: "cannon",
+    472: "canoe",
+    473: "can opener, tin opener",
+    474: "cardigan",
+    475: "car mirror",
+    476: "carousel, carrousel, merry-go-round, roundabout, whirligig",
+    477: "carpenter's kit, tool kit",
+    478: "carton",
+    479: "car wheel",
+    480: "cash machine, cash dispenser, automated teller machine, automatic "
+    "teller machine, automated teller, automatic teller, ATM",
+    481: "cassette",
+    482: "cassette player",
+    483: "castle",
+    484: "catamaran",
+    485: "CD player",
+    486: "cello, violoncello",
+    487: "cellular telephone, cellular phone, cellphone, cell, mobile phone",
+    488: "chain",
+    489: "chainlink fence",
+    490: "chain mail, ring mail, mail, chain armor, chain armour, ring armor, "
+    "ring armour",
+    491: "chain saw, chainsaw",
+    492: "chest",
+    493: "chiffonier, commode",
+    494: "chime, bell, gong",
+    495: "china cabinet, china closet",
+    496: "Christmas stocking",
+    497: "church, church building",
+    498: "cinema, movie theater, movie theatre, movie house, picture palace",
+    499: "cleaver, meat cleaver, chopper",
+    500: "cliff dwelling",
+    501: "cloak",
+    502: "clog, geta, patten, sabot",
+    503: "cocktail shaker",
+    504: "coffee mug",
+    505: "coffeepot",
+    506: "coil, spiral, volute, whorl, helix",
+    507: "combination lock",
+    508: "computer keyboard, keypad",
+    509: "confectionery, confectionary, candy store",
+    510: "container ship, containership, container vessel",
+    511: "convertible",
+    512: "corkscrew, bottle screw",
+    513: "cornet, horn, trumpet, trump",
+    514: "cowboy boot",
+    515: "cowboy hat, ten-gallon hat",
+    516: "cradle",
+    517: "crane",
+    518: "crash helmet",
+    519: "crate",
+    520: "crib, cot",
+    521: "Crock Pot",
+    522: "croquet ball",
+    523: "crutch",
+    524: "cuirass",
+    525: "dam, dike, dyke",
+    526: "desk",
+    527: "desktop computer",
+    528: "dial telephone, dial phone",
+    529: "diaper, nappy, napkin",
+    530: "digital clock",
+    531: "digital watch",
+    532: "dining table, board",
+    533: "dishrag, dishcloth",
+    534: "dishwasher, dish washer, dishwashing machine",
+    535: "disk brake, disc brake",
+    536: "dock, dockage, docking facility",
+    537: "dogsled, dog sled, dog sleigh",
+    538: "dome",
+    539: "doormat, welcome mat",
+    540: "drilling platform, offshore rig",
+    541: "drum, membranophone, tympan",
+    542: "drumstick",
+    543: "dumbbell",
+    544: "Dutch oven",
+    545: "electric fan, blower",
+    546: "electric guitar",
+    547: "electric locomotive",
+    548: "entertainment center",
+    549: "envelope",
+    550: "espresso maker",
+    551: "face powder",
+    552: "feather boa, boa",
+    553: "file, file cabinet, filing cabinet",
+    554: "fireboat",
+    555: "fire engine, fire truck",
+    556: "fire screen, fireguard",
+    557: "flagpole, flagstaff",
+    558: "flute, transverse flute",
+    559: "folding chair",
+    560: "football helmet",
+    561: "forklift",
+    562: "fountain",
+    563: "fountain pen",
+    564: "four-poster",
+    565: "freight car",
+    566: "French horn, horn",
+    567: "frying pan, frypan, skillet",
+    568: "fur coat",
+    569: "garbage truck, dustcart",
+    570: "gasmask, respirator, gas helmet",
+    571: "gas pump, gasoline pump, petrol pump, island dispenser",
+    572: "goblet",
+    573: "go-kart",
+    574: "golf ball",
+    575: "golfcart, golf cart",
+    576: "gondola",
+    577: "gong, tam-tam",
+    578: "gown",
+    579: "grand piano, grand",
+    580: "greenhouse, nursery, glasshouse",
+    581: "grille, radiator grille",
+    582: "grocery store, grocery, food market, market",
+    583: "guillotine",
+    584: "hair slide",
+    585: "hair spray",
+    586: "half track",
+    587: "hammer",
+    588: "hamper",
+    589: "hand blower, blow dryer, blow drier, hair dryer, hair drier",
+    590: "hand-held computer, hand-held microcomputer",
+    591: "handkerchief, hankie, hanky, hankey",
+    592: "hard disc, hard disk, fixed disk",
+    593: "harmonica, mouth organ, harp, mouth harp",
+    594: "harp",
+    595: "harvester, reaper",
+    596: "hatchet",
+    597: "holster",
+    598: "home theater, home theatre",
+    599: "honeycomb",
+    600: "hook, claw",
+    601: "hoopskirt, crinoline",
+    602: "horizontal bar, high bar",
+    603: "horse cart, horse-cart",
+    604: "hourglass",
+    605: "iPod",
+    606: "iron, smoothing iron",
+    607: "jack-o'-lantern",
+    608: "jean, blue jean, denim",
+    609: "jeep, landrover",
+    610: "jersey, T-shirt, tee shirt",
+    611: "jigsaw puzzle",
+    612: "jinrikisha, ricksha, rickshaw",
+    613: "joystick",
+    614: "kimono",
+    615: "knee pad",
+    616: "knot",
+    617: "lab coat, laboratory coat",
+    618: "ladle",
+    619: "lampshade, lamp shade",
+    620: "laptop, laptop computer",
+    621: "lawn mower, mower",
+    622: "lens cap, lens cover",
+    623: "letter opener, paper knife, paperknife",
+    624: "library",
+    625: "lifeboat",
+    626: "lighter, light, igniter, ignitor",
+    627: "limousine, limo",
+    628: "liner, ocean liner",
+    629: "lipstick, lip rouge",
+    630: "Loafer",
+    631: "lotion",
+    632: "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system",
+    633: "loupe, jeweler's loupe",
+    634: "lumbermill, sawmill",
+    635: "magnetic compass",
+    636: "mailbag, postbag",
+    637: "mailbox, letter box",
+    638: "maillot",
+    639: "maillot, tank suit",
+    640: "manhole cover",
+    641: "maraca",
+    642: "marimba, xylophone",
+    643: "mask",
+    644: "matchstick",
+    645: "maypole",
+    646: "maze, labyrinth",
+    647: "measuring cup",
+    648: "medicine chest, medicine cabinet",
+    649: "megalith, megalithic structure",
+    650: "microphone, mike",
+    651: "microwave, microwave oven",
+    652: "military uniform",
+    653: "milk can",
+    654: "minibus",
+    655: "miniskirt, mini",
+    656: "minivan",
+    657: "missile",
+    658: "mitten",
+    659: "mixing bowl",
+    660: "mobile home, manufactured home",
+    661: "Model T",
+    662: "modem",
+    663: "monastery",
+    664: "monitor",
+    665: "moped",
+    666: "mortar",
+    667: "mortarboard",
+    668: "mosque",
+    669: "mosquito net",
+    670: "motor scooter, scooter",
+    671: "mountain bike, all-terrain bike, off-roader",
+    672: "mountain tent",
+    673: "mouse, computer mouse",
+    674: "mousetrap",
+    675: "moving van",
+    676: "muzzle",
+    677: "nail",
+    678: "neck brace",
+    679: "necklace",
+    680: "nipple",
+    681: "notebook, notebook computer",
+    682: "obelisk",
+    683: "oboe, hautboy, hautbois",
+    684: "ocarina, sweet potato",
+    685: "odometer, hodometer, mileometer, milometer",
+    686: "oil filter",
+    687: "organ, pipe organ",
+    688: "oscilloscope, scope, cathode-ray oscilloscope, CRO",
+    689: "overskirt",
+    690: "oxcart",
+    691: "oxygen mask",
+    692: "packet",
+    693: "paddle, boat paddle",
+    694: "paddlewheel, paddle wheel",
+    695: "padlock",
+    696: "paintbrush",
+    697: "pajama, pyjama, pj's, jammies",
+    698: "palace",
+    699: "panpipe, pandean pipe, syrinx",
+    700: "paper towel",
+    701: "parachute, chute",
+    702: "parallel bars, bars",
+    703: "park bench",
+    704: "parking meter",
+    705: "passenger car, coach, carriage",
+    706: "patio, terrace",
+    707: "pay-phone, pay-station",
+    708: "pedestal, plinth, footstall",
+    709: "pencil box, pencil case",
+    710: "pencil sharpener",
+    711: "perfume, essence",
+    712: "Petri dish",
+    713: "photocopier",
+    714: "pick, plectrum, plectron",
+    715: "pickelhaube",
+    716: "picket fence, paling",
+    717: "pickup, pickup truck",
+    718: "pier",
+    719: "piggy bank, penny bank",
+    720: "pill bottle",
+    721: "pillow",
+    722: "ping-pong ball",
+    723: "pinwheel",
+    724: "pirate, pirate ship",
+    725: "pitcher, ewer",
+    726: "plane, carpenter's plane, woodworking plane",
+    727: "planetarium",
+    728: "plastic bag",
+    729: "plate rack",
+    730: "plow, plough",
+    731: "plunger, plumber's helper",
+    732: "Polaroid camera, Polaroid Land camera",
+    733: "pole",
+    734: "police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria",
+    735: "poncho",
+    736: "pool table, billiard table, snooker table",
+    737: "pop bottle, soda bottle",
+    738: "pot, flowerpot",
+    739: "potter's wheel",
+    740: "power drill",
+    741: "prayer rug, prayer mat",
+    742: "printer",
+    743: "prison, prison house",
+    744: "projectile, missile",
+    745: "projector",
+    746: "puck, hockey puck",
+    747: "punching bag, punch bag, punching ball, punchball",
+    748: "purse",
+    749: "quill, quill pen",
+    750: "quilt, comforter, comfort, puff",
+    751: "racer, race car, racing car",
+    752: "racket, racquet",
+    753: "radiator",
+    754: "radio, wireless",
+    755: "radio telescope, radio reflector",
+    756: "rain barrel",
+    757: "recreational vehicle, RV, R.V.",
+    758: "reel",
+    759: "reflex camera",
+    760: "refrigerator, icebox",
+    761: "remote control, remote",
+    762: "restaurant, eating house, eating place, eatery",
+    763: "revolver, six-gun, six-shooter",
+    764: "rifle",
+    765: "rocking chair, rocker",
+    766: "rotisserie",
+    767: "rubber eraser, rubber, pencil eraser",
+    768: "rugby ball",
+    769: "rule, ruler",
+    770: "running shoe",
+    771: "safe",
+    772: "safety pin",
+    773: "saltshaker, salt shaker",
+    774: "sandal",
+    775: "sarong",
+    776: "sax, saxophone",
+    777: "scabbard",
+    778: "scale, weighing machine",
+    779: "school bus",
+    780: "schooner",
+    781: "scoreboard",
+    782: "screen, CRT screen",
+    783: "screw",
+    784: "screwdriver",
+    785: "seat belt, seatbelt",
+    786: "sewing machine",
+    787: "shield, buckler",
+    788: "shoe shop, shoe-shop, shoe store",
+    789: "shoji",
+    790: "shopping basket",
+    791: "shopping cart",
+    792: "shovel",
+    793: "shower cap",
+    794: "shower curtain",
+    795: "ski",
+    796: "ski mask",
+    797: "sleeping bag",
+    798: "slide rule, slipstick",
+    799: "sliding door",
+    800: "slot, one-armed bandit",
+    801: "snorkel",
+    802: "snowmobile",
+    803: "snowplow, snowplough",
+    804: "soap dispenser",
+    805: "soccer ball",
+    806: "sock",
+    807: "solar dish, solar collector, solar furnace",
+    808: "sombrero",
+    809: "soup bowl",
+    810: "space bar",
+    811: "space heater",
+    812: "space shuttle",
+    813: "spatula",
+    814: "speedboat",
+    815: "spider web, spider's web",
+    816: "spindle",
+    817: "sports car, sport car",
+    818: "spotlight, spot",
+    819: "stage",
+    820: "steam locomotive",
+    821: "steel arch bridge",
+    822: "steel drum",
+    823: "stethoscope",
+    824: "stole",
+    825: "stone wall",
+    826: "stopwatch, stop watch",
+    827: "stove",
+    828: "strainer",
+    829: "streetcar, tram, tramcar, trolley, trolley car",
+    830: "stretcher",
+    831: "studio couch, day bed",
+    832: "stupa, tope",
+    833: "submarine, pigboat, sub, U-boat",
+    834: "suit, suit of clothes",
+    835: "sundial",
+    836: "sunglass",
+    837: "sunglasses, dark glasses, shades",
+    838: "sunscreen, sunblock, sun blocker",
+    839: "suspension bridge",
+    840: "swab, swob, mop",
+    841: "sweatshirt",
+    842: "swimming trunks, bathing trunks",
+    843: "swing",
+    844: "switch, electric switch, electrical switch",
+    845: "syringe",
+    846: "table lamp",
+    847: "tank, army tank, armored combat vehicle, armoured combat vehicle",
+    848: "tape player",
+    849: "teapot",
+    850: "teddy, teddy bear",
+    851: "television, television system",
+    852: "tennis ball",
+    853: "thatch, thatched roof",
+    854: "theater curtain, theatre curtain",
+    855: "thimble",
+    856: "thresher, thrasher, threshing machine",
+    857: "throne",
+    858: "tile roof",
+    859: "toaster",
+    860: "tobacco shop, tobacconist shop, tobacconist",
+    861: "toilet seat",
+    862: "torch",
+    863: "totem pole",
+    864: "tow truck, tow car, wrecker",
+    865: "toyshop",
+    866: "tractor",
+    867: "trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi",
+    868: "tray",
+    869: "trench coat",
+    870: "tricycle, trike, velocipede",
+    871: "trimaran",
+    872: "tripod",
+    873: "triumphal arch",
+    874: "trolleybus, trolley coach, trackless trolley",
+    875: "trombone",
+    876: "tub, vat",
+    877: "turnstile",
+    878: "typewriter keyboard",
+    879: "umbrella",
+    880: "unicycle, monocycle",
+    881: "upright, upright piano",
+    882: "vacuum, vacuum cleaner",
+    883: "vase",
+    884: "vault",
+    885: "velvet",
+    886: "vending machine",
+    887: "vestment",
+    888: "viaduct",
+    889: "violin, fiddle",
+    890: "volleyball",
+    891: "waffle iron",
+    892: "wall clock",
+    893: "wallet, billfold, notecase, pocketbook",
+    894: "wardrobe, closet, press",
+    895: "warplane, military plane",
+    896: "washbasin, handbasin, washbowl, lavabo, wash-hand basin",
+    897: "washer, automatic washer, washing machine",
+    898: "water bottle",
+    899: "water jug",
+    900: "water tower",
+    901: "whiskey jug",
+    902: "whistle",
+    903: "wig",
+    904: "window screen",
+    905: "window shade",
+    906: "Windsor tie",
+    907: "wine bottle",
+    908: "wing",
+    909: "wok",
+    910: "wooden spoon",
+    911: "wool, woolen, woollen",
+    912: "worm fence, snake fence, snake-rail fence, Virginia fence",
+    913: "wreck",
+    914: "yawl",
+    915: "yurt",
+    916: "web site, website, internet site, site",
+    917: "comic book",
+    918: "crossword puzzle, crossword",
+    919: "street sign",
+    920: "traffic light, traffic signal, stoplight",
+    921: "book jacket, dust cover, dust jacket, dust wrapper",
+    922: "menu",
+    923: "plate",
+    924: "guacamole",
+    925: "consomme",
+    926: "hot pot, hotpot",
+    927: "trifle",
+    928: "ice cream, icecream",
+    929: "ice lolly, lolly, lollipop, popsicle",
+    930: "French loaf",
+    931: "bagel, beigel",
+    932: "pretzel",
+    933: "cheeseburger",
+    934: "hotdog, hot dog, red hot",
+    935: "mashed potato",
+    936: "head cabbage",
+    937: "broccoli",
+    938: "cauliflower",
+    939: "zucchini, courgette",
+    940: "spaghetti squash",
+    941: "acorn squash",
+    942: "butternut squash",
+    943: "cucumber, cuke",
+    944: "artichoke, globe artichoke",
+    945: "bell pepper",
+    946: "cardoon",
+    947: "mushroom",
+    948: "Granny Smith",
+    949: "strawberry",
+    950: "orange",
+    951: "lemon",
+    952: "fig",
+    953: "pineapple, ananas",
+    954: "banana",
+    955: "jackfruit, jak, jack",
+    956: "custard apple",
+    957: "pomegranate",
+    958: "hay",
+    959: "carbonara",
+    960: "chocolate sauce, chocolate syrup",
+    961: "dough",
+    962: "meat loaf, meatloaf",
+    963: "pizza, pizza pie",
+    964: "potpie",
+    965: "burrito",
+    966: "red wine",
+    967: "espresso",
+    968: "cup",
+    969: "eggnog",
+    970: "alp",
+    971: "bubble",
+    972: "cliff, drop, drop-off",
+    973: "coral reef",
+    974: "geyser",
+    975: "lakeside, lakeshore",
+    976: "promontory, headland, head, foreland",
+    977: "sandbar, sand bar",
+    978: "seashore, coast, seacoast, sea-coast",
+    979: "valley, vale",
+    980: "volcano",
+    981: "ballplayer, baseball player",
+    982: "groom, bridegroom",
+    983: "scuba diver",
+    984: "rapeseed",
+    985: "daisy",
+    986: "yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, "
+    "Cypripedium parviflorum",
+    987: "corn",
+    988: "acorn",
+    989: "hip, rose hip, rosehip",
+    990: "buckeye, horse chestnut, conker",
+    991: "coral fungus",
+    992: "agaric",
+    993: "gyromitra",
+    994: "stinkhorn, carrion fungus",
+    995: "earthstar",
+    996: "hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa",
+    997: "bolete",
+    998: "ear, spike, capitulum",
+    999: "toilet tissue, toilet paper, bathroom tissue",
+}
diff --git a/src/finn/util/test.py b/src/finn/util/test.py
index 03a9d435a6d04659b2891815b33172586a7f0a96..0a34751786170a03361d6a17a24c7250c5ce49fd 100644
--- a/src/finn/util/test.py
+++ b/src/finn/util/test.py
@@ -30,7 +30,7 @@ import onnx
 import onnx.numpy_helper as nph
 import pkg_resources as pk
 from pkgutil import get_data
-from brevitas_examples import bnn_pynq
+from brevitas_examples import bnn_pynq, imagenet_classification
 import numpy as np
 import pytest
 import warnings
@@ -41,6 +41,7 @@ from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
 from finn.transformation.fpgadataflow.vitis_build import VitisBuild, VitisOptStrategy
 from finn.custom_op.registry import getCustomOp
 from finn.core.onnx_exec import execute_onnx
+import torchvision.transforms.functional as torchvision_util
 
 # map of (wbits,abits) -> model
 example_map = {
@@ -55,6 +56,7 @@ example_map = {
     ("TFC", 1, 1): bnn_pynq.tfc_1w1a,
     ("TFC", 1, 2): bnn_pynq.tfc_1w2a,
     ("TFC", 2, 2): bnn_pynq.tfc_2w2a,
+    ("mobilenet", 4, 4): imagenet_classification.quant_mobilenet_v1_4b,
 }
 
 
@@ -157,6 +159,7 @@ def get_trained_network_and_ishape(topology, wbits, abits):
 
     topology_to_ishape = {
         "tfc": (1, 1, 28, 28),
+        "lfc": (1, 1, 28, 28),
         "cnv": (1, 3, 32, 32),
     }
     ishape = topology_to_ishape[topology]
@@ -179,3 +182,14 @@ def execute_parent(parent_path, child_path, input_tensor_npy, return_full_ctx=Fa
         return ret
     else:
         return ret[oname]
+
+
+def resize_smaller_side(target_pixels, img):
+    """Resizes smallest side of image to target pixels and resizes larger side with
+    same ratio. Expects a PIL image."""
+    return torchvision_util.resize(img, target_pixels)
+
+
+def crop_center(size, img):
+    """Crop central size*size window out of a PIL image."""
+    return torchvision_util.center_crop(img, size)
diff --git a/src/finn/util/vivado.py b/src/finn/util/vivado.py
deleted file mode 100644
index 6b6df3940cfeeed292345382471719c49f725de6..0000000000000000000000000000000000000000
--- a/src/finn/util/vivado.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright (c) 2020, Xilinx
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import os
-import subprocess
-import stat
-from finn.util.basic import get_remote_vivado
-
-
-def which(program):
-    "Python equivalent of the shell cmd 'which'."
-
-    # source:
-    # https://stackoverflow.com/questions/377017/test-if-executable-exists-in-python
-    def is_exe(fpath):
-        return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
-
-    fpath, fname = os.path.split(program)
-    if fpath:
-        if is_exe(program):
-            return program
-    else:
-        for path in os.environ["PATH"].split(os.pathsep):
-            exe_file = os.path.join(path, program)
-            if is_exe(exe_file):
-                return exe_file
-
-    return None
-
-
-def out_of_context_synth(
-    verilog_dir,
-    top_name,
-    fpga_part="xczu3eg-sbva484-1-e",
-    clk_name="ap_clk_0",
-    clk_period_ns=5.0,
-    remote_server=get_remote_vivado(),
-):
-    "Run out-of-context Vivado synthesis, return resources and slack."
-
-    # ensure that the OH_MY_XILINX envvar is set
-    if "OHMYXILINX" not in os.environ:
-        raise Exception("The environment variable OHMYXILINX is not defined.")
-    # ensure that vivado is in PATH: source $VIVADO_PATH/settings64.sh
-    if which("vivado") is None:
-        raise Exception("vivado is not in PATH, ensure settings64.sh is sourced.")
-    omx_path = os.environ["OHMYXILINX"]
-    if remote_server is None:
-        script = "vivadocompile.sh"
-    else:
-        script = "vivadoprojgen.sh"
-    # vivadocompile.sh <top-level-entity> <clock-name (optional)> <fpga-part (optional)>
-    call_omx = "zsh %s/%s %s %s %s %f" % (
-        omx_path,
-        script,
-        top_name,
-        clk_name,
-        fpga_part,
-        float(clk_period_ns),
-    )
-    call_omx = call_omx.split()
-    proc = subprocess.Popen(
-        call_omx, cwd=verilog_dir, stdout=subprocess.PIPE, env=os.environ
-    )
-    proc.communicate()
-
-    vivado_proj_folder = "%s/results_%s" % (verilog_dir, top_name)
-    res_counts_path = vivado_proj_folder + "/res.txt"
-    if remote_server is not None:
-        print("Using remote Vivado OOC synth, remote server %s" % remote_server)
-        run_synth = """
-#!/bin/bash
-which vivado;
-cd %s;
-vivado -mode tcl -source %s.tcl -tclargs %s;
-cat %s
-        """ % (
-            vivado_proj_folder,
-            top_name,
-            top_name,
-            res_counts_path,
-        )
-        with open(vivado_proj_folder + "/run.sh", "w") as f:
-            f.write(run_synth)
-        st = os.stat(vivado_proj_folder + "/run.sh")
-        os.chmod(vivado_proj_folder + "/run.sh", st.st_mode | stat.S_IEXEC)
-        # note that this assumes the same temp folder can be created on the
-        # remote server
-        # note we set target path as / due to use of -R (relative)
-        remote_server_uri = remote_server + ":/"
-        copy_files = "rsync -avzR %s %s" % (verilog_dir + "/", remote_server_uri)
-        copy_files = copy_files.split()
-        proc = subprocess.Popen(copy_files, cwd=verilog_dir, env=os.environ)
-        proc.communicate()
-        vivado_cmd = "bash -ic %s/run.sh" % vivado_proj_folder
-        run_vivado = ["ssh", "-t", remote_server, vivado_cmd]
-        proc = subprocess.Popen(run_vivado, cwd=verilog_dir, env=os.environ)
-        proc.communicate()
-        remote_server_result = remote_server + ":" + res_counts_path
-        copy_results = "rsync -avz %s %s" % (remote_server_result, res_counts_path)
-        copy_results = copy_results.split()
-        proc = subprocess.Popen(copy_results, cwd=verilog_dir, env=os.environ)
-        proc.communicate()
-
-    with open(res_counts_path, "r") as myfile:
-        res_data = myfile.read().split("\n")
-    ret = {}
-    ret["vivado_proj_folder"] = vivado_proj_folder
-    for res_line in res_data:
-        res_fields = res_line.split("=")
-        print(res_fields)
-        try:
-            ret[res_fields[0]] = float(res_fields[1])
-        except ValueError:
-            ret[res_fields[0]] = 0
-        except IndexError:
-            ret[res_fields[0]] = 0
-    if ret["WNS"] == 0:
-        ret["fmax_mhz"] = 0
-    else:
-        ret["fmax_mhz"] = 1000.0 / (clk_period_ns - ret["WNS"])
-    return ret
diff --git a/tests/brevitas/king_charles.jpg b/tests/brevitas/king_charles.jpg
new file mode 100755
index 0000000000000000000000000000000000000000..c1400a484e686c3efe045c56e4fe02f3e0f8d17b
Binary files /dev/null and b/tests/brevitas/king_charles.jpg differ
diff --git a/tests/brevitas/test_brevitas_mobilenet.py b/tests/brevitas/test_brevitas_mobilenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..94f937ef2afc9eb86665e26d703be9f01e2163a0
--- /dev/null
+++ b/tests/brevitas/test_brevitas_mobilenet.py
@@ -0,0 +1,91 @@
+from PIL import Image
+import numpy as np
+import brevitas.onnx as bo
+import pytest
+import torch
+from finn.util.basic import make_build_dir
+from finn.util.pytorch import NormalizePreProc
+from finn.util.test import get_test_model_trained, resize_smaller_side, crop_center
+from finn.core.modelwrapper import ModelWrapper
+from finn.core.datatype import DataType
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.fold_constants import FoldConstants
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.general import (
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    GiveUniqueParameterTensors,
+)
+from finn.transformation.merge_onnx_models import MergeONNXModels
+import finn.transformation.streamline.absorb as absorb
+from finn.transformation.insert_topk import InsertTopK
+import finn.core.onnx_exec as oxe
+
+
+@pytest.mark.xfail
+def test_brevitas_mobilenet():
+    # get single image as input and prepare image
+    img = Image.open("/workspace/finn/tests/brevitas/king_charles.jpg")
+    # resize smallest side of the image to 256 pixels and resize larger side
+    # with same ratio
+    img = resize_smaller_side(256, img)
+    # crop central 224*224 window
+    img = crop_center(224, img)
+    # save image as numpy array and as torch tensor to enable testing in
+    # brevitas/pytorch and finn and transpose from (H, W, C) to (C, H, W)
+    img_np = np.asarray(img).copy().astype(np.float32).transpose(2, 0, 1)
+    img_np = img_np.reshape(1, 3, 224, 224)
+    img_torch = torch.from_numpy(img_np).float()
+
+    # export preprocess
+    export_onnx_path = make_build_dir("test_brevitas_mobilenet-v1_")
+    preproc_onnx = export_onnx_path + "/quant_mobilenet_v1_4b_preproc.onnx"
+    mean = [0.485, 0.456, 0.406]
+    std = 0.226
+    ch = 3
+    preproc = NormalizePreProc(mean, std, ch)
+    bo.export_finn_onnx(preproc, (1, 3, 224, 224), preproc_onnx)
+    preproc_model = ModelWrapper(preproc_onnx)
+    # set input finn datatype to UINT8
+    preproc_model.set_tensor_datatype(preproc_model.graph.input[0].name, DataType.UINT8)
+    preproc_model = preproc_model.transform(InferShapes())
+    preproc_model = preproc_model.transform(GiveUniqueNodeNames())
+    preproc_model = preproc_model.transform(GiveUniqueParameterTensors())
+    preproc_model = preproc_model.transform(GiveReadableTensorNames())
+
+    finn_onnx = export_onnx_path + "/quant_mobilenet_v1_4b_exported.onnx"
+    mobilenet = get_test_model_trained("mobilenet", 4, 4)
+    bo.export_finn_onnx(mobilenet, (1, 3, 224, 224), finn_onnx)
+
+    # do forward pass in PyTorch/Brevitas
+    input_tensor = preproc.forward(img_torch)
+    expected = mobilenet.forward(input_tensor).detach().numpy()
+    expected_topk = expected.flatten()
+    expected_top5 = np.argsort(expected_topk)[-5:]
+    expected_top5 = np.flip(expected_top5)
+    expected_top5_prob = []
+    for index in expected_top5:
+        expected_top5_prob.append(expected_topk[index])
+    model = ModelWrapper(finn_onnx)
+    model = model.transform(InferShapes())
+    model = model.transform(FoldConstants())
+    model = model.transform(InsertTopK())
+    # get initializer from Mul that will be absorbed into topk
+    a0 = model.get_initializer(model.graph.node[-2].input[1])
+    model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    model = model.transform(InferDataLayouts())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveUniqueParameterTensors())
+    model = model.transform(GiveReadableTensorNames())
+    model.save(export_onnx_path + "/quant_mobilenet_v1_4b_wo_preproc.onnx")
+    model = model.transform(MergeONNXModels(preproc_model))
+    model.save(export_onnx_path + "/quant_mobilenet_v1_4b.onnx")
+    idict = {model.graph.input[0].name: img_np}
+    odict = oxe.execute_onnx(model, idict, True)
+    produced = odict[model.graph.output[0].name]
+    produced_prob = odict["TopK_0_out0"] * a0
+    assert (produced.flatten() == expected_top5).all()
+    assert np.isclose(produced_prob.flatten(), expected_top5_prob).all()
diff --git a/tests/brevitas/test_brevitas_validate_mobilenet.py b/tests/brevitas/test_brevitas_validate_mobilenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..42bc3942d1a4f5fbdf70dbb1f1b5e853357abff8
--- /dev/null
+++ b/tests/brevitas/test_brevitas_validate_mobilenet.py
@@ -0,0 +1,180 @@
+import os
+import csv
+import numpy as np
+import brevitas.onnx as bo
+import torch
+from finn.util.basic import make_build_dir
+from finn.util.pytorch import NormalizePreProc
+from finn.util.test import get_test_model_trained
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.fold_constants import FoldConstants
+from finn.transformation.general import RemoveStaticGraphInputs
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.general import (
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    GiveUniqueParameterTensors,
+)
+from finn.transformation.merge_onnx_models import MergeONNXModels
+import finn.transformation.streamline.absorb as absorb
+from finn.transformation.insert_topk import InsertTopK
+import finn.core.onnx_exec as oxe
+import finn.util.imagenet as imagenet_util
+import pytest
+import torchvision.datasets as datasets
+import torchvision.transforms as transforms
+
+# normalization (preprocessing) settings for MobileNet-v1 w4a4
+mean = [0.485, 0.456, 0.406]
+std = 0.226
+ch = 3
+
+
+def test_brevitas_mobilenet_preproc():
+    if "IMAGENET_VAL_PATH" not in os.environ.keys():
+        pytest.skip("Can't do validation without IMAGENET_VAL_PATH")
+    n_images = 1000
+    # Brevitas-style: use torchvision pipeline
+    std_arr = [std, std, std]
+    normalize = transforms.Normalize(mean=mean, std=std_arr)
+    val_loader = torch.utils.data.DataLoader(
+        datasets.ImageFolder(
+            os.environ["IMAGENET_VAL_PATH"] + "/../",
+            transforms.Compose(
+                [
+                    transforms.Resize(256),
+                    transforms.CenterCrop(224),
+                    transforms.ToTensor(),
+                    normalize,
+                ]
+            ),
+        ),
+        batch_size=1,
+        shuffle=False,
+        num_workers=0,
+    )
+    # FINN-style: load_resize_crop then normalization as PyTorch graph
+    preproc = NormalizePreProc(mean, std, ch)
+    finn_loader = imagenet_util.get_val_images(n_images)
+    val_loader = iter(val_loader)
+    for i in range(n_images):
+        (img_path, finn_target) = next(finn_loader)
+        finn_img = imagenet_util.load_resize_crop(img_path)
+        finn_img = preproc.forward(torch.from_numpy(finn_img).float())
+        (pyt_img, pyt_target) = next(val_loader)
+        assert finn_img.shape == pyt_img.shape
+        assert (finn_img == pyt_img).all()
+
+
+@pytest.mark.slow
+# marked as XFAIL until Brevitas export issues are resolved:
+# https://github.com/Xilinx/brevitas/issues/173
+@pytest.mark.xfail
+def test_brevitas_compare_exported_mobilenet():
+    if "IMAGENET_VAL_PATH" not in os.environ.keys():
+        pytest.skip("Can't do validation without IMAGENET_VAL_PATH")
+    n_images = 10
+    debug_mode = False
+    export_onnx_path = make_build_dir("test_brevitas_mobilenet-v1_")
+    # export preprocessing
+    preproc_onnx = export_onnx_path + "/quant_mobilenet_v1_4b_preproc.onnx"
+    preproc = NormalizePreProc(mean, std, ch)
+    bo.export_finn_onnx(preproc, (1, 3, 224, 224), preproc_onnx)
+    preproc_model = ModelWrapper(preproc_onnx)
+    preproc_model = preproc_model.transform(InferShapes())
+    preproc_model = preproc_model.transform(GiveUniqueNodeNames())
+    preproc_model = preproc_model.transform(GiveUniqueParameterTensors())
+    preproc_model = preproc_model.transform(GiveReadableTensorNames())
+    # export the actual MobileNet-v1
+    finn_onnx = export_onnx_path + "/quant_mobilenet_v1_4b.onnx"
+    mobilenet = get_test_model_trained("mobilenet", 4, 4)
+    if debug_mode:
+        dbg_hook = bo.enable_debug(mobilenet)
+    bo.export_finn_onnx(mobilenet, (1, 3, 224, 224), finn_onnx)
+    model = ModelWrapper(finn_onnx)
+    model = model.transform(InferShapes())
+    model = model.transform(FoldConstants())
+    model = model.transform(RemoveStaticGraphInputs())
+    model = model.transform(InsertTopK())
+    # get initializer from Mul that will be absorbed into topk
+
+    a0 = model.get_initializer(model.get_nodes_by_op_type("Mul")[-1].input[1])
+    model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    model = model.transform(InferDataLayouts())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveUniqueParameterTensors())
+    model = model.transform(GiveReadableTensorNames())
+    model.save(export_onnx_path + "/quant_mobilenet_v1_4b_wo_preproc.onnx")
+    # create merged preprocessing + MobileNet-v1 model
+    model = model.transform(MergeONNXModels(preproc_model))
+    model.save(export_onnx_path + "/quant_mobilenet_v1_4b.onnx")
+
+    with open(
+        export_onnx_path + "/mobilenet_validation.csv", "w", newline=""
+    ) as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerow(
+            [
+                "goldenID",
+                "brevitasTop5",
+                "brevitasTop5[%]",
+                "finnTop5",
+                "finnTop5[%]",
+                "top5equal",
+                "top5%equal",
+            ]
+        )
+        csvfile.flush()
+        workload = imagenet_util.get_val_images(n_images, interleave_classes=True)
+        all_inds_ok = True
+        all_probs_ok = True
+        for (img_path, target_id) in workload:
+            img_np = imagenet_util.load_resize_crop(img_path)
+            img_torch = torch.from_numpy(img_np).float()
+            # do forward pass in PyTorch/Brevitas
+            input_tensor = preproc.forward(img_torch)
+            expected = mobilenet.forward(input_tensor).detach().numpy()
+            expected_topk = expected.flatten()
+            expected_top5 = np.argsort(expected_topk)[-5:]
+            expected_top5 = np.flip(expected_top5)
+            expected_top5_prob = []
+            for index in expected_top5:
+                expected_top5_prob.append(expected_topk[index])
+            idict = {model.graph.input[0].name: img_np}
+            odict = oxe.execute_onnx(model, idict, return_full_exec_context=True)
+            produced = odict[model.graph.output[0].name]
+            produced_prob = odict["TopK_0_out0"] * a0
+            inds_ok = (produced.flatten() == expected_top5).all()
+            probs_ok = np.isclose(produced_prob.flatten(), expected_top5_prob).all()
+            all_inds_ok = all_inds_ok and inds_ok
+            all_probs_ok = all_probs_ok and probs_ok
+            writer.writerow(
+                [
+                    str(target_id),
+                    str(expected_top5),
+                    str(expected_top5_prob),
+                    str(produced.flatten()),
+                    str(produced_prob.flatten()),
+                    str(inds_ok),
+                    str(probs_ok),
+                ]
+            )
+            csvfile.flush()
+            if ((not inds_ok) or (not probs_ok)) and debug_mode:
+                print("Results differ for %s" % img_path)
+                # check all tensors at debug markers
+                names_brevitas = set(dbg_hook.values.keys())
+                names_finn = set(odict.keys())
+                names_common = names_brevitas.intersection(names_finn)
+                for dbg_name in names_common:
+                    if not np.isclose(
+                        dbg_hook.values[dbg_name].detach().numpy(),
+                        odict[dbg_name],
+                        atol=1e-3,
+                    ).all():
+                        print("Tensor %s differs between Brevitas and FINN" % dbg_name)
+        assert all_inds_ok and all_probs_ok
diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py
index b1d0422801c6ad49b413ea0a4c937ac0ac03c37d..0fe64d1a46403cf0276654bca0c14f461d1c163b 100644
--- a/tests/end2end/test_end2end_bnn_pynq.py
+++ b/tests/end2end/test_end2end_bnn_pynq.py
@@ -98,7 +98,7 @@ import subprocess
 from finn.util.gdrive import upload_to_end2end_dashboard
 from collections import OrderedDict
 
-build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
+build_dir = os.environ["FINN_BUILD_DIR"]
 target_clk_ns = 10
 mem_mode = "decoupled"
 rtlsim_trace = False
@@ -135,6 +135,30 @@ def fold_tfc(model):
         (8, 8, "auto"),
         (10, 8, "distributed"),
     ]
+    for fcl, (pe, simd, ramstyle) in zip(fc_layers, config):
+        fcl_inst = getCustomOp(fcl)
+        fcl_inst.set_nodeattr("PE", pe)
+        fcl_inst.set_nodeattr("SIMD", simd)
+        fcl_inst.set_nodeattr("ram_style", ramstyle)
+        fcl_inst.set_nodeattr("runtime_writeable_weights", 1)
+    # set parallelism for input quantizer to be same as first layer's SIMD
+    inp_qnt_node = model.get_nodes_by_op_type("Thresholding_Batch")[0]
+    inp_qnt = getCustomOp(inp_qnt_node)
+    inp_qnt.set_nodeattr("PE", 49)
+    inp_qnt.set_nodeattr("mem_mode", "decoupled")
+    inp_qnt.set_nodeattr("runtime_writeable_weights", 1)
+    return model
+
+
+def fold_lfc(model):
+    fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
+    # (PE, SIMD, ramstyle) for each layer
+    config = [
+        (32, 49, "block"),
+        (64, 32, "auto"),
+        (32, 64, "auto"),
+        (10, 8, "distributed"),
+    ]
     for fcl, (pe, simd, ramstyle) in zip(fc_layers, config):
         fcl_inst = getCustomOp(fcl)
         fcl_inst.set_nodeattr("PE", pe)
@@ -205,6 +229,8 @@ def fold_cnv_small(model):
 def get_folding_function(topology, wbits, abits):
     if "tfc" in topology:
         return fold_tfc
+    elif "lfc" in topology:
+        return fold_lfc
     elif "cnv" in topology:
         if wbits == 1 and abits == 1:
             return fold_cnv_large
@@ -281,11 +307,13 @@ def topology2dataset(topology):
 
 @pytest.mark.parametrize("wbits", [1, 2])
 @pytest.mark.parametrize("abits", [1, 2])
-@pytest.mark.parametrize("topology", ["tfc", "cnv"])
+@pytest.mark.parametrize("topology", ["lfc", "tfc", "cnv"])
 class TestEnd2End:
     def test_export(self, topology, wbits, abits):
         if wbits > abits:
             pytest.skip("No wbits > abits end2end network configs for now")
+        if topology == "lfc" and wbits > 1:
+            pytest.skip("Skipping non-existing lfc configs")
         (model, ishape) = get_trained_network_and_ishape(topology, wbits, abits)
         chkpt_name = get_checkpoint_name(topology, wbits, abits, "export")
         bo.export_finn_onnx(model, ishape, chkpt_name)
@@ -362,6 +390,9 @@ class TestEnd2End:
     def test_convert_to_hls_layers(self, topology, wbits, abits):
         prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "streamline")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
+        if topology == "tfc" and wbits == 1 and abits == 1:
+            # use standalone thresholds for tfc-w1a1 to also exercise that option
+            model = model.transform(to_hls.InferThresholdingLayer())
         # needed for bipolar MatMul layers
         model = model.transform(to_hls.InferBinaryStreamingFCLayer(mem_mode))
         # needed for non-bipolar MatMul layers
diff --git a/tests/end2end/test_end2end_mobilenet_v1.py b/tests/end2end/test_end2end_mobilenet_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..c23749829a9d75c9a9519663a872aa1281bd46d3
--- /dev/null
+++ b/tests/end2end/test_end2end_mobilenet_v1.py
@@ -0,0 +1,434 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import time
+import pytest
+
+from PIL import Image
+import os
+import numpy as np
+import brevitas.onnx as bo
+import torch
+
+from finn.custom_op.registry import getCustomOp
+from finn.util.pytorch import NormalizePreProc
+from finn.util.test import (
+    get_test_model_trained,
+    load_test_checkpoint_or_skip,
+    resize_smaller_side,
+    crop_center,
+)
+
+from finn.core.modelwrapper import ModelWrapper
+from finn.core.datatype import DataType
+
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.fold_constants import FoldConstants
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.general import (
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    GiveUniqueParameterTensors,
+    RemoveUnusedTensors,
+)
+from finn.transformation.merge_onnx_models import MergeONNXModels
+from finn.transformation.insert_topk import InsertTopK
+import finn.transformation.streamline.absorb as absorb
+import finn.transformation.streamline.reorder as reorder
+from finn.transformation.streamline import Streamline
+from finn.transformation.double_to_single_float import DoubleToSingleFloat
+from finn.transformation.streamline.remove import RemoveIdentityOps
+from finn.transformation.streamline.collapse_repeated import CollapseRepeatedMul
+from finn.transformation.change_datalayout import ChangeDataLayoutQuantAvgPool2d
+from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
+from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+from finn.transformation.fpgadataflow.create_dataflow_partition import (
+    CreateDataflowPartition,
+)
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
+from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
+from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths
+from finn.core.onnx_exec import execute_onnx
+from finn.util.basic import alveo_part_map, alveo_default_platform
+from finn.util.config import extract_model_config_to_json
+from finn.transformation.fpgadataflow.vitis_build import VitisBuild, VitisOptStrategy
+
+build_dir = os.environ["FINN_BUILD_DIR"]
+
+test_board = "U250"
+test_platform = alveo_default_platform[test_board]
+test_fpga_part = alveo_part_map[test_board]
+target_clk_ns = 3
+mem_mode = "decoupled"
+large_fifo_ram_style = "ultra"
+extra_fold = 1
+first_layer_res_type = "dsp"
+
+
+def test_end2end_mobilenet_export():
+    # export preprocessing
+    preproc_onnx = build_dir + "/end2end_mobilenet_preproc.onnx"
+    mean = [0.485, 0.456, 0.406]
+    std = 0.226
+    ch = 3
+    preproc = NormalizePreProc(mean, std, ch)
+    bo.export_finn_onnx(preproc, (1, 3, 224, 224), preproc_onnx)
+    preproc_model = ModelWrapper(preproc_onnx)
+    # set input finn datatype to UINT8
+    preproc_model.set_tensor_datatype(preproc_model.graph.input[0].name, DataType.UINT8)
+    preproc_model = preproc_model.transform(InferShapes())
+    preproc_model = preproc_model.transform(GiveUniqueNodeNames())
+    preproc_model = preproc_model.transform(GiveUniqueParameterTensors())
+    preproc_model = preproc_model.transform(GiveReadableTensorNames())
+    preproc_model.save(build_dir + "/end2end_mobilenet_preproc.onnx")
+
+    # export mobilenet
+    finn_onnx = build_dir + "/end2end_mobilenet_export.onnx"
+    mobilenet = get_test_model_trained("mobilenet", 4, 4)
+    bo.export_finn_onnx(mobilenet, (1, 3, 224, 224), finn_onnx)
+
+    # calculate golden output with pytorch/brevitas and save as .npy
+    # get single image as input and prepare image
+    img = Image.open("/workspace/finn/tests/brevitas/king_charles.jpg")
+    # resize smallest side of the image to 256 pixels and resize larger side
+    # with same ratio
+    img = resize_smaller_side(256, img)
+    # crop central 224*224 window
+    img = crop_center(224, img)
+    # save image as numpy array and as torch tensor to enable testing in
+    # brevitas/pytorch and finn and transpose from (H, W, C) to (C, H, W)
+    img_np = np.asarray(img).copy().astype(np.float32).transpose(2, 0, 1)
+    img_np = img_np.reshape(1, 3, 224, 224)
+    np.save(build_dir + "/end2end_mobilenet_input.npy", img_np)
+    img_torch = torch.from_numpy(img_np).float()
+    # do forward pass in PyTorch/Brevitas
+    input_tensor = preproc.forward(img_torch)
+    golden = mobilenet.forward(input_tensor).detach().numpy()
+    golden_topk = golden.flatten()
+    golden_top5 = np.argsort(golden_topk)[-5:]
+    golden_top5 = np.flip(golden_top5)
+    golden_top5_prob = []
+    for index in golden_top5:
+        golden_top5_prob.append(golden_topk[index])
+    # save golden output values
+    np.save(build_dir + "/end2end_mobilenet_golden_top5.npy", golden_top5)
+    np.save(build_dir + "/end2end_mobilenet_golden_top5_prob.npy", golden_top5_prob)
+    assert os.path.isfile(finn_onnx)
+    assert os.path.isfile(build_dir + "/end2end_mobilenet_preproc.onnx")
+
+
+def test_end2end_mobilenet_tidy_and_merge_with_preproc():
+    preproc_model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_mobilenet_preproc.onnx"
+    )
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_export.onnx")
+    model = model.transform(InferShapes())
+    model = model.transform(FoldConstants())
+    model = model.transform(InsertTopK())
+    # get initializer from Mul that will be absorbed into topk
+    a0 = model.get_initializer(model.graph.node[-2].input[1])
+    np.save(build_dir + "/end2end_mobilenet_topk_scale.npy", a0)
+    model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    model = model.transform(InferDataLayouts())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveUniqueParameterTensors())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(MergeONNXModels(preproc_model))
+    model.save(build_dir + "/end2end_mobilenet_tidy.onnx")
+
+
+def test_end2end_mobilenet_streamline():
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_tidy.onnx")
+    model = model.transform(Streamline())
+    additional_streamline_transformations = [
+        DoubleToSingleFloat(),
+        reorder.MoveMulPastDWConv(),
+        absorb.AbsorbMulIntoMultiThreshold(),
+        ChangeDataLayoutQuantAvgPool2d(),
+        InferDataLayouts(),
+        reorder.MoveTransposePastScalarMul(),
+        absorb.AbsorbTransposeIntoFlatten(),
+        reorder.MoveFlattenPastAffine(),
+        reorder.MoveFlattenPastTopK(),
+        reorder.MoveScalarMulPastMatMul(),
+        CollapseRepeatedMul(),
+        RemoveIdentityOps(),
+        RoundAndClipThresholds(),
+    ]
+    for trn in additional_streamline_transformations:
+        model = model.transform(trn)
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(GiveReadableTensorNames())
+        model = model.transform(InferDataTypes())
+    model.save(build_dir + "/end2end_mobilenet_streamlined.onnx")
+
+
+def test_end2end_mobilenet_lowering():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_mobilenet_streamlined.onnx"
+    )
+    model = model.transform(LowerConvsToMatMul())
+    model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(InferDataTypes())
+    model = model.transform(RoundAndClipThresholds())
+    model.save(build_dir + "/end2end_mobilenet_lowered.onnx")
+
+
+def test_end2end_mobilenet_convert_to_hls_layers():
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_lowered.onnx")
+    model = model.transform(to_hls.InferPool_Batch())
+    model = model.transform(to_hls.InferConvInpGen())
+    model = model.transform(to_hls.InferVVAU())
+    model = model.transform(to_hls.InferQuantizedStreamingFCLayer(mem_mode))
+    model = model.transform(to_hls.InferChannelwiseLinearLayer())
+    model = model.transform(to_hls.InferLabelSelectLayer())
+    model = model.transform(InferShapes())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model.save(build_dir + "/end2end_mobilenet_hls_layers.onnx")
+
+
+def test_end2end_mobilenet_folding():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_mobilenet_hls_layers.onnx"
+    )
+    # optional extra folding to use fewer resources
+    # applied while setting the attributes on each node
+    assert extra_fold in [1, 2, 4]
+    # set up folding for the depthwise conv layers impl'd by VVAUs
+    # each value is PE for a layer
+    fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
+    # each tuple is (PE, SIMD, ram_style) for a layer
+    folding = [
+        (32, 3, "block"),
+        (16, 16, "block"),
+        (16, 16, "block"),
+        (32, 16, "block"),
+        (16, 16, "block"),
+        (32, 16, "block"),
+        (16, 16, "block"),
+        (32, 16, "block"),
+        (32, 16, "block"),
+        (32, 16, "block"),
+        (32, 16, "block"),
+        (32, 16, "block"),
+        (16, 16, "block"),
+        (32, 16, "block"),
+        (4, 4, "block"),
+    ]
+    for fcl, (pe, simd, ramstyle) in zip(fc_layers, folding):
+        fcl_inst = getCustomOp(fcl)
+        fcl_inst.set_nodeattr("PE", pe // extra_fold)
+        fcl_inst.set_nodeattr("SIMD", simd)
+        fcl_inst.set_nodeattr("ram_style", ramstyle)
+    # first layer uses 8-bit weights & activations
+    # control its compute resource type explicitly
+    getCustomOp(fc_layers[0]).set_nodeattr("resType", first_layer_res_type)
+    # set up folding for the depthwise conv layers impl'd by VVAUs
+    # each value is PE for a layer
+    vvau_layers = model.get_nodes_by_op_type("Vector_Vector_Activate_Batch")
+    folding = [32, 32, 64, 16, 32, 8, 16, 16, 16, 16, 16, 4, 8]
+    for vvau, pe in zip(vvau_layers, folding):
+        vvau_inst = getCustomOp(vvau)
+        vvau_inst.set_nodeattr("PE", pe // extra_fold)
+        # set SIMD in preceeding ConvInputGen to same value
+        convinputgen = model.find_direct_predecessors(vvau)[0]
+        convinputgen_inst = getCustomOp(convinputgen)
+        convinputgen_inst.set_nodeattr("SIMD", pe // extra_fold)
+        # set SIMD in preceeding FMPadding to same value
+        padding = model.find_direct_predecessors(convinputgen)[0]
+        if padding.op_type == "FMPadding_Batch":
+            padding_inst = getCustomOp(padding)
+            padding_inst.set_nodeattr("SIMD", pe // extra_fold)
+    # adjust final pooling layer + its inpgen
+    pool_node = model.get_nodes_by_op_type("Pool_Batch")[0]
+    pool_inst = getCustomOp(pool_node)
+    pool_inst.set_nodeattr("PE", 4 // extra_fold)
+    pool_inpgen = model.find_direct_predecessors(pool_node)[0]
+    pool_inpgen_inst = getCustomOp(pool_inpgen)
+    pool_inpgen_inst.set_nodeattr("SIMD", 4 // extra_fold)
+    model = model.transform(InferDataLayouts())
+    model.save(build_dir + "/end2end_mobilenet_folded.onnx")
+
+
+def test_end2end_mobilenet_create_dataflow_partition():
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_folded.onnx")
+    parent_model = model.transform(CreateDataflowPartition())
+    parent_model.save(build_dir + "/end2end_mobilenet_dataflow_parent.onnx")
+    sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+    sdp_node = getCustomOp(sdp_node)
+    dataflow_model_filename = sdp_node.get_nodeattr("model")
+    dataflow_model = load_test_checkpoint_or_skip(dataflow_model_filename)
+    dataflow_model = dataflow_model.transform(RemoveUnusedTensors())
+    dataflow_model.save(build_dir + "/end2end_mobilenet_dataflow_model.onnx")
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.xfail
+def test_end2end_mobilenet_cppsim():
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_folded.onnx")
+    x = np.load(build_dir + "/end2end_mobilenet_input.npy")
+    inp_name = model.graph.input[0].name
+    out_name = model.graph.output[0].name
+    inp_dict = {inp_name: x}
+    start = time.time()
+    # cppsim
+    model = model.transform(PrepareCppSim())
+    model = model.transform(CompileCppSim())
+    model = model.transform(SetExecMode("cppsim"))
+    end = time.time()
+    elapsed_time = end - start
+    f = open(build_dir + "/end2end_mobilenet_compile_time.txt", "w+")
+    f.write("Execution time in seconds: " + str(elapsed_time))
+    f.close()
+    model.save(build_dir + "/end2end_mobilenet_cppsim.onnx")
+    ret_cppsim = execute_onnx(model, inp_dict, True)
+    res_cppsim = ret_cppsim[out_name]
+    np.save(build_dir + "/end2end_mobilenet_result_cppsim.npy", res_cppsim)
+    a0 = np.load(build_dir + "/end2end_mobilenet_topk_scale.npy")
+    res_cppsim_prob = ret_cppsim[model.graph.node[-2].output[0]] * a0
+    np.save(build_dir + "/end2end_mobilenet_result_cppsim_prob.npy", res_cppsim_prob)
+
+    # check result with golden values
+    golden = np.load(build_dir + "/end2end_mobilenet_golden_top5.npy")
+    golden_prob = np.load(build_dir + "/end2end_mobilenet_golden_top5_prob.npy")
+
+    assert (golden == res_cppsim).all()
+    assert np.isclose(golden_prob, res_cppsim_prob).all()
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_end2end_mobilenet_gen_hls_ip():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_mobilenet_dataflow_model.onnx"
+    )
+    start = time.time()
+    model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(ReplaceVerilogRelPaths())
+    end = time.time()
+    elapsed_time = end - start
+    f = open(build_dir + "/end2end_mobilenet_ipgen_time.txt", "w+")
+    f.write("Execution time in seconds: " + str(elapsed_time))
+    f.close()
+
+    model = model.transform(AnnotateResources("hls"))
+    model.save(build_dir + "/end2end_mobilenet_ipgen.onnx")
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.xfail
+def test_end2end_mobilenet_rtlsim():
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_ipgen.onnx")
+    x = np.load(build_dir + "/end2end_mobilenet_input.npy")
+    inp_name = model.graph.input[0].name
+    out_name = model.graph.output[0].name
+    inp_dict = {inp_name: x}
+    # node-by-node rtlsim
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(PrepareRTLSim())
+    model.save(build_dir + "/end2end_mobilenet_ipgen_nodebynode_rtlsim.onnx")
+    ret_rtlsim_nodebynode = execute_onnx(model, inp_dict, True)
+    res_rtlsim_nodebynode = ret_rtlsim_nodebynode[out_name]
+    np.save(
+        build_dir + "/end2end_mobilenet_result_rtlsim_nodebynode.npy",
+        res_rtlsim_nodebynode,
+    )
+    a0 = np.load(build_dir + "/end2end_mobilenet_topk_scale.npy")
+    res_rtlsim_nodebynode_prob = (
+        ret_rtlsim_nodebynode[model.graph.node[-2].output[0]] * a0
+    )
+    np.save(
+        build_dir + "/end2end_mobilenet_result_rtlsim_nodebynode_prob.npy",
+        res_rtlsim_nodebynode_prob,
+    )
+
+    # check result with golden values
+    golden = np.load(build_dir + "/end2end_mobilenet_golden_top5.npy")
+    golden_prob = np.load(build_dir + "/end2end_mobilenet_golden_top5_prob.npy")
+
+    assert (golden == res_rtlsim_nodebynode).all()
+    assert np.isclose(golden_prob, res_rtlsim_nodebynode_prob).all()
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_end2end_mobilenet_set_fifo_depths():
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_ipgen.onnx")
+    start = time.time()
+    model = model.transform(
+        InsertAndSetFIFODepths(
+            test_fpga_part, target_clk_ns, vivado_ram_style=large_fifo_ram_style
+        )
+    )
+    end = time.time()
+    elapsed_time = end - start
+    f = open(build_dir + "/end2end_mobilenet_fifoset_time.txt", "w+")
+    f.write("Execution time in seconds: " + str(elapsed_time))
+    f.close()
+    extract_model_config_to_json(
+        model,
+        build_dir + "/end2end_mobilenet_folded_and_fifo_config.json",
+        ["PE", "SIMD", "impl_style", "ram_style", "depth"],
+    )
+    model.save(build_dir + "/end2end_mobilenet_fifodepth.onnx")
+
+
+@pytest.mark.slow
+@pytest.mark.vitis
+def test_end2end_mobilenet_build():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_mobilenet_fifodepth.onnx"
+    )
+    model = model.transform(
+        VitisBuild(
+            test_fpga_part,
+            target_clk_ns,
+            test_platform,
+            strategy=VitisOptStrategy.PERFORMANCE_BEST,
+        )
+    )
+    model.save(build_dir + "/end2end_mobilenet_build.onnx")
+    model = model.transform(AnnotateResources("synth"))
+    model.save(build_dir + "/end2end_mobilenet_final.onnx")
diff --git a/tests/fpgadataflow/test_fpgadataflow_fifo.py b/tests/fpgadataflow/test_fpgadataflow_fifo.py
index 1f1c9936139df4160bd08a0e168d1f4b7e639077..a603fc0664b78c00354514fbdff62c94aa7b7ef3 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fifo.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fifo.py
@@ -41,7 +41,7 @@ from finn.util.basic import gen_finn_dt_tensor
 import finn.core.onnx_exec as oxe
 
 
-build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
+build_dir = os.environ["FINN_BUILD_DIR"]
 test_fpga_part = "xc7z020clg400-1"
 target_clk_ns = 10
 
diff --git a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
index 306844c7ef3828d8483d3b0006491864f1525e21..23d7610dfdf434602f326e1117b072f312962295 100644
--- a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
+++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
@@ -65,7 +65,7 @@ from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
 test_fpga_part = pynq_part_map[test_pynq_board]
 
-ip_stitch_model_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
+ip_stitch_model_dir = os.environ["FINN_BUILD_DIR"]
 
 
 def create_one_fc_model(mem_mode="const"):
diff --git a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
index 06ebd90000e7466b2781d3284c5a0a0e56733dea..9def746c1c872a8b99b5bab48e8d0bd20798cedd 100644
--- a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
+++ b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
@@ -99,6 +99,7 @@ def test_res_estimate():
             "BRAM_efficiency": 1,
             "LUT": 357,
             "DSP": 0,
+            "URAM_efficiency": 1,
             "URAM": 0,
         }
     }
@@ -111,8 +112,22 @@ def test_res_estimate():
     prod_resource_estimation = model.analysis(res_estimation_complete)
     expect_resource_estimation = {
         "StreamingFCLayer_Batch_0": [
-            {"BRAM_18K": 0, "BRAM_efficiency": 1, "LUT": 352, "DSP": 1, "URAM": 0},
-            {"BRAM_18K": 0, "BRAM_efficiency": 1, "LUT": 357, "DSP": 0, "URAM": 0},
+            {
+                "BRAM_18K": 0,
+                "BRAM_efficiency": 1,
+                "LUT": 352,
+                "DSP": 1,
+                "URAM": 0,
+                "URAM_efficiency": 1,
+            },
+            {
+                "BRAM_18K": 0,
+                "BRAM_efficiency": 1,
+                "LUT": 357,
+                "DSP": 0,
+                "URAM": 0,
+                "URAM_efficiency": 1,
+            },
         ]
     }
 
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
index 5d46f4c3db35c159458dfc9e0eb8aae8ee89cb20..bbc7e8227d80fb9d064f484dafe91ecdcdc47144 100644
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
@@ -122,6 +122,10 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode):
     odt = act
     n_steps = act.get_num_possible_values() - 1
     T = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(np.float32)
+    # make the vivado_hls threshold bug appear (incorrect rtlsim result when first
+    # threshold of first channel is zero, while using BIPOLAR output)
+    if act == DataType.BIPOLAR:
+        T[0][0] = 0
     # provide non-decreasing thresholds
     T = np.sort(T, axis=1)
 
diff --git a/tests/fpgadataflow/test_set_folding.py b/tests/fpgadataflow/test_set_folding.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe3a1db8a476e33bfc0d76996917fab9ae6ed98b
--- /dev/null
+++ b/tests/fpgadataflow/test_set_folding.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+import numpy as np
+from onnx import TensorProto, helper
+
+from finn.custom_op.registry import getCustomOp
+from finn.core.datatype import DataType
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.fpgadataflow.set_folding import SetFolding
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.transformation.fpgadataflow.create_dataflow_partition import (
+    CreateDataflowPartition,
+)
+from finn.util.test import load_test_checkpoint_or_skip
+
+
+def make_multi_fclayer_model(ch, wdt, adt, tdt, nnodes):
+
+    W = np.random.randint(wdt.min(), wdt.max() + 1, size=(ch, ch))
+    W = W.astype(np.float32)
+
+    T = np.random.randint(tdt.min(), tdt.max() + 1, size=(ch, 2 ** adt.bitwidth() - 1))
+    T = T.astype(np.float32)
+
+    tensors = []
+    tensors.append(helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ch]))
+    for i in range(1, nnodes):
+        inter = helper.make_tensor_value_info(
+            "inter_" + str(i), TensorProto.FLOAT, [1, ch]
+        )
+        tensors.append(inter)
+    tensors.append(helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ch]))
+
+    FCLayer_nodes = []
+    for i in range(nnodes):
+        pe = 1
+        simd = 1
+        FCLayer_nodes += [
+            helper.make_node(
+                "StreamingFCLayer_Batch",
+                [tensors[i].name, "weights_" + str(i), "thresh_" + str(i)],
+                [tensors[i + 1].name],
+                domain="finn.custom_op.fpgadataflow",
+                backend="fpgadataflow",
+                MW=ch,
+                MH=ch,
+                SIMD=simd,
+                PE=pe,
+                inputDataType=adt.name,
+                weightDataType=wdt.name,
+                outputDataType=adt.name,
+                ActVal=0,
+                binaryXnorMode=0,
+                noActivation=0,
+            )
+        ]
+
+    graph = helper.make_graph(
+        nodes=FCLayer_nodes,
+        name="fclayer_graph",
+        inputs=[tensors[0]],
+        outputs=[tensors[-1]],
+    )
+
+    model = helper.make_model(graph, producer_name="fclayer-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", adt)
+    model.set_tensor_datatype("outp", adt)
+
+    for i in range(1, nnodes + 1):
+        model.graph.value_info.append(tensors[i])
+        model.set_initializer("weights_" + str(i - 1), W)
+        model.set_initializer("thresh_" + str(i - 1), T)
+        model.set_tensor_datatype("weights_" + str(i - 1), wdt)
+        model.set_tensor_datatype("thresh_" + str(i - 1), tdt)
+
+    return model
+
+
+# desired frames per second
+@pytest.mark.parametrize("target_fps", [30, 10 ** 5, 10 ** 7])
+# target chip or board
+@pytest.mark.parametrize("platform", ["Pynq-Z1", "Ultra96", "U200"])
+def test_set_folding(target_fps, platform):
+
+    model = make_multi_fclayer_model(
+        128, DataType.INT4, DataType.INT2, DataType.INT16, 5
+    )
+
+    model = model.transform(GiveUniqueNodeNames())
+    parent_model = model.transform(CreateDataflowPartition())
+    sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+    sdp_node = getCustomOp(sdp_node)
+    dataflow_model_filename = sdp_node.get_nodeattr("model")
+    dataflow_model = load_test_checkpoint_or_skip(dataflow_model_filename)
+
+    clk_ns = 5
+    target_cycles_per_frame = int((10 ** 9 / clk_ns) / target_fps)
+    dataflow_model = dataflow_model.transform(SetFolding(target_cycles_per_frame))
+
+    exp_cycles_dict = dataflow_model.analysis(exp_cycles_per_layer)
+    achieved_cycles_per_frame = max(exp_cycles_dict.values())
+
+    min_cycles = dict()
+    min_cycles["Pynq-Z1"] = 128
+    min_cycles["Ultra96"] = 64
+    min_cycles["U200"] = 1
+
+    assert achieved_cycles_per_frame <= max(
+        min_cycles[platform], target_cycles_per_frame
+    ), "Folding target not met"
diff --git a/tests/util/test_build_dataflow.py b/tests/util/test_build_dataflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8e886ddb047e932e5f03ce7d460d538c95a25f2
--- /dev/null
+++ b/tests/util/test_build_dataflow.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+import pkg_resources as pk
+from shutil import copytree
+from finn.util.basic import make_build_dir
+from finn.builder.build_dataflow import build_dataflow_directory
+import os
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_build_dataflow_directory():
+    test_dir = make_build_dir("test_build_dataflow_directory_")
+    target_dir = test_dir + "/build_dataflow"
+    example_data_dir = pk.resource_filename("finn.qnn-data", "build_dataflow/")
+    copytree(example_data_dir, target_dir)
+    build_dataflow_directory(target_dir)
+    # check the generated files
+    output_dir = target_dir + "/output_tfc_w1a1_Pynq-Z1"
+    assert os.path.isfile(output_dir + "/time_per_step.json")
+    assert os.path.isfile(output_dir + "/final_hw_config.json")
+    assert os.path.isfile(output_dir + "/stitched_ip/ip/component.xml")
+    assert os.path.isfile(output_dir + "/driver/driver.py")
+    assert os.path.isfile(output_dir + "/report/estimate_layer_cycles.json")
+    assert os.path.isfile(output_dir + "/report/estimate_layer_resources.json")
+    assert os.path.isfile(
+        output_dir + "/report/estimate_layer_config_alternatives.json"
+    )
+    assert os.path.isfile(output_dir + "/report/estimate_network_performance.json")
+    assert os.path.isfile(output_dir + "/report/ooc_synth_and_timing.json")
+    assert os.path.isfile(output_dir + "/report/rtlsim_performance.json")
+    assert os.path.isfile(output_dir + "/bitfile/finn-accel.bit")
+    assert os.path.isfile(output_dir + "/bitfile/finn-accel.hwh")
+    assert os.path.isfile(output_dir + "/report/post_synth_resources.xml")
+    assert os.path.isfile(output_dir + "/report/post_route_timing.rpt")
+    # verification outputs
+    verify_out_dir = output_dir + "/verification_output"
+    assert os.path.isfile(verify_out_dir + "/verify_initial_python_SUCCESS.npy")
+    assert os.path.isfile(verify_out_dir + "/verify_streamlined_python_SUCCESS.npy")
+    assert os.path.isfile(verify_out_dir + "/verify_folded_hls_cppsim_SUCCESS.npy")
+    assert os.path.isfile(verify_out_dir + "/verify_stitched_ip_rtlsim_SUCCESS.npy")