diff --git a/.github/workflows/quicktest-dev-pr.yml b/.github/workflows/quicktest-dev-pr.yml
index cd59a629405c748187cdf478c0bdb0694c58c79f..0233a81ba06dc701a3a4579b9a5bd3ce17e47d04 100644
--- a/.github/workflows/quicktest-dev-pr.yml
+++ b/.github/workflows/quicktest-dev-pr.yml
@@ -5,7 +5,7 @@ on:
     branches: [ dev ]
   push:
     branches: [ dev ]
-  
+
 
 jobs:
 
@@ -18,4 +18,6 @@ jobs:
         uses: actions/checkout@v2
 
       - name: DockerRunQuicktest
-        run: sh run-docker.sh quicktest
+        run: |
+          docker build -t finn_gha -f docker/Dockerfile.finn_ci --build-arg BUILD_PATH=/tmp/finn_gha .
+          docker run --init --hostname finn_gha -v $(pwd):/workspace/finn -e FINN_INST_NAME=finn_gha finn_gha quicktest.sh
diff --git a/.gitignore b/.gitignore
index f838c1695130d232ac6a2b888aed0cea31aafaa7..d7ee7e014a0c175a8a88060f2aa320efeb501ddc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -78,3 +78,9 @@ MANIFEST
 
 # Jenkins cfg dir
 /docker/jenkins_home
+
+# SSH key dir mounted into Docker
+/ssh_keys/
+
+# PYNQ board files
+/board_files/
diff --git a/docker/Dockerfile.finn_ci b/docker/Dockerfile.finn_ci
index 2668927602ebb8de5fdc3d7c25b20a0c8c4a2e55..0d122133a6446cb77160c9447e16ff13d4d4b9c5 100644
--- a/docker/Dockerfile.finn_ci
+++ b/docker/Dockerfile.finn_ci
@@ -30,15 +30,14 @@ FROM pytorch/pytorch:1.1.0-cuda10.0-cudnn7.5-devel
 MAINTAINER Yaman Umuroglu <yamanu@xilinx.com>
 ARG PYTHON_VERSION=3.6
 ARG BUILD_PATH
-ARG FINN_CI_BRANCH
 
 WORKDIR /workspace
 
 RUN apt-get update
 RUN apt-get -y upgrade
 RUN apt-get install -y build-essential libglib2.0-0 libsm6 libxext6 libxrender-dev
-RUN apt install verilator
-RUN apt-get -y install sshpass
+RUN apt-get install -y verilator zsh
+RUN apt-get -y install sshpass wget unzip
 RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config
 
 # cloning dependency repos
@@ -52,18 +51,23 @@ RUN git clone https://github.com/Xilinx/finn-hlslib.git /workspace/finn-hlslib
 RUN git clone https://github.com/maltanar/pyverilator /workspace/pyverilator
 # PYNQ-HelloWorld
 RUN git clone https://github.com/maltanar/PYNQ-HelloWorld.git /workspace/PYNQ-HelloWorld
+# oh-my-xilinx
+RUN git clone https://bitbucket.org/maltanar/oh-my-xilinx.git /workspace/oh-my-xilinx
 
-# checkout desired FINN branch for testing
-RUN git clone --branch $FINN_CI_BRANCH https://github.com/Xilinx/finn /workspace/finn
-
-RUN pip install -r /workspace/finn/requirements.txt
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+RUN rm requirements.txt
 RUN apt update; apt install nano
 RUN pip install pytest-dependency
+RUN pip install pytest-xdist
+RUN pip install pytest-parallel
 
 ENV PYTHONPATH "${PYTHONPATH}:/workspace/finn/src"
 ENV PYTHONPATH "${PYTHONPATH}:/workspace/pyverilator"
 ENV PYNQSHELL_PATH "/workspace/PYNQ-HelloWorld/boards"
 ENV VIVADO_IP_CACHE "$BUILD_PATH/vivado_ip_cache"
+ENV PATH "${PATH}:/workspace/oh-my-xilinx"
+ENV OHMYXILINX "/workspace/oh-my-xilinx"
 
 # colorful terminal output
 RUN echo "PS1='\[\033[1;36m\]\u\[\033[1;31m\]@\[\033[1;32m\]\h:\[\033[1;35m\]\w\[\033[1;31m\]\$\[\033[0m\] '" >>  /root/.bashrc
@@ -72,8 +76,8 @@ RUN mkdir -p $VIVADO_IP_CACHE
 
 WORKDIR /workspace/finn
 
-COPY finn_entrypoint.sh /usr/local/bin/
-COPY quicktest.sh /usr/local/bin/
+COPY docker/finn_entrypoint.sh /usr/local/bin/
+COPY docker/quicktest.sh /usr/local/bin/
 RUN chmod 755 /usr/local/bin/finn_entrypoint.sh
 RUN chmod 755 /usr/local/bin/quicktest.sh
 ENTRYPOINT ["finn_entrypoint.sh"]
diff --git a/docker/Dockerfile.finn_dev b/docker/Dockerfile.finn_dev
index 1200c7d5d15bbd62e15f19f84e70d5fe0b8aca28..f8e15f34fb4da3dc4ee353a29d26866b68879144 100644
--- a/docker/Dockerfile.finn_dev
+++ b/docker/Dockerfile.finn_dev
@@ -37,28 +37,25 @@ ARG PASSWD
 ARG JUPYTER_PORT
 ARG NETRON_PORT
 
-EXPOSE $JUPYTER_PORT
-EXPOSE $NETRON_PORT
-
 WORKDIR /workspace
 
 RUN apt-get update
 RUN apt-get -y upgrade
 RUN apt-get install -y build-essential libglib2.0-0 libsm6 libxext6 libxrender-dev
-RUN apt-get install verilator
-RUN apt-get install nano
-RUN apt-get -y install sshpass
+RUN apt-get install -y verilator nano zsh rsync
+RUN apt-get -y install sshpass wget unzip
 RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config
 
 COPY requirements.txt .
 RUN pip install -r requirements.txt
 RUN rm requirements.txt
 RUN pip install jupyter
-RUN pip install netron
 RUN pip install matplotlib
 RUN pip install pytest-dependency
 RUN pip install sphinx
 RUN pip install sphinx_rtd_theme
+RUN pip install pytest-xdist
+RUN pip install pytest-parallel
 
 # switch user
 RUN groupadd -g $GID $GNAME
@@ -81,12 +78,29 @@ RUN git clone https://github.com/Xilinx/finn-hlslib.git /workspace/finn-hlslib
 RUN git clone https://github.com/maltanar/pyverilator /workspace/pyverilator
 # PYNQ-HelloWorld
 RUN git clone https://github.com/maltanar/PYNQ-HelloWorld.git /workspace/PYNQ-HelloWorld
+# oh-my-xilinx
+RUN git clone https://bitbucket.org/maltanar/oh-my-xilinx.git /workspace/oh-my-xilinx
+# netron
+RUN git clone https://github.com/lutzroeder/netron.git /workspace/netron
+
+# build and install netron
+USER root
+RUN curl -sL https://deb.nodesource.com/setup_12.x | bash -
+RUN apt-get install -y nodejs
+WORKDIR /workspace/netron
+RUN git checkout 376e9d33733a3eacfe3c432808fd46e6cd1460cb
+RUN npm install
+RUN python setup.py build
+RUN pip install /workspace/netron
+USER $UNAME
 
 # for this developer-oriented Docker container we assume the FINN repo is cloned and mounted from the host
 # at /workspace/finn -- see run-docker.sh for an example of how to do this.
 ENV PYTHONPATH "${PYTHONPATH}:/workspace/finn/src"
 ENV PYTHONPATH "${PYTHONPATH}:/workspace/pyverilator"
 ENV PYNQSHELL_PATH "/workspace/PYNQ-HelloWorld/boards"
+ENV PATH "${PATH}:/workspace/oh-my-xilinx:/home/$UNAME/.local/bin"
+ENV OHMYXILINX "/workspace/oh-my-xilinx"
 
 WORKDIR /home/$UNAME/finn
 RUN echo "PS1='\[\033[1;36m\]\u\[\033[1;31m\]@\[\033[1;32m\]\h:\[\033[1;35m\]\w\[\033[1;31m\]\$\[\033[0m\] '" >>  /home/$UNAME/.bashrc
@@ -100,5 +114,8 @@ RUN chmod 755 /usr/local/bin/finn_entrypoint.sh
 RUN chmod 755 /usr/local/bin/quicktest.sh
 USER $UNAME
 
+EXPOSE $JUPYTER_PORT
+EXPOSE $NETRON_PORT
+
 ENTRYPOINT ["finn_entrypoint.sh"]
 CMD ["bash"]
diff --git a/docker/Jenkinsfile b/docker/Jenkinsfile
index 80be261fb3da057186259598f84d915176577a5d..b2d3102bd4aa3c00620f41c102af5a8b385cede7 100644
--- a/docker/Jenkinsfile
+++ b/docker/Jenkinsfile
@@ -9,12 +9,19 @@ pipeline {
         string(name: 'PYNQ_PASSWORD', defaultValue: 'xilinx', description: 'PYNQ board password')
         string(name: 'PYNQ_TARGET_DIR', defaultValue: '/home/xilinx/finn', description: 'PYNQ board target deployment directory')
         string(name: 'NUM_DEFAULT_WORKERS', defaultValue: '1', description: 'Number of cores for parallel transformations')
-        string(name: 'DOCKER_CMD', defaultValue: """python setup.py test""", description: 'Command to run')
+        // main test: everything except rtlsim and end2end tests, parallel run with xdist, no parallel transformations to save on memory
+        string(name: 'DOCKER_CMD_MAIN', defaultValue: """python setup.py test --addopts "-k 'not (rtlsim or end2end)' --dist=loadfile -n auto" """, description: 'Main test command')
+        // rtlsim tests: parallel run with pytest-parallel, no parallel transformations to save on memory
+        string(name: 'DOCKER_CMD_RTLSIM', defaultValue: """python setup.py test --addopts "-k rtlsim --workers auto" """, description: 'rtlsim test command')
+        // end2end tests: no parallel testing, use NUM_DEFAULT_WORKERS for parallel transformations
+        string(name: 'DOCKER_CMD_END2END', defaultValue: """python setup.py test --addopts "-k end2end" """, description: 'end2end test command')
+        // allow specifying where to mount the cloned folder from, since Jenkins and FINN may be running in separate containers
+        string(name: 'WORKSPACE_MOUNT', defaultValue: '/var/jenkins_home/workspace/finn', description: 'Path to Jenkins workspace mount')
     }
     environment {
         DOCKER_TAG='finn_ci:$BUILD_ID'
-        DOCKER_INST_NAME='finn_ci_$BUILD_ID'
-        BUILD_PATH='/tmp/finn_ci_$BUILD_ID'
+        DOCKER_INST_NAME='finn_ci'
+        BUILD_PATH='/tmp/finn_ci'
     }
     stages {
         stage("Clone") {
@@ -27,16 +34,57 @@ pipeline {
                 sh """
                 docker build -t $DOCKER_TAG -f docker/Dockerfile.finn_ci \
                 --build-arg BUILD_PATH=$BUILD_PATH \
-                --build-arg FINN_CI_BRANCH=${params.FINN_CI_BRANCH} \
-                docker/
+                .
                 """
             }
         }
-        stage('Test') {
+        stage('test-main') {
             steps {
+                catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') {
                 sh """
-                docker run --name $DOCKER_INST_NAME --init \
+                docker run --init \
                 --hostname $DOCKER_INST_NAME \
+                -v ${params.WORKSPACE_MOUNT}:/workspace/finn \
+                -v ${params.VIVADO_PATH}:${params.VIVADO_PATH}:ro \
+                -e NUM_DEFAULT_WORKERS=1 \
+                -e FINN_INST_NAME=$DOCKER_INST_NAME \
+                -e VIVADO_PATH=${params.VIVADO_PATH} \
+                -e PYNQ_BOARD=${params.PYNQ_BOARD} \
+                -e PYNQ_IP=${params.PYNQ_IP} \
+                -e PYNQ_USERNAME=${params.PYNQ_USERNAME} \
+                -e PYNQ_PASSWORD=${params.PYNQ_PASSWORD} \
+                -e PYNQ_TARGET_DIR=${params.PYNQ_TARGET_DIR} \
+                $DOCKER_TAG ${params.DOCKER_CMD_MAIN}
+                """}
+            }
+        }
+        stage('test-rtlsim') {
+            steps {
+                catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') {
+                sh """
+                docker run --init \
+                --hostname $DOCKER_INST_NAME \
+                -v ${params.WORKSPACE_MOUNT}:/workspace/finn \
+                -v ${params.VIVADO_PATH}:${params.VIVADO_PATH}:ro \
+                -e NUM_DEFAULT_WORKERS=1 \
+                -e FINN_INST_NAME=$DOCKER_INST_NAME \
+                -e VIVADO_PATH=${params.VIVADO_PATH} \
+                -e PYNQ_BOARD=${params.PYNQ_BOARD} \
+                -e PYNQ_IP=${params.PYNQ_IP} \
+                -e PYNQ_USERNAME=${params.PYNQ_USERNAME} \
+                -e PYNQ_PASSWORD=${params.PYNQ_PASSWORD} \
+                -e PYNQ_TARGET_DIR=${params.PYNQ_TARGET_DIR} \
+                $DOCKER_TAG ${params.DOCKER_CMD_RTLSIM}
+                """}
+            }
+        }
+        stage('test-end2end') {
+            steps {
+                catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') {
+                sh """
+                docker run --init \
+                --hostname $DOCKER_INST_NAME \
+                -v ${params.WORKSPACE_MOUNT}:/workspace/finn \
                 -v ${params.VIVADO_PATH}:${params.VIVADO_PATH}:ro \
                 -e NUM_DEFAULT_WORKERS=${params.NUM_DEFAULT_WORKERS} \
                 -e FINN_INST_NAME=$DOCKER_INST_NAME \
@@ -46,8 +94,8 @@ pipeline {
                 -e PYNQ_USERNAME=${params.PYNQ_USERNAME} \
                 -e PYNQ_PASSWORD=${params.PYNQ_PASSWORD} \
                 -e PYNQ_TARGET_DIR=${params.PYNQ_TARGET_DIR} \
-                $DOCKER_TAG ${params.DOCKER_CMD}
-                """
+                $DOCKER_TAG ${params.DOCKER_CMD_END2END}
+                """ }
             }
         }
     }
diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index feddc0db38db7d6ef5f73775cd6adbd0fe540f1e..84b3b6ea2983732890021517d9dcd2e0cf22fb4b 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -1,6 +1,5 @@
 #!/bin/bash
 
-export XILINX_VIVADO=$VIVADO_PATH
 export SHELL=/bin/bash
 export FINN_ROOT=/workspace/finn
 
@@ -13,11 +12,12 @@ gecho () {
 
 # checkout the correct dependency repo commits
 # the repos themselves are cloned in the Dockerfile
-BREVITAS_COMMIT=989cdfdba4700fdd900ba0b25a820591d561c21a
+BREVITAS_COMMIT=f9a27226d4acf1661dd38bc449f71f89e0983cce
 CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4
-HLSLIB_COMMIT=13e9b0772a27a3a1efc40c878d8e78ed09efb716
+HLSLIB_COMMIT=cfafe11a93b79ab1af7529d68f08886913a6466e
 PYVERILATOR_COMMIT=c97a5ba41bbc7c419d6f25c74cdf3bdc3393174f
 PYNQSHELL_COMMIT=bf281fc3a44eca29efbcbefd63f1196d82c7c255
+OMX_COMMIT=1bae737669901e762f581af73348332b5c4b2ada
 
 gecho "Setting up known-good commit versions for FINN dependencies"
 # Brevitas
@@ -41,8 +41,34 @@ git -C /workspace/pyverilator checkout $PYVERILATOR_COMMIT --quiet
 gecho "PYNQ shell @ $PYNQSHELL_COMMIT"
 git -C /workspace/PYNQ-HelloWorld pull --quiet
 git -C /workspace/PYNQ-HelloWorld checkout $PYNQSHELL_COMMIT --quiet
+# oh-my-xilinx
+gecho "oh-my-xilinx @ $OMX_COMMIT"
+git -C /workspace/oh-my-xilinx pull --quiet
+git -C /workspace/oh-my-xilinx checkout $OMX_COMMIT --quiet
 
-# source Vivado env.vars
-source $VIVADO_PATH/settings64.sh
+if [ ! -z "$VIVADO_PATH" ];then
+  # source Vivado env.vars
+  export XILINX_VIVADO=$VIVADO_PATH
+  source $VIVADO_PATH/settings64.sh
+fi
+if [ ! -z "$VITIS_PATH" ];then
+  # source Vitis env.vars
+  export XILINX_VITIS=$VITIS_PATH
+  source $VITIS_PATH/settings64.sh
+fi
+
+# download PYNQ board files if not already there
+if [ ! -d "/workspace/finn/board_files" ]; then
+    gecho "Downloading PYNQ board files for Vivado"
+    wget -q https://github.com/cathalmccabe/pynq-z1_board_files/raw/master/pynq-z1.zip
+    wget -q https://d2m32eurp10079.cloudfront.net/Download/pynq-z2.zip
+    unzip -q pynq-z1.zip
+    unzip -q pynq-z2.zip
+    mkdir /workspace/finn/board_files
+    mv pynq-z1/ board_files/
+    mv pynq-z2/ board_files/
+    rm pynq-z1.zip
+    rm pynq-z2.zip
+fi
 
 exec "$@"
diff --git a/docker/quicktest.sh b/docker/quicktest.sh
index 4f6a2d3e230de9fcbb947d794722294880a7730d..02e014cd3cc7bb88eebd02f03ff599913079152b 100755
--- a/docker/quicktest.sh
+++ b/docker/quicktest.sh
@@ -1,4 +1,21 @@
 #!/bin/bash
 
+: ${PYTEST_PARALLEL=auto}
+
 cd $FINN_ROOT
-python setup.py test --addopts "-m 'not (vivado or slow)'"
+# check if command line argument is empty or not present
+if [ -z $1 ]; then
+  echo "Running quicktest: not (vivado or slow) with pytest-xdist"
+  python setup.py test --addopts "-m 'not (vivado or slow or vitis)' --dist=loadfile -n $PYTEST_PARALLEL"
+elif [ $1 = "main" ]; then
+  echo "Running main test suite: not (rtlsim or end2end) with pytest-xdist"
+  python setup.py test --addopts "-k 'not (rtlsim or end2end)' --dist=loadfile -n $PYTEST_PARALLEL"
+elif [ $1 = "rtlsim" ]; then
+  echo "Running rtlsim test suite with pytest-parallel"
+  python setup.py test --addopts "-k rtlsim --workers $PYTEST_PARALLEL"
+elif [ $1 = "end2end" ]; then
+  echo "Running end2end test suite with no parallelism"
+  python setup.py test --addopts "-k end2end"
+else
+  echo "Unrecognized argument to quicktest.sh"
+fi
diff --git a/docs/finn-sheduling-and-folding.pptx b/docs/finn-sheduling-and-folding.pptx
new file mode 100644
index 0000000000000000000000000000000000000000..30bbe4d55b1cda9df25a791227983dc7cb750e58
Binary files /dev/null and b/docs/finn-sheduling-and-folding.pptx differ
diff --git a/docs/finn/example_networks.rst b/docs/finn/example_networks.rst
index 9f221871f09bf655db9d81988d6fa83e53473634..86bb2bd11fd805a23a3bdf6da8a8ed686259ecc1 100644
--- a/docs/finn/example_networks.rst
+++ b/docs/finn/example_networks.rst
@@ -20,17 +20,17 @@ version, this is indicated by an x mark in the table.
 +-----------------------+------------+----------+----------+----------+----------+----------+----------+
 | Export/Import         | x          | x        | x        | x        | x        |    x     |     x    |
 +-----------------------+------------+----------+----------+----------+----------+----------+----------+
-| Streamlining          | x          | x        | x        | x        | x        |          |          |
+| Streamlining          | x          | x        | x        | x        | x        |          |     x    |
 +-----------------------+------------+----------+----------+----------+----------+----------+----------+
-| Convert to HLS layers | x          | x        | x        | x        | x        |          |          |
+| Convert to HLS layers | x          | x        | x        | x        | x        |          |     x    |
 +-----------------------+------------+----------+----------+----------+----------+----------+----------+
-| Stitched IP           | x          | x        | x        | x        | x        |          |          |
+| Stitched IP           | x          | x        | x        | x        | x        |          |     x    |
 +-----------------------+------------+----------+----------+----------+----------+----------+----------+
-| Hardware test         | x          | x        | x        |          | x        |          |          |
+| Hardware test         | x          | x        | x        |          | x        |          |     x    |
 +-----------------------+------------+----------+----------+----------+----------+----------+----------+
-| cppsim                | x          | x        | x        | x        | x        |          |          |
+| cppsim                | x          | x        | x        | x        | x        |          |     x    |
 +-----------------------+------------+----------+----------+----------+----------+----------+----------+
-| rtlsim node-by-node   | x          | x        | x        | x        | x        |          |          |
+| rtlsim node-by-node   | x          | x        | x        | x        | x        |          |     x    |
 +-----------------------+------------+----------+----------+----------+----------+----------+----------+
-| rtlsim stitched IP    | x          | x        | x        | x        | x        |          |          |
+| rtlsim stitched IP    | x          | x        | x        | x        | x        |          |     x    |
 +-----------------------+------------+----------+----------+----------+----------+----------+----------+
diff --git a/docs/finn/getting_started.rst b/docs/finn/getting_started.rst
index 87495364808e843f8dda2981268d0340626758ee..8a20dad0e47b9458989039184cfa0e5d01d48aa2 100644
--- a/docs/finn/getting_started.rst
+++ b/docs/finn/getting_started.rst
@@ -13,11 +13,12 @@ The FINN compiler should not be thought of a single pushbutton tool that does ev
 Requirements
 ============
 
-* Ubuntu 18.04
+* Ubuntu 18.04 with `bash` installed
 * Docker
 * A working Vivado 2019.1 installation
 * A `VIVADO_PATH` environment variable pointing to the Vivado installation directory (e.g. the directory where settings64.sh is located)
 * (optional) A PYNQ board with a network connection
+   * the ``bitstring`` package must be installed on the PYNQ: ``sudo pip3 install bitstring``
 
 Running FINN in Docker
 ======================
@@ -25,11 +26,14 @@ We use Docker extensively for developing and deploying FINN. If you are not fami
 
 Getting an interactive shell for development or experimentation
 ***************************************************************
+.. note:: **run-docker.sh requires bash to execute correctly.**
+
 ::
 
-  sh run_docker.sh
+  ./run_docker.sh
 
 Simply running sh run-docker.sh without any additional arguments will clone the dependency repos, create a Docker container and give you a terminal with you can use for development for experimentation.
+If you want a new terminal on an already-running container, you can do this with `docker exec -it finn_dev_<username> bash`.
 
 .. warning:: The Docker container is spawned with the `--rm` option, so make sure that any important files you created inside the container are either in the /workspace/finn folder (which is mounted from the host computer) or otherwise backed up.
 
@@ -39,7 +43,7 @@ Running the Jupyter notebooks
 *****************************
 ::
 
-  sh run-docker.sh notebook
+  ./run-docker.sh notebook
 
 This will launch the `Jupyter notebook <https://jupyter.org/>`_ server inside a Docker container, and print a link on the terminal that you can open in your browser to run the FINN notebooks or create new ones.
 .. note:: The link will look something like this (the token you get will be different):
@@ -55,14 +59,14 @@ by:
 
 ::
 
-  sh run-docker.sh test
+  ./run-docker.sh test
 
 There is a quicker variant of the test suite that skips the tests marked as
 requiring Vivado or as slow-running tests:
 
 ::
 
-  sh run-docker.sh quicktest
+  ./run-docker.sh quicktest
 
 If you want to run individual tests, you can do this *inside the Docker container
 from the FINN root directory* as follows:
@@ -71,8 +75,12 @@ from the FINN root directory* as follows:
 
   python setup.py test --addopts "-k test_end2end_tfc_w1a2"
 
-Please see the pytest documentation for more about picking tests by marks or
-by name.
+Finally, if you want to run tests in parallel (e.g. to take advantage of a multi-core CPU)
+you can use:
+ * pytest-parallel for any rtlsim tests, e.g. `python setup.py test --addopts "-k rtlsim --workers auto"`
+ * pytest-xdist for anything else, make sure to add `--dist=loadfile` if you have tests in the same file that have dependencies on each other e.g. `python setup.py test --addopts "-k mytest -n auto --dist=loadfile"`
+
+Please see the pytest documentation for more about picking tests by marks or by name.
 
 Environment variables
 **********************
diff --git a/docs/finn/internals.rst b/docs/finn/internals.rst
index 7a4bc687eeb827320991f7d3f1ef8cc35e97f3da..dee62f09a9253380e05300dac8fa34915c20dab5 100644
--- a/docs/finn/internals.rst
+++ b/docs/finn/internals.rst
@@ -16,6 +16,10 @@ Custom Quantization Annotations
 
 ONNX does not support datatypes smaller than 8-bit integers, whereas in FINN we are interested in smaller integers down to ternary and bipolar. To make this work, FINN uses the quantization_annotation field in ONNX to annotate tensors with their FINN DataType (:py:mod:`finn.core.datatype.DataType`) information. However, all tensors are expected to use single-precision floating point (float32) storage in FINN. This means we store even a 1-bit value as floating point for the purposes of representation. The FINN compiler flow is responsible for eventually producing a packed representation for the target hardware, where the 1-bit is actually stored as 1-bit.
 
+Note that FINN uses floating point tensors as a carrier data type to represent integers. Floating point arithmetic can introduce rounding errors, e.g. (int_num * float_scale) / float_scale is not always equal to int_num.
+When using the custom ONNX execution flow, FINN will attempt to sanitize any rounding errors for integer tensors. See (:py:mod:`finn.util.basic.sanitize_quant_values`) for more information.
+This behavior can be disabled (not recommended!) by setting the environment variable SANITIZE_QUANT_TENSORS=0.
+
 Custom Operations/Nodes
 =======================
 
diff --git a/docs/finn/verification.rst b/docs/finn/verification.rst
index 391c6f999312839daca0d4161336c7c0ae822f89..c52c0840aa40566d930164490b1fd249d7c07757 100644
--- a/docs/finn/verification.rst
+++ b/docs/finn/verification.rst
@@ -28,4 +28,15 @@ This simulation can be used for a model containing several HLS custom operations
 Emulation using PyVerilator
 ===========================
 
-The emulation using PyVerilator can be used when IP blocks were generated, either node by node or of a whole design. For that purpose PyVerilator gets the generated verilog files.
+The emulation using PyVerilator can be used when IP blocks were generated, either node by node or of a whole (IP-stitched) design. For that purpose PyVerilator gets the generated verilog files.
+
+For debugging purposes, it's possible to generate .vcd trace files that show the value of external & internal signals as the emuation is running. To enable this:
+ - for node-by-node rtlsim, set the `rtlsim_trace` attribute of each node of interest to either a file name for the vcd or `default` to use the node name as the filename.
+ - for IP-stitched rtlsim, set the `rtlsim_trace` metadata_prop  for the graph as per above.
+
+To control the tracing depth in the module hierarchy, use the `RTLSIM_TRACE_DEPTH` environment variable (default is 1):
+ - level 1 shows top-level input/output streams
+ - level 2 shows per-layer input/output streams
+ - level 3 shows per full-layer I/O including FIFO count signals
+
+Note that deeper tracing will take longer to execute and may produce very large .vcd files.
diff --git a/finn-rtllib/memstream/component.xml b/finn-rtllib/memstream/component.xml
index 6b728c0555a4889b8e76d5759233d1109a3002bd..7910a8284dad3674b8665136506a60c498e0547f 100644
--- a/finn-rtllib/memstream/component.xml
+++ b/finn-rtllib/memstream/component.xml
@@ -1051,6 +1051,7 @@
         <xilinx:family xilinx:lifeCycle="Beta">azynq</xilinx:family>
         <xilinx:family xilinx:lifeCycle="Beta">zynquplus</xilinx:family>
         <xilinx:family xilinx:lifeCycle="Production">virtexuplus</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">virtexuplusHBM</xilinx:family>
       </xilinx:supportedFamilies>
       <xilinx:taxonomies>
         <xilinx:taxonomy>/UserIP</xilinx:taxonomy>
diff --git a/notebooks/end2end_example/cnv_end2end_example.ipynb b/notebooks/end2end_example/cnv_end2end_example.ipynb
index ce8c9decf4aaa6b7be2e556b6053abf380d0d373..74efa67d16616f64b21d84a8ef328ceaf2f3ce09 100644
--- a/notebooks/end2end_example/cnv_end2end_example.ipynb
+++ b/notebooks/end2end_example/cnv_end2end_example.ipynb
@@ -574,7 +574,7 @@
     "target_dir = os.getenv(\"PYNQ_TARGET_DIR\", \"/home/xilinx/finn\")\n",
     "\n",
     "model = ModelWrapper(build_dir + \"/end2end_cnv_w1a1_synth.onnx\")\n",
-    "model = model.transform(MakePYNQDriver())\n",
+    "model = model.transform(MakePYNQDriver(platform="zynq"))\n",
     "model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))\n",
     "model.save(build_dir + \"/end2end_cnv_w1a1_pynq_deploy.onnx\")"
    ]
diff --git a/notebooks/end2end_example/tfc_end2end_example.ipynb b/notebooks/end2end_example/tfc_end2end_example.ipynb
index d573061487de204084e0d3242da8ad1b791f44d8..c388feca2340792c3535dba3fb3cf5e7220adf3c 100644
--- a/notebooks/end2end_example/tfc_end2end_example.ipynb
+++ b/notebooks/end2end_example/tfc_end2end_example.ipynb
@@ -132,7 +132,7 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f8890385828>"
+       "<IPython.lib.display.IFrame at 0x7f7cc4290940>"
       ]
      },
      "execution_count": 3,
@@ -293,7 +293,7 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fe1ad0639e8>"
+       "<IPython.lib.display.IFrame at 0x7f7c6c567f28>"
       ]
      },
      "execution_count": 6,
@@ -333,9 +333,10 @@
       "            ConvertDivToMul(),\n",
       "            BatchNormToAffine(),\n",
       "            ConvertSignToThres(),\n",
+      "            AbsorbSignBiasIntoMultiThreshold(),\n",
       "            MoveAddPastMul(),\n",
       "            MoveScalarAddPastMatMul(),\n",
-      "            MoveScalarAddPastConv(),\n",
+      "            MoveAddPastConv(),\n",
       "            MoveScalarMulPastMatMul(),\n",
       "            MoveScalarMulPastConv(),\n",
       "            MoveAddPastMul(),\n",
@@ -350,6 +351,7 @@
       "        ]\n",
       "        for trn in streamline_transformations:\n",
       "            model = model.transform(trn)\n",
+      "            model = model.transform(RemoveIdentityOps())\n",
       "            model = model.transform(GiveUniqueNodeNames())\n",
       "            model = model.transform(GiveReadableTensorNames())\n",
       "            model = model.transform(InferDataTypes())\n",
@@ -400,7 +402,7 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fe1346e4ef0>"
+       "<IPython.lib.display.IFrame at 0x7f7c6c0bf898>"
       ]
      },
      "execution_count": 8,
@@ -454,7 +456,7 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fe1346f7780>"
+       "<IPython.lib.display.IFrame at 0x7f7c6c0e5c18>"
       ]
      },
      "execution_count": 9,
@@ -728,7 +730,7 @@
        " 'ip_path': ('s', False, ''),\n",
        " 'ip_vlnv': ('s', False, ''),\n",
        " 'exec_mode': ('s', False, ''),\n",
-       " 'sim_cycles': ('i', False, 0),\n",
+       " 'cycles_rtlsim': ('i', False, 0),\n",
        " 'rtlsim_trace': ('s', False, ''),\n",
        " 'res_estimate': ('s', False, ''),\n",
        " 'res_hls': ('s', False, ''),\n",
@@ -1420,7 +1422,7 @@
    "source": [
     "from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver\n",
     "model = ModelWrapper(build_dir + \"/tfc_w1_a1_post_synthesis.onnx\")\n",
-    "model = model.transform(MakePYNQDriver())"
+    "model = model.transform(MakePYNQDriver(platform="zynq"))"
    ]
   },
   {
diff --git a/run-docker.sh b/run-docker.sh
index e07556716db335421f57a390f1e6a17168ac058b..88956586c6a2ba9780d0597f8149038dad4aa6ab 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -50,6 +50,15 @@ if [ -z "$PYNQ_IP" ];then
         recho "Please set the PYNQ_IP env.var. to enable PYNQ deployment tests."
 fi
 
+if [ -z "$VITIS_PATH" ];then
+        recho "Please set the VITIS_PATH that contains the path to your Vitis installation directory."
+        recho "FINN functionality depending on Vitis will not be available."
+else
+    if [ -z "$PLATFORM_REPO_PATHS" ];then
+            recho "Please set PLATFORM_REPO_PATHS pointing to Vitis platform files (DSAs)."
+    fi
+fi
+
 DOCKER_GID=$(id -g)
 DOCKER_GNAME=$(id -gn)
 DOCKER_UNAME=$(id -un)
@@ -65,6 +74,11 @@ DOCKER_INST_NAME="finn_dev_${DOCKER_UNAME}"
 # ensure Docker tag and inst. name are all lowercase
 DOCKER_TAG=$(echo "$DOCKER_TAG" | tr '[:upper:]' '[:lower:]')
 DOCKER_INST_NAME=$(echo "$DOCKER_INST_NAME" | tr '[:upper:]' '[:lower:]')
+# Absolute path to this script, e.g. /home/user/bin/foo.sh
+SCRIPT=$(readlink -f "$0")
+# Absolute path this script is in, thus /home/user/bin
+SCRIPTPATH=$(dirname "$SCRIPT")
+
 # the settings below will be taken from environment variables if available,
 # otherwise the defaults below will be used
 : ${JUPYTER_PORT=8888}
@@ -74,11 +88,7 @@ DOCKER_INST_NAME=$(echo "$DOCKER_INST_NAME" | tr '[:upper:]' '[:lower:]')
 : ${PYNQ_BOARD="Pynq-Z1"}
 : ${PYNQ_TARGET_DIR="/home/xilinx/$DOCKER_INST_NAME"}
 : ${NUM_DEFAULT_WORKERS=1}
-
-# Absolute path to this script, e.g. /home/user/bin/foo.sh
-SCRIPT=$(readlink -f "$0")
-# Absolute path this script is in, thus /home/user/bin
-SCRIPTPATH=$(dirname "$SCRIPT")
+: ${FINN_SSH_KEY_DIR="$SCRIPTPATH/ssh_keys"}
 
 BUILD_LOCAL=/tmp/$DOCKER_INST_NAME
 VIVADO_HLS_LOCAL=$VIVADO_PATH
@@ -87,10 +97,12 @@ VIVADO_IP_CACHE=$BUILD_LOCAL/vivado_ip_cache
 # ensure build dir exists locally
 mkdir -p $BUILD_LOCAL
 mkdir -p $VIVADO_IP_CACHE
+mkdir -p $FINN_SSH_KEY_DIR
 
 gecho "Instance is named as $DOCKER_INST_NAME"
 gecho "Mounting $BUILD_LOCAL into $BUILD_LOCAL"
 gecho "Mounting $VIVADO_PATH into $VIVADO_PATH"
+gecho "Mounting $VITIS_PATH into $VITIS_PATH"
 gecho "Port-forwarding for Jupyter $JUPYTER_PORT:$JUPYTER_PORT"
 gecho "Port-forwarding for Netron $NETRON_PORT:$NETRON_PORT"
 gecho "Vivado IP cache dir is at $VIVADO_IP_CACHE"
@@ -126,23 +138,34 @@ docker build -f docker/Dockerfile.finn_dev --tag=$DOCKER_TAG \
 # Launch container with current directory mounted
 # important to pass the --init flag here for correct Vivado operation, see:
 # https://stackoverflow.com/questions/55733058/vivado-synthesis-hangs-in-docker-container-spawned-by-jenkins
-docker run -t --rm --name $DOCKER_INST_NAME $DOCKER_INTERACTIVE --init \
---hostname $DOCKER_INST_NAME \
--e "XILINX_VIVADO=$VIVADO_PATH" \
--e "SHELL=/bin/bash" \
--v $SCRIPTPATH:/workspace/finn \
--v $BUILD_LOCAL:$BUILD_LOCAL \
--v $VIVADO_PATH:$VIVADO_PATH \
--e VIVADO_PATH=$VIVADO_PATH \
--e FINN_INST_NAME=$DOCKER_INST_NAME \
--e FINN_ROOT="/workspace/finn" \
--e VIVADO_IP_CACHE="$VIVADO_IP_CACHE" \
--e PYNQ_BOARD=$PYNQ_BOARD \
--e PYNQ_IP=$PYNQ_IP \
--e PYNQ_USERNAME=$PYNQ_USERNAME \
--e PYNQ_PASSWORD=$PYNQ_PASSWORD \
--e PYNQ_TARGET_DIR=$PYNQ_TARGET_DIR \
--e NUM_DEFAULT_WORKERS=$NUM_DEFAULT_WORKERS \
--p $JUPYTER_PORT:$JUPYTER_PORT \
--p $NETRON_PORT:$NETRON_PORT \
-$DOCKER_TAG $DOCKER_CMD
+DOCKER_EXEC="docker run -t --rm --name $DOCKER_INST_NAME $DOCKER_INTERACTIVE --init "
+DOCKER_EXEC+="--hostname $DOCKER_INST_NAME "
+DOCKER_EXEC+="-e SHELL=/bin/bash "
+DOCKER_EXEC+="-v $SCRIPTPATH:/workspace/finn "
+DOCKER_EXEC+="-v $BUILD_LOCAL:$BUILD_LOCAL "
+DOCKER_EXEC+="-v $FINN_SSH_KEY_DIR:/home/$DOCKER_UNAME/.ssh "
+DOCKER_EXEC+="-e FINN_INST_NAME=$DOCKER_INST_NAME "
+DOCKER_EXEC+="-e FINN_ROOT="/workspace/finn" "
+DOCKER_EXEC+="-e VIVADO_IP_CACHE=$VIVADO_IP_CACHE "
+DOCKER_EXEC+="-e PYNQ_BOARD=$PYNQ_BOARD "
+DOCKER_EXEC+="-e PYNQ_IP=$PYNQ_IP "
+DOCKER_EXEC+="-e PYNQ_USERNAME=$PYNQ_USERNAME "
+DOCKER_EXEC+="-e PYNQ_PASSWORD=$PYNQ_PASSWORD "
+DOCKER_EXEC+="-e PYNQ_TARGET_DIR=$PYNQ_TARGET_DIR "
+DOCKER_EXEC+="-e NUM_DEFAULT_WORKERS=$NUM_DEFAULT_WORKERS "
+DOCKER_EXEC+="-p $JUPYTER_PORT:$JUPYTER_PORT "
+DOCKER_EXEC+="-p $NETRON_PORT:$NETRON_PORT "
+if [ ! -z "$VIVADO_PATH" ];then
+  DOCKER_EXEC+="-e "XILINX_VIVADO=$VIVADO_PATH" "
+  DOCKER_EXEC+="-v $VIVADO_PATH:$VIVADO_PATH "
+  DOCKER_EXEC+="-e VIVADO_PATH=$VIVADO_PATH "
+fi
+if [ ! -z "$VITIS_PATH" ];then
+  DOCKER_EXEC+="-v $VITIS_PATH:$VITIS_PATH "
+  DOCKER_EXEC+="-v $PLATFORM_REPO_PATHS:/workspace/finn/vitis_platforms "
+  DOCKER_EXEC+="-e VITIS_PATH=$VITIS_PATH "
+  DOCKER_EXEC+="-e PLATFORM_REPO_PATHS=/workspace/finn/vitis_platforms "
+fi
+DOCKER_EXEC+="$DOCKER_TAG $DOCKER_CMD"
+
+$DOCKER_EXEC
diff --git a/setup.cfg b/setup.cfg
index 1d7dcf247636b486e35d6320669eae706c2b7a72..7729d0949ee133e06242905afab31708e79ebf04 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -104,6 +104,7 @@ addopts =
 markers =
     slow: marks tests as slow (deselect with '-m "not slow"')
     vivado: mark tests that require Vivado or Vivado HLS
+    vitis: mark tests that require Vitis
 norecursedirs =
     dist
     build
diff --git a/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py b/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..201333aebdb3fc1d15464389e37326dcaf6848e0
--- /dev/null
+++ b/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import finn.custom_op.registry as registry
+from finn.util.fpgadataflow import is_fpgadataflow_node
+
+
+def exp_cycles_per_layer(model):
+    """Estimates the number of cycles per sample for dataflow layers in the given model.
+    Ensure that all nodes have unique names (by calling the GiveUniqueNodeNames
+    transformation) prior to calling this analysis pass to ensure all nodes are
+    visible in the results.
+
+    Returns {node name : cycle estimation}."""
+
+    cycle_dict = {}
+    for node in model.graph.node:
+        if is_fpgadataflow_node(node) is True:
+            op_type = node.op_type
+            inst = registry.custom_op[op_type](node)
+            cycle_dict[node.name] = inst.get_exp_cycles()
+
+    return cycle_dict
diff --git a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
index ad30282d93034f8d043a05a2172790349c31ec83..03b31b9c1ec51b45e17152d35d5824b6137ab4a2 100644
--- a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
+++ b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
@@ -35,6 +35,9 @@ from finn.util.fpgadataflow import is_fpgadataflow_node
 
 def hls_synth_res_estimation(model):
     """Extracts the FPGA resource results from the Vivado HLS synthesis estimates.
+    Ensure that all nodes have unique names (by calling the GiveUniqueNodeNames
+    transformation) prior to calling this analysis pass to ensure all nodes are
+    visible in the results.
 
     Returns {node name : resources_dict}."""
 
diff --git a/src/finn/analysis/fpgadataflow/post_synth_res.py b/src/finn/analysis/fpgadataflow/post_synth_res.py
index 508c34aaed50f2935f4915cdcea29a3e92641b3c..9206f3f6fcd81de175babef54de990fe01c861e1 100644
--- a/src/finn/analysis/fpgadataflow/post_synth_res.py
+++ b/src/finn/analysis/fpgadataflow/post_synth_res.py
@@ -30,15 +30,23 @@ import os
 import xml.etree.ElementTree as ET
 
 from finn.transformation.move_reshape import _is_fpgadataflow_node
+from finn.core.modelwrapper import ModelWrapper
+from finn.custom_op.registry import getCustomOp
 
 
-def post_synth_res(model):
+def post_synth_res(model, override_synth_report_filename=None):
     """Extracts the FPGA resource results from the Vivado synthesis.
+    Ensure that all nodes have unique names (by calling the GiveUniqueNodeNames
+    transformation) prior to calling this analysis pass to ensure all nodes are
+    visible in the results.
 
     Returns {node name : resources_dict}."""
 
     res_dict = {}
-    synth_report_filename = model.get_metadata_prop("vivado_synth_rpt")
+    if override_synth_report_filename is not None:
+        synth_report_filename = override_synth_report_filename
+    else:
+        synth_report_filename = model.get_metadata_prop("vivado_synth_rpt")
     if os.path.isfile(synth_report_filename):
         tree = ET.parse(synth_report_filename)
         root = tree.getroot()
@@ -50,7 +58,11 @@ def post_synth_res(model):
         raise Exception("Please run synthesis first")
 
     for node in model.graph.node:
-        if _is_fpgadataflow_node(node):
+        if node.op_type == "StreamingDataflowPartition":
+            sdp_model = ModelWrapper(getCustomOp(node).get_nodeattr("model"))
+            sdp_res_dict = post_synth_res(sdp_model, synth_report_filename)
+            res_dict.update(sdp_res_dict)
+        elif _is_fpgadataflow_node(node):
             row = root.findall(".//*[@contents='%s']/.." % node.name)
             if row != []:
                 node_dict = {}
diff --git a/src/finn/analysis/fpgadataflow/res_estimation.py b/src/finn/analysis/fpgadataflow/res_estimation.py
index c190059eceb0cc111477c84f843f4a9f9bf2f393..e52557573dab072709da4452f4e2d477e99b98c9 100644
--- a/src/finn/analysis/fpgadataflow/res_estimation.py
+++ b/src/finn/analysis/fpgadataflow/res_estimation.py
@@ -32,6 +32,9 @@ from finn.util.fpgadataflow import is_fpgadataflow_node
 
 def res_estimation(model):
     """Estimates the resources needed for the given model.
+    Ensure that all nodes have unique names (by calling the GiveUniqueNodeNames
+    transformation) prior to calling this analysis pass to ensure all nodes are
+    visible in the results.
 
     Returns {node name : resource estimation}."""
 
diff --git a/src/finn/core/onnx_exec.py b/src/finn/core/onnx_exec.py
index c2f68a35076418e0cf2edb578bdb8d548772fc78..0c01a48a07608dcd760447e8f569128f58d86f28 100644
--- a/src/finn/core/onnx_exec.py
+++ b/src/finn/core/onnx_exec.py
@@ -39,6 +39,7 @@ from finn.core.remote_exec import remote_exec
 from finn.core.rtlsim_exec import rtlsim_exec
 from finn.custom_op.registry import getCustomOp
 import finn.analysis.topology as ta
+from finn.util.basic import sanitize_quant_values, get_sanitize_quant_tensors
 
 
 def execute_node(node, context, graph):
@@ -50,8 +51,20 @@ def execute_node(node, context, graph):
     if node.op_type == "StreamingDataflowPartition":
         sdp_node = getCustomOp(node)
         model = ModelWrapper(sdp_node.get_nodeattr("model"))
-        ret = execute_onnx(model, context, True)
-        context.update(ret)
+        inp_ctx = dict(filter(lambda x: x[0] in node.input, context.items()))
+        # input may have been renamed in partition
+        assert len(inp_ctx) == 1
+        old_iname = node.input[0]
+        new_iname = model.graph.input[0].name
+        if old_iname != new_iname:
+            inp_ctx[new_iname] = inp_ctx[old_iname]
+            del inp_ctx[old_iname]
+        ret = execute_onnx(model, inp_ctx, False)
+        # output may have been renamed in partition
+        assert len(ret) == 1
+        node_oname = node.output[0]
+        model_oname = model.graph.output[0].name
+        context[node_oname] = ret[model_oname]
     else:
         if node.domain == "finn":
 
@@ -102,15 +115,14 @@ def execute_node(node, context, graph):
                     raise Exception(
                         """Output shapes disagree after node execution:
                         found %s vs expected %s"""
-                        % (
-                            str(output_list[list_ind].shape),
-                            str(context[outp].shape),
-                        )
+                        % (str(output_list[list_ind].shape), str(context[outp].shape))
                     )
                 context[outp] = output_list[list_ind]
 
 
-def execute_onnx(model, input_dict, return_full_exec_context=False):
+def execute_onnx(
+    model, input_dict, return_full_exec_context=False, start_node=None, end_node=None
+):
     """Executes given ONNX ModelWrapper with given named inputs.
 
     If return_full_exec_context is False, a dict of named outputs is returned
@@ -118,7 +130,12 @@ def execute_onnx(model, input_dict, return_full_exec_context=False):
 
     If return return_full_exec_context is True, the full set of tensors used by
     the execution (including inputs, weights, activations and final outputs)
-    will be returned as a dict."""
+    will be returned as a dict.
+
+    When start_node and end_node are set to None, the whole graph is executed.
+    If they are set to particular ONNX nodes, only the subgraph between (and
+    including) those nodes is executed.
+    """
 
     if not model.check_all_tensor_shapes_specified():
         raise Exception("Found unspecified tensor shapes, try infer_shapes")
@@ -161,8 +178,28 @@ def execute_onnx(model, input_dict, return_full_exec_context=False):
         # execute the model node by node
         # we can simply walk down the list since the ONNX spec guarantees that it is
         # topologically sorted
-        for node in graph.node:
+        subgraph = []
+        if start_node is None:
+            start_node = model.graph.node[0]
+        if end_node is None:
+            end_node = model.graph.node[-1]
+        # select the nodes between specified start/end nodes
+        start_ind = model.get_node_index(start_node)
+        end_ind = model.get_node_index(end_node) + 1
+        assert end_ind >= start_ind, "Start/end nodes must define valid subgraph"
+        subgraph = graph.node[start_ind:end_ind]
+        for node in subgraph:
+            if get_sanitize_quant_tensors() != 0:
+                # round input values to match quantization annotation
+                execution_context = sanitize_quant_values(
+                    model, node.input, execution_context
+                )
             execute_node(node, execution_context, graph)
+            if get_sanitize_quant_tensors() != 0:
+                # round output values to quantization annotation
+                execution_context = sanitize_quant_values(
+                    model, node.output, execution_context
+                )
     elif model_exec_mode == "remote_pynq":
         # use remote exec metadata built into model to execute on a remote PYNQ
         remote_exec(model, execution_context)
diff --git a/src/finn/core/remote_exec.py b/src/finn/core/remote_exec.py
index a533e4d36629f57f7c4a576570d75a1e051de5be..214358608c43a868f9ef414dcbf6eb33e3f45a5b 100644
--- a/src/finn/core/remote_exec.py
+++ b/src/finn/core/remote_exec.py
@@ -62,11 +62,15 @@ def remote_exec(model, execution_context):
     bash_command = ["/bin/bash", "-c", cmd]
     process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
     process_compile.communicate()
+    # set platform attribute for correct remote execution
+    platform = model.get_metadata_prop("platform")
+    assert platform in ["alveo", "zynq", "zynq-iodma"]
     cmd = (
         "sshpass -p {} ssh {}@{} -p {} "
         '"cd {}/{}; echo "{}" | '
         'sudo -S python3.6 driver.py --exec_mode="execute" --batchsize=1" '
-        '--bitfile="resizer.bit" --inputfile="input.npy" --outputfile="output.npy"'
+        '--bitfile="resizer.bit" --inputfile="input.npy" --outputfile="output.npy" '
+        '--platform="{}" '
     ).format(
         pynq_password,
         pynq_username,
@@ -75,6 +79,7 @@ def remote_exec(model, execution_context):
         pynq_target_dir,
         deployment_folder,
         pynq_password,
+        platform,
     )
     bash_command = ["/bin/bash", "-c", cmd]
     process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
diff --git a/src/finn/core/rtlsim_exec.py b/src/finn/core/rtlsim_exec.py
index ad44dab578b396c80af35af2ede031baca798150..d83bcd3a75dd0d2fc02315c72784e57348901a04 100644
--- a/src/finn/core/rtlsim_exec.py
+++ b/src/finn/core/rtlsim_exec.py
@@ -66,6 +66,11 @@ def rtlsim_exec(model, execution_context):
     i_stream_w = first_node.get_instream_width()
     # convert input into time multiplexed shape
     i_folded_shape = first_node.get_folded_input_shape()
+    batchsize = i_tensor.shape[0]
+    # override batch size for input
+    i_folded_shape = list(i_folded_shape)
+    i_folded_shape[0] = batchsize
+    i_folded_shape = tuple(i_folded_shape)
     # TODO any other layout transformations need to happen here!
     i_tensor = i_tensor.reshape(i_folded_shape)
     # extract output shape
@@ -74,12 +79,20 @@ def rtlsim_exec(model, execution_context):
     o_dt = model.get_tensor_datatype(o_name)
     last_node = getCustomOp(model.find_producer(o_name))
     o_folded_shape = last_node.get_folded_output_shape()
+    # override batch size from actual input
+    o_shape = list(o_shape)
+    o_shape[0] = batchsize
+    o_shape = tuple(o_shape)
+    o_folded_shape = list(o_folded_shape)
+    o_folded_shape[0] = batchsize
+    o_folded_shape = tuple(o_folded_shape)
     o_stream_w = last_node.get_outstream_width()
     packedBits = o_stream_w
     targetBits = o_dt.bitwidth()
     # pack input
     packed_input = npy_to_rtlsim_input(i_tensor, i_dt, i_stream_w)
     num_out_values = last_node.get_number_output_values()
+    num_out_values *= batchsize
     # prepare pyverilator model
     rtlsim_so = model.get_metadata_prop("rtlsim_so")
     if (rtlsim_so is None) or (not os.path.isfile(rtlsim_so)):
@@ -89,7 +102,7 @@ def rtlsim_exec(model, execution_context):
         sim = PyVerilator(rtlsim_so, auto_eval=False)
     ret = _run_rtlsim(sim, packed_input, num_out_values, trace_file)
     packed_output = ret[0]
-    model.set_metadata_prop("sim_cycles", str(ret[1]))
+    model.set_metadata_prop("cycles_rtlsim", str(ret[1]))
     # unpack output and put into context
     o_folded_tensor = rtlsim_output_to_npy(
         packed_output, None, o_dt, o_folded_shape, packedBits, targetBits
@@ -101,19 +114,19 @@ def rtlsim_exec(model, execution_context):
 def _reset_rtlsim(sim):
     """Sets reset input in pyverilator to zero, toggles the clock and set it
     back to one"""
-    sim.io.ap_rst_n_0 = 0
+    sim.io.ap_rst_n = 0
     _toggle_clk(sim)
     _toggle_clk(sim)
-    sim.io.ap_rst_n_0 = 1
+    sim.io.ap_rst_n = 1
     _toggle_clk(sim)
     _toggle_clk(sim)
 
 
 def _toggle_clk(sim):
     """Toggles the clock input in pyverilator once."""
-    sim.io.ap_clk_0 = 0
+    sim.io.ap_clk = 0
     sim.eval()
-    sim.io.ap_clk_0 = 1
+    sim.io.ap_clk = 1
     sim.eval()
 
 
@@ -127,7 +140,7 @@ def _run_rtlsim(sim, inp, num_out_values, trace_file=None, reset=True):
     from finn.util.fpgadataflow)"""
     inputs = inp
     outputs = []
-    sim.io.out_r_0_tready = 1
+    sim.io.m_axis_0_tready = 1
 
     # observe if output is completely calculated
     # observation_count will contain the number of cycles the calculation ran
@@ -146,19 +159,19 @@ def _run_rtlsim(sim, inp, num_out_values, trace_file=None, reset=True):
         _reset_rtlsim(sim)
 
     while not (output_observed):
-        sim.io.in0_V_V_0_tvalid = 1 if len(inputs) > 0 else 0
-        sim.io.in0_V_V_0_tdata = inputs[0] if len(inputs) > 0 else 0
-        if sim.io.in0_V_V_0_tready == 1 and sim.io.in0_V_V_0_tvalid == 1:
+        sim.io.s_axis_0_tvalid = 1 if len(inputs) > 0 else 0
+        sim.io.s_axis_0_tdata = inputs[0] if len(inputs) > 0 else 0
+        if sim.io.s_axis_0_tready == 1 and sim.io.s_axis_0_tvalid == 1:
             inputs = inputs[1:]
-        if sim.io.out_r_0_tvalid == 1 and sim.io.out_r_0_tready == 1:
-            outputs = outputs + [sim.io.out_r_0_tdata]
+        if sim.io.m_axis_0_tvalid == 1 and sim.io.m_axis_0_tready == 1:
+            outputs = outputs + [sim.io.m_axis_0_tdata]
         _toggle_clk(sim)
 
         observation_count = observation_count + 1
         no_change_count = no_change_count + 1
 
         if len(outputs) == num_out_values:
-            sim_cycles = observation_count
+            cycles_rtlsim = observation_count
             output_observed = True
 
         if no_change_count == liveness_threshold:
@@ -178,4 +191,4 @@ def _run_rtlsim(sim, inp, num_out_values, trace_file=None, reset=True):
         sim.flush_vcd_trace()
         sim.stop_vcd_trace()
 
-    return (outputs, sim_cycles)
+    return (outputs, cycles_rtlsim)
diff --git a/src/finn/core/throughput_test.py b/src/finn/core/throughput_test.py
index 8d3dabcf8af51327d5d951464c6d9b36e2f67497..fbfe775e581e063b08e34b3096fd34f412b47d11 100644
--- a/src/finn/core/throughput_test.py
+++ b/src/finn/core/throughput_test.py
@@ -28,6 +28,10 @@
 
 import os
 import subprocess
+import numpy as np
+
+from finn.util.basic import gen_finn_dt_tensor
+from finn.core.rtlsim_exec import rtlsim_exec
 
 
 def throughput_test(model, batchsize=1000):
@@ -88,3 +92,50 @@ def throughput_test(model, batchsize=1000):
         return res
     except FileNotFoundError:
         return None
+
+
+def throughput_test_rtlsim(model, batchsize=100):
+    """Runs a throughput test for the given IP-stitched model. When combined
+    with tracing, useful to determine bottlenecks and required FIFO sizes."""
+
+    assert (
+        model.get_metadata_prop("exec_mode") == "rtlsim"
+    ), """Top-level exec_mode
+    metadata_prop must be set to rtlsim"""
+
+    # create random input
+    iname = model.graph.input[0].name
+    ishape = model.get_tensor_shape(iname)
+    ishape_batch = ishape
+    ishape_batch[0] = batchsize
+    idt = model.get_tensor_datatype(iname)
+    dummy_input = gen_finn_dt_tensor(idt, ishape_batch)
+    # compute input/output sizes
+    oname = model.graph.output[0].name
+    oshape = model.get_tensor_shape(oname)
+    oshape_batch = oshape
+    oshape_batch[0] = batchsize
+    odt = model.get_tensor_datatype(oname)
+    i_bytes = (np.prod(ishape_batch) * idt.bitwidth()) / 8
+    o_bytes = (np.prod(oshape_batch) * odt.bitwidth()) / 8
+    # make empty exec context and insert input
+    ctx = model.make_empty_exec_context()
+    ctx[iname] = dummy_input
+    # remove liveness threshold, launch rtlsim
+    os.environ["LIVENESS_THRESHOLD"] = "-1"
+    rtlsim_exec(model, ctx)
+    # extract metrics
+    cycles = int(model.get_metadata_prop("cycles_rtlsim"))
+    clk_ns = float(model.get_metadata_prop("clk_ns"))
+    fclk_mhz = 1 / (clk_ns * 0.001)
+    runtime_s = (cycles * clk_ns) * (10 ** -9)
+    res = dict()
+    res["cycles"] = cycles
+    res["runtime[ms]"] = runtime_s * 1000
+    res["throughput[images/s]"] = batchsize / runtime_s
+    res["DRAM_in_bandwidth[Mb/s]"] = i_bytes * 0.000001 / runtime_s
+    res["DRAM_out_bandwidth[Mb/s]"] = o_bytes * 0.000001 / runtime_s
+    res["fclk[mhz]"] = fclk_mhz
+    res["N"] = batchsize
+
+    return res
diff --git a/src/finn/custom_op/__init__.py b/src/finn/custom_op/__init__.py
index ab6e03bee65b8bf5c4041dd8021b1a561e7673d2..4ae7b9ebffaab6ca6be04b8d73f647b2db22dc78 100644
--- a/src/finn/custom_op/__init__.py
+++ b/src/finn/custom_op/__init__.py
@@ -56,8 +56,15 @@ class CustomOp(ABC):
                     ret = ret.decode("utf-8")
                 return ret
             else:
-                # not set, return default value
-                return def_val
+                if req:
+                    raise Exception(
+                        """Required attribute %s unspecified in
+                    a %s node"""
+                        % (name, self.onnx_node.op_type)
+                    )
+                else:
+                    # not set, return default value
+                    return def_val
         except KeyError:
             raise AttributeError("Op has no such attribute: " + name)
 
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index a688898f4a43b33fd3f07cda12144b84829e451f..65c898a8c453420ed96ca22715ef2595c5840288 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -82,12 +82,15 @@ class HLSCustomOp(CustomOp):
             "ip_path": ("s", False, ""),
             "ip_vlnv": ("s", False, ""),
             "exec_mode": ("s", False, ""),
-            "sim_cycles": ("i", False, 0),
+            "cycles_rtlsim": ("i", False, 0),
+            "cycles_estimate": ("i", False, 0),
             "rtlsim_trace": ("s", False, ""),
             "res_estimate": ("s", False, ""),
             "res_hls": ("s", False, ""),
             "res_synth": ("s", False, ""),
             "rtlsim_so": ("s", False, ""),
+            # partitioning info
+            "partition_id": ("i", False, 0),
             # input and output FIFO depths
             "inFIFODepth": ("i", False, 2),
             "outFIFODepth": ("i", False, 2),
@@ -100,6 +103,23 @@ class HLSCustomOp(CustomOp):
         prefixed_top_name = "%s_%s" % (node.name, node.name)
         return prefixed_top_name
 
+    def get_verilog_top_module_intf_names(self):
+        """Return a dict of names of input and output interfaces.
+        The keys reflect the protocols each interface implements:
+        'clk', 'rst', 'm_axis', 's_axis', 'aximm', 'axilite'.
+        Values are lists of names:
+        's_axis' names correspond to the list of node inputs in order,
+        'm_axis' names correspond to the list of node outputs in order'
+        Each block must have at most one aximm and one axilite."""
+        intf_names = {}
+        intf_names["clk"] = ["ap_clk"]
+        intf_names["rst"] = ["ap_rst_n"]
+        intf_names["s_axis"] = ["in0_V_V"]
+        intf_names["m_axis"] = ["out_V_V"]
+        intf_names["aximm"] = []
+        intf_names["axilite"] = []
+        return intf_names
+
     def get_verilog_top_filename(self):
         "Return the Verilog top module filename for this node."
 
@@ -171,9 +191,15 @@ class HLSCustomOp(CustomOp):
         of the node as a dictionary."""
         ret = dict()
         ret["BRAM_18K"] = self.bram_estimation()
+        ret["BRAM_efficiency"] = self.bram_efficiency_estimation()
         ret["LUT"] = self.lut_estimation()
         return ret
 
+    def bram_efficiency_estimation(self):
+        """Function for BRAM efficiency estimation: actual parameter storage
+        needed divided by the allocated BRAM storage (from estimation)"""
+        return 1
+
     def bram_estimation(self):
         """Function for BRAM resource estimation, is member function of
         HLSCustomOp class but has to be filled by every node"""
@@ -184,6 +210,12 @@ class HLSCustomOp(CustomOp):
         HLSCustomOp class but has to be filled by every node"""
         return 0
 
+    def get_exp_cycles(self):
+        """Function for estimation of expected cycles for set folding,
+        is member function of HLSCustomOp class but has to be filled
+        by every node"""
+        return 0
+
     def code_generation_ipgen(self, model, fpgapart, clk):
         """Generates c++ code and tcl script for ip generation."""
         node = self.onnx_node
@@ -219,7 +251,6 @@ class HLSCustomOp(CustomOp):
         self.code_gen_dict["$CLKPERIOD$"] = [str(clk)]
         self.code_gen_dict["$EXTRA_DIRECTIVES$"] = self.ipgen_extra_directives()
 
-
         template = self.ipgentcl_template
 
         for key in self.code_gen_dict:
@@ -235,7 +266,7 @@ class HLSCustomOp(CustomOp):
     def ipgen_extra_directives(self):
         "Return a list of extra tcl directives for HLS synthesis."
         return []
-        
+
     def ipgen_singlenode_code(self):
         """Builds the bash script for ip generation using the IPGenBuilder from
         finn.util.fpgadataflow."""
@@ -412,7 +443,7 @@ compilation transformations?
             no_change_count = no_change_count + 1
 
             if len(outputs) == num_out_values:
-                self.set_nodeattr("sim_cycles", observation_count)
+                self.set_nodeattr("cycles_rtlsim", observation_count)
                 output_observed = True
 
             if no_change_count == liveness_threshold:
@@ -441,7 +472,7 @@ compilation transformations?
             trace_file = self.onnx_node.name + ".vcd"
         num_out_values = self.get_number_output_values()
         total_cycle_count = rtlsim_multi_io(sim, io_dict, num_out_values, trace_file)
-        self.set_nodeattr("sim_cycles", total_cycle_count)
+        self.set_nodeattr("cycles_rtlsim", total_cycle_count)
 
     def execute_node(self, context, graph):
         """Executes single node using cppsim or rtlsim."""
diff --git a/src/finn/custom_op/fpgadataflow/addstreams_batch.py b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
index d5f5c1194d36e86b895610c084222db5ab9eb2bf..14fb65739dab4208edd0c61bb7ca8ae2d114baab 100644
--- a/src/finn/custom_op/fpgadataflow/addstreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
@@ -170,6 +170,10 @@ class AddStreams_Batch(HLSCustomOp):
     def get_number_output_values(self):
         return np.prod(self.get_folded_output_shape()[:-1])
 
+    def get_exp_cycles(self):
+        # Channels/PE * batch size * fmdim * fmdim
+        return np.prod(self.get_folded_output_shape()[:-1])
+
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
         node = self.onnx_node
@@ -356,3 +360,8 @@ class AddStreams_Batch(HLSCustomOp):
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        intf_names["s_axis"] = ["in0_V_V", "in1_V_V"]
+        return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8e74a4d13043a741cf787477c51b63925b7aad8
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
@@ -0,0 +1,580 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from math import ceil
+import os
+
+import numpy as np
+
+from onnx import TensorProto, helper
+from finn.core.datatype import DataType
+from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.util.data_packing import (
+    npy_to_rtlsim_input,
+    numpy_to_hls_code,
+    rtlsim_output_to_npy,
+)
+from . import templates
+
+# ONNX i/o tensor shape assumptions for channelwise ops:
+# input 0 is the input tensor, shape (..., NumChannels)
+# input 1 is the channelwise parameter tensor, shape (NumChannels, params_per_channel)
+# output 0 is the output tensor, shape (..., NumChannels) - same as input
+# the ... here can be any shape (representing groups of vectors)
+
+
+class ChannelwiseOp_Batch(HLSCustomOp):
+    """Class that corresponds to finn-hls Thresholding_Batch function.
+    It can implement a variety of channel-wise parametrized operations,
+    including Add, Mul and multi-thresholding.
+    """
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+        self.decoupled_wrapper = templates.decoupled_wrapper
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            # channelwise "map" function to apply:
+            # one of cmp_le, cmp_ge, add, mul
+            "Func": ("s", False, "cmp_le"),
+            "PE": ("i", True, 0),
+            "NumChannels": ("i", True, 0),
+            # string defining memory resource type for parameters
+            "ram_style": ("s", False, "distributed"),
+            # FINN DataTypes for inputs, weights, outputs
+            "inputDataType": ("s", True, ""),
+            "paramDataType": ("s", True, ""),
+            "outputDataType": ("s", True, ""),
+            # input and output FIFO depths
+            "inFIFODepth": ("i", False, 0),
+            "outFIFODepth": ("i", False, 0),
+            # number of input vectors, examples:
+            # [1] is a single vector (like a FC layer with batch=1)
+            # [4] is four vectors (like a FC layer with batch=4)
+            # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
+            "numInputVectors": ("ints", False, [1]),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def calc_tmem(self):
+        """Calculates and returns TMEM, the depth of the memory used
+        to store the channelwise op parameters."""
+        chn = self.get_nodeattr("NumChannels")
+        pe = self.get_nodeattr("PE")
+        return chn // pe
+
+    def make_shape_compatible_op(self, model):
+        oshape = self.get_normal_output_shape()
+        # implement tensor with correct shape
+        values = np.random.randn(*oshape).astype(np.float32)
+        return helper.make_node(
+            "Constant",
+            inputs=[],
+            outputs=[self.onnx_node.output[0]],
+            value=helper.make_tensor(
+                name="const_tensor",
+                data_type=TensorProto.FLOAT,
+                dims=values.shape,
+                vals=values.flatten().astype(float),
+            ),
+        )
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        # check input datatype against property
+        idt_name = self.get_input_datatype().name
+        exp_idt_name = self.get_nodeattr("inputDataType")
+        assert exp_idt_name == idt_name, "Bad input DataType for ChannelwiseOp layer"
+        # TODO: dynamically infer/update odt based on idt as done in ConvertToHLSLayers?
+        # set output datatype from property
+        odt = self.get_output_datatype()
+        model.set_tensor_datatype(node.output[0], odt)
+
+    def verify_node(self):
+        info_messages = []
+        # verify that "domain" is set to "finn"
+        domain_value = self.onnx_node.domain
+        if domain_value == "finn":
+            info_messages.append("Attribute domain is set correctly")
+        else:
+            info_messages.append('Attribute domain should be set to "finn"')
+
+        # verify that "backend" is set to "fpgadataflow"
+        backend_value = self.get_nodeattr("backend")
+        if backend_value == "fpgadataflow":
+            info_messages.append("Attribute backend is set correctly")
+        else:
+            info_messages.append('Attribute backend should be set to "fpgadataflow"')
+
+        # verify that all necessary attributes exist
+        # TODO collect automatically from get_nodeattr_types
+        try:
+            self.get_nodeattr("code_gen_dir_cppsim")
+            self.get_nodeattr("executable_path")
+            self.get_nodeattr("NumChannels")
+            self.get_nodeattr("PE")
+            self.get_nodeattr("inputDataType")
+            self.get_nodeattr("paramDataType")
+            self.get_nodeattr("outputDataType")
+            info_messages.append("All necessary attributes exist")
+        except Exception:
+            info_messages.append(
+                """The required Threshold_Batch attributes do not exist."""
+            )
+
+        return info_messages
+
+    def bram_estimation(self):
+        """Calculates BRAM cost if resource set to BRAM"""
+        style = self.get_nodeattr("ram_style")
+        P = self.get_nodeattr("PE")
+        idt = self.get_input_datatype()
+        A = idt.bitwidth()
+        tmem = self.calc_tmem()
+
+        if style == "block" and tmem > 1:
+            return int(ceil(A * P / 16)) * int(ceil(tmem / 1024))
+        else:
+            return 0
+
+    def lut_estimation(self):
+        """Calculates LUT cost, taking memory resource type into account """
+        # TODO add in/out FIFO contributions
+        style = self.get_nodeattr("ram_style")
+        P = self.get_nodeattr("PE")
+        idt = self.get_input_datatype()
+        A = idt.bitwidth()
+        tmem = self.calc_tmem()
+        # cost of comparators
+        comparator_cost = A * P
+        # cost of LUTRAM
+        if style == "distributed" and tmem > 1:
+            lutram_cost = P * A * int(ceil(tmem / 64))
+        else:
+            lutram_cost = 0
+        # total cost
+        return comparator_cost + lutram_cost
+
+    def get_input_datatype(self):
+        """Returns FINN DataType of input."""
+        return DataType[self.get_nodeattr("inputDataType")]
+
+    def get_output_datatype(self):
+        """Returns FINN DataType of output."""
+        return DataType[self.get_nodeattr("outputDataType")]
+
+    def get_instream_width(self):
+        i_bits = self.get_input_datatype().bitwidth()
+        return i_bits * self.get_nodeattr("PE")
+
+    def get_outstream_width(self):
+        o_bits = self.get_output_datatype().bitwidth()
+        return o_bits * self.get_nodeattr("PE")
+
+    def get_folded_input_shape(self):
+        ich = self.get_nodeattr("NumChannels")
+        pe = self.get_nodeattr("PE")
+        fold = ich // pe
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        folded_input_shape = tuple(vecs + [fold, pe])
+        return folded_input_shape
+
+    def get_folded_output_shape(self):
+        # same shape as input
+        return self.get_folded_input_shape()
+
+    def get_normal_input_shape(self):
+        ich = self.get_nodeattr("NumChannels")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        normal_input_shape = tuple(vecs + [ich])
+        return normal_input_shape
+
+    def get_normal_output_shape(self):
+        # same shape as input
+        return self.get_normal_input_shape()
+
+    def get_number_output_values(self):
+        nf = np.prod(self.get_folded_output_shape()[:-1])
+        return nf
+
+    def get_exp_cycles(self):
+        # Channels/PE * batch size * fmdim * fmdim
+        return np.prod(self.get_folded_output_shape()[:-1])
+
+    def get_template_param_values(self):
+        """Returns the template parameter values according to input, output and weight
+        data types."""
+        ret = dict()
+        inp_hls_str = self.get_input_datatype().get_hls_datatype_str()
+        out_hls_str = self.get_output_datatype().get_hls_datatype_str()
+        # fill in TSrcI
+        ret["TSrcI"] = "Slice<%s>" % inp_hls_str
+        # fill in TDstI
+        ret["TDstI"] = "Slice<%s>" % out_hls_str
+
+        return ret
+
+    def get_hls_compatible_parameter_tensor(self, orig_param_vector):
+        """Convert the original numpy weight matrix orig_weight_matrix into
+        a form suitable for passing to the hlslib call:
+        * ensure chn % PE == 0
+        * interleave rows between PEs
+        * reshape into (PE, TMEM) and return
+        """
+        chn = self.get_nodeattr("NumChannels")
+        pe = self.get_nodeattr("PE")
+        tmem = chn // pe
+        assert chn % pe == 0, "Requirement NumChannels divisable by PE is violated."
+        assert (
+            orig_param_vector.ndim == 1
+        ), """Parameter vector dimension is {}.
+        Expected dimension: 1.""".format(
+            orig_param_vector.ndim
+        )
+
+        # if not self.get_input_datatype().signed():
+        #     # ensure all thresholds are nonnegative
+        #     assert (orig_param_vector >= 0).all()
+
+        # ensure all thresholds are integer
+        assert (orig_param_vector.astype(np.int32) == orig_param_vector).all()
+        ret = orig_param_vector
+
+        assert (
+            ret.shape[0] == chn
+        ), "Cardinality of parameter vector is not as expected (chn)"
+
+        # distribute rows between PEs
+        ret = ret.reshape(tmem, pe).transpose()
+        assert (
+            ret.shape[0] == pe
+        ), """First dimension after distribution of the
+        rows between PEs is not as expected (pe)"""
+        assert (
+            ret.shape[1] == tmem
+        ), """Second dimension after distribution of the
+        rows between PEs is not as expected (tmem)"""
+
+        return ret.reshape(1, pe, tmem)
+
+    def generate_params(self, model, path):
+        code_gen_dir = path
+        # save thresholds in params.h
+        parameters = model.get_initializer(self.onnx_node.input[1])
+        parameter_tensor = self.get_hls_compatible_parameter_tensor(parameters)
+        pdt = DataType[self.get_nodeattr("paramDataType")]
+
+        parameters_hls_code = numpy_to_hls_code(
+            parameter_tensor, pdt, "parameters", False, True
+        )
+        # get input data type
+        export_idt = self.get_input_datatype()
+        if self.get_input_datatype() == DataType.BIPOLAR:
+            export_idt = DataType.BINARY
+        idt_hls = export_idt.get_hls_datatype_str()
+
+        # write parameters into params.h
+        f_params = open("{}/params.h".format(code_gen_dir), "w")
+        pdt_hls = pdt.get_hls_datatype_str()
+        # use binary to export bipolar activations
+        export_odt = self.get_output_datatype()
+        if self.get_output_datatype() == DataType.BIPOLAR:
+            export_odt = DataType.BINARY
+        odt_hls = export_odt.get_hls_datatype_str()
+        # get desired function
+        func = self.get_nodeattr("Func")
+        if func == "cmp_le":
+            func_str = "std::less_equal"
+        elif func == "cmp_ge":
+            func_str = "std::greater_equal"
+        elif func == "add":
+            func_str = "std::plus"
+        elif func == "mul":
+            func_str = "std::multiplies"
+        else:
+            raise Exception(
+                """Invalid value for attribute Func! Is currently set to: {}
+            has to be set to one of the following value
+            ("cmp_le", "cmp_ge", "add", "mul")""".format(
+                    func
+                )
+            )
+        f_params.write(
+            "static ChannelWiseOperation<{},{},{},{},{},{}> threshs \
+            = ".format(
+                self.calc_tmem(),
+                self.get_nodeattr("PE"),
+                idt_hls,
+                pdt_hls,
+                odt_hls,
+                "%s<%s>" % (func_str, odt_hls),
+            )
+        )
+        f_params.write(parameters_hls_code)
+        f_params.close()
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+
+        # TODO ensure codegen dir exists
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        # create a npy file fore each input of the node (in_ind is input index)
+        in_ind = 0
+        for inputs in node.input:
+            # it is assumed that the first input of the node is the data input
+            # the second input are the weights
+            # the third input are the thresholds
+            if in_ind == 0:
+                assert (
+                    str(context[inputs].dtype) == "float32"
+                ), """Input datatype is
+                not float32 as expected."""
+                expected_inp_shape = self.get_folded_input_shape()
+                reshaped_input = context[inputs].reshape(expected_inp_shape)
+                export_idt = self.get_input_datatype()
+                # make copy before saving the array
+                reshaped_input = reshaped_input.copy()
+                np.save(
+                    os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)),
+                    reshaped_input,
+                )
+            elif in_ind > 2:
+                raise Exception("Unexpected input found for ChannelwiseOp_Batch")
+            in_ind += 1
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_output(context)
+            # reinterpret binary output as bipolar where needed
+            if self.get_output_datatype() == DataType.BIPOLAR:
+                out = context[node.output[0]]
+                out = 2 * out - 1
+                context[node.output[0]] = out
+            assert (
+                context[node.output[0]].shape == self.get_folded_output_shape()
+            ), """Output shape is not as expected"""
+            # reshape output to have expected shape
+            oshape = self.get_normal_output_shape()
+            context[node.output[0]] = context[node.output[0]].reshape(*oshape)
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            output = self.rtlsim(sim, inp)
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+
+            # load and reshape output
+            output = np.load(out_npy_path)
+            oshape = self.get_normal_output_shape()
+            output = np.asarray([output], dtype=np.float32).reshape(*oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "activations.hpp"']
+        self.code_gen_dict["$GLOBALS$"] += ['#include "params.h"']
+
+    # TODO check and add whatever missing
+    def defines(self, var):
+        numInputVectors = list(self.get_nodeattr("numInputVectors"))
+        numReps = numInputVectors[0]
+        self.code_gen_dict["$DEFINES$"] = [
+            """#define NumChannels1 {}\n#define PE1 {}\n#define numReps {}""".format(
+                self.get_nodeattr("NumChannels"), self.get_nodeattr("PE"), numReps,
+            )
+        ]
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_input_datatype()
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"] = []
+        # note: the innermost dim is reversed for the input
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0, false);'
+            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+        )
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+        )
+
+    def docompute(self):
+        tmpl_args = self.get_template_param_values()
+        # TODO: why put some template parameters into defines and not others?
+        # should ImgDim be defined or just filled in here like we do now?
+        ishape = self.get_folded_input_shape()
+        if len(ishape) == 3:
+            imgdim = 1
+        elif len(ishape) == 5:
+            imgdim = ishape[1]
+        else:
+            raise Exception("""Unexpeted input shape""")
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            """Thresholding_Batch<{}, NumChannels1, PE1, {}, {}>
+            (in0, out, threshs, numReps);""".format(
+                imgdim, tmpl_args["TSrcI"], tmpl_args["TDstI"],
+            )
+        ]
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        if dtype == DataType.BIPOLAR:
+            # use binary for bipolar storage
+            dtype = DataType.BINARY
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        shape = self.get_folded_output_shape()
+        shape_cpp_str = str(shape).replace("(", "{").replace(")", "}")
+
+        # note: the innermost dim is not reversed for the output
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", false);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                shape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            """void {}(hls::stream<ap_uint<{}>> &in0,
+                hls::stream<ap_uint<{}>> &out
+                )""".format(
+                self.onnx_node.name,
+                self.get_instream_width(),
+                self.get_outstream_width(),
+            )
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+        )
+
+        # the channelwise parameter tensor is acc_type [PE][TMEM][N_PARAMS_PER_CHANNEL]
+        # partition for parallel access along PE and N_PARAMS_PER_CHANNEL
+        # dimensions (dims 1 and 3)
+        self.code_gen_dict["$PRAGMAS$"].append(
+            (
+                "#pragma HLS ARRAY_PARTITION variable=threshs.parameters "
+                "complete dim=1"
+            )
+        )
+        # self.code_gen_dict["$PRAGMAS$"].append(
+        #     (
+        #         "#pragma HLS ARRAY_PARTITION variable=threshs.parameters "
+        #         "complete dim=3"
+        #     )
+        # )
+
+        # set resource type
+        ram_style = self.get_nodeattr("ram_style")
+        pe = self.get_nodeattr("PE")
+        ich = self.get_nodeattr("NumChannels")
+        # if PE less than NumChannels, assign cores according to ram_style;
+        # otherwise if PE == NumChannels, Vivado HLS will unroll to FFs
+        if pe < ich:
+            if ram_style == "distributed":
+                self.code_gen_dict["$PRAGMAS$"].append(
+                    (
+                        "#pragma HLS RESOURCE variable=threshs.parameters "
+                        "core=ROM_2P_LUTRAM"
+                    )
+                )
+            elif ram_style == "block":
+                self.code_gen_dict["$PRAGMAS$"].append(
+                    (
+                        "#pragma HLS RESOURCE variable=threshs.parameters "
+                        "core=ROM_2P_BRAM"
+                    )
+                )
+            else:
+                raise Exception(
+                    """Invalid value for attribute ram_style! Is currently set to: {}
+                has to be set to one of ("block", "distributed")""".format(
+                        ram_style
+                    )
+                )
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
index 3e40ad70208909551365c51324153859ccc79ceb..d33d6c963c0c55309f7f258c9ec1d7723e112282 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
@@ -177,6 +177,23 @@ class ConvolutionInputGenerator(HLSCustomOp):
         num_output_elems = np.prod(folded_oshape[:-1])
         return num_output_elems
 
+    def get_exp_cycles(self):
+        simd = self.get_nodeattr("SIMD")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        k = self.get_nodeattr("ConvKernelDim")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        ofm_dim = self.get_nodeattr("OFMDim")
+        stride = self.get_nodeattr("Stride")
+        # since mmv != 1 is not supported yet, we set mmv for now to 1
+        mmv = 1
+        # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h
+        cycles_write_block = (ofm_dim * k * k * (ifm_ch / simd)) / mmv
+        cycles_read_block = stride * ifm_dim * (ifm_ch / simd)
+        max_cycles = max(cycles_write_block, cycles_read_block)
+        exp_cycles = ifm_dim * k * (ifm_ch / simd) + ofm_dim * max_cycles
+
+        return int(exp_cycles)
+
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
         node = self.onnx_node
diff --git a/src/finn/custom_op/fpgadataflow/downsampler.py b/src/finn/custom_op/fpgadataflow/downsampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..15d55653b4e431dead885d75650b1500150d8775
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/downsampler.py
@@ -0,0 +1,305 @@
+import os
+import numpy as np
+from onnx import TensorProto, helper
+from finn.core.datatype import DataType
+from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class DownSampler(HLSCustomOp):
+    """Corresponds to finn-hlslib ConvolutionInputGenerator_kernel1 function.
+    Basically performs a down sampling of the image removing rows and columns."""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            # spatial size of input images
+            "ImgDim": ("i", True, 0),
+            # number of channels in input image
+            "NumChannels": ("i", True, 0),
+            # Number of input columns computed in parallel
+            "SIMD": ("i", False, 1),
+            "Stride": ("i", True, 2),
+            # FINN input datatype
+            "inputDataType": ("s", True, ""),
+            # Batch size
+            "numInputVectors": ("i", False, 1),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_downsampled_odim(self):
+        "Return the down sampled spatial size of the output."
+        idim = self.get_nodeattr("ImgDim")
+        stride = self.get_nodeattr("Stride")
+        return int(np.floor((idim - 1) / stride) + 1)
+
+    def get_exp_cycles(self):
+        idim = self.get_nodeattr("ImgDim")
+        channels = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        batch_size = self.get_nodeattr("numInputVectors")
+        exp_cycles = channels / simd * batch_size * idim * idim
+        return int(exp_cycles)
+
+    def get_normal_input_shape(self):
+        idim = self.get_nodeattr("ImgDim")
+        num_ch = self.get_nodeattr("NumChannels")
+        batch = self.get_nodeattr("numInputVectors")
+        ishape = (batch, idim, idim, num_ch)
+        return ishape
+
+    def get_normal_output_shape(self):
+        odim = self.get_downsampled_odim()
+        num_ch = self.get_nodeattr("NumChannels")
+        batch = self.get_nodeattr("numInputVectors")
+        oshape = (batch, odim, odim, num_ch)
+        return oshape
+
+    def get_folded_input_shape(self):
+        normal_ishape = list(self.get_normal_input_shape())
+        ifm_ch = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        assert ifm_ch % simd == 0, "SIMD must divide input channels"
+        fold = int(normal_ishape[-1] / simd)
+        folded_ishape = normal_ishape[:-1] + [fold, simd]
+        return tuple(folded_ishape)
+
+    def get_folded_output_shape(self):
+        normal_oshape = list(self.get_normal_output_shape())
+        ifm_ch = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        assert ifm_ch % simd == 0, "SIMD must divide input channels"
+        fold = int(normal_oshape[-1] / simd)
+        folded_oshape = normal_oshape[:-1] + [fold, simd]
+        return tuple(folded_oshape)
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpect input shape for DownSampler."
+        # implement tensor with correct shape
+        values = np.random.randn(*oshape).astype(np.float32)
+        return helper.make_node(
+            "Constant",
+            inputs=[],
+            outputs=[self.onnx_node.output[0]],
+            value=helper.make_tensor(
+                name="const_tensor",
+                data_type=TensorProto.FLOAT,
+                dims=values.shape,
+                vals=values.flatten().astype(float),
+            ),
+        )
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        # data type stays the same
+        dtype = model.get_tensor_datatype(node.input[0])
+        exp_idtype = self.get_input_datatype()
+        assert dtype == exp_idtype, "Unexpected datatype for DownSampler"
+        model.set_tensor_datatype(node.output[0], dtype)
+
+    def verify_node(self):
+        pass
+
+    def get_input_datatype(self):
+        """Returns FINN DataType of input."""
+        ret = DataType[self.get_nodeattr("inputDataType")]
+        return ret
+
+    def get_output_datatype(self):
+        """Returns FINN DataType of output. (Same as input datatype)"""
+        return self.get_input_datatype()
+
+    def get_instream_width(self):
+        ibits = self.get_input_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        return ibits * simd
+
+    def get_outstream_width(self):
+        obits = self.get_output_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        return obits * simd
+
+    def get_number_output_values(self):
+        folded_oshape = self.get_folded_output_shape()
+        return np.prod(folded_oshape[:-1])
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "slidingwindow.h"']
+
+    def defines(self, var):
+        self.code_gen_dict["$DEFINES$"] = []
+
+        ifm_ch = self.get_nodeattr("NumChannels")
+        self.code_gen_dict["$DEFINES$"] += ["#define IFMChannels {}".format(ifm_ch)]
+
+        ibits = self.get_input_datatype().bitwidth()
+        self.code_gen_dict["$DEFINES$"] += ["#define Input_precision {}".format(ibits)]
+
+        idim = self.get_nodeattr("ImgDim")
+        self.code_gen_dict["$DEFINES$"] += ["#define IFMDim {}".format(idim)]
+
+        simd = self.get_nodeattr("SIMD")
+        self.code_gen_dict["$DEFINES$"] += ["#define SIMD {}".format(simd)]
+
+        stride = self.get_nodeattr("Stride")
+        self.code_gen_dict["$DEFINES$"] += ["#define Stride {}".format(stride)]
+
+        batch_size = self.get_nodeattr("numInputVectors")
+        self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(batch_size)]
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_input_datatype()
+        if dtype == DataType.BIPOLAR:
+            # use binary for bipolar storage
+            dtype = DataType.BINARY
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"] = []
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
+            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+        )
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+        )
+
+    def docompute(self):
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            """ConvolutionInputGenerator_kernel1<IFMChannels, Input_precision,
+            IFMDim, SIMD,Stride> (in0, out, numReps);"""
+        ]
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        if dtype == DataType.BIPOLAR:
+            # use binary for bipolar storage
+            dtype = DataType.BINARY
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        oshape = self.get_folded_output_shape()
+        oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
+
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                oshape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)"
+            % (self.onnx_node.name, packed_hls_type, packed_hls_type)
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+        )
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        exp_ishape = self.get_normal_input_shape()
+        exp_oshape = self.get_normal_output_shape()
+        folded_ishape = self.get_folded_input_shape()
+        folded_oshape = self.get_folded_output_shape()
+
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert (
+            inp.shape == exp_ishape
+        ), """Input shape doesn't
+        match expected shape (numInputVectors, ImgDim, ImgDim, NumChannels)."""
+        export_idt = self.get_input_datatype()
+
+        reshaped_input = inp.reshape(folded_ishape)
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_output(context)
+            assert (
+                context[node.output[0]].shape == folded_oshape
+            ), "cppsim did not produce expected folded output shape"
+            context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            rtlsim_inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+            odt = export_idt
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+        assert (
+            context[node.output[0]].shape == exp_oshape
+        ), """Output shape doesn't match expected shape
+            (1, OutputDim, OutputDim, NumChannels)."""
diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
index 54051af5e0387081a23e1f8fa77ec9e363098830..044cfddaab51a5f9bf7aa25e9123247b10de8529 100644
--- a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
@@ -32,7 +32,7 @@ import numpy as np
 
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow import HLSCustomOp
-from onnx import TensorProto, helper
+from onnx import helper, TensorProto
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 
@@ -80,24 +80,33 @@ class DuplicateStreams_Batch(HLSCustomOp):
 
     def make_shape_compatible_op(self, model):
         exp_ishape = self.get_normal_input_shape()
-        oshape = self.get_normal_output_shape()
         ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
         assert ishape == exp_ishape, "Unexpected input shape."
-        # implement tensor with correct shape
-        values = np.random.randn(*oshape).astype(np.float32)
+
+        oshape = self.get_normal_output_shape()
+        values = np.zeros(oshape).astype(np.float32)
         split_input = np.concatenate((values, values), axis=0)
-        return helper.make_node(
+
+        split_in = helper.make_tensor_value_info(
+            model.make_new_valueinfo_name(), TensorProto.FLOAT, oshape
+        )
+
+        model.graph.value_info.append(split_in)  # requires clean up
+        model.set_initializer(split_in.name, split_input)
+
+        shape_comp_node = helper.make_node(
             "Split",
-            inputs=[split_input],
-            outputs=[self.onnx_node.output[0], self.onnx_node.output[0]],
-            value=helper.make_tensor(
-                name="const_tensor", data_type=TensorProto.FLOAT, axis=0
-            ),
+            inputs=[split_in.name],
+            outputs=[self.onnx_node.output[0], self.onnx_node.output[1]],
+            axis=0,
         )
 
+        return shape_comp_node
+
     def infer_node_datatype(self, model):
         odt = self.get_output_datatype()
         model.set_tensor_datatype(self.onnx_node.output[0], odt)
+        model.set_tensor_datatype(self.onnx_node.output[1], odt)
 
     def verify_node(self):
         info_messages = []
@@ -155,6 +164,10 @@ class DuplicateStreams_Batch(HLSCustomOp):
     def get_number_output_values(self):
         return 2 * np.prod(self.get_folded_output_shape()[1:-1])
 
+    def get_exp_cycles(self):
+        # Channels/PE * batch size * fmdim * fmdim
+        return np.prod(self.get_folded_output_shape()[:-1])
+
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
         node = self.onnx_node
@@ -359,3 +372,8 @@ class DuplicateStreams_Batch(HLSCustomOp):
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        intf_names["m_axis"] = ["out0_V_V", "out1_V_V"]
+        return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/fmpadding.py b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
similarity index 86%
rename from src/finn/custom_op/fpgadataflow/fmpadding.py
rename to src/finn/custom_op/fpgadataflow/fmpadding_batch.py
index fa321dfa65d14b67fa218fb6a49f602ddab8d57e..f9a9dc4340b18578550a9c453d90de86234d1cad 100644
--- a/src/finn/custom_op/fpgadataflow/fmpadding.py
+++ b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
@@ -21,6 +21,8 @@ class FMPadding_Batch(HLSCustomOp):
             "Padding": ("i", True, 2),
             # number of channels in input image
             "NumChannels": ("i", True, 0),
+            # SIMD Input parallelism
+            "SIMD": ("i", False, 1),
             # FINN input datatype
             "inputDataType": ("s", True, ""),
             # controls distribution of padded pixels
@@ -40,6 +42,14 @@ class FMPadding_Batch(HLSCustomOp):
         pad = self.get_nodeattr("Padding")
         return idim + pad
 
+    def get_exp_cycles(self):
+        odim = self.get_padded_odim()
+        channels = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        batch_size = self.get_nodeattr("numInputVectors")
+        exp_cycles = (channels / simd) * batch_size * odim * odim
+        return exp_cycles
+
     def get_normal_input_shape(self):
         idim = self.get_nodeattr("ImgDim")
         num_ch = self.get_nodeattr("NumChannels")
@@ -55,20 +65,22 @@ class FMPadding_Batch(HLSCustomOp):
         return oshape
 
     def get_folded_input_shape(self):
-        # even though there is no folding in the current hlslib op,
-        # insert a time multiplexing axis to remain compatible with the
-        # shapes produced by the rest of the dataflow pipeline
-        ret = list(self.get_normal_input_shape())
-        ret.insert(-1, 1)
-        return tuple(ret)
+        normal_ishape = list(self.get_normal_input_shape())
+        ifm_ch = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        assert ifm_ch % simd == 0, "SIMD must divide input channels"
+        fold = int(normal_ishape[-1] / simd)
+        folded_ishape = normal_ishape[:-1] + [fold, simd]
+        return tuple(folded_ishape)
 
     def get_folded_output_shape(self):
-        # even though there is no folding in the current hlslib op,
-        # insert a time multiplexing axis to remain compatible with the
-        # shapes produced by the rest of the dataflow pipeline
-        ret = list(self.get_normal_output_shape())
-        ret.insert(-1, 1)
-        return tuple(ret)
+        normal_oshape = list(self.get_normal_output_shape())
+        ifm_ch = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        assert ifm_ch % simd == 0, "SIMD must divide input channels"
+        fold = int(normal_oshape[-1] / simd)
+        folded_oshape = normal_oshape[:-1] + [fold, simd]
+        return tuple(folded_oshape)
 
     def make_shape_compatible_op(self, model):
         exp_ishape = self.get_normal_input_shape()
@@ -114,15 +126,13 @@ class FMPadding_Batch(HLSCustomOp):
 
     def get_instream_width(self):
         ibits = self.get_input_datatype().bitwidth()
-        num_ch = self.get_nodeattr("NumChannels")
-
-        return ibits * num_ch
+        simd = self.get_nodeattr("SIMD")
+        return ibits * simd
 
     def get_outstream_width(self):
         obits = self.get_output_datatype().bitwidth()
-        num_ch = self.get_nodeattr("NumChannels")
-
-        return obits * num_ch
+        simd = self.get_nodeattr("SIMD")
+        return obits * simd
 
     def get_number_output_values(self):
         folded_oshape = self.get_folded_output_shape()
@@ -135,13 +145,15 @@ class FMPadding_Batch(HLSCustomOp):
         self.code_gen_dict["$DEFINES$"] = [
             """#define ImgDim1 {}\n#define OutputDim1 {}\n
             #define Padding1 {}\n#define NumChannels1 {}\n
-            #define PaddingStyle1 {}\n#define numReps {}\n""".format(
+            #define PaddingStyle1 {}\n#define numReps {}
+            #define SIMD1 {}\n""".format(
                 self.get_nodeattr("ImgDim"),
                 self.get_padded_odim(),
                 self.get_nodeattr("Padding"),
                 self.get_nodeattr("NumChannels"),
                 self.get_nodeattr("PaddingStyle"),
                 self.get_nodeattr("numInputVectors"),
+                self.get_nodeattr("SIMD"),
             )
         ]
 
@@ -176,7 +188,7 @@ class FMPadding_Batch(HLSCustomOp):
         in_t = self.get_input_datatype().get_hls_datatype_str()
         node = self.onnx_node
         self.code_gen_dict["$DOCOMPUTE$"] = [
-            """{}<ImgDim1, OutputDim1, Padding1, NumChannels1,
+            """{}<ImgDim1, OutputDim1, Padding1, NumChannels1,SIMD1,
             {}, PaddingStyle1> (in0, out, numReps);""".format(
                 node.op_type, in_t
             )
@@ -232,6 +244,7 @@ class FMPadding_Batch(HLSCustomOp):
         node = self.onnx_node
         exp_ishape = self.get_normal_input_shape()
         exp_oshape = self.get_normal_output_shape()
+        folded_ishape = self.get_folded_input_shape()
         folded_oshape = self.get_folded_output_shape()
 
         if mode == "cppsim":
@@ -254,10 +267,8 @@ class FMPadding_Batch(HLSCustomOp):
         match expected shape (1, ImgDim, ImgDim, NumChannels)."""
         export_idt = self.get_input_datatype()
 
-        # no reshaping for input since assuming no folding on input
-        # make copy before saving array
-        inp = inp.copy()
-        np.save(os.path.join(code_gen_dir, "input_0.npy"), inp)
+        reshaped_input = inp.reshape(folded_ishape)
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
 
         if mode == "cppsim":
             # execute the precompiled model
diff --git a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
index 9e6c63dc510aab5f6baff9cb6326a2d0476f67a9..1a75858880a072345ef942ca91feabf0bec9ab36 100644
--- a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
@@ -75,16 +75,19 @@ class GlobalAccPool_Batch(HLSCustomOp):
     def get_normal_output_shape(self):
         ch = self.get_nodeattr("NumChannels")
         vecs = list(self.get_nodeattr("numInputVectors"))
-        oshape = tuple([vecs[0]] + [ch])
+        if len(vecs) == 1:
+            oshape = tuple(vecs + [ch])
+        elif len(vecs) == 3:
+            oshape = tuple([vecs[0]] + [1, 1, ch])
         return oshape
 
     def get_folded_output_shape(self):
         ch = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
-        vecs = list(self.get_nodeattr("numInputVectors"))
+        unfolded_shape = list(self.get_normal_output_shape())
         assert ch % pe == 0, "PE must divide NumChannels"
         folds = int(ch / pe)
-        oshape = tuple([vecs[0]] + [folds, pe])
+        oshape = tuple(unfolded_shape[:-1] + [folds, pe])
         return oshape
 
     def make_shape_compatible_op(self, model):
@@ -179,6 +182,13 @@ class GlobalAccPool_Batch(HLSCustomOp):
     def get_number_output_values(self):
         return np.prod(self.get_folded_output_shape()[1:-1])
 
+    def get_exp_cycles(self):
+        # Channels/PE * batch size * idim * idim + Channels/PE
+        ch = self.get_nodeattr("NumChannels")
+        pe = self.get_nodeattr("PE")
+        folds = int(ch / pe)
+        return np.prod(self.get_folded_input_shape()[:-1]) + folds
+
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
         node = self.onnx_node
diff --git a/src/finn/custom_op/fpgadataflow/iodma.py b/src/finn/custom_op/fpgadataflow/iodma.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d0374445d816f1e8d49ed92cf7aa67b024f9ac1
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/iodma.py
@@ -0,0 +1,360 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import math
+from onnx import TensorProto, helper
+from finn.core.datatype import DataType
+from finn.custom_op.fpgadataflow import HLSCustomOp
+
+
+# the IODMA inerfaces a memory-mapped AXI interface and an AXI stream
+# direction "in": pulls data from AXI-MM to AXI stream
+# direction "out": pushes data from AXI stream to AXI-MM
+
+# DMA Addressing
+# - burst mode can be "wrap" or "increment"
+# - "increment" bursts will increment the address when moving to the next image
+# - "wrap" bursts will reinitialize the address to the start address,
+#   and are useful for e.g. streaming weights, where the same buffer is
+#   repeatedly read into the FPGA
+# - no additional alignment restrictions beyond anything specified in the AXI spec
+
+# Interfaces
+# - AXI-MM name specified by intfName unless this is set to "" (empty, the default)
+#   in which case output AXI-MM are named "out" and input AXI-MM are named "in0"
+# - AXI-MM interface width (in bits) is specified by intfWidth
+# - AXI-Stream interface width (in bits) is specified by streamWidth
+# - If inftWidth and streamWidth are not equal, the DMA core performs
+#   width conversion by going up to the least common multiple of bitwidths
+#   e.g. intfWidth=32b -> 96b -> sreamWidth=24b
+# - transfers occur in multiples of the AXI-MM interface width, therefore
+#   the total number of bits in the tensor must be a multiple of intfWidth
+# - transfers occur in multiples of the AXI-Stream interface width, therefore
+#   the total number of bits in the tensor must be a multiple of streamWidth
+# - both interface widths must be a multiple of 8b (AXI protocol requirement)
+# - in most systems, intfWidth is also restricted to a power of 2 (e.g. Vitis)
+#   but this is not universal so we don't check here explicitly
+
+# Input/output tensor sizes shapes
+# - The data being moved is a tensor of shape numInputVectors+[NumChannels]
+# - The data type of the tensor elements is specified by dataType
+# - on the stream side
+#       -the normal shape is the same as the ONNX tensor attached to it
+#       -the folded shape is computed from the stream width and normal shape
+# - on the AXI-MM side
+#       -the normal shape is the same as the one on the stream side
+#       -the folded shape is not defined
+
+
+class IODMA(HLSCustomOp):
+    """Class that corresponds to finn-hlslib DMA function(s)."""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "NumChannels": ("i", True, 0),
+            # FINN input datatype
+            "dataType": ("s", True, ""),
+            # Stream parameters
+            "streamWidth": ("i", False, 32),
+            # DMA-specific parameters
+            "intfWidth": ("i", False, 32),
+            "burstMode": ("s", False, "increment"),
+            "direction": ("s", False, "in"),
+            # shape describing input vecs per execution
+            "numInputVectors": ("ints", False, [1]),
+            # name of axi-mm interface
+            "intfName": ("s", False, ""),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_normal_input_shape(self):
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        num_ch = self.get_nodeattr("NumChannels")
+        ishape = tuple(vecs + [num_ch])
+        return ishape
+
+    def get_normal_output_shape(self):
+        return self.get_normal_input_shape()
+
+    def get_folded_input_shape(self):
+        if self.get_nodeattr("direction") == "in":
+            raise ValueError("Folded input shape not defined for input IODMA")
+        else:
+            shape = list(self.get_normal_input_shape())
+            itype_bits = self.get_input_datatype().bitwidth()
+            intfw = self.get_nodeattr("streamWidth")
+            assert (
+                intfw % itype_bits == 0
+            ), "Input stream width must be a multiple of datatype bits"
+            elems_per_word = intfw // itype_bits
+            assert shape[-1] % elems_per_word == 0, "Fold depth must be integer"
+            fold_depth = shape[-1] // elems_per_word
+            shape[-1] = fold_depth
+            shape.append(elems_per_word)
+            return tuple(shape)
+
+    def get_folded_output_shape(self):
+        if self.get_nodeattr("direction") == "out":
+            raise ValueError("Folded output shape not defined for output IODMA")
+        else:
+            shape = list(self.get_normal_output_shape())
+            itype_bits = self.get_output_datatype().bitwidth()
+            intfw = self.get_nodeattr("streamWidth")
+            assert (
+                intfw % itype_bits == 0
+            ), "Input stream width must be a multiple of datatype bits"
+            elems_per_word = intfw // itype_bits
+            assert shape[-1] % elems_per_word == 0, "Fold depth must be integer"
+            fold_depth = shape[-1] // elems_per_word
+            shape[-1] = fold_depth
+            shape.append(elems_per_word)
+            return tuple(shape)
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpected input shape."
+        # implement tensor with correct shape
+        values = np.random.randn(*oshape).astype(np.float32)
+        return helper.make_node(
+            "Constant",
+            inputs=[],
+            outputs=[self.onnx_node.output[0]],
+            value=helper.make_tensor(
+                name="const_tensor",
+                data_type=TensorProto.FLOAT,
+                dims=values.shape,
+                vals=values.flatten().astype(float),
+            ),
+        )
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        # data type stays the same
+        dtype = model.get_tensor_datatype(node.input[0])
+        exp_idtype = self.get_input_datatype()
+        assert dtype == exp_idtype, "Unexpected datatype."
+        model.set_tensor_datatype(node.output[0], dtype)
+
+    def verify_node(self):
+        pass
+
+    def get_input_datatype(self):
+        """Returns FINN DataType of input."""
+        return DataType[self.get_nodeattr("dataType")]
+
+    def get_output_datatype(self):
+        """Returns FINN DataType of output. (Same as input datatype)"""
+        return self.get_input_datatype()
+
+    def get_instream_width(self):
+        if self.get_nodeattr("direction") == "in":
+            return self.get_nodeattr("intfWidth")
+        elif self.get_nodeattr("direction") == "out":
+            return self.get_nodeattr("streamWidth")
+        else:
+            raise ValueError("Invalid IODMA direction, please set to in or out")
+
+    def get_outstream_width(self):
+        if self.get_nodeattr("direction") == "out":
+            return self.get_nodeattr("intfWidth")
+        elif self.get_nodeattr("direction") == "in":
+            return self.get_nodeattr("streamWidth")
+        else:
+            raise ValueError("Invalid IODMA direction, please set to in or out")
+
+    def get_number_output_values(self):
+        oshape = self.get_normal_output_shape()
+        itype_bits = self.get_input_datatype().bitwidth()
+        stream_width = self.get_nodeattr("streamWidth")
+        nelems = np.prod(oshape)
+        nbits = nelems * itype_bits
+        assert (
+            nbits % stream_width == 0
+        ), "DMA: total transfer size must be word multiple"
+        ovalues = nbits // stream_width
+        return ovalues
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "dma.h"']
+        self.code_gen_dict["$GLOBALS$"].append('#include "streamtools.h"')
+
+    def defines(self, var):
+        itype_bits = self.get_input_datatype().bitwidth()
+        total_bits = itype_bits * np.prod(self.get_normal_input_shape())
+        assert total_bits % 8 == 0, "DMA input not a multiple of 1 Byte"
+        total_bytes = total_bits // 8
+        self.code_gen_dict["$DEFINES$"] = [
+            """#define NumBytes1 {}\n#define DataWidth1 {}\n""".format(
+                total_bytes, self.get_nodeattr("intfWidth")
+            )
+        ]
+
+    def get_ap_int_max_w(self):
+        "Return the maximum width of any ap_int used in this module."
+        instream = self.get_instream_width()
+        outstream = self.get_outstream_width()
+        width_lcm = (instream * outstream) // math.gcd(instream, outstream)
+        return width_lcm
+
+    def docompute(self):
+        direction = self.get_nodeattr("direction")
+        mode = self.get_nodeattr("burstMode")
+        if direction == "in":
+            if mode == "wrap":
+                func = "Mem2Stream_Batch_external_wmem"
+            else:
+                func = "Mem2Stream_Batch"
+            dwc_func = "WidthAdjustedOutputStream"
+        elif direction == "out":
+            func = "Stream2Mem_Batch"
+            dwc_func = "WidthAdjustedInputStream"
+        else:
+            raise ValueError("Invalid IODMA direction, please set to in or out")
+        # define templates for instantiation
+        dma_inst_template = func + "<DataWidth1, NumBytes1>(%s, %s, numReps);"
+        dwc_inst_template = dwc_func + "<%d, %d, %d> %s(%s, numReps);"
+        # do stream infrastructure and instantiations
+        intfw = self.get_nodeattr("intfWidth")
+        strmw = self.get_nodeattr("streamWidth")
+        width_lcm = (strmw * intfw) // math.gcd(strmw, intfw)
+        # we always need two streams: one of width_lcm, and one of intfw width
+        # because we use WidthAdjustedInputStream,
+        dtype_bits = self.get_input_datatype().bitwidth()
+        total_bits = dtype_bits * np.prod(self.get_normal_input_shape())
+        if direction == "in":
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                dwc_inst_template
+                % (width_lcm, strmw, total_bits // width_lcm, "dwc_lcm", "out"),
+                dwc_inst_template
+                % (intfw, width_lcm, total_bits // intfw, "dwc_intfw", "dwc_lcm"),
+                dma_inst_template % ("in0", "dwc_intfw"),
+            ]
+        else:
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                dwc_inst_template
+                % (strmw, width_lcm, total_bits // strmw, "dwc_lcm", "in0"),
+                dwc_inst_template
+                % (width_lcm, intfw, total_bits // width_lcm, "dwc_intfw", "dwc_lcm"),
+                dma_inst_template % ("dwc_intfw", "out"),
+            ]
+
+    def blackboxfunction(self):
+        packed_ibits = self.get_instream_width()
+        packed_hls_type_in = "ap_uint<%d>" % packed_ibits
+        packed_obits = self.get_outstream_width()
+        packed_hls_type_out = "ap_uint<%d>" % packed_obits
+        direction = self.get_nodeattr("direction")
+        if direction == "in":
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                "void %s(%s *in0, hls::stream<%s > &out, unsigned int numReps)"
+                % (self.onnx_node.name, packed_hls_type_in, packed_hls_type_out)
+            ]
+        elif direction == "out":
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                "void %s(hls::stream<%s > &in0, %s *out, unsigned int numReps)"
+                % (self.onnx_node.name, packed_hls_type_in, packed_hls_type_out)
+            ]
+        else:
+            raise ValueError("Invalid IODMA direction, please set to in or out")
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS INTERFACE s_axilite port=numReps bundle=control"
+        ]
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE s_axilite port=return bundle=control"
+        )
+        direction = self.get_nodeattr("direction")
+        intfname = self.get_nodeattr("intfName")
+        if direction == "in":
+            if intfname == "":
+                self.code_gen_dict["$PRAGMAS$"].append(
+                    "#pragma HLS INTERFACE m_axi offset=slave port=in0"
+                )
+            else:
+                self.code_gen_dict["$PRAGMAS$"].append(
+                    "#pragma HLS INTERFACE m_axi offset=slave port=%s" % (intfname)
+                )
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS INTERFACE s_axilite port=in0 bundle=control"
+            )
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS INTERFACE axis port=out"
+            )
+        elif direction == "out":
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS INTERFACE axis port=in0"
+            )
+            if intfname == "":
+                self.code_gen_dict["$PRAGMAS$"].append(
+                    "#pragma HLS INTERFACE m_axi offset=slave port=out"
+                )
+            else:
+                self.code_gen_dict["$PRAGMAS$"].append(
+                    "#pragma HLS INTERFACE m_axi offset=slave port=%s" % (intfname)
+                )
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS INTERFACE s_axilite port=out bundle=control"
+            )
+        else:
+            raise ValueError("Invalid IODMA direction, please set to in or out")
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS DATAFLOW")
+
+    def execute_node(self, context, graph):
+        pass
+
+    def dataoutstrm(self):
+        pass
+
+    def read_npy_data(self):
+        pass
+
+    def save_as_npy(self):
+        pass
+
+    def strm_decl(self):
+        pass
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        if self.get_nodeattr("direction") == "out":
+            intf_names["s_axis"] = ["in0_V_V"]
+            intf_names["m_axis"] = []
+        else:
+            intf_names["s_axis"] = []
+            intf_names["m_axis"] = ["out_V_V"]
+        intf_names["axilite"] = ["s_axi_control"]
+        intf_names["aximm"] = ["m_axi_gmem"]
+        return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/labelselect_batch.py b/src/finn/custom_op/fpgadataflow/labelselect_batch.py
index 7591f09d8d0cd1847672fe5aa09616ff1571033d..f61fbf12da889700297006ef2566088d4150c0e4 100644
--- a/src/finn/custom_op/fpgadataflow/labelselect_batch.py
+++ b/src/finn/custom_op/fpgadataflow/labelselect_batch.py
@@ -41,6 +41,13 @@ class LabelSelect_Batch(HLSCustomOp):
 
     def __init__(self, onnx_node):
         super().__init__(onnx_node)
+        odt_name = self.get_nodeattr("outputDataType")
+        if odt_name == "":
+            # If not provided compute min size
+            labels = self.get_nodeattr("Labels")
+            odt = DataType.get_smallest_possible(labels - 1)
+            odt_name = odt.name
+            self.set_nodeattr("outputDataType", odt_name)
 
     def get_nodeattr_types(self):
         my_attrs = {
@@ -49,6 +56,7 @@ class LabelSelect_Batch(HLSCustomOp):
             "K": ("i", True, 0),
             # FINN DataTypes for input
             "inputDataType": ("s", True, ""),
+            "outputDataType": ("s", False, ""),
             # number of input vectors, examples:
             # [1] is a single vector (like a FC layer with batch=1)
             # [4] is four vectors (like a FC layer with batch=4)
@@ -69,7 +77,6 @@ class LabelSelect_Batch(HLSCustomOp):
         pe = self.get_nodeattr("PE")
         vecs = list(self.get_nodeattr("numInputVectors"))
         assert nlabels % pe == 0, "PE must divide Labels"
-        assert pe == 1, "LabelSelect currently fails with folding"
         folds = int(nlabels / pe)
         folded_ishape = tuple(vecs + [folds, pe])
         return folded_ishape
@@ -90,7 +97,7 @@ class LabelSelect_Batch(HLSCustomOp):
         exp_ishape = self.get_normal_input_shape()
         oshape = self.get_normal_output_shape()
         ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
-        assert ishape == exp_ishape, "Unexpect input shape."
+        assert ishape == exp_ishape, "Unexpected input shape."
         # implement tensor with correct shape
         values = np.random.randn(*oshape).astype(np.int64)
         return helper.make_node(
@@ -106,9 +113,8 @@ class LabelSelect_Batch(HLSCustomOp):
         )
 
     def infer_node_datatype(self, model):
-        # currently set to uint32 to be compatible with hlslib
-        # enhancement: consider finding smallest power-of-two int for reduced output bandwidth
-        model.set_tensor_datatype(self.onnx_node.output[0], DataType.UINT32)
+        odt = self.get_output_datatype()
+        model.set_tensor_datatype(self.onnx_node.output[0], odt)
 
     def verify_node(self):
         info_messages = []
@@ -134,6 +140,7 @@ class LabelSelect_Batch(HLSCustomOp):
             self.get_nodeattr("PE")
             self.get_nodeattr("K")
             self.get_nodeattr("inputDataType")
+            self.get_nodeattr("outputDataType")
             info_messages.append("All necessary attributes exist")
         except Exception:
             info_messages.append(
@@ -150,12 +157,12 @@ class LabelSelect_Batch(HLSCustomOp):
     def get_input_datatype(self):
         """Returns FINN DataType of input."""
         ret = DataType[self.get_nodeattr("inputDataType")]
-        assert ret.signed() is False, "LabelSelect is currently broken for signed inputs"
         return ret
 
     def get_output_datatype(self):
         """Returns FINN DataType of output."""
-        return DataType.UINT32
+        ret = DataType[self.get_nodeattr("outputDataType")]
+        return ret
 
     def get_instream_width(self):
         """Returns input stream width."""
@@ -260,8 +267,13 @@ class LabelSelect_Batch(HLSCustomOp):
         npy_type = "float"
         npy_in = "%s/input_0.npy" % code_gen_dir
         self.code_gen_dict["$READNPYDATA$"] = []
+
+        # Calling npy2apintstream with reverse_inner = false to have LE packing
+        # as required by HLS fxn LabelSelect_Batch
+        # Also notice that StreamingDataWidthConverter_Batch performs LE packing
+
         self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0,false);'
             % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
         )
 
@@ -277,12 +289,13 @@ class LabelSelect_Batch(HLSCustomOp):
     def docompute(self):
         node = self.onnx_node
         self.code_gen_dict["$DOCOMPUTE$"] = [
-            """{}<{}, {}, {}, {}, ap_uint<32>> (in0, out, 1);""".format(
+            """{}<{}, {}, {}, {}, {} > (in0, out, 1);""".format(
                 node.op_type,
                 self.get_nodeattr("Labels"),
                 self.get_nodeattr("PE"),
                 self.get_nodeattr("K"),
                 self.get_input_datatype().get_hls_datatype_str(),
+                self.get_output_datatype().get_hls_datatype_str(),
             )
         ]
 
@@ -316,10 +329,11 @@ class LabelSelect_Batch(HLSCustomOp):
     def blackboxfunction(self):
         self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
             """void {}(hls::stream<ap_uint<{}*{}>> &in0,
-                hls::stream<ap_uint<32>> &out)""".format(
+                hls::stream<ap_uint<{}> > &out)""".format(
                 self.onnx_node.name,
                 self.get_nodeattr("PE"),
                 self.get_input_datatype().bitwidth(),
+                self.get_output_datatype().bitwidth(),
             )
         ]
 
diff --git a/src/finn/custom_op/fpgadataflow/pool_batch.py b/src/finn/custom_op/fpgadataflow/pool_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a2fa6889ae0ebb94976d50b0fc8362d01a63bea
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/pool_batch.py
@@ -0,0 +1,427 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import numpy as np
+
+from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.core.datatype import DataType
+from onnx import TensorProto, helper
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class Pool_Batch(HLSCustomOp):
+    """Class that corresponds to finn-hlslib Pool_batch function.
+    Requires ConvolutionInputGenerator(depthwise == 1) to format its input
+
+    Input shape (BatchSize,OutImgDim,OutImgDim,KernelSize^2*Channels)
+    Output shape (BatchSize,OutImgDim,OutImgDim,Channels)
+
+    Notes:
+    # The input shape was chosen to be compatible with im2col (only true when there
+    is not folding).
+
+    # The actual data layout produced by the hlslib kernels is different
+    for depthwise ops.
+     * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/PE, K, K, PE)
+
+    Channels can be folded using PE (SIMD from the input perspective)
+    """
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "Channels": ("i", True, 0),
+            "PE": ("i", True, 1),
+            "KernelSize": ("i", True, 0),
+            # Function:
+            #  - MaxPool
+            #  - AvgPool (not yet supported, but HLSLIB does)
+            #  - AccPool (not yet supported, but HLSLIB does)
+            "Function": ("s", True, ""),
+            "OutImgDim": ("i", True, 0),
+            # FINN DataTypes for inputs/outputs
+            "InputDataType": ("s", True, ""),
+            "OutputDataType": ("s", True, ""),
+            "AccumBits": ("i", False, 0),
+            "Size": ("i", False, 1),
+            "BatchSize": ("i", False, 1),
+        }
+
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_input_datatype(self):
+        """Returns FINN DataType of input."""
+        return DataType[self.get_nodeattr("InputDataType")]
+
+    def get_output_datatype(self):
+        """Returns FINN DataType of output."""
+        fxn = self.get_nodeattr("Function")
+        odt = DataType[self.get_nodeattr("OutputDataType")]
+
+        if fxn == "MaxPool":
+            # Same as input
+            idt = DataType[self.get_nodeattr("InputDataType")]
+            assert odt == idt, "In datatype must be equal to out datatype for Maxpool"
+        elif fxn == "QuantAvgPool":
+            idt = DataType[self.get_nodeattr("InputDataType")]
+            assert (
+                idt.signed() == odt.signed()
+            ), """QuantAvgPool: Can't mix signed
+            and unsigned datatypes"""
+        else:
+            raise Exception("Pool_Batch doesn't currently support " + fxn)
+
+        return odt
+
+    def get_normal_input_shape(self):
+        ifm_ch = self.get_nodeattr("Channels")
+        odim = self.get_nodeattr("OutImgDim")
+        batch_size = self.get_nodeattr("BatchSize")
+        k = self.get_nodeattr("KernelSize")
+        ishape = (batch_size, odim, odim, k * k * ifm_ch)
+        return ishape
+
+    def get_folded_input_shape(self):
+        normal_ishape = list(self.get_normal_input_shape())
+        ifm_ch = self.get_nodeattr("Channels")
+        pe = self.get_nodeattr("PE")
+        assert ifm_ch % pe == 0, "PE must divide input channels"
+        fold = int(normal_ishape[-1] / pe)
+        folded_ishape = normal_ishape[:-1] + [fold, pe]
+        return tuple(folded_ishape)
+
+    def get_normal_output_shape(self):
+        ofm_ch = self.get_nodeattr("Channels")
+        odim = self.get_nodeattr("OutImgDim")
+        batch_size = self.get_nodeattr("BatchSize")
+        oshape = (batch_size, odim, odim, ofm_ch)
+        return oshape
+
+    def get_folded_output_shape(self):
+        normal_oshape = list(self.get_normal_output_shape())
+        ifm_ch = self.get_nodeattr("Channels")
+        pe = self.get_nodeattr("PE")
+        assert ifm_ch % pe == 0, "PE must divide input channels"
+        fold = int(ifm_ch / pe)
+        folded_oshape = normal_oshape[:-1] + [fold, pe]
+        return tuple(folded_oshape)
+
+    def get_number_output_values(self):
+        folded_oshape = self.get_folded_output_shape()
+        return np.prod(folded_oshape[1:-1])
+
+    def get_exp_cycles(self):
+        # (Channels * kernel * kernel) / PE * odim * odim * batch_size
+        ifm_ch = self.get_nodeattr("Channels")
+        pe = self.get_nodeattr("PE")
+        k = self.get_nodeattr("KernelSize")
+        odim = self.get_nodeattr("OutImgDim")
+        batch_size = self.get_nodeattr("BatchSize")
+        exp_cycles = ((ifm_ch * k * k) / pe) * odim * odim * batch_size
+        return int(exp_cycles)
+
+    def get_instream_width(self):
+        dt_bits = self.get_input_datatype().bitwidth()
+        pe = self.get_nodeattr("PE")
+        in_width = int(dt_bits * pe)
+        return in_width
+
+    def get_outstream_width(self):
+        dt_bits = self.get_output_datatype().bitwidth()
+        pe = self.get_nodeattr("PE")
+        out_width = int(dt_bits * pe)
+        return out_width
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpected input shape for Pool_Batch."
+        # implement tensor with correct shape
+        values = np.random.randn(*oshape).astype(np.float32)
+        return helper.make_node(
+            "Constant",
+            inputs=[],
+            outputs=[self.onnx_node.output[0]],
+            value=helper.make_tensor(
+                name="const_tensor",
+                data_type=TensorProto.FLOAT,
+                dims=values.shape,
+                vals=values.flatten().astype(float),
+            ),
+        )
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        # data type stays the same
+        dtype = self.get_output_datatype()
+        model.set_tensor_datatype(node.output[0], dtype)
+
+    def verify_node(self):
+        info_messages = []
+
+        # verify that "domain" is set to "finn"
+        domain_value = self.onnx_node.domain
+        if domain_value == "finn":
+            info_messages.append("Attribute domain is set correctly")
+        else:
+            info_messages.append('Attribute domain should be set to "finn"')
+
+        # verify that "backend" is set to "fpgadataflow"
+        backend_value = self.get_nodeattr("backend")
+        if backend_value == "fpgadataflow":
+            info_messages.append("Attribute backend is set correctly")
+        else:
+            info_messages.append('Attribute backend should be set to "fpgadataflow"')
+
+        # verify the number of inputs
+        if len(self.onnx_node.input) == 1:
+            info_messages.append("The number of inputs is correct")
+        else:
+            info_messages.append("""Pool_Batch needs 1 data input""")
+
+        # check supported function
+        fnx = self.get_nodeattr("Function")
+        if fnx in ["MaxPool", "QuantAvgPool"]:
+            info_messages.append(
+                "Attribute Function contains a supported pool function"
+            )
+        else:
+            info_messages.append(
+                "Attribute Function contains an unsupported pool function"
+            )
+        return info_messages
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"']
+        self.code_gen_dict["$GLOBALS$"] += ['#include "pool.hpp"']
+
+    def defines(self, var):
+        self.code_gen_dict["$DEFINES$"] = []
+
+        ifm_ch = self.get_nodeattr("Channels")
+        self.code_gen_dict["$DEFINES$"] += ["#define Channels {}".format(ifm_ch)]
+
+        pe = self.get_nodeattr("PE")
+        self.code_gen_dict["$DEFINES$"] += ["#define PE {}".format(pe)]
+
+        k = self.get_nodeattr("KernelSize")
+        self.code_gen_dict["$DEFINES$"] += ["#define KernelSize {}".format(k)]
+
+        odim = self.get_nodeattr("OutImgDim")
+        self.code_gen_dict["$DEFINES$"] += ["#define OFMDim {}".format(odim)]
+
+        numReps = self.get_nodeattr("BatchSize")
+        self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(numReps)]
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_input_datatype()
+        if dtype == DataType.BIPOLAR:
+            # use binary for bipolar storage
+            dtype = DataType.BINARY
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"] = []
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0,false);'
+            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+        )
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+        )
+
+    def docompute(self):
+        idt = self.get_input_datatype()
+        i_hls_dt = idt.get_hls_datatype_str()
+        odt = self.get_output_datatype()
+        o_hls_dt = odt.get_hls_datatype_str()
+        size = self.get_nodeattr("Size")
+        accum_bits = self.get_nodeattr("AccumBits")
+        self.code_gen_dict["$DOCOMPUTE$"] = []
+
+        fxn = self.get_nodeattr("Function")
+        if fxn == "MaxPool":
+            self.code_gen_dict["$DOCOMPUTE$"] += [
+                "MaxPoolFunction<{},KernelSize> pool_fxn;".format(i_hls_dt)
+            ]
+        elif fxn == "QuantAvgPool":
+            if idt.signed():
+                act_hls_dt = "ap_int<{}>".format(accum_bits)
+            else:
+                act_hls_dt = "ap_uint<{}>".format(accum_bits)
+            self.code_gen_dict["$DOCOMPUTE$"] += [
+                "QuantAvgPoolFunction<{},{},{}> pool_fxn;".format(
+                    act_hls_dt, o_hls_dt, size
+                )
+            ]
+        else:
+            raise Exception("Pool_Batch doesn't currently support " + fxn)
+
+        self.code_gen_dict["$DOCOMPUTE$"] += [
+            """Pool_batch<Channels, PE, KernelSize,Slice<{} >, Slice< {} > >
+        (in0,out, pool_fxn, OFMDim*OFMDim*numReps);""".format(
+                i_hls_dt, o_hls_dt
+            )
+        ]
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        if dtype == DataType.BIPOLAR:
+            # use binary for bipolar storage
+            dtype = DataType.BINARY
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        oshape = self.get_folded_output_shape()
+        oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
+
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s",false);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                oshape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        packed_ibits = self.get_instream_width()
+        packed_in_hls_type = "ap_uint<%d>" % packed_ibits
+
+        packed_obits = self.get_outstream_width()
+        packed_out_hls_type = "ap_uint<%d>" % packed_obits
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)"
+            % (self.onnx_node.name, packed_in_hls_type, packed_out_hls_type)
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+        )
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        exp_ishape = self.get_normal_input_shape()
+        folded_ishape = self.get_folded_input_shape()
+        exp_oshape = self.get_normal_output_shape()
+        folded_oshape = self.get_folded_output_shape()
+
+        # TODO ensure codegen dir exists
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert (
+            inp.shape == exp_ishape
+        ), """Input shape doesn't
+        match expected shape (batch_size,odim,odim,k*k*ifm_ch)."""
+
+        export_idt = self.get_input_datatype()
+        reshaped_input = inp.reshape(folded_ishape)
+
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_output(context)
+            assert (
+                context[node.output[0]].shape == folded_oshape
+            ), "cppsim did not produce expected folded output shape"
+            context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            rtlsim_inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        assert (
+            context[node.output[0]].shape == exp_oshape
+        ), """Output
+        shape doesn't match expected shape (1, ofm_dim, ofm_dim, k*k*ifm_ch)."""
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index 9b73ba1e100aa83fd19aa8799195c99891fca3fd..72aa322e0e44a6f4a5c11025d94bdfeb820338a3 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -87,7 +87,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             "numInputVectors": ("ints", False, [1]),
             # memory mode for the FC weights
             # const -- embedded weights, default, long compile/synth times
-            # decoupled -- streaming weights
+            # decoupled -- streaming weights with weight streamer packaged inside IP
+            # external -- streaming weights with external streamer
             "mem_mode": ("s", False, "const"),
             # FPGA resource type for memories in decoupled mode
             # auto -- let Vivado decide
@@ -105,14 +106,14 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         node = self.onnx_node
         # set top name depending on mem_mode
         mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode == "const":
+        if mem_mode == "const" or mem_mode == "external":
             prefixed_top_name = "%s_%s" % (node.name, node.name)
         elif mem_mode == "decoupled":
             prefixed_top_name = "%s_memstream" % (node.name)
         else:
             raise Exception(
-                """Please set mem_mode to "const" or "decoupled", currently no other
-                parameter value is supported!"""
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
             )
         return prefixed_top_name
 
@@ -240,11 +241,21 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         Q = self.get_nodeattr("SIMD")
         wdt = self.get_weight_datatype()
         W = wdt.bitwidth()
-        D_in = self.get_instream_width()
-        D_out = self.get_outstream_width()
+        D_in = self.get_nodeattr("MW")
+        D_out = self.get_nodeattr("MH")
         omega = (D_in * D_out) / (Q * P)
         return P * (math.ceil(omega / 512)) * (math.ceil((Q * W) / 36))
 
+    def bram_efficiency_estimation(self):
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        D_in = self.get_nodeattr("MW")
+        D_out = self.get_nodeattr("MH")
+        bram16_est = self.bram_estimation()
+        wbits = W * D_in * D_out
+        bram16_est_capacity = bram16_est * 36 * 512
+        return wbits / bram16_est_capacity
+
     def lut_estimation(self):
         """Calculates resource estimations for LUTs based on:
         - FINN-R: An End-to-End Deep-Learning Framework for Fast
@@ -267,6 +278,17 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
         return c0 + c1 * (P * Q) * (W * A)
 
+    def get_exp_cycles(self):
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        num_inp_vec = self.get_nodeattr("numInputVectors")
+        mh = self.get_nodeattr("MH")
+        mw = self.get_nodeattr("MW")
+        # since mmv != 1 is not supported yet, we set mmv for now to 1
+        mmv = 1
+        exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv
+        return int(exp_cycles)
+
     def get_input_datatype(self):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
@@ -290,12 +312,18 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         return out_width
 
     def get_weightstream_width(self):
-        """Returns weight stream width. Used in decoupled mode."""
-        pe = self.get_nodeattr("PE")
-        simd = self.get_nodeattr("SIMD")
-        wp = self.get_weight_datatype().bitwidth()
-        w_width = pe * simd * wp
-        return w_width
+        """Returns weight stream width. Used only in decoupled mode."""
+        if (
+            self.get_nodeattr("mem_mode") == "decoupled"
+            or self.get_nodeattr("mem_mode") == "external"
+        ):
+            pe = self.get_nodeattr("PE")
+            simd = self.get_nodeattr("SIMD")
+            wp = self.get_weight_datatype().bitwidth()
+            w_width = pe * simd * wp
+            return w_width
+        else:
+            return 0
 
     def get_weightstream_width_padded(self):
         """Returns weight stream width padded to a multiple of 8. This is required
@@ -471,7 +499,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
     def generate_params(self, model, path):
         mem_mode = self.get_nodeattr("mem_mode")
-        # weights
+        code_gen_dir = path
+        # weights, if not external
         weights = model.get_initializer(self.onnx_node.input[1])
         # convert weights into hlslib-compatible format
         weight_tensor = self.get_hls_compatible_weight_tensor(weights)
@@ -480,7 +509,6 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         # so use it as such for weight generation
         if self.get_weight_datatype() == DataType.BIPOLAR:
             export_wdt = DataType.BINARY
-        code_gen_dir = path
 
         if mem_mode == "const":
             """Saves weights into params.h"""
@@ -510,7 +538,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             f_weights.write(weight_hls_code)
             f_weights.close()
 
-        elif mem_mode == "decoupled":
+        elif mem_mode == "decoupled" or mem_mode == "external":
             """Saves weights in corresponding file format for cppsim or rtlsim"""
             # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD)
             weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3))
@@ -539,37 +567,37 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 os.path.join(code_gen_dir, "weights.npy"), weight_tensor_simd_flipped
             )
 
-            """Saves weights into .dat file"""
-            # convert weight values into hexstring
-            weight_width = self.get_weightstream_width()
-            # pad to nearest 4 bits to get hex strings
-            weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
-            weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
-                weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
-            )
-            weight_stream_len = np.prod(weight_tensor_pe_flipped.shape)
-            factor = math.ceil(weight_stream_len / 1024)
-            # add zeroes to pad out file to 1024 entries
-            weight_stream = weight_tensor_pe_flipped.flatten()
-            pad_amt = (factor * 1024) - weight_stream_len
-            weight_stream = np.pad(
-                weight_stream, (0, pad_amt), mode="constant", constant_values="0"
-            )
-            weight_stream = weight_stream.copy()
-            i = 0
-            j = 0
-            for val in weight_stream:
-                if i == 1024:
-                    i = 0
-                    j += 1
-                with open("{}/memblock_{}.dat".format(code_gen_dir, j), "a+") as f:
-                    f.write(val + "\n")
-                i += 1
-
+            if mem_mode == "decoupled":
+                """Saves weights into .dat file"""
+                # convert weight values into hexstring
+                weight_width = self.get_weightstream_width()
+                # pad to nearest 4 bits to get hex strings
+                weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
+                weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
+                    weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
+                )
+                weight_stream_len = np.prod(weight_tensor_pe_flipped.shape)
+                factor = math.ceil(weight_stream_len / 1024)
+                # add zeroes to pad out file to 1024 entries
+                weight_stream = weight_tensor_pe_flipped.flatten()
+                pad_amt = (factor * 1024) - weight_stream_len
+                weight_stream = np.pad(
+                    weight_stream, (0, pad_amt), mode="constant", constant_values="0"
+                )
+                weight_stream = weight_stream.copy()
+                i = 0
+                j = 0
+                for val in weight_stream:
+                    if i == 1024:
+                        i = 0
+                        j += 1
+                    with open("{}/memblock_{}.dat".format(code_gen_dir, j), "a+") as f:
+                        f.write(val + "\n")
+                    i += 1
         else:
             raise Exception(
-                """Please set mem_mode to "const"i or "decoupled", currently no other
-                    parameter value is supported!"""
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
             )
 
         # save thresholds in thresh.h
@@ -589,6 +617,9 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
                 if inp_is_bipolar and wt_is_bipolar:
                     tdt = DataType.UINT32
+                assert np.vectorize(tdt.allowed)(
+                    threshold_tensor
+                ).all(), "Thresholds are not int"
                 thresholds_hls_code = numpy_to_hls_code(
                     threshold_tensor, tdt, "thresholds", False, True
                 )
@@ -617,6 +648,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
+        mem_mode = self.get_nodeattr("mem_mode")
         node = self.onnx_node
 
         # TODO ensure codegen dir exists
@@ -685,7 +717,24 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             )
             super().reset_rtlsim(sim)
             super().toggle_clk(sim)
-            output = self.rtlsim(sim, inp)
+            if mem_mode == "external":
+                wnbits = self.get_weightstream_width()
+                export_wdt = self.get_weight_datatype()
+                # we have converted bipolar weights to binary for export,
+                # so use it as such for weight generation
+                if self.get_weight_datatype() == DataType.BIPOLAR:
+                    export_wdt = DataType.BINARY
+                wei = npy_to_rtlsim_input(
+                    "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits
+                )
+                io_dict = {
+                    "inputs": {"in0": inp, "weights": wei},
+                    "outputs": {"out": []},
+                }
+                self.rtlsim_multi_io(sim, io_dict)
+                output = io_dict["outputs"]["out"]
+            else:
+                output = self.rtlsim(sim, inp)
             odt = self.get_output_datatype()
             target_bits = odt.bitwidth()
             packed_bits = self.get_outstream_width()
@@ -716,12 +765,12 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         if mem_mode == "const":
             # self.code_gen_dict["$GLOBALS$"] += ['#include "params.h"']
             pass
-        elif mem_mode == "decoupled":
+        elif mem_mode == "decoupled" or mem_mode == "external":
             self.code_gen_dict["$GLOBALS$"] += ['#include "mvau.hpp"']
         else:
             raise Exception(
-                """Please set mem_mode to "const" or "decoupled", currently no other
-                    parameter value is supported!"""
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
             )
         if self.calc_tmem() != 0:
             # TODO find a better way of checking for no pregenerated thresholds
@@ -744,7 +793,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 numReps,
             )
         ]
-        if mem_mode == "decoupled":
+        if mem_mode == "decoupled" or mem_mode == "external":
             wdt = self.get_weight_datatype()
             self.code_gen_dict["$DEFINES$"].append(
                 "#define WP1 {}\n".format(wdt.bitwidth())
@@ -770,7 +819,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         )
 
         mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode == "decoupled":
+        if mem_mode == "decoupled" or mem_mode == "external":
             wdt = self.get_weight_datatype()
             elem_bits = wdt.bitwidth()
             packed_bits = self.get_weightstream_width()
@@ -794,7 +843,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
         )
 
-        if mem_mode == "decoupled":
+        if mem_mode == "decoupled" or mem_mode == "external":
             self.code_gen_dict["$STREAMDECLARATIONS$"].append(
                 'hls::stream<ap_uint<{}>> weights ("weights");'.format(
                     self.get_weightstream_width()
@@ -822,7 +871,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                     self.get_nodeattr("resType"),
                 )
             ]
-        elif mem_mode == "decoupled":
+        elif mem_mode == "decoupled" or mem_mode == "external":
             wdt = self.get_weight_datatype()
             if wdt == DataType.BIPOLAR:
                 export_wdt = DataType.BINARY
@@ -843,8 +892,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
         else:
             raise Exception(
-                """Please set mem_mode to "const" or "decoupled", currently no other
-                    parameter value is supported!"""
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
             )
 
     def dataoutstrm(self):
@@ -890,7 +939,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                     self.get_outstream_width(),
                 )
             ]
-        elif mem_mode == "decoupled":
+        elif mem_mode == "decoupled" or mem_mode == "external":
             self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
                 """void {}(
                     hls::stream<ap_uint<{}>> &in0,
@@ -939,7 +988,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                     "complete dim=1"
                 )
             )
-        elif mem_mode == "decoupled":
+        elif mem_mode == "decoupled" or mem_mode == "external":
             self.code_gen_dict["$PRAGMAS$"].append(
                 "#pragma HLS INTERFACE axis port=weights"
             )
@@ -949,8 +998,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
         else:
             raise Exception(
-                """Please set mem_mode to "const", currently no other
-                    parameter value is supported!"""
+                """Please set mem_mode to "const", "decoupled", or external,
+                currently no other parameter value is supported!"""
             )
 
         # the threshold tensor is acc_type [PE][TMEM][N_THRES]
@@ -1079,3 +1128,10 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             )
             self.set_nodeattr("ip_vlnv", vlnv)
             self.code_gen_dict.clear()
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "external":
+            intf_names["s_axis"] = ["in0_V_V", "weights_V_V"]
+        return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
index 2344e12f7e87634c189563f9cde7b1c861a3606e..4c772358648f402467cee628afe410d7bce83ede 100644
--- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
@@ -95,6 +95,12 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         folded_oshape = self.get_folded_output_shape()
         return np.prod(folded_oshape[:-1])
 
+    def get_exp_cycles(self):
+        # derived from StreamingMaxPool_Batch loop nest
+        k = self.get_nodeattr("PoolDim")
+        ifm_dim = self.get_nodeattr("ImgDim")
+        return ifm_dim * (ifm_dim + (ifm_dim / k))
+
     def get_instream_width(self):
         dt_bits = self.get_input_datatype().bitwidth()
         ifm_ch = self.get_nodeattr("NumChannels")
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 1a8216f64bf71b7fb9f1f8becf4732970b5bf451..319731df70d5bd1cb80d42932f08acdcec80c074 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -99,6 +99,7 @@ set_top $config_toplevelfxn
 open_solution sol1
 set_part $config_proj_part
 
+config_compile -ignore_long_run_time -disable_unroll_code_size_check
 config_interface -m_axi_addr64
 config_rtl -auto_prefix
 $EXTRA_DIRECTIVES$
@@ -343,6 +344,7 @@ set_property supported_families { \
   virtex7 Production \
   virtexu Production \
   virtexuplus Production \
+  virtexuplusHBM Production \
   zynq Production \
   zynquplus Production \
   aartix7 Production \
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
index fa33c70218fab16f106da45e296f0d59ae4ea606..379ebd92d86d54c6bc621c7f89b01eacba2b5d3f 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
@@ -215,6 +215,10 @@ class Thresholding_Batch(HLSCustomOp):
         nf = np.prod(self.get_folded_output_shape()[:-1])
         return nf
 
+    def get_exp_cycles(self):
+        # Channels/PE * batch size * fmdim * fmdim
+        return np.prod(self.get_folded_output_shape()[:-1])
+
     def get_template_param_values(self):
         """Returns the template parameter values according to input, output and weight
         data types."""
@@ -280,6 +284,9 @@ class Thresholding_Batch(HLSCustomOp):
 
         threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
         tdt = DataType.INT32
+        assert np.vectorize(tdt.allowed)(
+            threshold_tensor
+        ).all(), "Thresholds are not int"
         thresholds_hls_code = numpy_to_hls_code(
             threshold_tensor, tdt, "thresholds", False, True
         )
diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/tlastmarker.py
index 25ea05e3607a52731ae1b64de421837bf137ee2b..38a139c279701ae7892f41b63c3c717a3e736691 100644
--- a/src/finn/custom_op/fpgadataflow/tlastmarker.py
+++ b/src/finn/custom_op/fpgadataflow/tlastmarker.py
@@ -30,20 +30,31 @@ from finn.custom_op.fpgadataflow import HLSCustomOp
 
 
 class TLastMarker(HLSCustomOp):
-    """Class that corresponds to the TLastMarker node that needs to be
-    inserted at the end of the model for rtlsim with stitched IP.
-    It marks the end of the current image/input sample."""
+    """Node that adds/removes AXI stream TLAST signals where needed. Its behavior
+    is transparent in node-by-node execution, only visible in IP-stitched rtlsim or
+    actual hardware.
+    This node  may be needed at the end of the network to signal a DMA write
+    (needed by the FINN PYNQ shell) or at the beginning to remove the end-of-burst
+    from DMA read."""
 
     def __init__(self, onnx_node):
         super().__init__(onnx_node)
 
     def get_nodeattr_types(self):
         my_attrs = {
+            # number of (static) iterations until TLAST=1 is generated for Direction=out
             "NumIters": ("i", True, 0),
+            # whether static or dynamic (from AXI lite) number of iterations are used
+            "DynIters": ("i", False, 1),
+            # direction: whether to insert or remove TLAST
+            "Direction": ("s", False, "out"),
             # width of input-output data streams, in bits
             "StreamWidth": ("i", True, 0),
             # width of individual element in stream, in bits
             "ElemWidth": ("i", True, 0),
+            # Protocol: external or internal
+            # Vitis docs recommend using qdma_axis for external, ap_axiu for internal
+            "Protocol": ("s", False, "external"),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -76,12 +87,33 @@ class TLastMarker(HLSCustomOp):
 
     def defines(self, var):
         stream_width = self.get_nodeattr("StreamWidth")
+        direction = self.get_nodeattr("Direction")
+        protocol = self.get_nodeattr("Protocol")
         # output stream must have TLAST, so we use this stream data type:
         # qdma_axis<stream_data_width,0,0,0 >
-        out_stream_dtype = "qdma_axis<%d,0,0,0>" % stream_width
+        if direction == "out":
+            if protocol == "external":
+                out_stream_dtype = "qdma_axis<%d,0,0,0>" % stream_width
+            elif protocol == "internal":
+                out_stream_dtype = "ap_axiu<%d,0,0,0>" % stream_width
+            else:
+                raise Exception("Unrecognized Protocol in TLastMarker")
+            in_stream_dtype = "ap_uint<%d>" % stream_width
+        elif direction == "in":
+            out_stream_dtype = "ap_uint<%d>" % stream_width
+            if protocol == "external":
+                in_stream_dtype = "qdma_axis<%d,0,0,0>" % stream_width
+            elif protocol == "internal":
+                in_stream_dtype = "ap_axiu<%d,0,0,0>" % stream_width
+            else:
+                raise Exception("Unrecognized Protocol in TLastMarker")
+        else:
+            raise Exception("Unrecognized Direction in TLastMarker")
+
         self.code_gen_dict["$DEFINES$"] = [
             "#define StreamWidth %d" % stream_width,
             "#define OutDType %s" % out_stream_dtype,
+            "#define InDType %s" % in_stream_dtype,
             "#define NumItersPerImg %d" % self.get_nodeattr("NumIters"),
         ]
 
@@ -89,27 +121,60 @@ class TLastMarker(HLSCustomOp):
         self.code_gen_dict["$READNPYDATA$"] = []
 
     def docompute(self):
-        self.code_gen_dict["$DOCOMPUTE$"] = [
-            "unsigned int n = 1;",
-            "OutDType t;",
-            "t.set_keep(-1);",
-            "io_section: { // start of cycle accurate region",
-            "#pragma HLS protocol fixed",
-            "// do a first read from stream before we decide on numIters",
-            "// giving software a chance to set up the numIters prior to startup",
-            "t.set_data(in0.read());",
-            "n = (numIters == 0 ? NumItersPerImg : numIters);",
-            "t.set_last(n==1);",
-            "out.write(t);",
-            "} // end of cycle accurate region",
-            "// do one less iteration than spec since we already did one",
-            "for(unsigned int i=1; i<n; i++) {",
-            "#pragma HLS PIPELINE II=1",
-            "t.set_data(in0.read());",
-            "t.set_last(i==(n-1));",
-            "out.write(t);",
-            "}",
-        ]
+        dyn_iters = self.get_nodeattr("DynIters")
+        direction = self.get_nodeattr("Direction")
+        use_qdma_axis = self.get_nodeattr("Protocol") == "external"
+        if direction == "in":
+            # read from input and just pass data along; ignore tlast
+            # no dyn iters on input, it doesnt make sense
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                "for(unsigned int i=0; i<NumItersPerImg; i++) {",
+                "#pragma HLS PIPELINE II=1",
+                "out.write(in0.read().get_data());"
+                if use_qdma_axis
+                else "out.write(in0.read().data);",
+                "}",
+            ]
+
+        elif dyn_iters == 1:
+            # output, with dynamic iteration counts
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                "unsigned int n = 1;",
+                "OutDType t;",
+                "t.set_keep(-1);" if use_qdma_axis else "t.keep = -1;",
+                "io_section: { // start of cycle accurate region",
+                "#pragma HLS protocol fixed",
+                "// do a first read from stream before we decide on numIters",
+                "// giving software a chance to set up the numIters prior to startup",
+                "t.set_data(in0.read());" if use_qdma_axis else "t.data = in0.read();",
+                "n = (numIters == 0 ? NumItersPerImg : numIters);",
+                "t.set_last(n==1);" if use_qdma_axis else "t.last = (n==1);",
+                "out.write(t);",
+                "} // end of cycle accurate region",
+                "// do one less iteration than spec since we already did one",
+                "for(unsigned int i=1; i<n; i++) {",
+                "#pragma HLS PIPELINE II=1",
+                "t.set_data(in0.read());" if use_qdma_axis else "t.data = in0.read();",
+                "t.set_last(i==(n-1));" if use_qdma_axis else "t.last = (i==(n-1));",
+                "out.write(t);",
+                "}",
+            ]
+
+        else:
+            # output, with static iteration counts
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                "unsigned int n = 1;",
+                "OutDType t;",
+                "t.set_keep(-1);" if use_qdma_axis else "t.keep = -1;",
+                "for(unsigned int i=0; i<NumItersPerImg; i++) {",
+                "#pragma HLS PIPELINE II=1",
+                "t.set_data(in0.read());" if use_qdma_axis else "t.data = in0.read();",
+                "t.set_last(i==(NumItersPerImg-1));"
+                if use_qdma_axis
+                else "t.last = (i==(NumItersPerImg-1));",
+                "out.write(t);",
+                "}",
+            ]
 
     def dataoutstrm(self):
         self.code_gen_dict["$DATAOUTSTREAM$"] = []
@@ -118,18 +183,30 @@ class TLastMarker(HLSCustomOp):
         self.code_gen_dict["$SAVEASCNPY$"] = []
 
     def blackboxfunction(self):
-        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            """void %s(hls::stream<ap_uint<StreamWidth> > &in0,
-                hls::stream<OutDType> &out, unsigned int numIters)"""
-            % self.onnx_node.name
-        ]
+        dyn_iters = self.get_nodeattr("DynIters")
+
+        if dyn_iters == 1:
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void %s(hls::stream<InDType> &in0,
+                    hls::stream<OutDType> &out, unsigned int numIters)"""
+                % self.onnx_node.name
+            ]
+        else:
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void %s(hls::stream<InDType> &in0, hls::stream<OutDType> &out)"""
+                % self.onnx_node.name
+            ]
 
     def pragmas(self):
         self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
         self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE s_axilite port=numIters bundle=control"
-        )
+
+        dyn_iters = self.get_nodeattr("DynIters")
+        if dyn_iters == 1:
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS INTERFACE s_axilite port=numIters bundle=control"
+            )
+
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
@@ -158,8 +235,20 @@ class TLastMarker(HLSCustomOp):
     def strm_decl(self):
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+            'hls::stream<InDType> in0 ("in0");'
         )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
             'hls::stream<OutDType> out ("out");'
         )
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        if self.get_nodeattr("Direction") == "in":
+            intf_names["s_axis"] = ["in0"]
+            intf_names["m_axis"] = ["out_V_V"]
+        else:
+            intf_names["s_axis"] = ["in0_V_V"]
+            intf_names["m_axis"] = ["out_r"]
+        if self.get_nodeattr("DynIters") == 1:
+            intf_names["axilite"] = ["s_axi_control"]
+        return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py b/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..942e4b25700d0c52c1bc5bcd81614a058342f178
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py
@@ -0,0 +1,506 @@
+import os
+import numpy as np
+
+from onnx import TensorProto, helper
+from finn.core.datatype import DataType
+from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.util.basic import interleave_matrix_outer_dim_from_partitions
+from finn.util.data_packing import (
+    npy_to_rtlsim_input,
+    numpy_to_hls_code,
+    rtlsim_output_to_npy,
+)
+
+
+class Vector_Vector_Activate_Batch(HLSCustomOp):
+    """Class that corresponds to finn-hlslib Vector_Vector_Activate_Batch function"""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "PE": ("i", True, 0),
+            "Dim": ("i", True, 0),
+            "Channels": ("i", True, 0),
+            "Kernel": ("i", True, 0),
+            "resType": ("s", True, ""),
+            "ActVal": ("i", False, 0),
+            # FINN DataTypes for inputs, weights, outputs
+            "inputDataType": ("s", True, ""),
+            "weightDataType": ("s", True, ""),
+            "outputDataType": ("s", True, ""),
+            # no-activation mode (produce accumulators)
+            "noActivation": ("i", False, 0),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def calc_wmem(self):
+        """Calculates and returns WMEM."""
+        ch = self.get_nodeattr("Channels")
+        k = self.get_nodeattr("Kernel")
+        pe = self.get_nodeattr("PE")
+        wmem = k * k * ch // pe
+        return wmem
+
+    def calc_tmem(self):
+        """Calculates and returns TMEM."""
+        if self.get_nodeattr("noActivation") == 1:
+            return 0
+        else:
+            ch = self.get_nodeattr("Channels")
+            pe = self.get_nodeattr("PE")
+            return ch // pe
+
+    def make_shape_compatible_op(self, model):
+        oshape = self.get_normal_output_shape()
+        # implement tensor with correct shape
+        values = np.random.randn(*oshape).astype(np.float32)
+        return helper.make_node(
+            "Constant",
+            inputs=[],
+            outputs=[self.onnx_node.output[0]],
+            value=helper.make_tensor(
+                name="const_tensor",
+                data_type=TensorProto.FLOAT,
+                dims=values.shape,
+                vals=values.flatten().astype(float),
+            ),
+        )
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        # check input datatype against property
+        idt_name = self.get_input_datatype().name
+        exp_idt_name = self.get_nodeattr("inputDataType")
+        assert exp_idt_name == idt_name, "Bad input DataType for VVAU  node"
+        # set output datatype from property
+        odt = self.get_output_datatype()
+        model.set_tensor_datatype(node.output[0], odt)
+
+    def verify_node(self):
+        pass
+
+    def get_input_datatype(self):
+        """Returns FINN DataType of input."""
+        return DataType[self.get_nodeattr("inputDataType")]
+
+    def get_weight_datatype(self):
+        """Returns FINN DataType of weights."""
+        return DataType[self.get_nodeattr("weightDataType")]
+
+    def get_output_datatype(self):
+        """Returns FINN DataType of output."""
+        return DataType[self.get_nodeattr("outputDataType")]
+
+    def get_instream_width(self):
+        i_bits = self.get_input_datatype().bitwidth()
+        in_width = i_bits * self.get_nodeattr("Channels")
+        return in_width
+
+    def get_outstream_width(self):
+        o_bits = self.get_output_datatype().bitwidth()
+        out_width = o_bits * self.get_nodeattr("PE")
+        return out_width
+
+    def get_folded_input_shape(self):
+        k = self.get_nodeattr("Kernel")
+        sf = k * k
+        dim = self.get_nodeattr("Dim")
+        ch = self.get_nodeattr("Channels")
+        pe = self.get_nodeattr("PE")
+        nf = ch // pe
+        folded_input_shape = tuple([1, dim, dim, sf * nf, pe])
+        return folded_input_shape
+
+    def get_folded_output_shape(self):
+        ch = self.get_nodeattr("Channels")
+        pe = self.get_nodeattr("PE")
+        nf = ch // pe
+        dim = self.get_nodeattr("Dim")
+        folded_output_shape = tuple([1, dim, dim, nf, pe])
+        return folded_output_shape
+
+    def get_normal_input_shape(self):
+        dim = self.get_nodeattr("Dim")
+        ch = self.get_nodeattr("Channels")
+        k = self.get_nodeattr("Kernel")
+        normal_input_shape = tuple([1, dim, dim, k * k * ch])
+        return normal_input_shape
+
+    def get_normal_output_shape(self):
+        ch = self.get_nodeattr("Channels")
+        dim = self.get_nodeattr("Dim")
+        normal_output_shape = tuple([1, dim, dim, ch])
+        return normal_output_shape
+
+    def get_number_output_values(self):
+        nf = np.prod(self.get_folded_output_shape()[:-1])
+        return nf
+
+    def get_exp_cycles(self):
+        pe = self.get_nodeattr("PE")
+        ch = self.get_nodeattr("Channels")
+        dim = self.get_nodeattr("Dim")
+        k = self.get_nodeattr("Kernel")
+        # currently FINN supports for vvau a batch size of 1
+        batch_size = 1
+        # since mmv != 1 is not supported yet, we set mmv for now to 1
+        mmv = 1
+        exp_cycles = ((ch * k * k) / pe) * batch_size * (dim * dim) / mmv
+        return int(exp_cycles)
+
+    def get_template_param_values(self):
+        """Returns the template parameter values according to input, output and weight
+        data types."""
+        ret = dict()
+        inp_hls_str = self.get_input_datatype().get_hls_datatype_str()
+        out_hls_str = self.get_output_datatype().get_hls_datatype_str()
+        inp_is_bipolar = self.get_input_datatype() == DataType.BIPOLAR
+        wt_is_bipolar = self.get_weight_datatype() == DataType.BIPOLAR
+        # fill in TSrcI and TWeightI
+        # TODO handle bipolar inputs
+        if inp_is_bipolar or wt_is_bipolar:
+            raise Exception("VVAU node doesn't support bipolar values yet.")
+        else:
+            ret["TSrcI"] = "Slice<%s>" % inp_hls_str
+            ret["TWeightI"] = "Identity"
+
+        # fill in TDstI
+        ret["TDstI"] = "Slice<%s>" % out_hls_str
+
+        return ret
+
+    def get_hls_compatible_weight_tensor(self, orig_weight_matrix):
+        pe = self.get_nodeattr("PE")
+        ch = self.get_nodeattr("Channels")
+        k = self.get_nodeattr("Kernel")
+        wmem = self.calc_wmem()
+        assert orig_weight_matrix.shape == (
+            ch,
+            1,
+            k,
+            k,
+        ), """Weights matrix doesn't
+        have expected shape (channels, 1, kernel_size, kernel_size)"""
+        ret = orig_weight_matrix
+        ret = ret.reshape(ch, k * k)
+        # distribute rows between PEs
+        ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
+        ret = ret.reshape(1, pe, wmem, 1)
+        return ret
+
+    def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
+        ch = self.get_nodeattr("Channels")
+        pe = self.get_nodeattr("PE")
+        tmem = self.calc_tmem()
+        assert ch % pe == 0, "Requirement Channels divisable by PE is violated."
+        assert (
+            orig_thres_matrix.ndim == 2
+        ), """Threshold matrix dimension is
+        not as expected (2)."""
+        n_thres_steps = orig_thres_matrix.shape[1]
+        ret = orig_thres_matrix
+        # distribute rows between PEs
+        ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
+        assert (
+            ret.shape[0] == pe
+        ), """First dimension after distribution of the
+        rows between PEs is not as expected (pe)"""
+        assert (
+            ret.shape[1] == tmem
+        ), """Second dimension after distribution of the
+        rows between PEs is not as expected (tmem)"""
+        assert (
+            ret.shape[2] == n_thres_steps
+        ), """Third dimension after distribution of the
+        rows between PEs is not as expected (n_thres_steps)"""
+        return ret.reshape(1, pe, tmem, n_thres_steps)
+
+    def generate_params(self, model, path):
+        # weights
+        weights = model.get_initializer(self.onnx_node.input[1])
+        # convert weights into hlslib-compatible format
+        weight_tensor = self.get_hls_compatible_weight_tensor(weights)
+        wdt = self.get_weight_datatype()
+        code_gen_dir = path
+
+        """Saves weights into params.h"""
+        weight_hls_code = numpy_to_hls_code(weight_tensor, wdt, "weights", True, True)
+        # write weights into params.h
+        f_weights = open("{}/params.h".format(code_gen_dir), "w")
+
+        if wdt.bitwidth() != 1:
+            f_weights.write(
+                "const FixedPointWeights<1,{},{},{}> weights = ".format(
+                    wdt.get_hls_datatype_str(),
+                    self.get_nodeattr("PE"),
+                    self.calc_wmem(),
+                )
+            )
+        else:
+            f_weights.write(
+                "const BinaryWeights<1,{},{}> weights = ".format(
+                    self.get_nodeattr("PE"), self.calc_wmem()
+                )
+            )
+        f_weights.write(weight_hls_code)
+        f_weights.close()
+
+        # save thresholds in thresh.h
+        if len(self.onnx_node.input) > 2:
+            thresholds = model.get_initializer(self.onnx_node.input[2])
+            if thresholds is not None:
+                threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
+                tdt = DataType.INT32
+                assert np.vectorize(tdt.allowed)(
+                    threshold_tensor
+                ).all(), "Thresholds are not int"
+                thresholds_hls_code = numpy_to_hls_code(
+                    threshold_tensor, tdt, "thresholds", False, True
+                )
+                # write thresholds into thresh.h
+                f_thresh = open("{}/thresh.h".format(code_gen_dir), "w")
+                tdt_hls = tdt.get_hls_datatype_str()
+                odt = self.get_output_datatype()
+                odt_hls = odt.get_hls_datatype_str()
+                f_thresh.write(
+                    "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \
+                    = ".format(
+                        self.calc_tmem(),
+                        self.get_nodeattr("PE"),
+                        threshold_tensor.shape[-1],
+                        tdt_hls,
+                        odt_hls,
+                        self.get_nodeattr("ActVal"),
+                        "std::less_equal<%s>" % tdt_hls,
+                    )
+                )
+                f_thresh.write(thresholds_hls_code)
+                f_thresh.close()
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+
+        # TODO ensure codegen dir exists
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        # create a npy file fore each input of the node (in_ind is input index)
+        in_ind = 0
+        for inputs in node.input:
+            # it is assumed that the first input of the node is the data input
+            # the second input are the weights
+            # the third input are the thresholds
+            if in_ind == 0:
+                assert (
+                    str(context[inputs].dtype) == "float32"
+                ), """Input datatype is
+                not float32 as expected."""
+                expected_inp_shape = self.get_folded_input_shape()
+                reshaped_input = context[inputs].reshape(expected_inp_shape)
+                # make copy before saving the array
+                reshaped_input = reshaped_input.copy()
+                np.save(
+                    os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)),
+                    reshaped_input,
+                )
+            elif in_ind > 2:
+                raise Exception(
+                    "Unexpected input found for Vector_Vector_Activate_Unit"
+                )
+            in_ind += 1
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_output(context)
+            assert (
+                context[node.output[0]].shape == self.get_folded_output_shape()
+            ), """Output shape is not as expected"""
+            # reshape output to have expected shape
+            oshape = self.get_normal_output_shape()
+            context[node.output[0]] = context[node.output[0]].reshape(*oshape)
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            idt = self.get_input_datatype()
+            inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), idt, nbits)
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            output = self.rtlsim(sim, inp)
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+
+            # load and reshape output
+            output = np.load(out_npy_path)
+            oshape = self.get_normal_output_shape()
+            output = np.asarray([output], dtype=np.float32).reshape(*oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"']
+        self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"']
+        if self.calc_tmem() != 0:
+            self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"']
+
+    def defines(self, var):
+        dim = self.get_nodeattr("Dim")
+        numReps = 1 * dim * dim
+        self.code_gen_dict["$DEFINES$"] = [
+            """#define Channels1 {}\n #define Kernel1 {}\n
+            #define SIMD1 1\n #define PE1 {}\n #define numReps {}""".format(
+                self.get_nodeattr("Channels"),
+                self.get_nodeattr("Kernel"),
+                self.get_nodeattr("PE"),
+                numReps,
+            )
+        ]
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_input_datatype()
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"] = []
+        # note: the innermost dim is reversed for the input
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0, false);'
+            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+        )
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+        )
+
+    def docompute(self):
+        tmpl_args = self.get_template_param_values()
+        if self.calc_tmem() == 0:
+            odtype_hls_str = self.get_output_datatype().get_hls_datatype_str()
+            threshs = "PassThroughActivation<%s>()" % odtype_hls_str
+        else:
+            threshs = "threshs"
+        node = self.onnx_node
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            """{}<Channels1, Kernel1, SIMD1, PE1, 1, {}, {}, {}>
+            (in0, out, weights, {}, numReps, {});""".format(
+                node.op_type,
+                tmpl_args["TSrcI"],
+                tmpl_args["TDstI"],
+                tmpl_args["TWeightI"],
+                threshs,
+                self.get_nodeattr("resType"),
+            )
+        ]
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        shape = self.get_folded_output_shape()
+        shape_cpp_str = str(shape).replace("(", "{").replace(")", "}")
+
+        # note: the innermost dim is not reversed for the output
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", false);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                shape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            """void {}(hls::stream<ap_uint<{}>> &in0,
+            hls::stream<ap_uint<{}>> &out
+            )""".format(
+                self.onnx_node.name,
+                self.get_instream_width(),
+                self.get_outstream_width(),
+            )
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        in_fifo_depth = self.get_nodeattr("inFIFODepth")
+        out_fifo_depth = self.get_nodeattr("outFIFODepth")
+        # insert depth pragmas only if specified
+        if in_fifo_depth != 0:
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS stream depth=%d variable=in0" % in_fifo_depth
+            )
+        if out_fifo_depth != 0:
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS stream depth=%d variable=out" % out_fifo_depth
+            )
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+        )
+
+        self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"')
+        # the weight tensor is ap_uint<ch*prec> [PE][WMEM]
+        # partition for parallel access along the PE dimension (dim 1)
+        self.code_gen_dict["$PRAGMAS$"].append(
+            ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1")
+        )
+        if self.calc_tmem() != 0:
+            # TODO find a better way of checking for no pregenerated thresholds
+            self.code_gen_dict["$PRAGMAS$"].append(
+                (
+                    "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds "
+                    "complete dim=1"
+                )
+            )
+            self.code_gen_dict["$PRAGMAS$"].append(
+                (
+                    "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds "
+                    "complete dim=3"
+                )
+            )
diff --git a/src/finn/custom_op/im2col.py b/src/finn/custom_op/im2col.py
index 82a6b140f7af1be4e5c0f429d077b99c7865383e..8ed0041704d421dab587f08bcbcd9e739e8434e9 100644
--- a/src/finn/custom_op/im2col.py
+++ b/src/finn/custom_op/im2col.py
@@ -80,6 +80,8 @@ class Im2Col(CustomOp):
             "input_shape": ("s", True, ""),
             "pad_amount": ("i", False, 0),
             "pad_value": ("i", False, 0),
+            # depthwise: if != 0, infer ConvolutionInputGenerator with depthwise == 1
+            "depthwise": ("i", False, 0),
         }
 
     def make_shape_compatible_op(self, model):
diff --git a/src/finn/custom_op/quantavgpool2d.py b/src/finn/custom_op/quantavgpool2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..28d01069264d883f3afc400808470f5f303be799
--- /dev/null
+++ b/src/finn/custom_op/quantavgpool2d.py
@@ -0,0 +1,136 @@
+import numpy as np
+from onnx import TensorProto, helper
+import onnxruntime as rt
+
+from finn.custom_op import CustomOp
+from finn.core.datatype import DataType
+from finn.custom_op.maxpoolnhwc import compute_pool_output_dim
+
+
+class QuantAvgPool2d(CustomOp):
+    """Class that corresponds to the quantized average pooling
+    layer from brevitas"""
+
+    def get_nodeattr_types(self):
+        return {
+            "stride": ("i", True, 1),
+            "kernel": ("i", True, 1),
+            "ibits": ("i", True, 1),
+            "obits": ("i", True, 1),
+            # determines if values are signed (set to "1") or unsigned ("0")
+            "signed": ("i", True, 0),
+            # data layout attribute can be set to "NCHW" or "NHWC"
+            "data_layout": ("s", False, "NCHW"),
+        }
+
+    def make_shape_compatible_op(self, model):
+        node = self.onnx_node
+        k = self.get_nodeattr("kernel")
+        s = self.get_nodeattr("stride")
+        data_layout = self.get_nodeattr("data_layout")
+        if data_layout == "NCHW":
+            return helper.make_node(
+                "AveragePool",
+                inputs=[node.input[0]],
+                outputs=[node.output[0]],
+                kernel_shape=[k, k],
+                strides=[s, s],
+            )
+        elif data_layout == "NHWC":
+            iname = node.input[0]
+            ishape = model.get_tensor_shape(iname)
+            (n, hi, wi, c) = ishape
+            ho = compute_pool_output_dim(hi, k, s)
+            wo = compute_pool_output_dim(wi, k, s)
+            oshape = (n, ho, wo, c)
+            # implement tensor with correct shape
+            values = np.random.randn(*oshape).astype(np.float32)
+            return helper.make_node(
+                "Constant",
+                inputs=[],
+                outputs=[node.output[0]],
+                value=helper.make_tensor(
+                    name="const_tensor",
+                    data_type=TensorProto.FLOAT,
+                    dims=values.shape,
+                    vals=values.flatten().astype(float),
+                ),
+            )
+
+        else:
+            raise Exception(
+                """Datalayout for QuantAvgPool2d is set to an invalid value.
+                    Has to be set to "NCHW" or "NHWC"."""
+            )
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        bw = self.get_nodeattr("obits")
+        if bw in [2, 4, 8, 16, 32]:
+            if self.get_nodeattr("signed") == 0:
+                dtype = DataType["UINT%d" % bw]
+            else:
+                dtype = DataType["INT%d" % bw]
+        else:
+            raise Exception("Unsupported output datatype for QuantAvgPool2d")
+        model.set_tensor_datatype(node.output[0], dtype)
+
+    def get_accum_size(self):
+        ibits = self.get_nodeattr("ibits")
+        k = self.get_nodeattr("kernel")
+        max_value = 2 ** ibits - 1
+        max_value = max_value * k * k
+        max_bit_width = int(max_value).bit_length()
+        return max_bit_width
+
+    def get_shifts(self):
+        shift_bits = self.get_accum_size() - self.get_nodeattr("obits")
+        shift_bits = shift_bits if shift_bits >= 0 else 0
+        return shift_bits
+
+    def execute_node(self, context, graph):
+        # create a standard average pooling node to help calculate the result
+        node = self.onnx_node
+        k = self.get_nodeattr("kernel")
+        s = self.get_nodeattr("stride")
+        inp_values = context[node.input[0]]
+        oshape = context[node.output[0]].shape
+        if self.get_nodeattr("data_layout") == "NHWC":
+            inp_values = inp_values.transpose(0, 3, 1, 2)
+            oshape = (context[node.output[0]]).transpose(0, 3, 1, 2).shape
+        ishape = inp_values.shape
+        inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, ishape)
+        outp = helper.make_tensor_value_info(node.output[0], TensorProto.FLOAT, oshape)
+        node_avgpool = helper.make_node(
+            "AveragePool",
+            inputs=[node.input[0]],
+            outputs=[node.output[0]],
+            kernel_shape=[k, k],
+            strides=[s, s],
+        )
+        graph_avgpool = helper.make_graph(
+            nodes=[node_avgpool],
+            name="single-avgpool-exec",
+            inputs=[inp],
+            outputs=[outp],
+        )
+        model_avgpool = helper.make_model(graph_avgpool)
+        idict = {node.input[0]: inp_values}
+        sess = rt.InferenceSession(model_avgpool.SerializeToString())
+        result_temp = sess.run(None, idict)
+        # remove scaling introduced by average
+        result_temp = result_temp[0] * (k * k)
+        result = np.right_shift(result_temp.astype(int), self.get_shifts())
+        if self.get_nodeattr("data_layout") == "NHWC":
+            result = result.transpose(0, 2, 3, 1)
+        context[node.output[0]] = result.astype(np.float32)
+
+    def verify_node(self):
+        info_messages = []
+        # verify that "domain" is set to "finn"
+        domain_value = self.onnx_node.domain
+        if domain_value == "finn":
+            info_messages.append("Attribute domain is set correctly")
+        else:
+            info_messages.append('Attribute domain should be set to "finn"')
+        return info_messages
diff --git a/src/finn/custom_op/registry.py b/src/finn/custom_op/registry.py
index 614a3d7ffd70d0b102bad2b76177a2d3b32765c7..0cc0e53eaebd94d5e2cd0e030bc107da098e4931 100644
--- a/src/finn/custom_op/registry.py
+++ b/src/finn/custom_op/registry.py
@@ -31,6 +31,7 @@
 from finn.custom_op.fpgadataflow.convolutioninputgenerator import (
     ConvolutionInputGenerator,
 )
+from finn.custom_op.fpgadataflow.downsampler import DownSampler
 from finn.custom_op.fpgadataflow.streamingfclayer_batch import StreamingFCLayer_Batch
 from finn.custom_op.fpgadataflow.streamingmaxpool_batch import StreamingMaxPool_Batch
 from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO
@@ -44,16 +45,24 @@ from finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch import (
     StreamingDataWidthConverter_Batch,
 )
 from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch
-from finn.custom_op.fpgadataflow.fmpadding import FMPadding_Batch
+from finn.custom_op.fpgadataflow.pool_batch import Pool_Batch
+from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch
 from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch
 from finn.custom_op.fpgadataflow.addstreams_batch import AddStreams_Batch
 from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch
+from finn.custom_op.quantavgpool2d import QuantAvgPool2d
 from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch
+from finn.custom_op.fpgadataflow.vector_vector_activate_batch import (
+    Vector_Vector_Activate_Batch,
+)
+from finn.custom_op.fpgadataflow.channelwise_op_batch import ChannelwiseOp_Batch
+from finn.custom_op.fpgadataflow.iodma import IODMA
 
 # create a mapping of all known CustomOp names and classes
 custom_op = {}
 
 custom_op["MultiThreshold"] = MultiThreshold
+custom_op["DownSampler"] = DownSampler
 custom_op["XnorPopcountMatMul"] = XnorPopcountMatMul
 custom_op["Im2Col"] = Im2Col
 custom_op["StreamingMaxPool_Batch"] = StreamingMaxPool_Batch
@@ -65,11 +74,16 @@ custom_op["MaxPoolNHWC"] = MaxPoolNHWC
 custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch
 custom_op["StreamingFIFO"] = StreamingFIFO
 custom_op["GlobalAccPool_Batch"] = GlobalAccPool_Batch
+custom_op["Pool_Batch"] = Pool_Batch
 custom_op["FMPadding_Batch"] = FMPadding_Batch
 custom_op["Thresholding_Batch"] = Thresholding_Batch
 custom_op["AddStreams_Batch"] = AddStreams_Batch
 custom_op["LabelSelect_Batch"] = LabelSelect_Batch
+custom_op["QuantAvgPool2d"] = QuantAvgPool2d
 custom_op["DuplicateStreams_Batch"] = DuplicateStreams_Batch
+custom_op["Vector_Vector_Activate_Batch"] = Vector_Vector_Activate_Batch
+custom_op["ChannelwiseOp_Batch"] = ChannelwiseOp_Batch
+custom_op["IODMA"] = IODMA
 
 
 def getCustomOp(node):
diff --git a/src/finn/custom_op/streamingdataflowpartition.py b/src/finn/custom_op/streamingdataflowpartition.py
index b63326d676f4ded5ec1dd62f5cc7f02d7acb82ad..31cd38fea3c5a9e88084c3332d46aebdb065f800 100644
--- a/src/finn/custom_op/streamingdataflowpartition.py
+++ b/src/finn/custom_op/streamingdataflowpartition.py
@@ -36,7 +36,12 @@ class StreamingDataflowPartition(CustomOp):
     bitfile by itself."""
 
     def get_nodeattr_types(self):
-        return {"model": ("s", True, "")}
+        return {
+            "model": ("s", True, ""),
+            "res_estimate": ("s", False, ""),
+            "res_hls": ("s", False, ""),
+            "res_synth": ("s", False, ""),
+        }
 
     def make_shape_compatible_op(self, model):
         pass
@@ -83,7 +88,7 @@ class StreamingDataflowPartition(CustomOp):
             )
 
         # verify the number of inputs
-        if len(self.onnx_node.input) == 1:
+        if len(self.onnx_node.input) >= 1:
             info_messages.append("The number of inputs is correct")
         else:
             info_messages.append("StreamingDataflowPartition needs 1 data input")
diff --git a/src/finn/transformation/bipolar_to_xnor.py b/src/finn/transformation/bipolar_to_xnor.py
index 8b65cfee17edd5d89fcca0bd86da12415d38fe78..80f2a73351f8548c99efd8dedd8a04d44c8558a3 100644
--- a/src/finn/transformation/bipolar_to_xnor.py
+++ b/src/finn/transformation/bipolar_to_xnor.py
@@ -27,6 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import numpy as np
+import warnings
 from onnx import TensorProto
 from onnx import helper as oh
 
@@ -66,26 +67,40 @@ class ConvertBipolarMatMulToXnorPopcount(Transformation):
 
                     mt_chain = model.find_upstream(mm_input, find_prod_mt)
                     if len(mt_chain) == 0:
-                        raise Exception(
-                            """Could not find upstream bipolar
-                                            MultiThreshold"""
-                        )
-                    graph_modified = True
-                    mt = mt_chain[-1]
-                    mt_inst = getCustomOp(mt)
-                    # ensure old scale/bias were correct for BIPOLAR
-                    scale_ok = mt_inst.get_nodeattr("out_scale") == 2.0
-                    bias_ok = mt_inst.get_nodeattr("out_bias") == -1.0
-                    assert (
-                        scale_ok and bias_ok
-                    ), """Unexpected scale/bias
-                    attributes for BIPOLAR MultiThreshold node."""
-                    # start conversion, set MT output to binary
-                    # (this is what XnorPopcountMatMul expects)
-                    mt_inst.set_nodeattr("out_dtype", "BINARY")
-                    mt_inst.set_nodeattr("out_scale", 1.0)
-                    mt_inst.set_nodeattr("out_bias", 0.0)
-                    model.set_tensor_datatype(mm_input, DataType.BINARY)
+                        if mm_input == graph.input[0].name:
+                            # change input datatype to BINARY
+                            model.set_tensor_datatype(mm_input, DataType.BINARY)
+                            graph_modified = True
+                            warnings.warn(
+                                """IMPORTANT: Changing graph input DataType
+                            to BINARY instead of BIPOLAR. Ensure this is respected
+                            when checking for correctness.
+                            """
+                            )
+                        else:
+                            raise Exception(
+                                """Could not find upstream bipolar
+                                   MultiThreshold, and the MatMul is not the
+                                   first node on graph input. Unable to convert
+                                   input tensor from BIPOLAR to BINARY."""
+                            )
+                    else:
+                        graph_modified = True
+                        mt = mt_chain[-1]
+                        mt_inst = getCustomOp(mt)
+                        # ensure old scale/bias were correct for BIPOLAR
+                        scale_ok = mt_inst.get_nodeattr("out_scale") == 2.0
+                        bias_ok = mt_inst.get_nodeattr("out_bias") == -1.0
+                        assert (
+                            scale_ok and bias_ok
+                        ), """Unexpected scale/bias
+                        attributes for BIPOLAR MultiThreshold node."""
+                        # start conversion, set MT output to binary
+                        # (this is what XnorPopcountMatMul expects)
+                        mt_inst.set_nodeattr("out_dtype", "BINARY")
+                        mt_inst.set_nodeattr("out_scale", 1.0)
+                        mt_inst.set_nodeattr("out_bias", 0.0)
+                        model.set_tensor_datatype(mm_input, DataType.BINARY)
                     # change node type and domain
                     n.op_type = "XnorPopcountMatMul"
                     n.domain = "finn"
diff --git a/src/finn/transformation/change_datalayout.py b/src/finn/transformation/change_datalayout.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5b393a25e57122b059a44f70904a6dbe5bbaa3f
--- /dev/null
+++ b/src/finn/transformation/change_datalayout.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from onnx import helper, TensorProto
+
+from finn.transformation import Transformation
+from finn.transformation.infer_shapes import InferShapes
+from finn.util.basic import get_by_name
+
+
+class ChangeDataLayoutQuantAvgPool2d(Transformation):
+    """Replace QuantAvgPool2d with datalayout (N,C,H,W) with Transpose nodes
+    and QuantAvgPool2dNHWC with datalayout (N,H,W,C)"""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type == "QuantAvgPool2d" and (
+                get_by_name(n.attribute, "data_layout") is None
+                or get_by_name(n.attribute, "data_layout").s.decode("UTF-8") == "NCHW"
+            ):
+                graph_modified = True
+                node_input = n.input[0]
+                node_output = n.output[0]
+                s = get_by_name(n.attribute, "stride").i
+                k = get_by_name(n.attribute, "kernel").i
+                ibits = get_by_name(n.attribute, "ibits").i
+                obits = get_by_name(n.attribute, "obits").i
+                signed = get_by_name(n.attribute, "signed").i
+                batchsize = model.get_tensor_shape(n.input[0])[0]  # assume NCHW
+                channels = model.get_tensor_shape(n.input[0])[1]  # assume NCHW
+                idim = model.get_tensor_shape(n.input[0])[-1]  # assume NCHW
+                odim = model.get_tensor_shape(n.output[0])[-1]  # assume NCHW
+
+                # create new nodes
+                # NCHW -> NHWC
+                # create new intermediate values
+                inp_trans_out = helper.make_tensor_value_info(
+                    model.make_new_valueinfo_name(),
+                    TensorProto.FLOAT,
+                    (batchsize, idim, idim, channels),  # NHWC
+                )
+                graph.value_info.append(inp_trans_out)
+                inp_trans_out = inp_trans_out.name
+                quantavg_out = helper.make_tensor_value_info(
+                    model.make_new_valueinfo_name(),
+                    TensorProto.FLOAT,
+                    (batchsize, odim, odim, channels),
+                )
+                graph.value_info.append(quantavg_out)
+                quantavg_out = quantavg_out.name
+                inp_trans_node = helper.make_node(
+                    "Transpose", [node_input], [inp_trans_out], perm=[0, 2, 3, 1]
+                )
+                quantavg_node = helper.make_node(
+                    "QuantAvgPool2d",
+                    [inp_trans_out],
+                    [quantavg_out],
+                    domain="finn",
+                    stride=s,
+                    kernel=k,
+                    ibits=ibits,
+                    obits=obits,
+                    signed=signed,
+                    data_layout="NHWC",
+                )
+                # NHWC -> NCHW
+                out_trans_node = helper.make_node(
+                    "Transpose", [quantavg_out], [node_output], perm=[0, 3, 1, 2]
+                )
+                # insert nodes
+                graph.node.insert(node_ind, inp_trans_node)
+                graph.node.insert(node_ind + 1, quantavg_node)
+                graph.node.insert(node_ind + 2, out_trans_node)
+                # remove old nodes
+                graph.node.remove(n)
+
+                # set shapes
+                model.set_tensor_shape(inp_trans_out, (batchsize, idim, idim, channels))
+                model.set_tensor_shape(quantavg_out, (batchsize, odim, odim, channels))
+        model = model.transform(InferShapes())
+        return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/annotate_cycles.py b/src/finn/transformation/fpgadataflow/annotate_cycles.py
new file mode 100644
index 0000000000000000000000000000000000000000..521c84952daf25982e574421dfba3ff0f7df91ae
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/annotate_cycles.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import finn.custom_op.registry as registry
+from finn.transformation import Transformation
+from finn.transformation.move_reshape import _is_fpgadataflow_node
+from finn.core.modelwrapper import ModelWrapper
+from finn.custom_op.registry import getCustomOp
+
+
+class AnnotateCycles(Transformation):
+    """Annotate the estimate of clock cycles per sample taken by each fpgadataflow
+    node as an attribute on the node.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def apply(self, model):
+        graph = model.graph
+        # annotate node cycles
+        for node in graph.node:
+            if _is_fpgadataflow_node(node):
+                op_inst = registry.getCustomOp(node)
+                cycles = op_inst.get_exp_cycles()
+                op_inst.set_nodeattr("cycles_estimate", cycles)
+            elif node.op_type == "StreamingDataflowPartition":
+                # recurse into model to manually annotate per-layer cycles
+                sdp_model_filename = getCustomOp(node).get_nodeattr("model")
+                sdp_model = ModelWrapper(sdp_model_filename)
+                sdp_model = sdp_model.transform(AnnotateCycles())
+                # save transformed model
+                sdp_model.save(sdp_model_filename)
+        return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/annotate_resources.py b/src/finn/transformation/fpgadataflow/annotate_resources.py
index 207075b00de1871da19ea78472125d435449ed6e..da6fa1ff738690308a9b7686a5c92d7395ab50c8 100644
--- a/src/finn/transformation/fpgadataflow/annotate_resources.py
+++ b/src/finn/transformation/fpgadataflow/annotate_resources.py
@@ -32,6 +32,8 @@ from finn.transformation.move_reshape import _is_fpgadataflow_node
 from finn.analysis.fpgadataflow.res_estimation import res_estimation
 from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
 from finn.analysis.fpgadataflow.post_synth_res import post_synth_res
+from finn.core.modelwrapper import ModelWrapper
+from finn.custom_op.registry import getCustomOp
 
 
 class AnnotateResources(Transformation):
@@ -44,9 +46,10 @@ class AnnotateResources(Transformation):
     chosen mode (e.g. HLSSynthIP for hls) was previously run.
     """
 
-    def __init__(self, mode):
+    def __init__(self, mode, override_res_dict=None):
         super().__init__()
         self.mode = mode
+        self.res_dict = override_res_dict
 
     def apply(self, model):
         graph = model.graph
@@ -58,10 +61,33 @@ class AnnotateResources(Transformation):
             res_fxn = post_synth_res
         else:
             raise Exception("Unrecognized mode for AnnotateResources")
-        res_dict = model.analysis(res_fxn)
+        if self.res_dict is None:
+            self.res_dict = model.analysis(res_fxn)
+        children_dict = {}
+        # annotate node resources
+        for node in graph.node:
+            if _is_fpgadataflow_node(node) and node.name in self.res_dict.keys():
+                op_inst = registry.getCustomOp(node)
+                op_inst.set_nodeattr("res_" + self.mode, str(self.res_dict[node.name]))
+                children_dict[node.name] = self.res_dict[node.name]
+            elif node.op_type == "StreamingDataflowPartition":
+                # recurse into model to manually annotate per-layer resources
+                sdp_model_filename = getCustomOp(node).get_nodeattr("model")
+                sdp_model = ModelWrapper(sdp_model_filename)
+                sdp_model = sdp_model.transform(
+                    AnnotateResources(self.mode, self.res_dict)
+                )
+                sdp_dict = sdp_model.get_metadata_prop("res_total_" + self.mode)
+                sdp_dict = eval(sdp_dict)
+                # save transformed model
+                sdp_model.save(sdp_model_filename)
+                # set res attribute for sdp node
+                getCustomOp(node).set_nodeattr("res_" + self.mode, str(sdp_dict))
+                children_dict[node.name] = sdp_dict
+        self.res_dict.update(children_dict)
         total_dict = {}
-        for lname in res_dict.keys():
-            layer_res_dict = res_dict[lname]
+        for lname in children_dict.keys():
+            layer_res_dict = self.res_dict[lname]
             for r_type in layer_res_dict.keys():
                 r_amount = layer_res_dict[r_type]
                 r_amount = float(r_amount)
@@ -69,10 +95,8 @@ class AnnotateResources(Transformation):
                     total_dict[r_type] += r_amount
                 else:
                     total_dict[r_type] = r_amount
+        for k in total_dict.keys():
+            if "efficiency" in k:
+                total_dict[k] = total_dict[k] / len(graph.node)
         model.set_metadata_prop("res_total_" + self.mode, str(total_dict))
-        for node in graph.node:
-            if _is_fpgadataflow_node(node) and node.name in res_dict.keys():
-                op_inst = registry.getCustomOp(node)
-                op_inst.set_nodeattr("res_" + self.mode, str(res_dict[node.name]))
-
         return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index d421a5f3ef8ca980b399087de1482b2ae913da1b..e6dca0e4b05f943c971bc0f97af03f5038fd0dab 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -26,14 +26,20 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+
 from onnx import helper, TensorProto
+import numpy as np
+import warnings
 
 from finn.core.datatype import DataType
 from finn.transformation import Transformation
 from finn.custom_op.registry import getCustomOp
 from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.general import SortGraph
 import finn.core.data_layout as DataLayout
+from finn.util.onnx import nchw_to_nhwc
+from finn.util.basic import get_by_name
 
 
 class InferConvInpGen(Transformation):
@@ -51,11 +57,15 @@ class InferConvInpGen(Transformation):
                 i2c_in_shape = model.get_tensor_shape(i2c_input)
                 i2c_out_shape = model.get_tensor_shape(i2c_output)
                 dt = model.get_tensor_datatype(i2c_input)
+                if not dt.is_integer():
+                    warnings.warn("Input is not int. Can't infer ConvInpGen")
+                    continue
                 i2c_inst = getCustomOp(n)
                 stride = i2c_inst.get_nodeattr("stride")
                 k = i2c_inst.get_nodeattr("kernel_size")
                 pad = i2c_inst.get_nodeattr("pad_amount")
                 pad_val = i2c_inst.get_nodeattr("pad_value")
+                depthwise = i2c_inst.get_nodeattr("depthwise")
                 ifm_ch = i2c_in_shape[-1]
                 ifm_dim = i2c_in_shape[1]
                 ofm_dim = i2c_out_shape[1]
@@ -67,7 +77,11 @@ class InferConvInpGen(Transformation):
 
                 if pad > 0:
                     # if padding enabled, ensure pad_val supported by DataType
-                    assert dt.allowed(pad_val), "Im2Col DataType must support pad_val"
+                    # assert dt.allowed(pad_val),"""FMPadding_Batch DataType
+                    # must support pad_val"""
+                    assert (
+                        pad_val == 0
+                    ), "FMPadding_Batch doesn't currently support pad_val!= 0"
 
                     odim_padding = ifm_dim + 2 * pad
 
@@ -94,26 +108,44 @@ class InferConvInpGen(Transformation):
                         Padding=2 * pad,
                         NumChannels=ifm_ch,
                         inputDataType=dt.name,
+                        SIMD=ifm_ch,
                     )
                     graph.node.insert(node_ind, padding_node)
 
-                # create equivalent ConvolutionInputGenerator node
-                ConvInpGen_node = helper.make_node(
-                    "ConvolutionInputGenerator",
-                    [ConvInpGen_input],
-                    [i2c_output],
-                    domain="finn",
-                    backend="fpgadataflow",
-                    ConvKernelDim=k,
-                    IFMChannels=ifm_ch,
-                    IFMDim=ConvInpGen_idim,
-                    OFMDim=ofm_dim,
-                    SIMD=ifm_ch,
-                    Stride=stride,
-                    inputDataType=dt.name,
-                    outputDataType=dt.name,
-                )
-                graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
+                if stride > 1 and k == 1:
+                    # create DownSampler node
+                    ConvInpGen_node = helper.make_node(
+                        "DownSampler",
+                        [ConvInpGen_input],
+                        [i2c_output],
+                        domain="finn",
+                        backend="fpgadataflow",
+                        ImgDim=ConvInpGen_idim,
+                        NumChannels=ifm_ch,
+                        SIMD=ifm_ch,
+                        Stride=stride,
+                        inputDataType=dt.name,
+                    )
+                    graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
+                else:
+                    # create equivalent ConvolutionInputGenerator node
+                    ConvInpGen_node = helper.make_node(
+                        "ConvolutionInputGenerator",
+                        [ConvInpGen_input],
+                        [i2c_output],
+                        domain="finn",
+                        backend="fpgadataflow",
+                        ConvKernelDim=k,
+                        IFMChannels=ifm_ch,
+                        IFMDim=ConvInpGen_idim,
+                        OFMDim=ofm_dim,
+                        SIMD=ifm_ch,
+                        Stride=stride,
+                        inputDataType=dt.name,
+                        outputDataType=dt.name,
+                        depthwise=depthwise,
+                    )
+                    graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
                 # remove old nodes
                 graph.node.remove(n)
                 graph_modified = True
@@ -169,6 +201,167 @@ class InferStreamingMaxPool(Transformation):
         return (model, graph_modified)
 
 
+class InferPool_Batch(Transformation):
+    """If kernel_shape > strides, replace Pool layer with  with of Im2col
+    + pool(with kernel_shape == strides), plus Transpose layers to keep the original
+    data layout."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type in ["MaxPool", "QuantAvgPool2d"]:
+                # extract pool parameters
+
+                if n.op_type == "MaxPool":
+                    k = get_by_name(n.attribute, "kernel_shape").ints[-1]
+                    stride = get_by_name(n.attribute, "strides").ints[-1]
+                elif n.op_type == "QuantAvgPool2d":
+                    inst = getCustomOp(n)
+                    k = inst.get_nodeattr("kernel")
+                    stride = inst.get_nodeattr("stride")
+
+                try:
+                    pad = get_by_name(n.attribute, "pads").ints[-1]
+                except AttributeError:
+                    pad = 0
+
+                node_input = n.input[0]
+                node_output = n.output[0]
+                idt = model.get_tensor_datatype(node_input)
+
+                if not idt.is_integer():
+                    continue
+
+                if k < stride:
+                    continue
+                elif k == stride:
+                    warnings.warn(
+                        """Inferring Pool_Batch node for k == stride.
+                        This case can be optimized.
+                        For example, for MaxPool run InferStreamingMaxPool before
+                        InferPool_Batch """
+                    )
+
+                odt = model.get_tensor_datatype(node_output)
+
+                ifm_ch = model.get_tensor_shape(n.input[0])[1]  # assume NCHW
+                ofm_ch = ifm_ch
+                ifm_dim = model.get_tensor_shape(n.input[0])[-1]  # assume NCHW
+                ofm_dim = model.get_tensor_shape(n.output[0])[-1]  # assume NCHW
+                # create new intermediate values
+                inp_trans_out = helper.make_tensor_value_info(
+                    model.make_new_valueinfo_name(),
+                    TensorProto.FLOAT,
+                    (1, ifm_dim, ifm_dim, ifm_ch),  # NHWC
+                )
+                graph.value_info.append(inp_trans_out)
+                inp_trans_out = inp_trans_out.name
+                model.set_tensor_datatype(inp_trans_out, idt)
+
+                im2col_out = helper.make_tensor_value_info(
+                    model.make_new_valueinfo_name(),
+                    TensorProto.FLOAT,
+                    (1, ofm_dim, ofm_dim, ifm_ch * k * k),
+                )
+                graph.value_info.append(im2col_out)
+                im2col_out = im2col_out.name
+                model.set_tensor_datatype(im2col_out, idt)
+
+                pool_output = helper.make_tensor_value_info(
+                    model.make_new_valueinfo_name(),
+                    TensorProto.FLOAT,
+                    (1, ofm_dim, ofm_dim, ofm_ch),
+                )
+                graph.value_info.append(pool_output)
+                pool_output = pool_output.name
+                # model.set_tensor_datatype(pool_output, odt)
+
+                # create new nodes
+                # NCHW -> NHWC
+                inp_trans_node = helper.make_node(
+                    "Transpose", [node_input], [inp_trans_out], perm=[0, 2, 3, 1]
+                )
+
+                accum_bits = 0
+                pool_size_param = k
+                pad_value = 0
+                if n.op_type == "MaxPool":
+                    pool_fxn = "MaxPool"
+                    odt = idt
+                    pad_value = idt.min()
+                elif n.op_type == "QuantAvgPool2d":
+                    assert odt.is_integer(), """Output data type for QuantAvgPool2d
+                    needs to be integer"""
+                    assert pad == 0, "Padding is not supported for QuantAvgPool2d"
+                    inst = getCustomOp(n)
+                    pool_fxn = "QuantAvgPool"
+                    pool_size_param = inst.get_shifts()
+                    accum_bits = inst.get_accum_size()
+
+                else:
+                    raise Exception(
+                        "pad_value and pool_fxn not configured for {}".format(n.op_type)
+                    )
+
+                # format input tensor
+                im2col_node = helper.make_node(
+                    "Im2Col",
+                    [inp_trans_out],
+                    [im2col_out],
+                    domain="finn",
+                    stride=stride,
+                    kernel_size=k,
+                    pad_amount=pad,
+                    pad_value=pad_value,
+                    depthwise=1,
+                    input_shape="(1,{},{},{})".format(ifm_dim, ifm_dim, ifm_ch),
+                )
+
+                # Warning PE has to be equal to ifm_ch until Im2Col is replaced by
+                # ConvolutionInputGenerator with depthwise=1.
+                # For other settings the output will be incorrect due to incorrect input
+                # data layout
+                pool_node = helper.make_node(
+                    "Pool_Batch",
+                    [im2col_out],
+                    [pool_output],
+                    domain="finn",
+                    backend="fpgadataflow",
+                    InputDataType=idt.name,
+                    OutputDataType=odt.name,
+                    Channels=ifm_ch,
+                    PE=ifm_ch,
+                    KernelSize=k,
+                    Function=pool_fxn,
+                    OutImgDim=ofm_dim,
+                    AccumBits=accum_bits,
+                    Size=pool_size_param,
+                    BatchSize=1,
+                )
+
+                # NHWC -> NCHW
+                out_trans_node = helper.make_node(
+                    "Transpose", [pool_output], [node_output], perm=[0, 3, 1, 2]
+                )
+
+                # insert nodes where the conv is to preserve topological ordering
+                graph.node.insert(node_ind, inp_trans_node)
+                graph.node.insert(node_ind + 1, im2col_node)
+                graph.node.insert(node_ind + 2, pool_node)
+                graph.node.insert(node_ind + 3, out_trans_node)
+                # remove old node
+                graph.node.remove(n)
+                graph_modified = True
+
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
+
+
 class InferBinaryStreamingFCLayer(Transformation):
     """Convert XnorPopcountMatMul layers to
     StreamingFCLayer_Batch layers. Any immediately following MultiThreshold
@@ -316,7 +509,7 @@ class InferQuantizedStreamingFCLayer(Transformation):
         graph_modified = False
         for n in graph.node:
             node_ind += 1
-            if n.op_type == "MatMul":
+            if n.op_type == "MatMul" and model.get_tensor_sparsity(n.input[1]) is None:
                 mm_input = n.input[0]
                 mm_weight = n.input[1]
                 mm_output = n.output[0]
@@ -435,6 +628,150 @@ class InferQuantizedStreamingFCLayer(Transformation):
         return (model, graph_modified)
 
 
+class InferVVAU(Transformation):
+    """Convert MatMul layers with quantized inputs and weights to
+    Vector_Vector_Activate_Batch layers, if the sparsity annotation
+    of the weight matrix indicates that the MatMul layer belongs to
+    a depthwise convolution. Any immediately following MultiThreshold
+    layers will also be absorbed into the VVAU."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if (
+                n.op_type == "MatMul"
+                and model.get_tensor_sparsity(n.input[1]) is not None
+            ):
+                sparsity = model.get_tensor_sparsity(n.input[1])
+                try:
+                    k = sparsity["dw"]["kernel_shape"]
+                except KeyError:
+                    raise Exception(
+                        """Sparsity doesn't indicate that MatMul
+                        belongs to a depthwise convolution."""
+                    )
+
+                mm_input = n.input[0]
+                mm_weight = n.input[1]
+                mm_output = n.output[0]
+                mm_in_shape = model.get_tensor_shape(mm_input)
+                mm_out_shape = model.get_tensor_shape(mm_output)
+                idt = model.get_tensor_datatype(mm_input)
+                wdt = model.get_tensor_datatype(mm_weight)
+                if idt.is_integer() and wdt.is_integer():
+                    mm_output = n.output[0]
+                    W = model.get_initializer(mm_weight)
+                    # infer dense weight tensor from sparse weight matrix
+                    # kernel size k which was extracted above and the value of
+                    # the channels is used.
+                    # the weight matrix has a shape of (k * k * Channels, Channels)
+                    # we need to reverse the creation of the sparse weight matrix
+                    # to achieve a weight tensor of shape (Channels, 1, k, k)
+                    channels = int(W.shape[1])
+                    # transpose to achieve a shape of (k * k * Channels, Channels)
+                    W = W.T
+                    # reshape to (Channels, k, k, Channels) to transpose afterwards
+                    # to (Channels, Channels, k, k)
+                    W = W.reshape(channels, k, k, channels)
+                    W = W.transpose(0, 3, 1, 2)
+                    # now we can extract the values using a for loop over the channels
+                    # and fill a zero numpy array in the correct shape
+                    w_tensor = np.zeros((channels, 1, k, k))
+                    for ch in range(channels):
+                        w_tensor[ch][0] = W[ch][ch]
+                    model.set_initializer(mm_weight, w_tensor)
+                    model.set_tensor_shape(mm_weight, (channels, 1, k, k))
+                    # create node with pe=channels as default
+                    pe = channels
+                    assert (
+                        channels % pe == 0
+                    ), "Requirement Channels divisable by PE is violated."
+                    # see if we have any following thresholds
+                    consumer = model.find_consumer(mm_output)
+                    if consumer is not None and consumer.op_type == "MultiThreshold":
+                        # create VVAU (i.e. including activation)
+                        mt_output = consumer.output[0]
+                        mt_out_shape = model.get_tensor_shape(mt_output)
+                        mt_thres = consumer.input[1]
+                        T = model.get_initializer(mt_thres)
+                        assert (
+                            T.shape[0] == 1 or T.shape[0] == channels
+                        ), """First dimension of
+                        thresholds neither 1 nor Channels."""
+                        odt = model.get_tensor_datatype(mt_output)
+                        scale = getCustomOp(consumer).get_nodeattr("out_scale")
+                        assert (
+                            scale == 1.0
+                        ), "out_scale must be equal to 1.0 for HLS conversion."
+                        actval = getCustomOp(consumer).get_nodeattr("out_bias")
+                        assert (
+                            int(actval) == actval
+                        ), "out_bias must be integer for HLS conversion."
+                        actval = int(actval)
+                        assert (not odt.signed()) or (
+                            actval < 0
+                        ), "Signed output requres actval < 0"
+                        model.set_tensor_shape(mm_input, mm_in_shape)
+                        model.set_tensor_shape(mt_output, mt_out_shape)
+                        # create and insert new Vector_Vector_Activate_Batch node
+                        new_node = helper.make_node(
+                            "Vector_Vector_Activate_Batch",
+                            [mm_input, mm_weight, mt_thres],
+                            [mt_output],
+                            domain="finn",
+                            backend="fpgadataflow",
+                            resType="ap_resource_lut()",
+                            PE=pe,
+                            Dim=mm_in_shape[1],
+                            Channels=channels,
+                            Kernel=k,
+                            inputDataType=idt.name,
+                            weightDataType=wdt.name,
+                            outputDataType=odt.name,
+                            ActVal=actval,
+                            noActivation=0,
+                        )
+                        graph.node.insert(node_ind, new_node)
+                        # remove old nodes
+                        graph.node.remove(n)
+                        graph.node.remove(consumer)
+                        graph_modified = True
+                    else:
+                        # no activation, matmul only
+                        odt = model.get_tensor_datatype(mm_output)
+                        model.set_tensor_shape(mm_input, mm_in_shape)
+                        model.set_tensor_shape(mm_output, mm_out_shape)
+                        # create and insert new VVAU node
+                        new_node = helper.make_node(
+                            "Vector_Vector_Activate_Batch",
+                            [mm_input, mm_weight],
+                            [mm_output],
+                            domain="finn",
+                            backend="fpgadataflow",
+                            resType="ap_resource_lut()",
+                            PE=pe,
+                            Dim=mm_in_shape[1],
+                            Channels=channels,
+                            Kernel=k,
+                            inputDataType=idt.name,
+                            weightDataType=wdt.name,
+                            outputDataType=odt.name,
+                            ActVal=0,
+                            noActivation=1,
+                        )
+                        graph.node.insert(node_ind, new_node)
+                        # remove old node
+                        graph.node.remove(n)
+                        graph_modified = True
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
+
+
 class InferThresholdingLayer(Transformation):
     """Convert any MultiThreshold into a standalone thresholding HLS layer."""
 
@@ -455,10 +792,21 @@ class InferThresholdingLayer(Transformation):
                 if not idt.is_integer():
                     continue
 
-                # skip conversion if input is not NHWC or NC
+                # check layout of inputs/outputs, and convert if needed
+                # check layout and convert if necessary
                 thl_in_layout = model.get_tensor_layout(thl_input)
-                if thl_in_layout != DataLayout.NHWC and thl_in_layout != DataLayout.NC:
-                    continue
+                if thl_in_layout == DataLayout.NCHW:
+                    thl_input = nchw_to_nhwc(thl_input, model, node_ind)
+                    node_ind += 1
+                    thl_in_shape = model.get_tensor_shape(thl_input)
+
+                # keep track of where we need to insert the HLS Op
+                # it has to be ahead of the output transform
+                insert_point = node_ind
+                thl_output_layout = model.get_tensor_layout(thl_output)
+                if thl_output_layout == DataLayout.NCHW:
+                    thl_output = nchw_to_nhwc(thl_output, model, node_ind, reverse=True)
+                    node_ind += 1
 
                 # now safe to assume number of channels is in last dimension
                 ifc = int(thl_in_shape[-1])
@@ -480,6 +828,379 @@ class InferThresholdingLayer(Transformation):
                     outputDataType=odt.name,
                     numInputVectors=list(thl_in_shape[:-1]),
                 )
+                graph.node.insert(insert_point, new_node)
+                # remove old node
+                graph.node.remove(node)
+                graph_modified = True
+
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
+
+
+class InferAddStreamsLayer(Transformation):
+    """Convert any Add into a AddStreams HLS layer."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for node in graph.node:
+            node_ind += 1
+            if node.op_type == "Add":
+                in0 = node.input[0]
+                in1 = node.input[1]
+                result = node.output[0]
+                in0_shape = model.get_tensor_shape(in0)
+                in1_shape = model.get_tensor_shape(in1)
+
+                # skip if different shapes on inputs
+                if in0_shape != in1_shape:
+                    continue
+
+                idt0 = model.get_tensor_datatype(in0)
+                idt1 = model.get_tensor_datatype(in1)
+
+                # skip if different data types on inputs
+                if idt0 != idt1:
+                    continue
+
+                idt = idt0
+
+                # skip conversion for layers with float input
+                if not idt.is_integer():
+                    continue
+
+                # check layout and convert if necessary
+                in0_layout = model.get_tensor_layout(in0)
+                in1_layout = model.get_tensor_layout(in1)
+                result_layout = model.get_tensor_layout(result)
+
+                if in0_layout == DataLayout.NCHW:
+                    in0 = nchw_to_nhwc(in0, model, node_ind)
+                    node_ind += 1
+                    in0_shape = model.get_tensor_shape(in0)
+
+                if in1_layout == DataLayout.NCHW:
+                    in1 = nchw_to_nhwc(in1, model, node_ind)
+                    node_ind += 1
+                    in1_shape = model.get_tensor_shape(in1)
+
+                # keep track of where we need to insert the HLS Op
+                # it has to be ahead of the output transform
+                insert_point = node_ind
+
+                if result_layout == DataLayout.NCHW:
+                    result = nchw_to_nhwc(result, model, node_ind, reverse=True)
+                    node_ind += 1
+
+                # now safe to assume num_channels is size of last dimension
+                num_channels = int(in0_shape[-1])
+                # create node with no parallelization first
+                pe = 1
+                assert (
+                    num_channels % pe == 0
+                ), "Requirement Channels divisable by PE is violated."
+
+                # create and insert new StreamingFCLayer node
+                new_node = helper.make_node(
+                    "AddStreams_Batch",
+                    [in0, in1],
+                    [result],
+                    domain="finn",
+                    backend="fpgadataflow",
+                    NumChannels=num_channels,
+                    PE=pe,
+                    inputDataType=idt.name,
+                    numInputVectors=in0_shape[:-1],
+                )
+                graph.node.insert(insert_point, new_node)
+                # remove old node
+                graph.node.remove(node)
+                graph_modified = True
+
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
+
+
+class InferDuplicateStreamsLayer(Transformation):
+    """Insert a DuplicateStreams HLS layer for any tensor with fanout == 2 """
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for node in graph.node:
+            node_ind += 1
+            successors = model.find_consumers(node.output[0])
+            if successors is not None and len(successors) == 2:
+                output_tensor = node.output[0]
+
+                dt = model.get_tensor_datatype(output_tensor)
+
+                # skip conversion for layers with float input
+                if not dt.is_integer():
+                    continue
+
+                # create clone tensors
+                out_shape = model.get_tensor_shape(output_tensor)
+                out_tensor_clones = []
+                for i in range(2):
+                    clone = helper.make_tensor_value_info(
+                        model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
+                    )
+                    model.graph.value_info.append(clone)
+                    out_tensor_clones += [clone.name]
+
+                num_ch = int(out_shape[-1])
+                vecs = out_shape[:-1]
+
+                # create node with no parallelization first
+                pe = 1
+                assert (
+                    num_ch % pe == 0
+                ), "Requirement channels divisable by PE is violated."
+
+                dup_node = helper.make_node(
+                    "DuplicateStreams_Batch",
+                    [output_tensor],
+                    out_tensor_clones,
+                    domain="finn",
+                    backend="fpgadataflow",
+                    NumChannels=num_ch,
+                    PE=pe,
+                    inputDataType=dt.name,
+                    numInputVectors=vecs,
+                )
+
+                graph.node.insert(node_ind, dup_node)
+
+                # connect successors to out tensor clone
+                clone_idx = 0
+                for successor in successors:
+                    for i, succ_input in enumerate(successor.input):
+                        if succ_input == output_tensor:
+                            successor.input[i] = out_tensor_clones[clone_idx]
+                            clone_idx += 1
+                            # if one node has multiple connections to the same output
+                            # find_direct_successors will return one node per input
+                            # so break the inner loop will result in correct behaviour
+                            break
+
+                graph_modified = True
+
+        if graph_modified:
+            model = model.transform(SortGraph())
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
+
+
+class InferChannelwiseLinearLayer(Transformation):
+    """Convert any channel-wise Add/Mul into a HLS layer."""
+
+    def get_smallest_possible(self, vals):
+        """Returns smallest (fewest bits) possible DataType that can represent
+        value. Prefers unsigned integers where possible."""
+        vals = np.array(vals)
+        for v in vals:
+            assert int(v) == v, "Error float value"
+
+        for k in DataType.__members__:
+            dt = DataType[k]
+
+            if dt in [DataType.BIPOLAR, DataType.TERNARY, DataType.FLOAT32]:
+                # not currently supported
+                continue
+
+            if (dt.min() <= vals).all() and (vals <= dt.max()).all():
+                return dt
+
+        warnings.warn(
+            """InferChannelwiseLinearLayer: Output values may not be
+        representable with supported data types.
+        Setting maximum width data type available.
+        This will lead to errors if there are no constrains on the input
+        """
+        )
+
+        if (0 <= vals).all():
+            return DataType.UINT32
+        else:
+            return DataType.INT32
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for node in graph.node:
+            node_ind += 1
+            if node.op_type == "Add" or node.op_type == "Mul":
+                # assuming input[0] is dynamic
+                ll_input = node.input[0]
+                ll_output = node.output[0]
+                ll_in_shape = model.get_tensor_shape(ll_input)
+
+                # check if input 1 has an initializer
+                ll_const = node.input[1]
+                if ll_const is not None:
+                    ll_cinit = model.get_initializer(ll_const)
+                    if ll_cinit is None:
+                        # input 1 is also dynamic
+                        continue
+                else:
+                    continue
+
+                # get number of channels and channel index from input
+                ll_in_layout = model.get_tensor_layout(ll_input)
+                if ll_in_layout == DataLayout.NHWC or ll_in_layout == DataLayout.NC:
+                    ch_index = -1
+                    ch = ll_in_shape[-1]
+                elif ll_in_layout == DataLayout.NCHW:
+                    ch_index = 1
+                    ch = ll_in_shape[1]
+                else:
+                    continue
+
+                # check if the shape of initializer is compatible
+                ll_cinit_shape = list(ll_cinit.shape)
+                if np.prod(ll_cinit_shape) == 1:
+                    warnings.warn(
+                        "Broadcasting " + str(node.op_type) + "(" + node.name + ")"
+                    )
+                    ll_cinit = np.full((ch), ll_cinit.flatten()[0])
+                elif np.prod(ll_cinit_shape) != ch or ll_cinit_shape[ch_index] != ch:
+                    # parameter shape not compatible with Channelwise_batch
+                    continue
+
+                # check initializer contains integers as floats
+                if not (ll_cinit.astype(np.int32) == ll_cinit).all():
+                    continue
+                # all initializer conditions are met
+
+                # check inputs
+                idt = model.get_tensor_datatype(ll_input)
+                if not idt.is_integer():
+                    # skip conversion for layers with float input
+                    continue
+
+                # check layout of inputs/outputs, and convert if needed
+                # check layout and convert if necessary
+                if ll_in_layout == DataLayout.NCHW:
+                    ll_input = nchw_to_nhwc(ll_input, model, node_ind)
+                    node_ind += 1
+                    ll_in_shape = model.get_tensor_shape(ll_input)
+
+                # keep track of where we need to insert the HLS Op
+                # it has to be ahead of the output transform
+                insert_point = node_ind
+                ll_output_layout = model.get_tensor_layout(ll_output)
+                if ll_output_layout == DataLayout.NCHW:
+                    ll_output = nchw_to_nhwc(ll_output, model, node_ind, reverse=True)
+                    node_ind += 1
+
+                # get parameter data type
+                param_min = min(ll_cinit.flatten())
+                param_max = max(ll_cinit.flatten())
+                pdt = self.get_smallest_possible([param_min, param_max])
+
+                # set function and determine output data type
+                if node.op_type == "Add":
+                    func = "add"
+                    out_min = idt.min() + param_min
+                    out_max = idt.max() + param_max
+                    odt = self.get_smallest_possible([out_min, out_max])
+                elif node.op_type == "Mul":
+                    func = "mul"
+                    possible_limits = []
+                    possible_limits += [idt.min() * param_min]
+                    possible_limits += [idt.min() * param_max]
+                    possible_limits += [idt.max() * param_min]
+                    possible_limits += [idt.max() * param_max]
+                    odt = self.get_smallest_possible(possible_limits)
+
+                model.set_initializer(ll_const, ll_cinit.reshape(ch))
+                model.set_tensor_datatype(ll_output, odt)
+
+                # create node with no parallelization first
+                pe = 1
+                assert ch % pe == 0, "Requirement IFC divisable by PE is violated."
+                # create and insert node
+                new_node = helper.make_node(
+                    "ChannelwiseOp_Batch",
+                    [ll_input, ll_const],
+                    [ll_output],
+                    domain="finn",
+                    backend="fpgadataflow",
+                    Func=func,
+                    NumChannels=ch,
+                    PE=pe,
+                    inputDataType=idt.name,
+                    paramDataType=pdt.name,
+                    outputDataType=odt.name,
+                    numInputVectors=list(ll_in_shape[:-1]),
+                )
+                graph.node.insert(insert_point, new_node)
+                # remove old node
+                graph.node.remove(node)
+                graph_modified = True
+
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
+
+
+class InferLabelSelectLayer(Transformation):
+    """Convert any TopK into a LabelSelect HLS layer."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for node in graph.node:
+            node_ind += 1
+            if node.op_type == "TopK":
+                fc_input = node.input[0]
+                k_input = node.input[1]
+                val_output = node.output[0]
+                idx_output = node.output[1]
+                fc_in_shape = model.get_tensor_shape(fc_input)
+
+                idt = model.get_tensor_datatype(fc_input)
+
+                # skip conversion for layers with float input
+                if not idt.is_integer():
+                    continue
+
+                # skip conversion for if value output is connected (not supported)
+                if model.find_consumer(val_output) is not None:
+                    continue
+
+                num_labels = int(fc_in_shape[-1])
+                # create node with no parallelization first
+                pe = 1
+                assert (
+                    num_labels % pe == 0
+                ), "Requirement Labels divisable by PE is violated."
+
+                k = model.get_initializer(k_input)[0]
+
+                # create and insert new StreamingFCLayer node
+                new_node = helper.make_node(
+                    "LabelSelect_Batch",
+                    [fc_input],
+                    [idx_output],
+                    domain="finn",
+                    backend="fpgadataflow",
+                    Labels=num_labels,
+                    PE=pe,
+                    K=k,
+                    inputDataType=idt.name,
+                )
                 graph.node.insert(node_ind, new_node)
                 # remove old node
                 graph.node.remove(node)
@@ -489,3 +1210,88 @@ class InferThresholdingLayer(Transformation):
             model = model.transform(InferShapes())
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
+
+
+class InferGlobalAccPoolLayer(Transformation):
+    """Convert any GlobalAveragePool into a GlobalAccPool HLS layer and a scalar Mul."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for node in graph.node:
+            node_ind += 1
+            if node.op_type == "GlobalAveragePool":
+                in0 = node.input[0]
+                result = node.output[0]
+                in0_shape = model.get_tensor_shape(in0)
+
+                idt = model.get_tensor_datatype(in0)
+
+                # skip conversion for layers with float input
+                if not idt.is_integer():
+                    continue
+
+                # check layout and convert if necessary
+                in0_layout = model.get_tensor_layout(in0)
+                result_layout = model.get_tensor_layout(result)
+
+                if in0_layout == DataLayout.NCHW:
+                    in0 = nchw_to_nhwc(in0, model, node_ind)
+                    node_ind += 1
+                    in0_shape = model.get_tensor_shape(in0)
+
+                # keep track of where we need to insert the HLS Op
+                # it has to be ahead of the output transform
+                insert_point = node_ind
+
+                if result_layout == DataLayout.NCHW:
+                    result = nchw_to_nhwc(result, model, node_ind, reverse=True)
+                    node_ind += 1
+
+                num_ch = int(in0_shape[-1])
+                vecs = in0_shape[:-1]
+                # create node with no parallelization first
+                pe = 1
+                assert (
+                    num_ch % pe == 0
+                ), "Requirement Labels divisable by PE is violated."
+
+                # create an additional tensor of the same shape and layout as result
+                out_shape = model.get_tensor_shape(result)
+                pool_out = helper.make_tensor_value_info(
+                    model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
+                )
+                model.graph.value_info.append(pool_out)
+                pool_out = pool_out.name
+                model.set_tensor_layout(pool_out, model.get_tensor_layout(result))
+
+                new_pool = helper.make_node(
+                    "GlobalAccPool_Batch",
+                    [in0],
+                    [pool_out],
+                    domain="finn",
+                    backend="fpgadataflow",
+                    NumChannels=num_ch,
+                    PE=pe,
+                    inputDataType=idt.name,
+                    numInputVectors=vecs,
+                )
+
+                mul_value = helper.make_tensor_value_info(
+                    model.make_new_valueinfo_name(), TensorProto.FLOAT, [1]
+                )
+                model.graph.value_info.append(mul_value)
+                model.set_initializer(mul_value.name, np.array(1 / (vecs[1] * vecs[2])))
+                new_mul = helper.make_node("Mul", [pool_out, mul_value.name], [result],)
+                graph.node.insert(insert_point, new_pool)
+                graph.node.insert(insert_point + 1, new_mul)
+                node_ind += 1
+                # remove old node
+                graph.node.remove(node)
+                graph_modified = True
+
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
index e0f990600d9ca4be748b662b47ce8296d3d462ce..5ec4ab14d65d63523856a6bb107bf75c1ca5a261 100644
--- a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
+++ b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
@@ -45,58 +45,90 @@ class CreateDataflowPartition(Transformation):
         super().__init__()
 
     def apply(self, model):
-        # TODO we currently assume that all dataflow nodes are connected to
-        # each other, forming a single partition. check the assumption and/or
-        # improve this.
-        all_nodes = list(model.graph.node)
-        df_nodes = filter(
-            lambda x: get_by_name(x.attribute, "backend") is not None, all_nodes
-        )
-        df_nodes = filter(
-            lambda x: get_by_name(x.attribute, "backend").s.decode("UTF-8")
-            == "fpgadataflow",
-            df_nodes,
-        )
-        df_nodes = list(df_nodes)
-        non_df_nodes = filter(lambda x: x not in df_nodes, all_nodes)
-        non_df_nodes = list(non_df_nodes)
-
-        if len(df_nodes) == 0:
-            # no changes if no dataflow nodes are present
-            return (model, False)
-        else:
-            # partition the model into two models
-            df_model = copy.deepcopy(model)
-            non_df_model = model
-            # remove all non-dataflow nodes from the dataflow model
-            for node_to_remove in non_df_nodes:
-                df_model.graph.node.remove(node_to_remove)
-            # identify the entry and exit points for the dataflow part
-            df_in = df_model.graph.node[0].input[0]
-            df_out = df_model.graph.node[-1].output[0]
-            df_in_vi = df_model.get_tensor_valueinfo(df_in)
-            df_out_vi = df_model.get_tensor_valueinfo(df_out)
-            # set df graph in/out to be df_in/df_out
-            df_model.graph.input.remove(df_model.graph.input[0])
-            df_model.graph.input.insert(0, df_in_vi)
-            df_model.graph.output.remove(df_model.graph.output[0])
-            df_model.graph.output.insert(0, df_out_vi)
-            df_model_dir = make_build_dir("dataflow_partition_")
-            df_model_filename = df_model_dir + "/df_model.onnx"
-            df_model.save(df_model_filename)
-            # remove all dataflow nodes from the non-dataflow model
-            # keep track of where the dataflow part starts
-            df_start_ind = all_nodes.index(df_nodes[0])
-            for node_to_remove in df_nodes:
-                non_df_model.graph.node.remove(node_to_remove)
-            # create StreamingDataflow node with df_in/df_out io
-            df_node = helper.make_node(
-                "StreamingDataflowPartition",
-                [df_in],
-                [df_out],
-                # use the model attribute to mark the df model
-                model=df_model_filename,
+        target_partition_id = 0
+        # we currently assume that all dataflow nodes belonging to the same partition
+        # are connected to each other and there is a single input/output to/from each.
+        # NOTE: all dataflow nodes with no partition_id set are moved to partition 0
+        # TODO: check the assumption and/or improve this.
+        while True:
+            all_nodes = list(model.graph.node)
+            df_nodes = filter(
+                lambda x: get_by_name(x.attribute, "backend") is not None, all_nodes
+            )
+            df_nodes = filter(
+                lambda x: get_by_name(x.attribute, "backend").s.decode("UTF-8")
+                == "fpgadataflow"
+                and (
+                    get_by_name(x.attribute, "partition_id") is None
+                    or get_by_name(x.attribute, "partition_id").i == target_partition_id
+                )
+                and x.op_type != "StreamingDataflowPartition",
+                df_nodes,
             )
-            non_df_model.graph.node.insert(df_start_ind, df_node)
+            df_nodes = list(df_nodes)
+            non_df_nodes = filter(lambda x: x not in df_nodes, all_nodes)
+            non_df_nodes = list(non_df_nodes)
+
+            if len(df_nodes) == 0:
+                # no changes if no dataflow nodes are present
+                break
+            else:
+                # partition the model into two models
+                df_model = copy.deepcopy(model)
+                non_df_model = model
+                # remove all non-dataflow nodes from the dataflow model
+                for node_to_remove in non_df_nodes:
+                    df_model.graph.node.remove(node_to_remove)
+                # identify the entry and exit points for the dataflow part
+                df_in = df_model.graph.node[0].input[0]
+                df_out = df_model.graph.node[-1].output[0]
+                df_in_vi = df_model.get_tensor_valueinfo(df_in)
+                df_out_vi = df_model.get_tensor_valueinfo(df_out)
+                # set df graph in/out to be df_in/df_out
+                df_model.graph.input.remove(df_model.graph.input[0])
+                df_model.graph.input.insert(0, df_in_vi)
+                df_model.graph.output.remove(df_model.graph.output[0])
+                df_model.graph.output.insert(0, df_out_vi)
+                # parse StreamingFCLayers looking for external weight memories
+                fc_extw_nodes = filter(
+                    lambda x: x.op_type == "StreamingFCLayer_Batch"
+                    and get_by_name(x.attribute, "mem_mode") is not None
+                    and get_by_name(x.attribute, "mem_mode").s.decode("UTF-8")
+                    == "external",
+                    df_nodes,
+                )
+                fc_extw_nodes = list(fc_extw_nodes)
+                extra_df_inputs = []
+
+                for i in range(len(fc_extw_nodes)):
+                    fc_weight_vi = df_model.get_tensor_valueinfo(
+                        fc_extw_nodes[i].input[1]
+                    )
+                    df_model.graph.input.insert(i + 1, fc_weight_vi)
+                    extra_df_inputs.append(fc_extw_nodes[i].input[1])
+
+                # save model
+                df_model_dir = make_build_dir(
+                    "dataflow_partition" + str(target_partition_id) + "_"
+                )
+                df_model_filename = df_model_dir + "/df_model.onnx"
+                df_model.save(df_model_filename)
+                # remove all dataflow nodes from the non-dataflow model
+                # keep track of where the dataflow part starts
+                df_start_ind = all_nodes.index(df_nodes[0])
+                for node_to_remove in df_nodes:
+                    non_df_model.graph.node.remove(node_to_remove)
+                # create StreamingDataflow node with df_in/df_out io
+                df_node = helper.make_node(
+                    "StreamingDataflowPartition",
+                    [df_in] + extra_df_inputs,
+                    [df_out],
+                    # use the model attribute to mark the df model
+                    model=df_model_filename,
+                    domain="finn",
+                )
+                non_df_model.graph.node.insert(df_start_ind, df_node)
+                model = non_df_model
+                target_partition_id += 1
 
-        return (non_df_model, False)
+        return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index 0e898f63db785f80cfce2683df0c9b6268e3ec7e..90b4b6c47e6e353c1b606d6918eb271e9c0619c5 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -33,6 +33,8 @@ import subprocess
 from finn.transformation import Transformation
 from finn.util.basic import get_by_name, make_build_dir
 from finn.custom_op.registry import getCustomOp
+from finn.util.basic import get_num_default_workers
+import multiprocessing as mp
 
 
 class CreateStitchedIP(Transformation):
@@ -49,20 +51,137 @@ class CreateStitchedIP(Transformation):
     The packaged block design IP can be found under the ip subdirectory.
     """
 
-    def __init__(self, fpgapart, clk_ns = 10.0):
+    def __init__(self, fpgapart, clk_ns=10.0, ip_name="finn_design", vitis=False):
         super().__init__()
         self.fpgapart = fpgapart
         self.clk_ns = clk_ns
+        self.ip_name = ip_name
+        self.vitis = vitis
         if float(clk_ns) not in [5.0, 10.0, 20.0]:
             warnings.warn(
                 """The chosen frequency may lead to failure due to clock divider
                 constraints."""
             )
+        self.has_axilite = False
+        self.has_aximm = False
+        self.has_m_axis = False
+        self.m_axis_idx = 0
+        self.has_s_axis = False
+        self.s_axis_idx = 0
+        self.clock_reset_are_external = False
+        self.create_cmds = []
+        self.connect_cmds = []
+        # keep track of top-level interface names
+        self.intf_names = {
+            "clk": [],
+            "rst": [],
+            "s_axis": [],
+            "m_axis": [],
+            "aximm": [],
+            "axilite": [],
+        }
+
+    def connect_clk_rst(self, node):
+        inst_name = node.name
+        node_inst = getCustomOp(node)
+        clock_intf_name = node_inst.get_verilog_top_module_intf_names()["clk"][0]
+        reset_intf_name = node_inst.get_verilog_top_module_intf_names()["rst"][0]
+        # make clock and reset external, if they aren't already
+        if not self.clock_reset_are_external:
+            self.connect_cmds.append(
+                "make_bd_pins_external [get_bd_pins %s/%s]"
+                % (inst_name, clock_intf_name)
+            )
+            self.connect_cmds.append("set_property name ap_clk [get_bd_ports ap_clk_0]")
+            self.connect_cmds.append(
+                "make_bd_pins_external [get_bd_pins %s/%s]"
+                % (inst_name, reset_intf_name)
+            )
+            self.connect_cmds.append(
+                "set_property name ap_rst_n [get_bd_ports ap_rst_n_0]"
+            )
+            self.clock_reset_are_external = True
+            self.intf_names["clk"] = ["ap_clk"]
+            self.intf_names["rst"] = ["ap_rst_n"]
+        # otherwise connect clock and reset
+        else:
+            self.connect_cmds.append(
+                "connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins %s/%s]"
+                % (inst_name, reset_intf_name)
+            )
+            self.connect_cmds.append(
+                "connect_bd_net [get_bd_ports ap_clk] [get_bd_pins %s/%s]"
+                % (inst_name, clock_intf_name)
+            )
+
+    def connect_axi(self, node):
+        inst_name = node.name
+        node_inst = getCustomOp(node)
+        axilite_intf_name = node_inst.get_verilog_top_module_intf_names()["axilite"]
+        aximm_intf_name = node_inst.get_verilog_top_module_intf_names()["aximm"]
+        if len(axilite_intf_name) != 0:
+            self.connect_cmds.append(
+                "make_bd_intf_pins_external "
+                "[get_bd_intf_pins %s/%s]" % (inst_name, axilite_intf_name[0])
+            )
+            self.connect_cmds.append(
+                "set_property name s_axi_control " "[get_bd_intf_ports s_axi_control_0]"
+            )
+            assert (
+                self.has_axilite is False
+            ), "Currently limited to one slave AXI-Stream"
+            self.intf_names["axilite"] = ["s_axi_control"]
+            self.has_axilite = True
+        if len(aximm_intf_name) != 0:
+            self.connect_cmds.append(
+                "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]"
+                % (inst_name, aximm_intf_name[0])
+            )
+            self.connect_cmds.append(
+                "set_property name m_axi_gmem0 [get_bd_intf_ports m_axi_gmem_0]"
+            )
+            self.intf_names["aximm"] = ["m_axi_gmem0"]
+            assert self.has_aximm is False, "Currently limited to one AXI-MM interface"
+            self.has_aximm = True
+
+    def connect_m_axis_external(self, node):
+        inst_name = node.name
+        node_inst = getCustomOp(node)
+        output_intf_names = node_inst.get_verilog_top_module_intf_names()["m_axis"]
+        # make output axis external
+        for output_intf_name in output_intf_names:
+            self.connect_cmds.append(
+                "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]"
+                % (inst_name, output_intf_name)
+            )
+            self.connect_cmds.append(
+                "set_property name m_axis_%d [get_bd_intf_ports %s_0]"
+                % (self.m_axis_idx, output_intf_name)
+            )
+            self.has_m_axis = True
+            self.intf_names["m_axis"].append("m_axis_%d" % self.m_axis_idx)
+            self.m_axis_idx += 1
+
+    def connect_s_axis_external(self, node):
+        inst_name = node.name
+        node_inst = getCustomOp(node)
+        input_intf_names = node_inst.get_verilog_top_module_intf_names()["s_axis"]
+        # make input axis external
+        for input_intf_name in input_intf_names:
+            self.connect_cmds.append(
+                "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]"
+                % (inst_name, input_intf_name)
+            )
+            self.connect_cmds.append(
+                "set_property name s_axis_%d [get_bd_intf_ports %s_0]"
+                % (self.s_axis_idx, input_intf_name)
+            )
+            self.has_s_axis = True
+            self.intf_names["s_axis"].append("s_axis_%d" % self.s_axis_idx)
+            self.s_axis_idx += 1
 
     def apply(self, model):
         ip_dirs = ["list"]
-        create_cmds = []
-        connect_cmds = []
         # ensure that all nodes are fpgadataflow, and that IPs are generated
         for node in model.graph.node:
             assert node.domain == "finn", 'Node domain is not set to "finn"'
@@ -80,59 +199,57 @@ class CreateStitchedIP(Transformation):
             vlnv = node_inst.get_nodeattr("ip_vlnv")
             inst_name = node.name
             create_cmd = "create_bd_cell -type ip -vlnv %s %s" % (vlnv, inst_name)
-            create_cmds += [create_cmd]
-            # TODO nonlinear topologies: check this for all inputs
+            self.create_cmds += [create_cmd]
             my_producer = model.find_producer(node.input[0])
+            self.connect_clk_rst(node)
+            self.connect_axi(node)
             if my_producer is None:
                 # first node in graph
-                # make clock and reset external
-                connect_cmds.append(
-                    "make_bd_pins_external [get_bd_pins %s/ap_clk]" % inst_name
-                )
-                connect_cmds.append(
-                    "make_bd_pins_external [get_bd_pins %s/ap_rst_n]" % inst_name
-                )
-                # make input external
-                connect_cmds.append(
-                    "make_bd_intf_pins_external [get_bd_intf_pins %s/in0_V_V]"
-                    % inst_name
-                )
+                self.connect_s_axis_external(node)
+                if node.op_type == "TLastMarker":
+                    assert (
+                        node_inst.get_nodeattr("Direction") == "in"
+                    ), """Output TLastMarker incorrect direction"""
+                elif node.op_type == "IODMA" and len(model.graph.node) != 1:
+                    # don't apply this check for a 1-node partition
+                    assert (
+                        node_inst.get_nodeattr("direction") == "in"
+                    ), """Input DMA incorrect direction"""
             else:
                 # intermediate node
-                # wire up global clock and reset
-                connect_cmds.append(
-                    "connect_bd_net [get_bd_ports ap_rst_n_0] [get_bd_pins %s/ap_rst_n]"
-                    % inst_name
-                )
-                connect_cmds.append(
-                    "connect_bd_net [get_bd_ports ap_clk_0] [get_bd_pins %s/ap_clk]"
-                    % inst_name
-                )
-                # wire up input to previous output
-                # TODO nonlinear topologies: loop over all inputs
-                my_in_name = "%s/in0_V_V" % (inst_name)
-                prev_out_name = "%s/out_V_V" % (my_producer.name)
-                connect_cmds.append(
-                    "connect_bd_intf_net [get_bd_intf_pins %s] [get_bd_intf_pins %s]"
-                    % (prev_out_name, my_in_name)
-                )
-            if model.find_consumer(node.output[0]) is None:
+                # wire up input(s) to previous node output(s)
+                # foreach input
+                #     find producer
+                #     find index of producer output connected to our target input
+                #     get names of hdl interfaces for input and producer output
+                #     issue a TCL directive to connect input to output
+                for i in range(len(node.input)):
+                    producer = model.find_producer(node.input[i])
+                    if producer is None:
+                        continue
+                    j = list(producer.output).index(node.input[i])
+                    src_intf_name = getCustomOp(
+                        producer
+                    ).get_verilog_top_module_intf_names()["m_axis"][j]
+                    dst_intf_name = node_inst.get_verilog_top_module_intf_names()[
+                        "s_axis"
+                    ][i]
+                    self.connect_cmds.append(
+                        "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                        "[get_bd_intf_pins %s/%s]"
+                        % (producer.name, src_intf_name, node.name, dst_intf_name)
+                    )
+            if model.find_consumers(node.output[0]) is None:
                 # last node in graph
-                # ensure it is a TLastMarker to have a valid TLast signal
-                assert (
-                    node.op_type == "TLastMarker"
-                ), """Last node is not TLastMarker.
-                Please run transformation InsertTLastMarker to ensure a valid
-                TLast signal"""
-                # make output external
-                connect_cmds.append(
-                    "make_bd_intf_pins_external [get_bd_intf_pins %s/out_r]" % inst_name
-                )
-                # make AXI lite IF external
-                connect_cmds.append(
-                    "make_bd_intf_pins_external [get_bd_intf_pins %s/s_axi_control]"
-                    % inst_name
-                )
+                self.connect_m_axis_external(node)
+                if node.op_type == "TLastMarker":
+                    assert (
+                        node_inst.get_nodeattr("Direction") == "out"
+                    ), """Output TLastMarker incorrect direction"""
+                elif node.op_type == "IODMA" and len(model.graph.node) != 1:
+                    assert (
+                        node_inst.get_nodeattr("direction") == "out"
+                    ), """Output DMA incorrect direction"""
 
         # create a temporary folder for the project
         prjname = "finn_vivado_stitch_proj"
@@ -150,22 +267,54 @@ class CreateStitchedIP(Transformation):
         tcl.append("set_property ip_repo_paths [%s] [current_project]" % ip_dirs_str)
         tcl.append("update_ip_catalog")
         # create block design and instantiate all layers
-        block_name = "finn_design"
+        block_name = self.ip_name
         tcl.append('create_bd_design "%s"' % block_name)
-        tcl.extend(create_cmds)
-        tcl.extend(connect_cmds)
+        tcl.extend(self.create_cmds)
+        tcl.extend(self.connect_cmds)
         fclk_mhz = 1 / (self.clk_ns * 0.001)
         fclk_hz = fclk_mhz * 1000000
         model.set_metadata_prop("clk_ns", str(self.clk_ns))
-        tcl.append("set_property CONFIG.FREQ_HZ %f [get_bd_ports /ap_clk_0]" % fclk_hz)
+        tcl.append("set_property CONFIG.FREQ_HZ %f [get_bd_ports /ap_clk]" % fclk_hz)
         tcl.append("regenerate_bd_layout")
         tcl.append("validate_bd_design")
         tcl.append("save_bd_design")
+        # create wrapper hdl (for rtlsim later on)
+        bd_base = "%s/%s.srcs/sources_1/bd/%s" % (
+            vivado_stitch_proj_dir,
+            prjname,
+            block_name,
+        )
+        bd_filename = "%s/%s.bd" % (bd_base, block_name)
+        tcl.append("make_wrapper -files [get_files %s] -top" % bd_filename)
+        wrapper_filename = "%s/hdl/%s_wrapper.v" % (bd_base, block_name)
+        tcl.append("add_files -norecurse %s" % wrapper_filename)
+        model.set_metadata_prop("wrapper_filename", wrapper_filename)
+        # synthesize to DCP and export stub, DCP and constraints
+        if self.vitis:
+            tcl.append(
+                "set_property SYNTH_CHECKPOINT_MODE Hierarchical [ get_files %s ]"
+                % bd_filename
+            )
+            tcl.append(
+                "set_property -name {STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS} "
+                "-value {-mode out_of_context} -objects [get_runs synth_1]"
+            )
+            num_workers = get_num_default_workers()
+            assert num_workers >= 0, "Number of workers must be nonnegative."
+            if num_workers == 0:
+                num_workers = mp.cpu_count()
+            tcl.append("launch_runs synth_1 -jobs %s" % str(num_workers))
+            tcl.append("wait_on_run [get_runs synth_1]")
+            tcl.append("open_run synth_1 -name synth_1")
+            tcl.append("write_verilog -force -mode synth_stub %s.v" % block_name)
+            tcl.append("write_checkpoint %s.dcp" % block_name)
+            tcl.append("write_xdc %s.xdc" % block_name)
         # export block design itself as an IP core
         block_vendor = "xilinx_finn"
         block_library = "finn"
         block_vlnv = "%s:%s:%s:1.0" % (block_vendor, block_library, block_name)
         model.set_metadata_prop("vivado_stitch_vlnv", block_vlnv)
+        model.set_metadata_prop("vivado_stitch_ifnames", str(self.intf_names))
         tcl.append(
             (
                 "ipx::package_project -root_dir %s/ip -vendor %s "
@@ -175,19 +324,89 @@ class CreateStitchedIP(Transformation):
         )
         tcl.append("set_property core_revision 2 [ipx::find_open_core %s]" % block_vlnv)
         tcl.append("ipx::create_xgui_files [ipx::find_open_core %s]" % block_vlnv)
+        # if targeting Vitis, add some properties to the IP
+        if self.vitis:
+            tcl.append(
+                "ipx::remove_bus_parameter FREQ_HZ "
+                "[ipx::get_bus_interfaces CLK.AP_CLK -of_objects [ipx::current_core]]"
+            )
+            # replace source code with dcp
+            tcl.append(
+                "set_property sdx_kernel true [ipx::find_open_core %s]" % block_vlnv
+            )
+            tcl.append(
+                "set_property sdx_kernel_type rtl [ipx::find_open_core %s]" % block_vlnv
+            )
+            tcl.append(
+                "set_property supported_families { } [ipx::find_open_core %s]"
+                % block_vlnv
+            )
+            tcl.append(
+                "set_property xpm_libraries {XPM_CDC XPM_MEMORY XPM_FIFO} "
+                "[ipx::find_open_core %s]" % block_vlnv
+            )
+            tcl.append(
+                "set_property auto_family_support_level level_2 "
+                "[ipx::find_open_core %s]" % block_vlnv
+            )
+            # remove all files from synthesis and sim groups
+            # we'll replace with DCP, stub, and xdc
+            tcl.append(
+                "ipx::remove_all_file "
+                "[ipx::get_file_groups xilinx_anylanguagebehavioralsimulation]"
+            )
+            tcl.append(
+                "ipx::remove_all_file "
+                "[ipx::get_file_groups xilinx_anylanguagesynthesis]"
+            )
+            tcl.append(
+                "ipx::remove_file_group "
+                "xilinx_anylanguagebehavioralsimulation [ipx::current_core]"
+            )
+            tcl.append(
+                "ipx::remove_file_group "
+                "xilinx_anylanguagesynthesis [ipx::current_core]"
+            )
+            # remove sim and src folders
+            tcl.append("file delete -force %s/ip/sim" % vivado_stitch_proj_dir)
+            tcl.append("file delete -force %s/ip/src" % vivado_stitch_proj_dir)
+            # copy and add DCP, stub, and xdc
+            tcl.append("file mkdir %s/ip/dcp" % vivado_stitch_proj_dir)
+            tcl.append("file mkdir %s/ip/impl" % vivado_stitch_proj_dir)
+            tcl.append(
+                "file copy -force %s.dcp %s/ip/dcp"
+                % (block_name, vivado_stitch_proj_dir)
+            )
+            tcl.append(
+                "file copy -force %s.xdc %s/ip/impl"
+                % (block_name, vivado_stitch_proj_dir)
+            )
+            tcl.append("ipx::add_file_group xilinx_implementation [ipx::current_core]")
+            tcl.append(
+                "ipx::add_file impl/%s.xdc [ipx::get_file_groups xilinx_implementation]"
+                % block_name
+            )
+            tcl.append(
+                "set_property used_in [list implementation] "
+                "[ipx::get_files impl/%s.xdc "
+                "-of_objects [ipx::get_file_groups xilinx_implementation]]" % block_name
+            )
+            tcl.append(
+                "ipx::add_file_group " "xilinx_synthesischeckpoint [ipx::current_core]"
+            )
+            tcl.append(
+                "ipx::add_file dcp/%s.dcp "
+                "[ipx::get_file_groups xilinx_synthesischeckpoint]" % block_name
+            )
+            tcl.append(
+                "ipx::add_file_group xilinx_simulationcheckpoint [ipx::current_core]"
+            )
+            tcl.append(
+                "ipx::add_file dcp/%s.dcp "
+                "[ipx::get_file_groups xilinx_simulationcheckpoint]" % block_name
+            )
         tcl.append("ipx::update_checksums [ipx::find_open_core %s]" % block_vlnv)
         tcl.append("ipx::save_core [ipx::find_open_core %s]" % block_vlnv)
-        # create wrapper hdl (for rtlsim later on)
-        bd_base = "%s/%s.srcs/sources_1/bd/%s" % (
-            vivado_stitch_proj_dir,
-            prjname,
-            block_name,
-        )
-        bd_filename = "%s/%s.bd" % (bd_base, block_name)
-        tcl.append("make_wrapper -files [get_files %s] -top" % bd_filename)
-        wrapper_filename = "%s/hdl/%s_wrapper.v" % (bd_base, block_name)
-        tcl.append("add_files -norecurse %s" % wrapper_filename)
-        model.set_metadata_prop("wrapper_filename", wrapper_filename)
         # export list of used Verilog files (for rtlsim later on)
         tcl.append("set all_v_files [get_files -filter {FILE_TYPE == Verilog}]")
         v_file_list = "%s/all_verilog_srcs.txt" % vivado_stitch_proj_dir
diff --git a/src/finn/transformation/fpgadataflow/floorplan.py b/src/finn/transformation/fpgadataflow/floorplan.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d9a51875499d77f384c03f54009a9dd1001dea0
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/floorplan.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from finn.custom_op.registry import getCustomOp
+from finn.transformation import Transformation
+from finn.util.basic import get_by_name
+
+
+class Floorplan(Transformation):
+    """Perform Floorplanning of the dataflow design. Separate DMAs into their own
+    partitions IDs, and TODO: split the design into sections of defined size"""
+
+    def __init__(self, limits=None):
+        super().__init__()
+        self.resource_limits = limits
+
+    def apply(self, model):
+        target_partition_id = 0
+        # we currently assume that all dataflow nodes belonging to the same partition
+        # are connected to each other and there is a single input/output to/from each.
+        all_nodes = list(model.graph.node)
+        df_nodes = list(
+            filter(lambda x: get_by_name(x.attribute, "backend") is not None, all_nodes)
+        )
+        dma_nodes = list(filter(lambda x: x.op_type == "IODMA", df_nodes))
+
+        non_dma_nodes = list(filter(lambda x: x not in dma_nodes, df_nodes))
+        dyn_tlastmarker_nodes = list(
+            filter(
+                lambda x: x.op_type == "TLastMarker"
+                and getCustomOp(x).get_nodeattr("DynIters") == "true",
+                non_dma_nodes,
+            )
+        )
+
+        non_dma_nodes = list(
+            filter(lambda x: x not in dyn_tlastmarker_nodes, non_dma_nodes)
+        )
+
+        for node in dma_nodes:
+            node_inst = getCustomOp(node)
+            node_inst.set_nodeattr("partition_id", target_partition_id)
+            target_partition_id += 1
+
+        for node in dyn_tlastmarker_nodes:
+            node_inst = getCustomOp(node)
+            node_inst.set_nodeattr("partition_id", target_partition_id)
+            target_partition_id += 1
+
+        for node in non_dma_nodes:
+            # TODO: implement proper floorplanning; for now just a single partition
+            node_inst = getCustomOp(node)
+            node_inst.set_nodeattr("partition_id", target_partition_id)
+
+        return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index b01f8cbe5c48db6c5288b2db1a8b009ea09ce6c0..6f7fde0c4faba09e584eb578819f44c18639bc9d 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -118,8 +118,11 @@ class InsertFIFO(Transformation):
                         graph_modified = True
 
         if graph_modified is False:
-            # insert FIFO as first node
-            if graph.node[0].op_type != "StreamingFIFO":
+            # insert FIFO as first node, except when first node is DMA
+            if (
+                graph.node[0].op_type != "StreamingFIFO"
+                and graph.node[0].op_type != "IODMA"
+            ):
                 n = graph.node[0]
                 n_input = n.input[0]
                 n0 = getCustomOp(n)
@@ -153,8 +156,11 @@ class InsertFIFO(Transformation):
                 # set fifo output tensor as new input tensor of second node
                 n.input[0] = fifo_output_tensor.name
 
-            # insert FIFO as last node
-            if graph.node[-1].op_type != "StreamingFIFO":
+            # insert FIFO as last node, except when last node is DMA
+            if (
+                graph.node[-1].op_type != "StreamingFIFO"
+                and graph.node[-1].op_type != "IODMA"
+            ):
                 n = graph.node[-1]
                 assert (
                     n.op_type != "TLastMarker"
diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py
new file mode 100644
index 0000000000000000000000000000000000000000..72e5ec4fdd721ecf549adaf7ddd38db4636bce27
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/insert_iodma.py
@@ -0,0 +1,200 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from onnx import TensorProto
+from onnx import helper as oh
+
+from finn.util.basic import get_by_name
+from finn.custom_op.registry import getCustomOp
+from finn.transformation import Transformation
+from finn.transformation.general import SortGraph
+import finn.core.data_layout as DataLayout
+import math
+import numpy as np
+
+
+class InsertIODMA(Transformation):
+    """Insert DMA nodes on all inputs and outputs."""
+
+    def __init__(self, max_intfwidth=32):
+        super().__init__()
+        assert (
+            2 ** math.log2(max_intfwidth) == max_intfwidth
+        ), "max_intfwidth must be a power of 2"
+        self.max_intfwidth = max_intfwidth
+
+    def apply(self, model):
+        # only makes sense for a pure fpgadataflow graph -- so we check!
+        all_nodes = list(model.graph.node)
+        assert all(
+            get_by_name(x.attribute, "backend").s.decode("UTF-8") == "fpgadataflow"
+            for x in all_nodes
+        )
+        # parse streamingfclayers looking for external weights with no attached IODMA
+        fc_extw_nodes = list(
+            filter(
+                lambda x: x.op_type == "StreamingFCLayer_Batch"
+                and get_by_name(x.attribute, "mem_mode") is not None
+                and get_by_name(x.attribute, "mem_mode").s.decode("UTF-8") == "external"
+                and model.find_producer(x.input[1]) is None,
+                all_nodes,
+            )
+        )
+        graph_in_name = model.graph.input[0].name
+        first_node = model.find_consumer(graph_in_name)
+        graph_out_name = model.graph.output[0].name
+        final_node = model.find_producer(graph_out_name)
+        if (
+            final_node.op_type == "IODMA"
+            and first_node.op_type == "IODMA"
+            and len(fc_extw_nodes) == 0
+        ):
+            # TODO maybe check the correctness of properties
+            return (model, False)
+        else:
+            if final_node.op_type != "IODMA":
+                # check if tensor is NHWC
+                assert (
+                    model.get_tensor_layout(graph_out_name) == DataLayout.NHWC
+                    or model.get_tensor_layout(graph_out_name) == DataLayout.NC
+                ), "Data layout of output tensor must be NHWC or NC"
+                out_shape = model.get_tensor_shape(graph_out_name)
+                out_dtype = model.get_tensor_datatype(graph_out_name)
+                # determine the feasible interface width
+                transfer_bits = np.prod(out_shape) * out_dtype.bitwidth()
+                intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
+                assert (
+                    intfwidth % 8 == 0
+                ), "No feasible interface width for transfer size"
+                # get width of stream input to DMA
+                streamWidth = getCustomOp(final_node).get_outstream_width()
+                # make new buffer
+                final_node_out = oh.make_tensor_value_info(
+                    model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
+                )
+                model.graph.value_info.append(final_node_out)
+                model.set_tensor_datatype(final_node_out.name, out_dtype)
+                # reroute final node output to final_node_out_name
+                final_node.output[0] = final_node_out.name
+                dma_node = oh.make_node(
+                    "IODMA",
+                    [final_node_out.name],
+                    [graph_out_name],
+                    numInputVectors=out_shape[:-1],
+                    NumChannels=out_shape[-1],
+                    dataType=str(out_dtype.name),
+                    intfWidth=intfwidth,
+                    streamWidth=streamWidth,
+                    direction="out",
+                    domain="finn",
+                    backend="fpgadataflow",
+                )
+                model.graph.node.append(dma_node)
+            if first_node.op_type != "IODMA":
+                # check if tensor is NHWC
+                assert (
+                    model.get_tensor_layout(graph_in_name) == DataLayout.NHWC
+                    or model.get_tensor_layout(graph_in_name) == DataLayout.NC
+                ), "Data layout of input tensor must be NHWC or NC"
+                in_shape = model.get_tensor_shape(graph_in_name)
+                in_dtype = model.get_tensor_datatype(graph_in_name)
+                # determine the feasible interface width
+                transfer_bits = np.prod(in_shape) * in_dtype.bitwidth()
+                intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
+                assert (
+                    intfwidth % 8 == 0
+                ), "No feasible interface width for transfer size"
+                # get width of stream output from DMA
+                streamWidth = getCustomOp(first_node).get_instream_width()
+                # make new buffer
+                first_node_in = oh.make_tensor_value_info(
+                    model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape
+                )
+                model.graph.value_info.append(first_node_in)
+                model.set_tensor_datatype(first_node_in.name, in_dtype)
+                # reroute final node output to final_node_out_name
+                first_node.input[0] = first_node_in.name
+                dma_node = oh.make_node(
+                    "IODMA",
+                    [graph_in_name],
+                    [first_node_in.name],
+                    numInputVectors=in_shape[:-1],
+                    NumChannels=in_shape[-1],
+                    dataType=str(in_dtype.name),
+                    intfWidth=intfwidth,
+                    streamWidth=streamWidth,
+                    direction="in",
+                    domain="finn",
+                    backend="fpgadataflow",
+                )
+                model.graph.node.insert(0, dma_node)
+            for fc_node in fc_extw_nodes:
+                # check if tensor is NHWC
+                assert (
+                    model.get_tensor_layout(fc_node.input[1]) == DataLayout.NHWC
+                    or model.get_tensor_layout(graph_in_name) == DataLayout.NC
+                ), "Data layout of tensors must be NHWC or NC"
+                fc_w_name = fc_node.input[1]
+                w_shape = model.get_tensor_shape(fc_w_name)
+                w_dtype = model.get_tensor_datatype(fc_w_name)
+                # determine the feasible interface width
+                transfer_bits = np.prod(w_shape) * w_dtype.bitwidth()
+                intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
+                assert (
+                    intfwidth % 8 == 0
+                ), "No feasible interface width for transfer size"
+                # calculate width of stream output from DMA
+                pe = get_by_name(fc_node.attribute, "PE").i
+                simd = get_by_name(fc_node.attribute, "SIMD").i
+                assert pe * simd == w_shape[0], "Malformed weight matrix"
+                streamWidth = simd * pe * w_dtype.bitwidth()
+                # make new buffer
+                fc_node_in = oh.make_tensor_value_info(
+                    model.make_new_valueinfo_name(), TensorProto.FLOAT, w_shape
+                )
+                model.graph.value_info.append(fc_node_in)
+                model.set_tensor_datatype(fc_node_in.name, w_dtype)
+                model.set_initializer(fc_node_in.name, model.get_initializer(fc_w_name))
+                dma_node = oh.make_node(
+                    "IODMA",
+                    [fc_w_name],
+                    [fc_node_in.name],
+                    numInputVectors=[w_shape[1]],
+                    NumChannels=w_shape[0],
+                    dataType=str(w_dtype.name),
+                    intfWidth=intfwidth,
+                    streamWidth=streamWidth,
+                    direction="in",
+                    burstMode="wrap",
+                    domain="finn",
+                    backend="fpgadataflow",
+                )
+                fc_node.input[1] = fc_node_in.name
+                model.graph.node.insert(0, dma_node)
+            model = model.transform(SortGraph())
+            return (model, True)
diff --git a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
index 32f32ece585a93465ba32fede45d5eb606a2b0a3..bbb0e43fda464e919a7d8c9dcd25e08a49b33cec 100644
--- a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
+++ b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
@@ -31,23 +31,35 @@ from onnx import helper as oh
 
 from finn.custom_op.registry import getCustomOp
 from finn.transformation import Transformation
+from finn.util.basic import get_by_name
+
+import numpy as np
 
 
 class InsertTLastMarker(Transformation):
-    """Ensure that the graph is terminated with a TLastMarker node, inserting
-    one if necessary."""
+    """Ensure that the graph is started/terminated with a TLastMarker node, inserting
+    one if necessary.
+    Use constructor args to determine type of TLastMarker to be inserted.
+    More information available on the TLastMarker documentation.
+    """
 
-    def __init__(self):
+    def __init__(self, both=False, external=True, dynamic=True):
         super().__init__()
+        self.dyniters = dynamic
+        self.external = external
+        self.both = both
 
     def apply(self, model):
         # TODO only makes sense for a pure fpgadataflow graph -- check!
         graph_out_name = model.graph.output[0].name
         final_node = model.find_producer(graph_out_name)
-        if final_node.op_type == "TLastMarker":
-            # TODO maybe check the correctness of properties
-            return (model, False)
-        else:
+        graph_modified = False
+        if final_node.op_type != "TLastMarker" and not (
+            final_node.op_type == "IODMA"
+            and get_by_name(final_node.attribute, "direction").s.decode("UTF-8")
+            == "out"
+        ):
+
             custom_op = getCustomOp(final_node)
             num_iters = int(custom_op.get_number_output_values())
             stream_width = int(custom_op.get_outstream_width())
@@ -69,8 +81,88 @@ class InsertTLastMarker(Transformation):
                 NumIters=num_iters,
                 StreamWidth=stream_width,
                 ElemWidth=elem_width,
+                DynIters=(1 if self.dyniters else 0),
+                Direction="out",
+                Protocol=("external" if self.external else "internal"),
                 domain="finn",
                 backend="fpgadataflow",
             )
             model.graph.node.append(tlast_node)
-            return (model, True)
+            graph_modified = True
+        # if both is True, also insert marker on input
+        if self.both:
+            # detect and parse graph inputs
+            insert_idx = 0
+            graph_in_names = [x.name for x in model.graph.input]
+            for graph_in_name in graph_in_names:
+                first_node = model.find_consumers(graph_in_name)
+                # skip if no consumers (this may be the case for unused initializers)
+                # TODO: fix this with a cleanup transform
+                if first_node is None:
+                    continue
+                assert len(first_node) == 1, "Input fans out to multiple nodes"
+                first_node = first_node[0]
+                # several scenarios exclude the node:
+                # 1. node is a FC layer with internal weights, in which case
+                #    the input is in the list of graph inputs because it has an
+                #    initializer (TODO: fix this with a clean-up transform)
+                if (
+                    first_node.op_type == "StreamingFCLayer_Batch"
+                    and get_by_name(first_node.attribute, "mem_mode").s.decode("UTF-8")
+                    != "external"
+                ):
+                    continue
+                # 2. node is either a TLastMarker or an input IODMA
+                if first_node.op_type != "TLastMarker" and not (
+                    first_node.op_type == "IODMA"
+                    and get_by_name(first_node.attribute, "direction").s.decode("UTF-8")
+                    == "in"
+                ):
+
+                    custom_op = getCustomOp(first_node)
+                    num_iters = np.prod(custom_op.get_folded_input_shape()[1:-1])
+                    inp_idx = list(first_node.input).index(graph_in_name)
+                    if inp_idx > 0:
+                        if (
+                            first_node.op_type == "StreamingFCLayer_Batch"
+                            and inp_idx == 1
+                        ):
+                            stream_width = int(custom_op.get_weightstream_width())
+                        elif first_node.op_type == "AddStreams_Batch" and inp_idx == 1:
+                            stream_width = int(custom_op.get_instream_width())
+                        else:
+                            raise Exception("No method to determine stream width")
+                    else:
+                        stream_width = int(custom_op.get_instream_width())
+                    in_shape = model.get_tensor_shape(graph_in_name)
+                    in_dtype = model.get_tensor_datatype(graph_in_name)
+                    elem_width = in_dtype.bitwidth()
+                    # make new buffer
+                    first_node_in = oh.make_tensor_value_info(
+                        model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape
+                    )
+                    model.graph.value_info.append(first_node_in)
+                    model.set_tensor_datatype(first_node_in.name, in_dtype)
+                    ini = model.get_initializer(graph_in_name)
+                    # copy initializer if it exists
+                    if ini is not None:
+                        model.set_initializer(first_node_in.name, ini)
+                    # reroute final node output to first_node_in_name
+                    first_node.input[inp_idx] = first_node_in.name
+                    tlast_node = oh.make_node(
+                        "TLastMarker",
+                        [graph_in_name],
+                        [first_node_in.name],
+                        NumIters=num_iters,
+                        StreamWidth=stream_width,
+                        ElemWidth=elem_width,
+                        DynIters=(1 if self.dyniters else 0),
+                        Direction="in",
+                        Protocol=("external" if self.external else "internal"),
+                        domain="finn",
+                        backend="fpgadataflow",
+                    )
+                    model.graph.node.insert(insert_idx, tlast_node)
+                    graph_modified = True
+                    insert_idx += 1
+        return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
index 18d3db18da089a5dda4dbb6d97180dd4a20613b5..fc326b4a25a9784f3919b4246ec2b8f54fb881f4 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
@@ -26,9 +26,8 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
-import shutil
 
+import shutil
 from finn.custom_op.registry import getCustomOp
 from finn.transformation import Transformation
 from finn.util.basic import gen_finn_dt_tensor, get_finn_root, make_build_dir
@@ -42,19 +41,18 @@ class MakePYNQDriver(Transformation):
     accelerator, including data packing/unpacking. The MakePYNQProject
     transformation must have been already applied.
 
+    platform: one of ["zynq", "zynq-iodma", "alveo"]
+
     Outcome if successful: sets the pynq_driver_dir attribute in the ONNX
     ModelProto's metadata_props field, with the created driver dir as the
     value.
     """
 
-    def __init__(self):
+    def __init__(self, platform):
         super().__init__()
+        self.platform = platform
 
     def apply(self, model):
-        vivado_pynq_proj = model.get_metadata_prop("vivado_pynq_proj")
-        if vivado_pynq_proj is None or (not os.path.isdir(vivado_pynq_proj)):
-            raise Exception("No PYNQ project found, apply MakePYNQProject first.")
-
         # create a temporary folder for the generated driver
         pynq_driver_dir = make_build_dir(prefix="pynq_driver_")
         model.set_metadata_prop("pynq_driver_dir", pynq_driver_dir)
@@ -67,11 +65,21 @@ class MakePYNQDriver(Transformation):
         o_tensor_shape_normal = tuple(model.get_tensor_shape(o_tensor_name))
         i_tensor_dt = model.get_tensor_datatype(i_tensor_name)
         o_tensor_dt = model.get_tensor_datatype(o_tensor_name)
-        # extract HLSCustomOp instances to get folded i/o shapes
-        first_node = getCustomOp(model.find_consumer(i_tensor_name))
-        last_node = getCustomOp(model.find_producer(o_tensor_name))
-        i_tensor_shape_folded = tuple(first_node.get_folded_input_shape())
-        o_tensor_shape_folded = tuple(last_node.get_folded_output_shape())
+        # handle folded i/o shapes due to differences in DMA engines
+        if self.platform == "zynq":
+            # extract HLSCustomOp instances to get folded i/o shapes
+            first_node = getCustomOp(model.find_consumer(i_tensor_name))
+            last_node = getCustomOp(model.find_producer(o_tensor_name))
+            i_tensor_shape_folded = tuple(first_node.get_folded_input_shape())
+            o_tensor_shape_folded = tuple(last_node.get_folded_output_shape())
+        else:
+            i_tensor_shape_folded = list(i_tensor_shape_normal)
+            i_tensor_shape_folded.insert(-1, 1)
+            i_tensor_shape_folded = tuple(i_tensor_shape_folded)
+            o_tensor_shape_folded = list(o_tensor_shape_normal)
+            o_tensor_shape_folded.insert(-1, 1)
+            o_tensor_shape_folded = tuple(o_tensor_shape_folded)
+
         # generate dummy folded i/o tensors and their packed versions
         i_tensor_dummy_folded = gen_finn_dt_tensor(i_tensor_dt, i_tensor_shape_folded)
         o_tensor_dummy_folded = gen_finn_dt_tensor(o_tensor_dt, o_tensor_shape_folded)
@@ -98,6 +106,7 @@ class MakePYNQDriver(Transformation):
             ret = ret.replace("[1,", "[%s," % batch_var_name)
             return ret
 
+        driver = driver.replace("$PLATFORM$", self.platform)
         driver = driver.replace("$INPUT_FINN_DATATYPE$", str(i_tensor_dt))
         driver = driver.replace("$INPUT_SHAPE_NORMAL$", mss(i_tensor_shape_normal))
         driver = driver.replace("$INPUT_SHAPE_FOLDED$", mss(i_tensor_shape_folded))
@@ -108,7 +117,12 @@ class MakePYNQDriver(Transformation):
         driver = driver.replace("$OUTPUT_SHAPE_PACKED$", mss(o_tensor_shape_packed))
 
         # clock settings for driver
-        clk_ns = float(model.get_metadata_prop("clk_ns"))
+        clk_ns = model.get_metadata_prop("clk_ns")
+        # default to 10ns / 100 MHz if property not set
+        if clk_ns is None:
+            clk_ns = 10.0
+        else:
+            clk_ns = float(clk_ns)
         fclk_mhz = 1 / (clk_ns * 0.001)
         # TODO change according to PYNQ board?
         driver = driver.replace("$CLK_NAME$", "fclk0_mhz")
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_proj.py b/src/finn/transformation/fpgadataflow/make_pynq_proj.py
index 91f6bd2c4ab19c736fcf21322979cac17a163f24..5e45d6f230503668a15d784e3c6afa45560fe004 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_proj.py
@@ -67,6 +67,16 @@ class MakePYNQProject(Transformation):
             raise Exception(
                 "No vlnv for stitched IP found, apply CreateStitchedIP first."
             )
+        vivado_stitch_ifnames = model.get_metadata_prop("vivado_stitch_ifnames")
+        if vivado_stitch_ifnames is None:
+            raise Exception("No IF name metadata found, apply CreateStitchedIP first.")
+        vivado_stitch_ifnames = eval(vivado_stitch_ifnames)
+        # recover interface names from dict
+        self.clk_name = vivado_stitch_ifnames["clk"][0]
+        self.rst_name = vivado_stitch_ifnames["rst"][0]
+        self.s_axis_if_name = vivado_stitch_ifnames["s_axis"][0]
+        self.m_axis_if_name = vivado_stitch_ifnames["m_axis"][0]
+        self.s_aximm_if_name = vivado_stitch_ifnames["axilite"][0]
 
         # collect list of all IP dirs
         ip_dirs = ["list"]
@@ -105,11 +115,11 @@ class MakePYNQProject(Transformation):
         multiple of 8."""
         in_bytes = i_bits_per_cycle_padded / 8
         out_bytes = o_bits_per_cycle_padded / 8
-        in_if_name = "in0_V_V_0"
-        out_if_name = "out_r_0"
-        clk_name = "ap_clk_0"
-        nrst_name = "ap_rst_n_0"
-        axi_lite_if_name = "s_axi_control_0"
+        in_if_name = self.s_axis_if_name
+        out_if_name = self.m_axis_if_name
+        clk_name = self.clk_name
+        nrst_name = self.rst_name
+        axi_lite_if_name = self.s_aximm_if_name
         vivado_ip_cache = os.getenv("VIVADO_IP_CACHE", default="")
 
         # create a temporary folder for the project
@@ -118,6 +128,8 @@ class MakePYNQProject(Transformation):
         # filename for the synth utilization report
         synth_report_filename = vivado_pynq_proj_dir + "/synth_report.xml"
         model.set_metadata_prop("vivado_synth_rpt", synth_report_filename)
+        # set platform attribute for correct remote execution
+        model.set_metadata_prop("platform", "zynq")
 
         # get metadata property clk_ns to calculate clock frequency
         clk_ns = float(model.get_metadata_prop("clk_ns"))
diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
new file mode 100644
index 0000000000000000000000000000000000000000..1558d7399fe5399053c3f347cd06c4e0d76753e7
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -0,0 +1,318 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import subprocess
+
+from finn.custom_op.registry import getCustomOp
+from finn.transformation import Transformation
+from finn.core.modelwrapper import ModelWrapper
+from finn.util.basic import get_by_name, make_build_dir
+from finn.util.basic import get_num_default_workers
+from finn.util.basic import pynq_part_map
+
+from finn.transformation.fpgadataflow.create_dataflow_partition import (
+    CreateDataflowPartition,
+)
+from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.floorplan import Floorplan
+from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from shutil import copy
+from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
+
+from . import templates
+
+
+def collect_ip_dirs(model, ipstitch_path):
+    # collect list of all IP dirs
+    ip_dirs = []
+    for node in model.graph.node:
+        ip_dir_attribute = get_by_name(node.attribute, "ip_path")
+        assert (
+            ip_dir_attribute is not None
+        ), """Node attribute "ip_path" is
+        empty. Please run transformation HLSSynth_ipgen first."""
+        ip_dir_value = ip_dir_attribute.s.decode("UTF-8")
+        assert os.path.isdir(
+            ip_dir_value
+        ), """The directory that should
+        contain the generated ip blocks doesn't exist."""
+        ip_dirs += [ip_dir_value]
+    ip_dirs += [ipstitch_path + "/ip"]
+    return ip_dirs
+
+
+class MakeZYNQProject(Transformation):
+    """Create a Vivado overlay project (including the shell infrastructure)
+    from the already-stitched IP block for this graph.
+    All nodes in the graph must have the fpgadataflow backend attribute,
+    and the CreateStitchedIP transformation must have been previously run on
+    the graph. This is functionally equivalent with MakePYNQProject but does
+    not use Pynq infrastructure and instead creates a fully custom block design.
+    However, this transform requires DMAs in the accelerator design.
+
+    Outcome if successful: sets the vivado_pynq_proj attribute in the ONNX
+    ModelProto's metadata_props field, with the created project dir as the
+    value.
+    """
+
+    def __init__(self, platform, enable_debug=False):
+        super().__init__()
+        self.platform = platform
+        self.enable_debug = 1 if enable_debug else 0
+
+    def apply(self, model):
+
+        # create a config file and empty list of xo files
+        config = []
+        idma_idx = 0
+        odma_idx = 0
+        aximm_idx = 0
+        axilite_idx = 0
+        global_clk_ns = 0
+        instance_names = {}
+        for node in model.graph.node:
+            assert node.op_type == "StreamingDataflowPartition", "Invalid link graph"
+            sdp_node = getCustomOp(node)
+            dataflow_model_filename = sdp_node.get_nodeattr("model")
+            kernel_model = ModelWrapper(dataflow_model_filename)
+
+            ipstitch_path = kernel_model.get_metadata_prop("vivado_stitch_proj")
+            if ipstitch_path is None or (not os.path.isdir(ipstitch_path)):
+                raise Exception(
+                    "No stitched IPI design found for %s, apply CreateStitchedIP first."
+                    % node.name
+                )
+
+            vivado_stitch_vlnv = kernel_model.get_metadata_prop("vivado_stitch_vlnv")
+            if vivado_stitch_vlnv is None:
+                raise Exception(
+                    "No vlnv found for %s, apply CreateStitchedIP first." % node.name
+                )
+
+            ip_dirs = ["list"]
+            ip_dirs += collect_ip_dirs(kernel_model, ipstitch_path)
+            ip_dirs_str = "[%s]" % (" ".join(ip_dirs))
+            config.append(
+                "set_property ip_repo_paths "
+                "[concat [get_property ip_repo_paths [current_project]] %s] "
+                "[current_project]" % ip_dirs_str
+            )
+            config.append("update_ip_catalog -rebuild -scan_changes")
+
+            # get metadata property clk_ns to calculate clock frequency
+            clk_ns = float(kernel_model.get_metadata_prop("clk_ns"))
+            if clk_ns > global_clk_ns:
+                global_clk_ns = clk_ns
+
+            # gather info on connectivity
+            # assume each node connected to outputs/inputs is DMA:
+            # has axis, aximm and axilite
+            # everything else is axis-only
+            # assume only one connection from each ip to the next
+            # all aximm allocated to DDR[0]
+            # all kernels allocated to SLR0
+            producer = model.find_producer(node.input[0])
+            consumer = model.find_consumers(node.output[0])
+            # define kernel instances
+            # name kernels connected to graph inputs as idmaxx
+            # name kernels connected to graph inputs as odmaxx
+            if producer is None or consumer is None:
+                if producer is None:
+                    instance_names[node.name] = "idma" + str(idma_idx)
+                elif consumer is None:
+                    instance_names[node.name] = "odma" + str(odma_idx)
+                config.append(
+                    "create_bd_cell -type ip -vlnv %s %s"
+                    % (vivado_stitch_vlnv, instance_names[node.name])
+                )
+                config.append(
+                    "connect_bd_intf_net [get_bd_intf_pins %s/m_axi_gmem0] "
+                    "[get_bd_intf_pins smartconnect_0/S%02d_AXI]"
+                    % (instance_names[node.name], aximm_idx)
+                )
+                config.append(
+                    "connect_bd_intf_net [get_bd_intf_pins %s/s_axi_control] "
+                    "[get_bd_intf_pins axi_interconnect_0/M%02d_AXI]"
+                    % (instance_names[node.name], axilite_idx)
+                )
+                idma_idx += 1
+                aximm_idx += 1
+                axilite_idx += 1
+            else:
+                instance_names[node.name] = node.name
+                config.append(
+                    "create_bd_cell -type ip -vlnv %s %s"
+                    % (vivado_stitch_vlnv, instance_names[node.name])
+                )
+            config.append(
+                "connect_bd_net [get_bd_pins %s/ap_clk] "
+                "[get_bd_pins smartconnect_0/aclk]" % instance_names[node.name]
+            )
+            config.append(
+                "connect_bd_net [get_bd_pins %s/ap_rst_n] "
+                "[get_bd_pins smartconnect_0/aresetn]" % instance_names[node.name]
+            )
+            # connect streams
+            if producer is not None:
+                for i in range(len(node.input)):
+                    producer = model.find_producer(node.input[i])
+                    if producer is not None:
+                        j = list(producer.output).index(node.input[i])
+                        config.append(
+                            "connect_bd_intf_net [get_bd_intf_pins %s/s_axis_%d] "
+                            "[get_bd_intf_pins %s/m_axis_%d]"
+                            % (
+                                instance_names[node.name],
+                                i,
+                                instance_names[producer.name],
+                                j,
+                            )
+                        )
+
+        # create a temporary folder for the project
+        vivado_pynq_proj_dir = make_build_dir(prefix="vivado_zynq_proj_")
+        model.set_metadata_prop("vivado_pynq_proj", vivado_pynq_proj_dir)
+
+        fclk_mhz = int(1 / (global_clk_ns * 0.001))
+
+        # create a TCL recipe for the project
+        ipcfg = vivado_pynq_proj_dir + "/ip_config.tcl"
+        config = "\n".join(config) + "\n"
+        with open(ipcfg, "w") as f:
+            f.write(
+                templates.custom_zynq_shell_template
+                % (
+                    fclk_mhz,
+                    axilite_idx,
+                    aximm_idx,
+                    self.platform,
+                    pynq_part_map[self.platform],
+                    config,
+                    self.enable_debug,
+                    get_num_default_workers(),
+                )
+            )
+
+        # create a TCL recipe for the project
+        synth_project_sh = vivado_pynq_proj_dir + "/synth_project.sh"
+        working_dir = os.environ["PWD"]
+        with open(synth_project_sh, "w") as f:
+            f.write("#!/bin/bash \n")
+            f.write("cd {}\n".format(vivado_pynq_proj_dir))
+            f.write("vivado -mode tcl -source %s\n" % ipcfg)
+            f.write("cd {}\n".format(working_dir))
+
+        # call the synthesis script
+        bash_command = ["bash", synth_project_sh]
+        process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
+        process_compile.communicate()
+        bitfile_name = (
+            vivado_pynq_proj_dir + "/finn_zynq_link.runs/impl_1/top_wrapper.bit"
+        )
+        if not os.path.isfile(bitfile_name):
+            raise Exception("Synthesis failed, no bitfile found")
+        deploy_bitfile_name = vivado_pynq_proj_dir + "/resizer.bit"
+        copy(bitfile_name, deploy_bitfile_name)
+        # set bitfile attribute
+        model.set_metadata_prop("vivado_pynq_bitfile", deploy_bitfile_name)
+        # set platform attribute for correct remote execution
+        model.set_metadata_prop("platform", "zynq-iodma")
+        hwh_name = (
+            vivado_pynq_proj_dir
+            + "/finn_zynq_link.srcs/sources_1/bd/top/hw_handoff/top.hwh"
+        )
+        if not os.path.isfile(hwh_name):
+            raise Exception("Synthesis failed, no hardware handoff file found")
+        deploy_hwh_name = vivado_pynq_proj_dir + "/resizer.hwh"
+        copy(hwh_name, deploy_hwh_name)
+        # filename for the synth utilization report
+        synth_report_filename = vivado_pynq_proj_dir + "/synth_report.xml"
+        model.set_metadata_prop("vivado_synth_rpt", synth_report_filename)
+        return (model, False)
+
+
+class ZynqBuild(Transformation):
+    """Best-effort attempt at building the accelerator for Zynq."""
+
+    def __init__(self, platform, period_ns, enable_debug=False):
+        super().__init__()
+        self.fpga_part = pynq_part_map[platform]
+        self.period_ns = period_ns
+        self.platform = platform
+        self.enable_debug = enable_debug
+
+    def apply(self, model):
+        # first infer layouts
+        model = model.transform(InferDataLayouts())
+        # prepare at global level, then break up into kernels
+        prep_transforms = [
+            MakePYNQDriver(platform="zynq-iodma"),
+            InsertIODMA(64),
+            InsertDWC(),
+            Floorplan(),
+            CreateDataflowPartition(),
+        ]
+        for trn in prep_transforms:
+            model = model.transform(trn)
+            model = model.transform(GiveUniqueNodeNames())
+            model = model.transform(GiveReadableTensorNames())
+        # Build each kernel individually
+        sdp_nodes = model.get_nodes_by_op_type("StreamingDataflowPartition")
+        for sdp_node in sdp_nodes:
+            sdp_node = getCustomOp(sdp_node)
+            dataflow_model_filename = sdp_node.get_nodeattr("model")
+            kernel_model = ModelWrapper(dataflow_model_filename)
+            kernel_model = kernel_model.transform(InsertFIFO())
+            kernel_model = kernel_model.transform(GiveUniqueNodeNames())
+            kernel_model.save(dataflow_model_filename)
+            kernel_model = kernel_model.transform(
+                PrepareIP(self.fpga_part, self.period_ns)
+            )
+            kernel_model = kernel_model.transform(HLSSynthIP())
+            kernel_model = kernel_model.transform(ReplaceVerilogRelPaths())
+            kernel_model = kernel_model.transform(
+                CreateStitchedIP(
+                    self.fpga_part, self.period_ns, sdp_node.onnx_node.name, True
+                )
+            )
+            kernel_model.save(dataflow_model_filename)
+        # Assemble design from IPs
+        model = model.transform(
+            MakeZYNQProject(self.platform, enable_debug=self.enable_debug)
+        )
+        return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/prepare_cppsim.py b/src/finn/transformation/fpgadataflow/prepare_cppsim.py
index a1524322ec03a4e96ef41f999144e3eed349c5af..6eae560e1191642cfaf85d92c6d0fcf644630973 100644
--- a/src/finn/transformation/fpgadataflow/prepare_cppsim.py
+++ b/src/finn/transformation/fpgadataflow/prepare_cppsim.py
@@ -29,9 +29,12 @@
 import os
 
 import finn.custom_op.registry as registry
-from finn.transformation import Transformation
 from finn.util.basic import make_build_dir
 from finn.util.fpgadataflow import is_fpgadataflow_node
+from finn.transformation import Transformation
+from finn.util.basic import get_num_default_workers
+import multiprocessing as mp
+import copy
 
 
 def _codegen_single_node(node, model):
@@ -66,8 +69,39 @@ class PrepareCppSim(Transformation):
     that contains generated C++ code that can be used to simulate node using cppsim.
     The subsequent transformation is CompileCppSim"""
 
+    def __init__(self, num_workers=None):
+        super().__init__()
+        if num_workers is None:
+            self._num_workers = get_num_default_workers()
+        else:
+            self._num_workers = num_workers
+        assert self._num_workers >= 0, "Number of workers must be nonnegative."
+        if self._num_workers == 0:
+            self._num_workers = mp.cpu_count()
+
+    def prepareCppSim_node(self, node):
+        if is_fpgadataflow_node(node) is True:
+            _codegen_single_node(node, self.model)
+        return (node, False)
+
     def apply(self, model):
-        for node in model.graph.node:
-            if is_fpgadataflow_node(node) is True:
-                _codegen_single_node(node, model)
-        return (model, False)
+        # Remove old nodes from the current model
+        self.model = copy.deepcopy(model)
+        old_nodes = []
+        for i in range(len(model.graph.node)):
+            old_nodes.append(model.graph.node.pop())
+
+        # Execute transformation in parallel
+        with mp.Pool(self._num_workers) as p:
+            new_nodes_and_bool = p.map(self.prepareCppSim_node, old_nodes, chunksize=1)
+
+        # extract nodes and check if the transformation needs to run again
+        # Note: .pop() had initially reversed the node order
+        run_again = False
+        for node, run in reversed(new_nodes_and_bool):
+            # Reattach new nodes to old model
+            model.graph.node.append(node)
+            if run is True:
+                run_again = True
+
+        return (model, run_again)
diff --git a/src/finn/transformation/fpgadataflow/synth_ooc.py b/src/finn/transformation/fpgadataflow/synth_ooc.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fd7e4724ef7f255b1435d5ab5e680d155d39487
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/synth_ooc.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+from shutil import copy2
+
+from finn.transformation import Transformation
+from finn.util.vivado import out_of_context_synth
+from finn.util.basic import make_build_dir
+
+
+class SynthOutOfContext(Transformation):
+    """Run out-of-context Vivado synthesis on a stitched IP design."""
+
+    def __init__(self, part, clk_period_ns, clk_name="ap_clk"):
+        super().__init__()
+        self.part = part
+        self.clk_period_ns = clk_period_ns
+        self.clk_name = clk_name
+
+    def apply(self, model):
+        def file_to_basename(x):
+            return os.path.basename(os.path.realpath(x))
+
+        vivado_stitch_proj_dir = model.get_metadata_prop("vivado_stitch_proj")
+        assert vivado_stitch_proj_dir is not None, "Need stitched IP to run."
+        top_module_name = model.get_metadata_prop("wrapper_filename")
+        top_module_name = file_to_basename(top_module_name).strip(".v")
+        build_dir = make_build_dir("synth_out_of_context_")
+        with open(vivado_stitch_proj_dir + "/all_verilog_srcs.txt", "r") as f:
+            all_verilog_srcs = f.read().split()
+        for file in all_verilog_srcs:
+            if file.endswith(".v"):
+                copy2(file, build_dir)
+        ret = out_of_context_synth(
+            build_dir, top_module_name, self.part, self.clk_name, self.clk_period_ns
+        )
+        model.set_metadata_prop("res_total_ooc_synth", str(ret))
+        return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py
index ab9fd03251819aee72f74cc0c1fa17b99b1e05a4..3bd74ec6a2071db820a35a9440eedd74092354e1 100644
--- a/src/finn/transformation/fpgadataflow/templates.py
+++ b/src/finn/transformation/fpgadataflow/templates.py
@@ -104,9 +104,10 @@ from finn.core.datatype import DataType
 from pynq.ps import Clocks
 
 class FINNAccelDriver():
-    def __init__(self, N, bitfile):
+    def __init__(self, N, bitfile, platform="$PLATFORM$"):
         \"\"\"Instantiate the FINN accelerator driver.
         Gets batchsize (N) as integer and path to bitfile as string.\"\"\"
+        self.platform = platform
         self.N = N
         # input FINN DataType
         self.idt = $INPUT_FINN_DATATYPE$
@@ -119,21 +120,37 @@ class FINNAccelDriver():
         self.oshape_folded = $OUTPUT_SHAPE_FOLDED$
         self.ishape_packed = $INPUT_SHAPE_PACKED$   # datatype np.uint8
         self.oshape_packed = $OUTPUT_SHAPE_PACKED$  # datatype np.uint8
-        # clock frequency
-        self.fclk_mhz = $CLOCK_FREQ_MHZ$
         # load bitfile and set up accelerator
         self.ol = Overlay(bitfile)
-        # set the clock frequency as specified by user during transformations
-        Clocks.$CLK_NAME$ = self.fclk_mhz
-        self.dma = self.ol.axi_dma_0
-        self.ctrl_regs = self.ol.resize_accel_0
         # neuron folding factor of output = iterations per sample
         self.itersPerSample = self.oshape_packed[-2]
-        # AXI lite register offset for number of iterations
-        # used by TLastMarker to signal end of transmission for AXI CDMA
-        self.REG_OFFSET_NUM_ITERS = 0x10
-        # set up TLastMarker with correct num. samples
-        self.ctrl_regs.write(self.REG_OFFSET_NUM_ITERS, self.N*self.itersPerSample)
+        if self.platform == "zynq":
+            # clock frequency
+            self.fclk_mhz = $CLOCK_FREQ_MHZ$
+            # set the clock frequency as specified by user during transformations
+            if self.fclk_mhz > 0:
+                Clocks.$CLK_NAME$ = self.fclk_mhz
+            self.dma = self.ol.axi_dma_0
+            self.ctrl_regs = self.ol.resize_accel_0
+
+            # AXI lite register offset for number of iterations
+            # used by TLastMarker to signal end of transmission for AXI CDMA
+            self.REG_OFFSET_NUM_ITERS = 0x10
+            # set up TLastMarker with correct num. samples
+            self.ctrl_regs.write(self.REG_OFFSET_NUM_ITERS, self.N*self.itersPerSample)
+        elif self.platform == "alveo":
+            self.idma = self.ol.idma0
+            self.odma = self.ol.odma0
+        elif self.platform == "zynq-iodma":
+            self.idma = self.ol.idma0
+            self.odma = self.ol.odma0
+            # clock frequency
+            self.fclk_mhz = $CLOCK_FREQ_MHZ$
+            # set the clock frequency as specified by user during transformations
+            if self.fclk_mhz > 0:
+                Clocks.$CLK_NAME$ = self.fclk_mhz
+        else:
+            raise ValueError("Supported platforms are zynq zynq-iodma alveo")
 
         # allocate a PYNQ buffer for the packed input and buffer
         self.ibuf_packed_device = allocate(shape=self.ishape_packed, dtype=np.uint8)
@@ -176,19 +193,42 @@ class FINNAccelDriver():
         np.copyto(self.ibuf_packed_device, data)
 
     def execute(self):
-        \"\"\"Executes accelerator by setting up the DMA and
-        waiting until all transfers complete. Uses only member variables and
+        \"\"\"Executes accelerator by setting up the DMA(s) and
+        waiting until all transfers/calls complete. Uses only member variables and
         returns nothing.\"\"\"
-        dma = self.dma
-        dma.sendchannel.transfer(self.ibuf_packed_device)
-        dma.recvchannel.transfer(self.obuf_packed_device)
-        dma.sendchannel.wait()
-        dma.recvchannel.wait()
+        if self.platform == "zynq":
+            dma = self.dma
+            dma.sendchannel.transfer(self.ibuf_packed_device)
+            dma.recvchannel.transfer(self.obuf_packed_device)
+            dma.sendchannel.wait()
+            dma.recvchannel.wait()
+        elif self.platform == "zynq-iodma":
+            # manually launch IODMAs since signatures are missing
+            self.idma.write(0x10, self.ibuf_packed_device.device_address)
+            self.idma.write(0x1c, self.N)
+            self.odma.write(0x10, self.obuf_packed_device.device_address)
+            self.odma.write(0x1c, self.N)
+            self.idma.write(0x00, 1)
+            self.odma.write(0x00, 1)
+            # wait until output IODMA is finished
+            status = self.odma.read(0x00)
+            while status & 0x2 == 0:
+                status = self.odma.read(0x00)
+
+        elif self.platform == "alveo":
+            self.ibuf_packed_device.sync_to_device()
+            self.idma.start(self.ibuf_packed_device, self.N)
+            self.odma.start(self.obuf_packed_device, self.N)
+            self.idma.wait()
+            self.odma.wait()
+            self.obuf_packed_device.sync_from_device()
+
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='Set exec mode, batchsize N, bitfile name, inputfile name and outputfile name')
     parser.add_argument('--exec_mode', help='Please select functional verification ("execute") or throughput test ("throughput_test")', default="execute")
+    parser.add_argument('--platform', help='Target platform: zynq zynq-iodma alveo', default="zynq")
     parser.add_argument('--batchsize', help='number of samples for inference', type=int, default=1)
     parser.add_argument('--bitfile', help='name of bitfile (i.e. "resizer.bit")', default="resizer.bit")
     parser.add_argument('--inputfile', help='name of input npy file (i.e. "input.npy")', default="input.npy")
@@ -196,13 +236,14 @@ if __name__ == "__main__":
     # parse arguments
     args = parser.parse_args()
     exec_mode = args.exec_mode
+    platform = args.platform
     N = args.batchsize
     bitfile = args.bitfile
     inputfile = args.inputfile
     outputfile = args.outputfile
 
     # instantiate FINN accelerator driver and pass batchsize and bitfile
-    finnDriver = FINNAccelDriver(N, bitfile)
+    finnDriver = FINNAccelDriver(N, bitfile, platform)
 
     # for the remote execution the data from the input npy file has to be loaded,
     # packed and copied to the PYNQ buffer
@@ -258,3 +299,126 @@ if __name__ == "__main__":
 
 
 """
+
+custom_zynq_shell_template = """
+set FREQ_MHZ %s
+set NUM_AXILITE %d
+if {$NUM_AXILITE > 9} {
+    error "Maximum 10 AXI-Lite interfaces supported"
+}
+set NUM_AXIMM %d
+set BOARD %s
+set FPGA_PART %s
+create_project finn_zynq_link ./ -part $FPGA_PART
+
+# set board part repo paths to find PYNQ-Z1/Z2
+set paths_prop [get_property BOARD_PART_REPO_PATHS [current_project]]
+set paths_param [get_param board.repoPaths]
+lappend paths_prop /workspace/finn/board_files
+lappend paths_param /workspace/finn/board_files
+set_property BOARD_PART_REPO_PATHS $paths_prop [current_project]
+set_param board.repoPaths $paths_param
+
+if {$BOARD == "ZCU104"} {
+    set_property board_part xilinx.com:zcu104:part0:1.1 [current_project]
+    set ZYNQ_TYPE "zynq_us+"
+} elseif {$BOARD == "Ultra96"} {
+    set ZYNQ_TYPE "zynq_us+"
+} elseif {$BOARD == "Pynq-Z2"} {
+    set ZYNQ_TYPE "zynq_7000"
+} elseif {$BOARD == "Pynq-Z1"} {
+    set ZYNQ_TYPE "zynq_7000"
+    set_property board_part www.digilentinc.com:pynq-z1:part0:1.0 [current_project]
+} else {
+    puts "Unrecognized board"
+}
+
+create_bd_design "top"
+if {$ZYNQ_TYPE == "zynq_us+"} {
+    create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.3 zynq_ps
+    apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" }  [get_bd_cells zynq_ps]
+    #activate one slave port, deactivate the second master port
+    set_property -dict [list CONFIG.PSU__USE__S_AXI_GP2 {1}] [get_bd_cells zynq_ps]
+    set_property -dict [list CONFIG.PSU__USE__M_AXI_GP1 {0}] [get_bd_cells zynq_ps]
+    #set frequency of PS clock (this can't always be exactly met)
+    set_property -dict [list CONFIG.PSU__CRL_APB__PL0_REF_CTRL__FREQMHZ [expr int($FREQ_MHZ)]] [get_bd_cells zynq_ps]
+} elseif {$ZYNQ_TYPE == "zynq_7000"} {
+    create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 zynq_ps
+    apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" }  [get_bd_cells zynq_ps]
+    set_property -dict [list CONFIG.PCW_USE_S_AXI_HP0 {1}] [get_bd_cells zynq_ps]
+    set_property -dict [list CONFIG.PCW_FPGA0_PERIPHERAL_FREQMHZ [expr int($FREQ_MHZ)]] [get_bd_cells zynq_ps]
+} else {
+    puts "Unrecognized Zynq type"
+}
+
+#instantiate axi interconnect, axi smartconnect
+create_bd_cell -type ip -vlnv xilinx.com:ip:axi_interconnect:2.1 axi_interconnect_0
+create_bd_cell -type ip -vlnv xilinx.com:ip:smartconnect:1.0 smartconnect_0
+#set number of axilite interfaces, and number of axi master interfaces
+set_property -dict [list CONFIG.NUM_SI $NUM_AXILITE] [get_bd_cells smartconnect_0]
+set_property -dict [list CONFIG.NUM_MI $NUM_AXIMM] [get_bd_cells axi_interconnect_0]
+
+#create reset controller and connect interconnects to PS
+if {$ZYNQ_TYPE == "zynq_us+"} {
+    connect_bd_intf_net [get_bd_intf_pins smartconnect_0/M00_AXI] [get_bd_intf_pins zynq_ps/S_AXI_HP0_FPD]
+    connect_bd_intf_net [get_bd_intf_pins zynq_ps/M_AXI_HPM0_FPD] -boundary_type upper [get_bd_intf_pins axi_interconnect_0/S00_AXI]
+    #connect interconnect clocks and resets
+    apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins axi_interconnect_0/ACLK]
+    apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins axi_interconnect_0/S00_ACLK]
+    apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins zynq_ps/saxihp0_fpd_aclk]
+} elseif {$ZYNQ_TYPE == "zynq_7000"} {
+    connect_bd_intf_net -boundary_type upper [get_bd_intf_pins zynq_ps/M_AXI_GP0] [get_bd_intf_pins axi_interconnect_0/S00_AXI]
+    connect_bd_intf_net [get_bd_intf_pins smartconnect_0/M00_AXI] [get_bd_intf_pins zynq_ps/S_AXI_HP0]
+    apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/FCLK_CLK0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins axi_interconnect_0/ACLK]
+    apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/FCLK_CLK0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins axi_interconnect_0/S00_ACLK]
+    apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/FCLK_CLK0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins zynq_ps/S_AXI_HP0_ACLK]
+}
+connect_bd_net [get_bd_pins axi_interconnect_0/ARESETN] [get_bd_pins smartconnect_0/aresetn]
+
+#custom IP instantiations/connections start here
+%s
+
+# set up debug
+if {%d == 1} {
+    set_property HDL_ATTRIBUTE.DEBUG true [get_bd_intf_nets {idma0_m_axis_0}]
+    set_property HDL_ATTRIBUTE.DEBUG true [get_bd_intf_nets {StreamingDataflowPartition_1_m_axis_0}]
+    set_property HDL_ATTRIBUTE.DEBUG true [get_bd_intf_nets {smartconnect_0_M00_AXI}]
+    apply_bd_automation -rule xilinx.com:bd_rule:debug -dict [list \
+                                                              [get_bd_intf_nets smartconnect_0_M00_AXI] {AXI_R_ADDRESS "Data and Trigger" AXI_R_DATA "Data and Trigger" AXI_W_ADDRESS "Data and Trigger" AXI_W_DATA "Data and Trigger" AXI_W_RESPONSE "Data and Trigger" CLK_SRC "/zynq_ps/FCLK_CLK0" SYSTEM_ILA "Auto" APC_EN "0" } \
+                                                              [get_bd_intf_nets idma0_m_axis_0] {AXIS_SIGNALS "Data and Trigger" CLK_SRC "/zynq_ps/FCLK_CLK0" SYSTEM_ILA "Auto" APC_EN "0" } \
+                                                              [get_bd_intf_nets StreamingDataflowPartition_1_m_axis_0] {AXIS_SIGNALS "Data and Trigger" CLK_SRC "/zynq_ps/FCLK_CLK0" SYSTEM_ILA "Auto" APC_EN "0" } \
+                                                             ]
+}
+
+#finalize clock and reset connections for interconnects
+set i 0
+while {$i < $NUM_AXILITE} {
+    apply_bd_automation -quiet -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/FCLK_CLK0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins axi_interconnect_0/M0${i}_ACLK]
+    incr i
+}
+
+save_bd_design
+assign_bd_address
+validate_bd_design
+
+set_property SYNTH_CHECKPOINT_MODE "Hierarchical" [ get_files top.bd ]
+make_wrapper -files [get_files top.bd] -import -fileset sources_1 -top
+
+set_property strategy Flow_PerfOptimized_high [get_runs synth_1]
+set_property STEPS.SYNTH_DESIGN.ARGS.DIRECTIVE AlternateRoutability [get_runs synth_1]
+set_property STEPS.SYNTH_DESIGN.ARGS.RETIMING true [get_runs synth_1]
+set_property strategy Performance_ExtraTimingOpt [get_runs impl_1]
+set_property STEPS.OPT_DESIGN.ARGS.DIRECTIVE Explore [get_runs impl_1]
+set_property STEPS.POST_ROUTE_PHYS_OPT_DESIGN.ARGS.DIRECTIVE AggressiveExplore [get_runs impl_1]
+set_property STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE AggressiveExplore [get_runs impl_1]
+set_property STEPS.POST_ROUTE_PHYS_OPT_DESIGN.IS_ENABLED true [get_runs impl_1]
+
+# out-of-context synth can't be used for bitstream generation
+# set_property -name {STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS} -value {-mode out_of_context} -objects [get_runs synth_1]
+launch_runs -to_step write_bitstream impl_1 -jobs %d
+wait_on_run [get_runs impl_1]
+
+# generate synthesis report
+open_run synth_1 -name synth_1
+report_utilization -hierarchical -hierarchical_depth 4 -file synth_report.xml -format xml
+"""
diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py
new file mode 100644
index 0000000000000000000000000000000000000000..2df58c537250c102ee85a685fc32904ee879e38f
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/vitis_build.py
@@ -0,0 +1,322 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import subprocess
+
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation import Transformation
+from finn.custom_op.registry import getCustomOp
+
+from finn.transformation.fpgadataflow.create_dataflow_partition import (
+    CreateDataflowPartition,
+)
+from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker
+from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.floorplan import Floorplan
+from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
+from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from finn.util.basic import make_build_dir
+from finn.transformation.infer_data_layouts import InferDataLayouts
+
+
+def _check_vitis_envvars():
+    assert "VITIS_PATH" in os.environ, "VITIS_PATH must be set for Vitis"
+    assert (
+        "PLATFORM_REPO_PATHS" in os.environ
+    ), "PLATFORM_REPO_PATHS must be set for Vitis"
+    assert (
+        "XILINX_XRT" in os.environ
+    ), "XILINX_XRT must be set for Vitis, ensure the XRT env is sourced"
+
+
+class CreateVitisXO(Transformation):
+    """Create a Vitis object file from a stitched FINN ip.
+
+    Outcome if successful: sets the vitis_xo attribute in the ONNX
+    ModelProto's metadata_props field with the name of the object file as value.
+    The object file can be found under the ip subdirectory.
+    """
+
+    def __init__(self, ip_name="finn_design"):
+        super().__init__()
+        self.ip_name = ip_name
+
+    def apply(self, model):
+        _check_vitis_envvars()
+        vivado_proj_dir = model.get_metadata_prop("vivado_stitch_proj")
+        stitched_ip_dir = vivado_proj_dir + "/ip"
+        args_string = []
+        m_axis_idx = 0
+        s_axis_idx = 0
+        # NOTE: this assumes the graph is Vitis-compatible: max one axi lite interface
+        # developed from instructions in UG1393 (v2019.2) and package_xo documentation
+        # package_xo is responsible for generating the kernel xml
+        for node in model.graph.node:
+            node_inst = getCustomOp(node)
+            arg_id = 0
+            if node.op_type == "TLastMarker":
+                stream_width = node_inst.get_nodeattr("StreamWidth")
+                # add a stream input or output port, based on direction
+                if node_inst.get_nodeattr("Direction") == "in":
+                    args_string.append(
+                        "{in:4:%s:s_axis_%d:0x0:0x0:ap_uint&lt;%s>:0}"
+                        % (str(arg_id), s_axis_idx, str(stream_width))
+                    )
+                    s_axis_idx += 1
+                else:
+                    args_string.append(
+                        "{out:4:%s:m_axis_%d:0x0:0x0:ap_uint&lt;%s>:0}"
+                        % (str(arg_id), m_axis_idx, str(stream_width))
+                    )
+                    m_axis_idx += 1
+                arg_id += 1
+                # add a axilite port if dynamic
+                # add a count parameter if dynamic
+                if node_inst.get_nodeattr("DynIters") == 1:
+                    args_string.append(
+                        "{numReps:0:%s:s_axi_control:0x4:0x10:uint:0}" % str(arg_id)
+                    )
+                    arg_id += 1
+            elif node.op_type == "IODMA":
+                port_width = node_inst.get_nodeattr("intfWidth")
+                # add an address parameter
+                # add a count parameter
+                args_string.append(
+                    "{addr:1:%s:m_axi_gmem0:0x8:0x10:ap_uint&lt;%s>*:0}"
+                    % (str(arg_id), str(port_width))
+                )
+                arg_id += 1
+                args_string.append(
+                    "{numReps:0:%s:s_axi_control:0x4:0x1C:uint:0}" % str(arg_id)
+                )
+                arg_id += 1
+
+        # save kernel xml then run package_xo
+        xo_name = self.ip_name + ".xo"
+        xo_path = vivado_proj_dir + "/" + xo_name
+        model.set_metadata_prop("vitis_xo", xo_path)
+
+        # generate the package_xo command in a tcl script
+        package_xo_string = (
+            "package_xo -force -xo_path %s -kernel_name %s -ip_directory %s"
+            % (xo_path, self.ip_name, stitched_ip_dir)
+        )
+        for arg in args_string:
+            package_xo_string += " -kernel_xml_args " + arg
+        with open(vivado_proj_dir + "/gen_xo.tcl", "w") as f:
+            f.write(package_xo_string)
+
+        # create a shell script and call Vivado
+        package_xo_sh = vivado_proj_dir + "/gen_xo.sh"
+        working_dir = os.environ["PWD"]
+        with open(package_xo_sh, "w") as f:
+            f.write("#!/bin/bash \n")
+            f.write("cd {}\n".format(vivado_proj_dir))
+            f.write("vivado -mode batch -source gen_xo.tcl\n")
+            f.write("cd {}\n".format(working_dir))
+        bash_command = ["bash", package_xo_sh]
+        process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
+        process_compile.communicate()
+        assert os.path.isfile(xo_path), (
+            "Vitis .xo file not created, check logs under %s" % vivado_proj_dir
+        )
+        return (model, False)
+
+
+class VitisLink(Transformation):
+    """Create an XCLBIN with Vitis.
+
+    Outcome if successful: sets the vitis_xclbin attribute in the ONNX
+    ModelProto's metadata_props field with the XCLBIN full path as value.
+    """
+
+    def __init__(self, platform, f_mhz=200):
+        super().__init__()
+        self.platform = platform
+        self.f_mhz = f_mhz
+
+    def apply(self, model):
+        _check_vitis_envvars()
+        # create a config file and empty list of xo files
+        config = ["[connectivity]"]
+        object_files = []
+        idma_idx = 0
+        odma_idx = 0
+        instance_names = {}
+        for node in model.graph.node:
+            assert node.op_type == "StreamingDataflowPartition", "Invalid link graph"
+            sdp_node = getCustomOp(node)
+            dataflow_model_filename = sdp_node.get_nodeattr("model")
+            kernel_model = ModelWrapper(dataflow_model_filename)
+            kernel_xo = kernel_model.get_metadata_prop("vitis_xo")
+            object_files.append(kernel_xo)
+            # gather info on connectivity
+            # assume each node connected to outputs/inputs is DMA:
+            # has axis, aximm and axilite
+            # everything else is axis-only
+            # assume only one connection from each ip to the next
+            # all aximm allocated to DDR[0]
+            # all kernels allocated to SLR0
+            producer = model.find_producer(node.input[0])
+            consumer = model.find_consumers(node.output[0])
+            # define kernel instances
+            # name kernels connected to graph inputs as idmaxx
+            # name kernels connected to graph inputs as odmaxx
+            if producer is None:
+                instance_names[node.name] = "idma" + str(idma_idx)
+                config.append("nk=%s:1:%s" % (node.name, instance_names[node.name]))
+                idma_idx += 1
+            elif consumer is None:
+                instance_names[node.name] = "odma" + str(odma_idx)
+                config.append("nk=%s:1:%s" % (node.name, instance_names[node.name]))
+                odma_idx += 1
+            else:
+                instance_names[node.name] = node.name
+                config.append("nk=%s:1:%s" % (node.name, instance_names[node.name]))
+            # assign SLRs
+            config.append("slr=%s:SLR0" % instance_names[node.name])
+            # assign memory banks
+            if producer is None or consumer is None:
+                config.append(
+                    "sp=%s.m_axi_gmem0:DDR[%d]" % (instance_names[node.name], 0)
+                )
+            # connect streams
+            if producer is not None:
+                for i in range(len(node.input)):
+                    producer = model.find_producer(node.input[i])
+                    if producer is not None:
+                        j = list(producer.output).index(node.input[i])
+                        config.append(
+                            "stream_connect=%s.m_axis_%d:%s.s_axis_%d"
+                            % (
+                                instance_names[producer.name],
+                                j,
+                                instance_names[node.name],
+                                i,
+                            )
+                        )
+
+        # create a temporary folder for the project
+        link_dir = make_build_dir(prefix="vitis_link_proj_")
+        model.set_metadata_prop("vitis_link_proj", link_dir)
+
+        config = "\n".join(config) + "\n"
+        with open(link_dir + "/config.txt", "w") as f:
+            f.write(config)
+
+        # create a shell script and call Vitis
+        script = link_dir + "/run_vitis_link.sh"
+        working_dir = os.environ["PWD"]
+        with open(script, "w") as f:
+            f.write("#!/bin/bash \n")
+            f.write("cd {}\n".format(link_dir))
+            f.write(
+                "v++ -t hw --platform %s --link %s"
+                " --kernel_frequency %d --config config.txt --optimize 2"
+                " --save-temps -R2\n"
+                % (self.platform, " ".join(object_files), self.f_mhz)
+            )
+            f.write("cd {}\n".format(working_dir))
+        bash_command = ["bash", script]
+        process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
+        process_compile.communicate()
+        # TODO rename xclbin appropriately here?
+        xclbin = link_dir + "/a.xclbin"
+        assert os.path.isfile(xclbin), (
+            "Vitis .xclbin file not created, check logs under %s" % link_dir
+        )
+        model.set_metadata_prop("vitis_xclbin", xclbin)
+        return (model, False)
+
+
+class VitisBuild(Transformation):
+    """Best-effort attempt at building the accelerator with Vitis."""
+
+    def __init__(self, fpga_part, period_ns, platform):
+        super().__init__()
+        self.fpga_part = fpga_part
+        self.period_ns = period_ns
+        self.platform = platform
+
+    def apply(self, model):
+        _check_vitis_envvars()
+        # first infer layouts
+        model = model.transform(InferDataLayouts())
+        # prepare at global level, then break up into kernels
+        prep_transforms = [
+            MakePYNQDriver(platform="alveo"),
+            InsertIODMA(512),
+            InsertDWC(),
+            Floorplan(),
+            CreateDataflowPartition(),
+        ]
+        for trn in prep_transforms:
+            model = model.transform(trn)
+            model = model.transform(GiveUniqueNodeNames())
+            model = model.transform(GiveReadableTensorNames())
+        # Build each kernel individually
+        sdp_nodes = model.get_nodes_by_op_type("StreamingDataflowPartition")
+        for sdp_node in sdp_nodes:
+            sdp_node = getCustomOp(sdp_node)
+            dataflow_model_filename = sdp_node.get_nodeattr("model")
+            kernel_model = ModelWrapper(dataflow_model_filename)
+            kernel_model = kernel_model.transform(InsertFIFO())
+            kernel_model = kernel_model.transform(
+                InsertTLastMarker(both=True, external=False, dynamic=False)
+            )
+            kernel_model = kernel_model.transform(GiveUniqueNodeNames())
+            kernel_model.save(dataflow_model_filename)
+            kernel_model = kernel_model.transform(
+                PrepareIP(self.fpga_part, self.period_ns)
+            )
+            kernel_model = kernel_model.transform(HLSSynthIP())
+            kernel_model = kernel_model.transform(ReplaceVerilogRelPaths())
+            kernel_model = kernel_model.transform(
+                CreateStitchedIP(
+                    self.fpga_part, self.period_ns, sdp_node.onnx_node.name, True
+                )
+            )
+            kernel_model = kernel_model.transform(
+                CreateVitisXO(sdp_node.onnx_node.name)
+            )
+            kernel_model.save(dataflow_model_filename)
+        # Assemble design from kernels
+        model = model.transform(VitisLink(self.platform, round(1000 / self.period_ns)))
+        # set platform attribute for correct remote execution
+        model.set_metadata_prop("platform", "alveo")
+
+        return (model, False)
diff --git a/src/finn/transformation/general.py b/src/finn/transformation/general.py
index 488391740fc25f1f7caa657adc9ed55bdc2f9722..4303eb17f39a9949f5729e895e449bbb6a633033 100644
--- a/src/finn/transformation/general.py
+++ b/src/finn/transformation/general.py
@@ -31,6 +31,55 @@ from finn.transformation import Transformation
 from toposort import toposort_flatten
 
 
+class RemoveUnusedTensors(Transformation):
+    """Remove any unused tensors in the graph by removing any initializers,
+    ValueInfo and tensor annotations associated with it. Unused tensors do not
+    appear as any input/output for any graph nodes.
+    """
+
+    def apply(self, model):
+        graph_modified = False
+        onnx_graph = model.model.graph
+        # build a set of tensors that we actually use in the graph nodes
+        used_tensors = set()
+        for node in model.graph.node:
+            for i in node.input:
+                used_tensors.add(i)
+            for o in node.output:
+                used_tensors.add(o)
+        # remove initializers, value_info and annotations that are not in the
+        # used set of tensors, as determined by the graph node i/o
+        for init in onnx_graph.initializer:
+            if init.name not in used_tensors:
+                onnx_graph.initializer.remove(init)
+                graph_modified = True
+        for vi in onnx_graph.value_info:
+            if vi.name not in used_tensors:
+                onnx_graph.value_info.remove(vi)
+                graph_modified = True
+        for qa in onnx_graph.quantization_annotation:
+            if qa.tensor_name not in used_tensors:
+                onnx_graph.quantization_annotation.remove(qa)
+                graph_modified = True
+
+        return (model, graph_modified)
+
+
+class RemoveStaticGraphInputs(Transformation):
+    "Remove any top-level graph inputs that have initializers."
+
+    def apply(self, model):
+        graph_modified = False
+        for i in model.graph.input:
+            if model.get_initializer(i.name) is not None:
+                # move ValueInfo to internal (value_info) container
+                model.graph.value_info.append(i)
+                model.graph.input.remove(i)
+                graph_modified = True
+
+        return (model, graph_modified)
+
+
 class GiveUniqueNodeNames(Transformation):
     """Give unique names to each node in the graph using enumeration."""
 
@@ -121,24 +170,23 @@ class GiveUniqueParameterTensors(Transformation):
 
 class SortGraph(Transformation):
     """ Returns the model with its node list sorted topologically.
-    Any ONNX graph to be executed must have a topologically sorted node list, as dictated
-    by the ONNX standard.
+    Any ONNX graph to be executed must have a topologically sorted node list,
+    as dictated by the ONNX standard.
     """
-    
+
     # Notes on SortGraph performance:
-    #         benchmark in  tests/transformation/test_sort_graph.py
-    # 
-    #         The algorithm doesn't move initializers so its performance should only depend on
-    #         the number of nodes
-    # 
-    #         Relative order of magnitudes for time per step:
-    #             - Gather graph structure:       base
-    #             - Sort nodes:                   0.1 of base
-    #             - Remove and insert in order :  0.001 of base
-    # 
-    #     Notes:
-    #         Remove nodes and insert them in order:
-    #           Probably this is faster than copying initializers and more robust in general
+    # benchmark in  tests/transformation/test_sort_graph.py
+    # The algorithm doesn't move initializers so its performance should only depend on
+    # the number of nodes
+    #
+    # Relative order of magnitudes for time per step:
+    # - Gather graph structure:       base
+    # - Sort nodes:                   0.1 of base
+    # - Remove and insert in order :  0.001 of base
+    #
+    # Notes:
+    # Remove nodes and insert them in order:
+    # Probably this is faster than copying initializers and more robust in general
 
     def apply(self, model):
         # Gather graph structure
diff --git a/src/finn/transformation/infer_data_layouts.py b/src/finn/transformation/infer_data_layouts.py
index 9ac75578ffb911cc44cfddc2b2119b55e6abf2dd..e7a6b88239a1735d5379e165333f8356ae6f88a1 100644
--- a/src/finn/transformation/infer_data_layouts.py
+++ b/src/finn/transformation/infer_data_layouts.py
@@ -38,7 +38,7 @@ def _dims_to_layout(model, node, ndims):
         return DataLayout.NC
     else:
         if node.domain == "finn":
-            if node.op_type == "MultiThreshold":
+            if node.op_type == "MultiThreshold" or node.op_type == "QuantAvgPool2d":
                 mt_inst = registry.getCustomOp(node)
                 layout = mt_inst.get_nodeattr("data_layout")
                 if layout == "NHWC" and ndims == 4:
diff --git a/src/finn/transformation/infer_datatypes.py b/src/finn/transformation/infer_datatypes.py
index 1acd4e3abe2d77248810cf15c15475e806a3bd32..39b7a787be8c725e7b6d474757dd96fc4848dfe0 100644
--- a/src/finn/transformation/infer_datatypes.py
+++ b/src/finn/transformation/infer_datatypes.py
@@ -71,7 +71,13 @@ def _infer_node_datatype(model, node):
         else:
             # unknown, assume node produces float32 outputs
             for o in node.output:
-                model.set_tensor_datatype(o, DataType.FLOAT32)
+                # check if output datatype is already set to a value != FLOAT32
+                odtype = model.get_tensor_datatype(o)
+                if odtype is not None and odtype != DataType.FLOAT32:
+                    # don't change data type
+                    model.set_tensor_datatype(o, odtype)
+                else:
+                    model.set_tensor_datatype(o, DataType.FLOAT32)
     # compare old and new output dtypes to see if anything changed
     new_odtypes = list(map(lambda x: model.get_tensor_datatype(x), node.output))
     graph_modified = new_odtypes != odtypes
diff --git a/src/finn/transformation/lower_convs_to_matmul.py b/src/finn/transformation/lower_convs_to_matmul.py
index 3da785d8dd21b2c6701bffc8ce3869fb14b237a9..e5a1f778d0cac48925ecd97ae8b970f7bdab9c4f 100644
--- a/src/finn/transformation/lower_convs_to_matmul.py
+++ b/src/finn/transformation/lower_convs_to_matmul.py
@@ -26,6 +26,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import numpy as np
 from onnx import TensorProto
 from onnx import helper
 
@@ -54,12 +55,34 @@ class LowerConvsToMatMul(Transformation):
                 k = get_by_name(n.attribute, "kernel_shape").ints[-1]
                 pad = get_by_name(n.attribute, "pads").ints[-1]
                 stride = get_by_name(n.attribute, "strides").ints[-1]
+                group = get_by_name(n.attribute, "group").i
                 weight_name = n.input[1]
                 W_conv = model.get_initializer(weight_name)
-                ifm_ch = W_conv.shape[1]
-                ofm_ch = W_conv.shape[0]
+                ifm_ch = model.get_tensor_shape(n.input[0])[1]  # assume NCHW
+                ofm_ch = model.get_tensor_shape(n.output[0])[1]  # assume NCHW
                 ifm_dim = model.get_tensor_shape(n.input[0])[-1]  # assume NCHW
                 ofm_dim = model.get_tensor_shape(n.output[0])[-1]  # assume NCHW
+
+                # if depthwise conv create sparse matrix and variable "dw"
+                # to store as attribute in Im2Col that indicates that the created
+                # Im2Col node belongs to a depthwise convolution
+                dw = False
+                if group == ifm_ch and ofm_ch == ifm_ch:
+                    W_sparse = np.zeros((ofm_ch, ifm_ch, k, k))
+                    for ch in range(ifm_ch):
+                        W_sparse[ch][ch] = W_conv[ch][0]
+                    W_conv = W_sparse.astype(np.float32)
+                    # we need to store information of the
+                    # sparsity of the weight matrix. For this
+                    # we use the sparsity annotation of the
+                    # weight tensor
+                    sparsity = {"dw": {"kernel_shape": k}}
+                    model.set_tensor_sparsity(weight_name, sparsity)
+                    # additionally create variable "dw" to store
+                    # as attribute in Im2Col that indicates that the created
+                    # Im2Col node belongs to a depthwise convolution
+                    dw = True
+
                 # reuse conv weights for new matmul weights
                 # conv weights are [OFM][IFM][k][k]
                 # first convert to [OFM][k][k][IFM] (to remain compatible with
@@ -70,6 +93,7 @@ class LowerConvsToMatMul(Transformation):
                 # transpose to get ONNX-compatible [k*k*IFM][OFM] matrix
                 W_matmul = W_matmul.T
                 model.set_initializer(weight_name, W_matmul)
+
                 # create new intermediate values
                 inp_trans_out = helper.make_tensor_value_info(
                     model.make_new_valueinfo_name(),
@@ -80,14 +104,19 @@ class LowerConvsToMatMul(Transformation):
                 inp_trans_out = inp_trans_out.name
                 model.set_tensor_datatype(inp_trans_out, idt)
 
-                im2col_out = helper.make_tensor_value_info(
-                    model.make_new_valueinfo_name(),
-                    TensorProto.FLOAT,
-                    (1, ofm_dim, ofm_dim, ifm_ch * k * k),
-                )
-                graph.value_info.append(im2col_out)
-                im2col_out = im2col_out.name
-                model.set_tensor_datatype(im2col_out, idt)
+                need_im2col = True
+                if k == 1 and pad == 0 and stride == 1:
+                    need_im2col = False
+
+                if need_im2col:
+                    im2col_out = helper.make_tensor_value_info(
+                        model.make_new_valueinfo_name(),
+                        TensorProto.FLOAT,
+                        (1, ofm_dim, ofm_dim, ifm_ch * k * k),
+                    )
+                    graph.value_info.append(im2col_out)
+                    im2col_out = im2col_out.name
+                    model.set_tensor_datatype(im2col_out, idt)
 
                 matmul_out = helper.make_tensor_value_info(
                     model.make_new_valueinfo_name(),
@@ -104,19 +133,24 @@ class LowerConvsToMatMul(Transformation):
                     "Transpose", [cnv_input], [inp_trans_out], perm=[0, 2, 3, 1]
                 )
                 # lower input tensor
-                im2col_node = helper.make_node(
-                    "Im2Col",
-                    [inp_trans_out],
-                    [im2col_out],
-                    domain="finn",
-                    stride=stride,
-                    kernel_size=k,
-                    pad_amount=pad,
-                    input_shape="(1,{},{},{})".format(ifm_dim, ifm_dim, ifm_ch),
-                )
+                matmul_input = inp_trans_out
+                if need_im2col:
+                    matmul_input = im2col_out
+                    im2col_node = helper.make_node(
+                        "Im2Col",
+                        [inp_trans_out],
+                        [im2col_out],
+                        domain="finn",
+                        stride=stride,
+                        kernel_size=k,
+                        pad_amount=pad,
+                        input_shape="(1,{},{},{})".format(ifm_dim, ifm_dim, ifm_ch),
+                        depthwise=dw,
+                    )
+
                 # do matmul
                 matmul_node = helper.make_node(
-                    "MatMul", [im2col_out, weight_name], [matmul_out]
+                    "MatMul", [matmul_input, weight_name], [matmul_out]
                 )
                 # NHWC -> NCHW
                 out_trans_node = helper.make_node(
@@ -124,9 +158,13 @@ class LowerConvsToMatMul(Transformation):
                 )
                 # insert nodes where the conv is to preserve topological ordering
                 graph.node.insert(node_ind, inp_trans_node)
-                graph.node.insert(node_ind + 1, im2col_node)
-                graph.node.insert(node_ind + 2, matmul_node)
-                graph.node.insert(node_ind + 3, out_trans_node)
+                if need_im2col:
+                    graph.node.insert(node_ind + 1, im2col_node)
+                    graph.node.insert(node_ind + 2, matmul_node)
+                    graph.node.insert(node_ind + 3, out_trans_node)
+                else:
+                    graph.node.insert(node_ind + 1, matmul_node)
+                    graph.node.insert(node_ind + 2, out_trans_node)
                 # remove old nodes
                 graph.node.remove(n)
         model = model.transform(InferShapes())
diff --git a/src/finn/transformation/merge_onnx_models.py b/src/finn/transformation/merge_onnx_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dc6127ed189311c72a119932394aca4745e3608
--- /dev/null
+++ b/src/finn/transformation/merge_onnx_models.py
@@ -0,0 +1,222 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import copy
+import warnings
+from onnx import helper
+
+from finn.transformation import Transformation
+from finn.core.modelwrapper import ModelWrapper
+import finn.util.basic as util
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.general import (
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    GiveUniqueParameterTensors,
+)
+
+
+class MergeONNXModels(Transformation):
+    """Merges two models. The model passed in the transformation will be inserted before
+    the model the transformation is applied on, the resulting model is returned.
+    This transformation will try to connect graph.output[0] of the pre model and
+    graph.input[0] of the post model.
+    If more than one input or output exists, a warning is raised."""
+
+    def __init__(self, pre_model):
+        super().__init__()
+        # use deep copy of model that should be inserted in the beginning of
+        # the other model to ensure that it stays unchanged
+        self.pre_model = copy.deepcopy(pre_model)
+
+    def apply(self, model):
+        graph_modified = False
+        pre_model = self.pre_model
+        post_model = copy.deepcopy(model)
+
+        # check for dynamic outputs of pre model
+        dyn_outp = []
+        for outp in pre_model.graph.output:
+            init_val = pre_model.get_initializer(outp.name)
+            if init_val is None:
+                dyn_outp.append(outp)
+
+        if len(dyn_outp) != 1:
+            warnings.warn(
+                "The pre model has more than one dynamic output! The transformation "
+                "tries to connect the first dynamic output to the first dynamic input "
+                "of the post model."
+            )
+
+        # check for dynamic inputs of post model
+        dyn_inp = []
+        for inp in post_model.graph.input:
+            init_val = post_model.get_initializer(inp.name)
+            if init_val is None:
+                dyn_inp.append(inp)
+
+        if len(dyn_inp) != 1:
+            warnings.warn(
+                "The post model has more than one dynamic input! The transformation "
+                "tries to connect the first dynamic input to the first dynamic output "
+                "of the pre model."
+            )
+
+        # erase all node names to avoid conflict
+        for n in pre_model.graph.node:
+            n.name = ""
+        for n in post_model.graph.node:
+            n.name = ""
+
+        # randomize all tensor names
+        names1 = pre_model.get_all_tensor_names()
+        names2 = post_model.get_all_tensor_names()
+        used_names = names1 + names2
+
+        # pre_model
+        for tensor_name in names1:
+            new_name = util.random_string()
+            while new_name in used_names:
+                new_name = util.random_string()
+            pre_model.rename_tensor(tensor_name, new_name)
+            used_names.append(new_name)
+
+        # post_model
+        for tensor in names2:
+            new_name = util.random_string()
+            while new_name in used_names:
+                new_name = util.random_string()
+            post_model.rename_tensor(tensor_name, new_name)
+            used_names.append(new_name)
+
+        # check if models can be merged
+        output_model_a = dyn_outp[0].name
+        input_model_b = dyn_inp[0].name
+        output_a_shape = pre_model.get_tensor_shape(output_model_a)
+        input_b_shape = post_model.get_tensor_shape(input_model_b)
+        assert (
+            output_a_shape == input_b_shape
+        ), "Models can't be merged! Shapes don't match."
+
+        # connect output of one model to input of the other
+        for n in pre_model.graph.node:
+            if output_model_a == n.output[0]:
+                n.output[0] = input_model_b
+
+        # extract information for new model
+
+        # nodes
+        node_list_a = pre_model.graph.node
+        node_list_b = post_model.graph.node
+
+        node_list = node_list_a
+        for node in node_list_b:
+            node_list.append(node)
+
+        # in and output
+        inp = pre_model.graph.input[0]
+        outp = post_model.graph.output[0]
+
+        # create new graph and model
+        new_graph = helper.make_graph(
+            nodes=node_list,
+            name="fuse-graph",
+            inputs=[inp],
+            outputs=[outp],
+            value_info=[],
+        )
+
+        new_model = helper.make_model(new_graph, producer_name="fuse_model")
+        new_model = ModelWrapper(new_model)
+
+        # add value info from both models to new model
+        # pre model
+        vi_pre = [x for x in pre_model.graph.input]
+        vi_pre += [x for x in pre_model.graph.output]
+        vi_pre += [x for x in pre_model.graph.value_info]
+        for vi in vi_pre:
+            # preserve intializers, quantization/sparsity annotation, etc.
+            # initializer
+            init_val = pre_model.get_initializer(vi.name)
+            if init_val is not None:
+                new_model.set_initializer(vi.name, init_val)
+            # FINN datatype
+            dtype = pre_model.get_tensor_datatype(vi.name)
+            new_model.set_tensor_datatype(vi.name, dtype)
+            # data layout
+            data_layout = pre_model.get_tensor_layout(vi.name)
+            if data_layout is not None:
+                new_model.set_tensor_layout(vi.name, data_layout)
+            # sparsity
+            sparsity = pre_model.get_tensor_sparsity(vi.name)
+            if sparsity is not None:
+                new_model.set_tensor_sparsity(vi.name, sparsity)
+            # graph input should not be part of graph.value_info, so don't insert
+            # if current vi == inp, but the quantization annotation is preserved
+            if vi == inp:
+                continue
+            new_model.graph.value_info.append(vi)
+
+        # post model
+        vi_model = [x for x in post_model.graph.input]
+        vi_model += [x for x in post_model.graph.output]
+        vi_model += [x for x in post_model.graph.value_info]
+        for vi in vi_model:
+            # preserve intializers, quantization/sparsity annotation, etc.
+            # initializer
+            init_val = post_model.get_initializer(vi.name)
+            if init_val is not None:
+                new_model.set_initializer(vi.name, init_val)
+            # FINN datatype
+            dtype = post_model.get_tensor_datatype(vi.name)
+            new_model.set_tensor_datatype(vi.name, dtype)
+            # data layout
+            data_layout = post_model.get_tensor_layout(vi.name)
+            if data_layout is not None:
+                new_model.set_tensor_layout(vi.name, data_layout)
+            # sparsity
+            sparsity = post_model.get_tensor_sparsity(vi.name)
+            if sparsity is not None:
+                new_model.set_tensor_sparsity(vi.name, sparsity)
+            # graph output should not be part of graph.value_info, so don't insert
+            # if current vi == outp, but the quantization annotation is preserved
+            if vi == outp:
+                continue
+            new_model.graph.value_info.append(vi)
+
+        # tidy-up new model
+        model = new_model
+        model = model.transform(InferShapes())
+        model = model.transform(InferDataTypes())
+        model = model.transform(InferDataLayouts())
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(GiveUniqueParameterTensors())
+        model = model.transform(GiveReadableTensorNames())
+
+        return (model, graph_modified)
diff --git a/src/finn/transformation/move_reshape.py b/src/finn/transformation/move_reshape.py
index 2ddaf4f840f449d3f5ec5cb83eaf461d624eb7a2..9943d371dad79a977b61810bcddafdcba505d6cc 100644
--- a/src/finn/transformation/move_reshape.py
+++ b/src/finn/transformation/move_reshape.py
@@ -36,5 +36,15 @@ class RemoveCNVtoFCFlatten(Transformation):
                             graph_modified = True
                             consumer.input[0] = n.input[0]
                             graph.node.remove(n)
+                    elif producer.op_type == "Transpose":
+                        transp_node = producer
+                        producer = model.find_producer(transp_node.input[0])
+                        if _is_fpgadataflow_node(producer) is True:
+                            consumer = model.find_consumer(n.output[0])
+                            if _is_fpgadataflow_node(consumer) is True:
+                                graph_modified = True
+                                consumer.input[0] = transp_node.input[0]
+                                graph.node.remove(n)
+                                graph.node.remove(transp_node)
 
         return (model, graph_modified)
diff --git a/src/finn/transformation/streamline/__init__.py b/src/finn/transformation/streamline/__init__.py
index c9c73fa4c8303ee28bc1cc6aee879d633740e01e..d7686eaadcbc800542ab96c5f45145857412b773 100644
--- a/src/finn/transformation/streamline/__init__.py
+++ b/src/finn/transformation/streamline/__init__.py
@@ -41,6 +41,7 @@ from finn.transformation.streamline.absorb import (
     FactorOutMulSignMagnitude,
     Absorb1BitMulIntoMatMul,
     Absorb1BitMulIntoConv,
+    AbsorbSignBiasIntoMultiThreshold,
 )
 
 from finn.transformation.streamline.collapse_repeated import (
@@ -52,13 +53,14 @@ from finn.transformation.streamline.reorder import (
     MoveAddPastMul,
     MoveScalarMulPastMatMul,
     MoveScalarAddPastMatMul,
-    MoveScalarAddPastConv,
+    MoveAddPastConv,
     MoveScalarMulPastConv,
 )
 
 from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
 from finn.transformation.streamline.sign_to_thres import ConvertSignToThres
 from finn.transformation.batchnorm_to_affine import BatchNormToAffine
+from finn.transformation.streamline.remove import RemoveIdentityOps
 
 
 class Streamline(Transformation):
@@ -70,9 +72,10 @@ class Streamline(Transformation):
             ConvertDivToMul(),
             BatchNormToAffine(),
             ConvertSignToThres(),
+            AbsorbSignBiasIntoMultiThreshold(),
             MoveAddPastMul(),
             MoveScalarAddPastMatMul(),
-            MoveScalarAddPastConv(),
+            MoveAddPastConv(),
             MoveScalarMulPastMatMul(),
             MoveScalarMulPastConv(),
             MoveAddPastMul(),
@@ -87,6 +90,7 @@ class Streamline(Transformation):
         ]
         for trn in streamline_transformations:
             model = model.transform(trn)
+            model = model.transform(RemoveIdentityOps())
             model = model.transform(GiveUniqueNodeNames())
             model = model.transform(GiveReadableTensorNames())
             model = model.transform(InferDataTypes())
diff --git a/src/finn/transformation/streamline/absorb.py b/src/finn/transformation/streamline/absorb.py
index dbcf97361017144174f9fbfca35a84361b5abd26..8398a277443530e84632d26fbfca6d90ea4b0b9e 100644
--- a/src/finn/transformation/streamline/absorb.py
+++ b/src/finn/transformation/streamline/absorb.py
@@ -28,14 +28,81 @@
 
 import numpy as np
 from onnx import helper as oh
+import warnings
 
 from finn.core.datatype import DataType
+import finn.core.data_layout as DataLayout
 from finn.transformation import Transformation
 from finn.util.basic import get_by_name
 from finn.custom_op.registry import getCustomOp
+from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.infer_datatypes import InferDataTypes
 
 
+class AbsorbSignBiasIntoMultiThreshold(Transformation):
+    """Absorb scalar bias originating from signed int export back into
+    MultiThreshold and re-evaluate the output datatype."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            # search for (MultiThreshold, Add) pair
+            node_ind += 1
+            if (
+                n.op_type == "MultiThreshold"
+                and not model.is_fork_node(n)
+                and not model.is_join_node(n)
+            ):
+                consumer = model.find_consumer(n.output[0])
+                if consumer is not None and consumer.op_type == "Add":
+                    mt_node = n
+                    add_node = consumer
+                    threshold_name = mt_node.input[1]
+                    add_weight_name = add_node.input[1]
+                    T = model.get_initializer(threshold_name)
+                    A = model.get_initializer(add_weight_name)
+                    if (A is None) or (T is None):
+                        warnings.warn("Threshold or add bias not constant, skipping")
+                        continue
+                    end_name = add_node.output[0]
+                    # we can only absorb scalar adds
+                    is_scalar = A.ndim == 0 or all(x == 1 for x in A.shape)
+                    if not is_scalar:
+                        continue
+                    bias = A.flatten()[0]
+                    # set MultiThreshold bias property
+                    mt_inst = getCustomOp(mt_node)
+                    bias += mt_inst.get_nodeattr("out_bias")
+                    mt_inst.set_nodeattr("out_bias", bias)
+                    graph_modified = True
+                    # compute new DataType for MultiThreshold output
+                    steps = T.shape[-1]
+                    new_min = bias
+                    new_max = steps + bias
+                    odt = DataType.get_smallest_possible(steps).name.replace(
+                        "UINT", "INT"
+                    )
+                    odt = DataType[odt]
+                    assert odt.allowed(new_max) and odt.allowed(
+                        new_min
+                    ), """Could
+                    not compute new MultiThreshold DataType (min = %d max = %d)""" % (
+                        new_min,
+                        new_max,
+                    )
+                    mt_inst.set_nodeattr("out_dtype", odt.name)
+                    # remove Add node, rewire MultiThreshold
+                    graph.node.remove(add_node)
+                    mt_node.output[0] = end_name
+                    # set datatype
+                    model.set_tensor_datatype(end_name, odt)
+        if graph_modified:
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
+
+
 class AbsorbAddIntoMultiThreshold(Transformation):
     """Absorb preceding Add ops into MultiThreshold by updating the threshold
     values. Only scalar/1D add vectors can be absorbed."""
@@ -250,11 +317,13 @@ class AbsorbTransposeIntoMultiThreshold(Transformation):
         graph_modified = False
         for n in graph.node:
             node_ind += 1
-            if n.op_type == "Transpose":
+            if n.op_type == "Transpose" and not model.is_fork_node(n):
                 perms = list(get_by_name(n.attribute, "perm").ints)
                 if perms == [0, 3, 1, 2]:
                     mt_cand = model.find_consumer(n.output[0])
-                    if mt_cand.op_type == "MultiThreshold":
+                    if mt_cand.op_type == "MultiThreshold" and not model.is_fork_node(
+                        mt_cand
+                    ):
                         final_t_cand = model.find_consumer(mt_cand.output[0])
                         if final_t_cand.op_type == "Transpose":
                             perms = list(
@@ -290,3 +359,182 @@ class AbsorbTransposeIntoMultiThreshold(Transformation):
         if graph_modified:
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
+
+
+class AbsorbTransposeIntoFlatten(Transformation):
+    """Absorb transpose node into succeeding flatten node, if H=W=1 and the first
+    dimension stays the same. Can also be applied if flatten is implemented implicitly
+    by a reshape node with shape [1, -1] and the first input dimension is 1"""
+
+    def apply(self, model):
+        graph = model.graph
+        graph_modified = False
+        node_ind = 0
+        for n in graph.node:
+            node_ind += 1
+            if (
+                n.op_type == "Reshape"
+                and (model.get_initializer(n.input[1]) == [1, -1]).all()
+            ) or n.op_type == "Flatten":
+                prod = model.find_producer(n.input[0])
+                if (
+                    prod is not None
+                    and prod.op_type == "Transpose"
+                    # we ensure that the first dimension is not changed from the
+                    # transpose operation
+                    and get_by_name(prod.attribute, "perm").ints[0] == 0
+                ):
+                    data_layout = model.get_tensor_layout(prod.input[0])
+                    # check for the data layout to interpret input shape correctly
+                    if data_layout is None:
+                        warnings.warn(
+                            """Data layout for input tensor of Transpose node is not set.
+                                To use AbsorbTransposeIntoFlatten transformation
+                                please set tensor data layout."""
+                        )
+                        continue
+                    elif data_layout == DataLayout.NCHW:
+                        (b, c, h, w) = model.get_tensor_shape(prod.input[0])
+                        # if h=w=1 the transposition can be absorbed, otherwise
+                        # the absorption would lead to an error in the behavior
+                        if h != 1 or w != 1:
+                            continue
+                        # the flatten node from onnx keeps by default the first
+                        # dim and flattens the rest, that is why this transformation
+                        # can only work with b != 1 if the model contains already a
+                        # flatten node and not a reshape node with shape = [1, -1].
+                        # If the first  dim of the input tensor is not 1, flatten and
+                        # reshape (with shape = [1, -1]) would lead to different results
+                        if n.op_type == "Reshape" and b != 1:
+                            continue
+                    elif data_layout == DataLayout.NHWC:
+                        (b, h, w, c) = model.get_tensor_shape(prod.input[0])
+                        if h != 1 or w != 1:
+                            continue
+                        if n.op_type == "Reshape" and b != 1:
+                            continue
+                    # create single flatten node and remove obsolete nodes
+                    node = oh.make_node("Flatten", [prod.input[0]], [n.output[0]])
+                    graph.node.remove(n)
+                    graph.node.remove(prod)
+                    graph.node.insert(node_ind, node)
+                    graph_modified = True
+        if graph_modified:
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
+
+
+class AbsorbScalarMulIntoTopK(Transformation):
+    """Absorb a mul node into a suceeding topk node if the mul is scalar."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type == "TopK":
+                prod = model.find_producer(n.input[0])
+                if prod is not None and prod.op_type == "Mul":
+                    prod_input = prod.input[0]
+                    param_name = prod.input[1]
+                    A = model.get_initializer(param_name)
+                    if A is None:
+                        warnings.warn("Param is not constant, skipping")
+                        continue
+                    if all(x == 1 for x in A.shape) and A > 0:
+                        # if the mul is scalar and positive, we can just delete the
+                        # mul node and rewire the top k node. Because the top k node
+                        # works with probabilities and their relation to each other
+                        # the relation doesn't change if every value is multiplied
+                        # with a scalar
+                        graph.node.remove(prod)
+                        n.input[0] = prod_input
+                        # to avoid error the dataype is set to float32
+                        model.set_tensor_datatype(n.input[0], DataType.FLOAT32)
+                        graph_modified = True
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
+
+
+class AbsorbConsecutiveTransposes(Transformation):
+    """Remove (Transpose -> Transpose) patterns when the input and output
+    of the pattern have the same layout."""
+
+    def Are_opposite_permutations(self, perms1, perms2):
+        if len(perms1) != len(perms2):
+            return False
+        assert 0 <= max(perms2) < len(perms2), "invalid permutation"
+        assert 0 <= max(perms1) < len(perms1), "invalid permutation"
+
+        for i, p in enumerate(perms2):
+            if perms1[p] != i:
+                return False
+
+        return True
+
+    def apply(self, model):
+        graph = model.graph
+        graph_modified = False
+        for n in graph.node:
+            if n.op_type == "Transpose":
+                if model.is_fork_node(n):
+                    next_nodes = model.find_direct_successors(n)
+                    perms1 = list(get_by_name(n.attribute, "perm").ints)
+
+                    # check if all nodes after fork are opposite transposes
+                    all_opposite_transposes = True
+                    for next_node in next_nodes:
+                        if next_node is not None and next_node.op_type == "Transpose":
+                            perms2 = list(get_by_name(next_node.attribute, "perm").ints)
+                            if not self.Are_opposite_permutations(perms1, perms2):
+                                all_opposite_transposes = False
+                                break
+                        else:
+                            all_opposite_transposes = False
+                            break
+
+                    if not all_opposite_transposes:
+                        continue
+
+                    prod = model.find_producer(n.input[0])
+                    for next_node in next_nodes:
+                        # connect next_node's consumer input to n's producer output
+                        # TODO implement this to allow for forks as producers and
+                        # joins as consumers
+                        cons = model.find_consumer(next_node.output[0])
+                        cons.input[0] = prod.output[0]
+
+                        # remove consumer transpose
+                        graph.node.remove(next_node)
+
+                    # remove producer transpose
+                    graph.node.remove(n)
+                    graph_modified = True
+
+                else:
+                    next_node = model.find_consumer(n.output[0])
+                    if next_node is not None and next_node.op_type == "Transpose":
+                        perms1 = list(get_by_name(n.attribute, "perm").ints)
+                        perms2 = list(get_by_name(next_node.attribute, "perm").ints)
+                        if self.Are_opposite_permutations(perms1, perms2):
+
+                            # connect next_node's consumer input to n's producer output
+                            # TODO implement this to allow for forks as producers
+                            consumers = model.find_direct_successors(next_node)
+                            prod = model.find_producer(n.input[0])
+                            for cons in consumers:
+                                for cons_in in cons.input:
+                                    if cons_in == next_node.output[0]:
+                                        prod.output[0] = cons_in
+                                        break
+                            # remove both transposes
+                            graph.node.remove(n)
+                            graph.node.remove(next_node)
+
+                            graph_modified = True
+        if graph_modified:
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
diff --git a/src/finn/transformation/streamline/collapse_repeated.py b/src/finn/transformation/streamline/collapse_repeated.py
index 67824ad4f633983b93e3178d03118927a1ddd85b..769bed841ce07c1c9c62f762de4b2c0937a6d68f 100644
--- a/src/finn/transformation/streamline/collapse_repeated.py
+++ b/src/finn/transformation/streamline/collapse_repeated.py
@@ -30,6 +30,7 @@ from onnx import helper as oh
 
 from finn.transformation import Transformation
 from finn.transformation.infer_shapes import InferShapes
+from finn.core.datatype import DataType
 
 
 class CollapseRepeatedOp(Transformation):
@@ -83,6 +84,9 @@ class CollapseRepeatedOp(Transformation):
                     graph.node.insert(node_ind, new_node)
                     # replace parameter value
                     model.set_initializer(new_node_param_name, new_param)
+                    # be conservative with param/output DataTypes
+                    model.set_tensor_datatype(new_node_param_name, DataType.FLOAT32)
+                    model.set_tensor_datatype(end_name, DataType.FLOAT32)
                     # remove old nodes
                     graph.node.remove(n)
                     graph.node.remove(consumer)
diff --git a/src/finn/transformation/streamline/remove.py b/src/finn/transformation/streamline/remove.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddc4233ddafbc70c4d20d316ea72ea6bba1b82a8
--- /dev/null
+++ b/src/finn/transformation/streamline/remove.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+from finn.transformation import Transformation
+from finn.transformation.infer_shapes import InferShapes
+import numpy as np
+
+class RemoveIdentityOps(Transformation):
+    """Remove identity ops like Add/Sub with zero or Mul/Div with one"""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if (
+                n.op_type in ["Add", "Sub"]
+                and not model.is_fork_node(n)
+                and not model.is_join_node(n)
+            ):
+                A = model.get_initializer(n.input[1])
+                if A is not None and (A == np.zeros_like(A)).all():
+                    producer = model.find_producer(n.input[0])
+                    # remove node and wire output tensor to
+                    # output of producer node
+                    producer.output[0] = n.output[0]
+                    graph.node.remove(n)
+
+            elif (
+                n.op_type in ["Mul", "Div"]
+                and not model.is_fork_node(n)
+                and not model.is_join_node(n)
+            ):
+                A = model.get_initializer(n.input[1])
+                if A is not None and (A == np.ones_like(A)).all():
+                    producer = model.find_producer(n.input[0])
+                    # remove node and wire output tensor to
+                    # output of producer node
+                    producer.output[0] = n.output[0]
+                    graph.node.remove(n)
+        model = model.transform(InferShapes())
+        return (model, graph_modified)
diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index 0b6259a61d3eb67b7b38d4c6939019ce2893a875..b47f269dd6f2671c3d98c9316954483c0e72f14f 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -27,12 +27,19 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import numpy as np
+import warnings
 from onnx import helper as oh
+from onnx import TensorProto
 
 from finn.transformation import Transformation
+import finn.core.data_layout as DataLayout
 from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.core.datatype import DataType
 from finn.core.onnx_exec import execute_node
 from finn.util.basic import get_by_name
+from finn.custom_op.registry import getCustomOp
 
 
 class MoveAddPastMul(Transformation):
@@ -65,8 +72,11 @@ class MoveAddPastMul(Transformation):
                     add_weight_name = n.input[1]
                     A = model.get_initializer(mul_weight_name)
                     B = model.get_initializer(add_weight_name)
-                    assert A is not None, "Initializer for mul weights is not set."
-                    assert B is not None, "Initializer for add weights is not set."
+                    if (A is None) or (B is None):
+                        warnings.warn(
+                            "Mul or add does not have constant params, skipping"
+                        )
+                        continue
                     start_name = n.input[0]
                     middle_name = n.output[0]
                     end_name = consumer.output[0]
@@ -121,8 +131,9 @@ class MoveScalarMulPastMatMul(Transformation):
                     matmul_weight_name = consumer.input[1]
                     A = model.get_initializer(mul_weight_name)
                     W = model.get_initializer(matmul_weight_name)
-                    assert A is not None, "Initializer for mul weights is not set."
-                    assert W is not None, "Initializer for matmul weights is not set."
+                    if (A is None) or (W is None):
+                        warnings.warn("MatMul or Mul params are not constant, skipping")
+                        continue
                     start_name = n.input[0]
                     middle_name = n.output[0]
                     end_name = consumer.output[0]
@@ -178,8 +189,9 @@ class MoveScalarAddPastMatMul(Transformation):
                     matmul_weight_name = consumer.input[1]
                     A = model.get_initializer(add_weight_name)
                     W = model.get_initializer(matmul_weight_name)
-                    assert A is not None, "Initializer for add weights is not set."
-                    assert W is not None, "Initializer for matmul weights is not set."
+                    if (A is None) or (W is None):
+                        warnings.warn("MatMul or Add params are not constant, skipping")
+                        continue
                     start_name = n.input[0]
                     middle_name = n.output[0]
                     end_name = consumer.output[0]
@@ -213,8 +225,8 @@ class MoveScalarAddPastMatMul(Transformation):
         return (model, graph_modified)
 
 
-class MoveScalarAddPastConv(Transformation):
-    """Move scalar add operations past conv operations. We want to have adds
+class MoveAddPastConv(Transformation):
+    """Move scalar and channelwise add operations past conv operations. We want to have adds
     next to each other such that they can be collapsed into a single add."""
 
     def apply(self, model):
@@ -239,8 +251,12 @@ class MoveScalarAddPastConv(Transformation):
                     add_weight_name = n.input[1]
                     conv_in_name = consumer.input[0]
                     conv_in_shape = model.get_tensor_shape(conv_in_name)
+                    # assume datalayout to be NCHW
+                    channels = conv_in_shape[1]
                     A = model.get_initializer(add_weight_name)
-                    assert A is not None, "Initializer for add weights is not set."
+                    if A is None:
+                        warnings.warn("Add param is not constant, skipping")
+                        continue
                     start_name = n.input[0]
                     end_name = consumer.output[0]
                     conv_out_shape = model.get_tensor_shape(end_name)
@@ -249,11 +265,17 @@ class MoveScalarAddPastConv(Transformation):
                     pads = list(get_by_name(consumer.attribute, "pads").ints)
                     if sum(pads) == 0:
                         using_padding = False
-                    if all(x == 1 for x in A.shape) and not using_padding:
+                    if (
+                        all(x == 1 for x in A.shape) or A.shape == (1, channels, 1, 1)
+                    ) and not using_padding:
                         # create a tensor filled with the add constant, in
                         # the shape expected by the convolution
                         conv_in_const = np.zeros(conv_in_shape, dtype=np.float32)
-                        conv_in_const.fill(A.item())
+                        if A.shape == (1, channels, 1, 1):
+                            for ch in range(channels):
+                                conv_in_const[0][ch].fill(A[0][ch].item())
+                        else:
+                            conv_in_const.fill(A.item())
                         # create an execution context and put in const input
                         exec_ctx = model.make_empty_exec_context()
                         exec_ctx[conv_in_name] = conv_in_const
@@ -308,7 +330,9 @@ class MoveScalarMulPastConv(Transformation):
                 ):
                     mul_weight_name = n.input[1]
                     A = model.get_initializer(mul_weight_name)
-                    assert A is not None, "Initializer for mul weights is not set."
+                    if A is None:
+                        warnings.warn("Mul param is not constant, skipping")
+                        continue
                     conv_node = consumer
                     mul_node = n
                     start_name = mul_node.input[0]
@@ -336,6 +360,71 @@ class MoveScalarMulPastConv(Transformation):
         return (model, graph_modified)
 
 
+class MoveMulPastDWConv(Transformation):
+    """Move channelwise mul operations past depthwise conv operations. We want to have muls
+    next to each other such that they can be collapsed into a single mul."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if (
+                n.op_type == "Mul"
+                and not model.is_fork_node(n)
+                and not model.is_join_node(n)
+            ):
+                consumer = model.find_consumer(n.output[0])
+                if (
+                    consumer is not None
+                    and consumer.op_type == "Conv"
+                    and not model.is_join_node(consumer)
+                ):
+                    mul_weight_name = n.input[1]
+                    A = model.get_initializer(mul_weight_name)
+                    if A is None:
+                        warnings.warn(
+                            """Mul weight tensor is not set. If it is a constant,
+                                please use set_initializer to set the tensor."""
+                        )
+                        continue
+                    conv_node = consumer
+                    mul_node = n
+                    start_name = mul_node.input[0]
+                    conv_in_name = conv_node.input[0]
+                    conv_in_shape = model.get_tensor_shape(conv_in_name)
+                    ifm_ch = conv_in_shape[1]
+                    group_attribute = get_by_name(consumer.attribute, "group")
+                    if group_attribute is None:
+                        continue
+                    group_attribute = group_attribute.i
+                    conv_out_name = conv_node.output[0]
+                    conv_out_shape = model.get_tensor_shape(conv_out_name)
+                    if A.shape == (1, ifm_ch, 1, 1) and ifm_ch == group_attribute:
+                        # if the mul is channelwise and conv is depthwise,
+                        # we can simply swap the order of ops
+                        # rewire mul input to be conv input
+                        conv_node.input[0] = start_name
+                        model.set_tensor_shape(start_name, conv_in_shape)
+                        model.set_tensor_datatype(start_name, DataType.FLOAT32)
+                        # use old conv input tensor as conv output
+                        conv_node.output[0] = conv_in_name
+                        model.set_tensor_shape(conv_in_name, conv_out_shape)
+                        model.set_tensor_datatype(conv_in_name, DataType.FLOAT32)
+                        # use new conv output as new mul node input
+                        mul_node.input[0] = conv_in_name
+                        # use old conv output as new mul node output
+                        mul_node.output[0] = conv_out_name
+                        model.set_tensor_datatype(conv_out_name, DataType.FLOAT32)
+                        # move mul node past conv node
+                        graph.node.remove(mul_node)
+                        graph.node.insert(node_ind, mul_node)
+                        graph_modified = True
+        model = model.transform(InferShapes())
+        return (model, graph_modified)
+
+
 class MoveLinearPastEltwiseAdd(Transformation):
     """Move linear operations (mul, add) past elementwise add operations where possible.
        Specifically,matches and transforms the following patterns:
@@ -413,6 +502,73 @@ class MoveLinearPastEltwiseAdd(Transformation):
         return (model, graph_modified)
 
 
+class MoveScalarLinearPastInvariants(Transformation):
+    """Move scalar linear operations (mul, add) past functions which are invariant
+       to them. Specifically, matches and transforms the following patterns:
+       f(x*C) -> f(x) * C
+       f(x+C) -> f(x) + C
+       where x is a dynamic input, C is a constant tensor.
+       Known f which obey this property are: Reshape, Flatten, Transpose,
+       GlobalAveragePool
+    """
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        nodes = [n for n in graph.node]
+        for n in nodes:
+            node_ind += 1
+            if (
+                n.op_type == "GlobalAveragePool"
+                or n.op_type == "Reshape"
+                or n.op_type == "Transpose"
+                or n.op_type == "Flatten"
+            ):
+                in0 = n.input[0]
+                if in0 is None:
+                    continue
+                # find and check producer on our input
+                prod0 = model.find_producer(in0)
+                if prod0 is None:
+                    continue
+
+                if prod0.op_type == "Mul" or prod0.op_type == "Add":
+                    # check if second input of producer is an initializer
+                    init0 = model.get_initializer(prod0.input[1])
+                    # if either initializer is None, skip
+                    if init0 is None:
+                        continue
+                    # if initializer is not scalar, skip
+                    if np.prod(init0.shape) != 1:
+                        continue
+                    # move prod0 from input to output,
+                    old_prod0_in = prod0.input[0]
+                    old_prod0_out = prod0.output[0]
+                    scalar_op_odt = model.get_tensor_datatype(old_prod0_out)
+                    old_n_out = n.output[0]
+                    in_shape = model.get_tensor_shape(n.input[0])
+                    out_shape = model.get_tensor_shape(n.output[0])
+                    n.input[0] = old_prod0_in
+                    n.output[0] = old_prod0_out
+                    prod0.input[0] = old_prod0_out
+                    prod0.output[0] = old_n_out
+                    model.set_tensor_shape(n.input[0], in_shape)
+                    model.set_tensor_shape(n.output[0], out_shape)
+                    model.set_tensor_shape(prod0.output[0], out_shape)
+                    model.set_tensor_datatype(prod0.output[0], scalar_op_odt)
+                    model.set_tensor_datatype(n.output[0], DataType.FLOAT32)
+                    graph.node.remove(prod0)
+                    graph.node.insert(node_ind - 1, prod0)
+                    graph_modified = True
+                else:
+                    continue
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
+
+
 class MakeMaxPoolNHWC(Transformation):
     """Convert (MaxPool, NHWCTranpose) into (MaxPoolNHWC)."""
 
@@ -531,3 +687,281 @@ class MoveMulPastFork(MoveOpPastFork):
 class MoveLinearPastFork(MoveOpPastFork):
     def __init__(self):
         super().__init__(["Add", "Mul"])
+
+
+class MoveMaxPoolPastMultiThreshold(Transformation):
+    """Move MaxPool nodes past MultiThreshold nodes on linear segments of the graph."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        nodes = [n for n in graph.node]
+        for n in nodes:
+            node_ind += 1
+            if n.op_type == "MaxPool" and not model.is_fork_node(n):
+                consumer = model.find_consumer(n.output[0])
+                pads = get_by_name(n.attribute, "pads")
+                has_padding = False
+                if pads is not None:
+                    pads = list(pads.ints)
+                    has_padding = np.prod(pads) != 0
+                if consumer is not None and consumer.op_type == "MultiThreshold":
+                    mt_out = consumer.output[0]
+                    mt_odt = model.get_tensor_datatype(mt_out)
+                    if mt_odt.signed() and has_padding:
+                        warnings.warn(
+                            "Skipping padded MaxPool + signed-output MultiThreshold"
+                        )
+                        continue
+                    # check for non-decreasing thresholds and nonnegative
+                    # scale factor in MultiThreshold
+                    # otherwise we cannot do the reordering
+                    T = model.get_initializer(consumer.input[1])
+                    T_sorted = np.sort(T, axis=1)
+                    assert (
+                        T == T_sorted
+                    ).all(), "MultiThreshold must have non-decreasing thresholds"
+                    mt_inst = getCustomOp(consumer)
+                    if mt_inst.get_nodeattr("out_scale") < 0:
+                        warnings.warn("Skipping MultiThreshold with negative out_scale")
+                        continue
+
+                    # remove old nodes
+                    graph.node.remove(n)
+                    graph.node.remove(consumer)
+
+                    # swap conections
+                    group_in = n.input[0]
+                    # new tensor because dims change
+                    group_middle = model.make_new_valueinfo_name()
+                    group_out = consumer.output[0]
+
+                    consumer.input[0] = group_in
+                    consumer.output[0] = group_middle
+
+                    n.input[0] = group_middle
+                    n.output[0] = group_out
+
+                    # insert them back in
+                    graph.node.insert(node_ind - 1, consumer)
+                    graph.node.insert(node_ind, n)
+
+                    graph_modified = True
+
+        model = model.transform(InferShapes())
+        return (model, graph_modified)
+
+
+class MoveFlattenPastTopK(Transformation):
+    """Move flatten node past a succeeding topk node, if the "axis" attribute in topk
+    is set to -1 and the data layout before the flatten is NHWC with H=W=1"""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type == "Flatten":
+                consumer = model.find_consumer(n.output[0])
+                if consumer is not None and consumer.op_type == "TopK":
+                    axis = get_by_name(consumer.attribute, "axis")
+                    if axis is None or axis.i != -1:
+                        continue
+                    start_name = n.input[0]
+                    data_layout = model.get_tensor_layout(start_name)
+                    if data_layout != DataLayout.NHWC:
+                        warnings.warn(
+                            """Transformation can't be applied. The input
+                            to flatten has to have DataLayout.NHWC"""
+                        )
+                        continue
+                    (b, h, w, c) = model.get_tensor_shape(start_name)
+                    if h != 1 or w != 1:
+                        continue
+                    # get parameter k from topk
+                    k = model.get_tensor_shape(consumer.output[1])[-1]
+
+                    # swap conections
+                    # new tensor because dims change
+                    middle_name = model.make_new_valueinfo_name()
+                    topk_indices = oh.make_tensor_value_info(
+                        middle_name, TensorProto.INT64, [b, h, w, k]
+                    )
+                    end_name = consumer.output[1]
+                    graph.value_info.append(topk_indices)
+
+                    # remove old nodes
+                    graph.node.remove(n)
+                    graph.node.remove(consumer)
+
+                    # set inputs and outputs correctly
+                    consumer.input[0] = start_name
+                    consumer.output[1] = middle_name
+                    model.set_tensor_shape(consumer.output[0], (b, h, w, k))
+
+                    n.input[0] = middle_name
+                    n.output[0] = end_name
+
+                    # insert them back in
+                    graph.node.insert(node_ind - 1, consumer)
+                    graph.node.insert(node_ind, n)
+
+                    graph_modified = True
+
+        model = model.transform(InferShapes())
+        return (model, graph_modified)
+
+
+class MoveFlattenPastAffine(Transformation):
+    """Moves a node that implements a (1, -1) reshape past a MatMul, Mul or Add node."""
+
+    def apply(self, model):
+        graph = model.graph
+        graph_modified = False
+        node_ind = 0
+        for n in graph.node:
+            node_ind += 1
+            if (
+                n.op_type == "Flatten"
+                and not model.is_fork_node(n)
+                and not model.is_join_node(n)
+            ):
+                consumer = model.find_consumer(n.output[0])
+                if (
+                    consumer is not None
+                    and (
+                        consumer.op_type == "MatMul"
+                        or consumer.op_type == "Mul"
+                        or consumer.op_type == "Add"
+                    )
+                    and not model.is_join_node(consumer)
+                ):
+                    # move flatten past operation and rewire tensors
+                    start_name = n.input[0]
+                    # check if datalyout is set to NHWC and H=W=1
+                    datalayout = model.get_tensor_layout(start_name)
+                    if datalayout == DataLayout.NHWC:
+                        (b, h, w, c) = model.get_tensor_shape(start_name)
+                        if h != 1 or w != 1:
+                            warnings.warn(
+                                """The Transformation can only be performed if
+                            H=W=1."""
+                            )
+                            continue
+                    else:
+                        warnings.warn(
+                            """The Transformation can only be performed on
+                            operations that operate on data layout NHWC."""
+                        )
+                        continue
+                    middle_name = n.output[0]
+                    end_name = consumer.output[0]
+                    op_param_name = consumer.input[1]
+                    A = model.get_initializer(op_param_name)
+                    if A is None:
+                        warnings.warn("Param is not constant, skipping")
+                        continue
+                    op_in_dt = model.get_tensor_datatype(consumer.input[0])
+                    op_out_dt = model.get_tensor_datatype(consumer.output[0])
+                    start_shape = model.get_tensor_shape(start_name)
+                    dummy_in = np.random.uniform(low=0, high=1, size=(start_shape))
+
+                    if consumer.op_type == "MatMul":
+                        dummy_out = np.matmul(dummy_in, A)
+                    elif consumer.op_type == "Mul":
+                        dummy_out = dummy_in * A
+                    elif consumer.op_type == "Add":
+                        dummy_out = dummy_in + A
+
+                    new_op = oh.make_node(
+                        consumer.op_type,
+                        [start_name, op_param_name],
+                        [middle_name],
+                        name=consumer.name,
+                    )
+                    new_flatten = oh.make_node("Flatten", [middle_name], [end_name])
+                    graph.node.insert(node_ind, new_op)
+                    graph.node.insert(node_ind + 1, new_flatten)
+                    model.set_tensor_shape(middle_name, dummy_out.shape)
+                    # because a flatten node doesn't change the datatype we need
+                    # only the datatype of the op node
+                    model.set_tensor_datatype(start_name, op_in_dt)
+                    model.set_tensor_datatype(middle_name, op_out_dt)
+                    model.set_tensor_datatype(end_name, op_out_dt)
+                    # set datalayout
+                    model.set_tensor_layout(start_name, DataLayout.NHWC)
+                    model.set_tensor_layout(middle_name, DataLayout.NHWC)
+                    # remove old nodes
+                    graph.node.remove(n)
+                    graph.node.remove(consumer)
+                    graph_modified = True
+
+        model = model.transform(InferShapes())
+        model = model.transform(InferDataTypes())
+        model = model.transform(InferDataLayouts())
+        return (model, graph_modified)
+
+
+class MoveTransposePastScalarMul(Transformation):
+    """Moves a Transpose node past a scalar Mul node"""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if (
+                n.op_type == "Transpose"
+                and not model.is_fork_node(n)
+                and not model.is_join_node(n)
+            ):
+                consumer = model.find_consumer(n.output[0])
+                if (
+                    consumer is not None
+                    and consumer.op_type == "Mul"
+                    and not model.is_join_node(consumer)
+                ):
+                    mul_weight_name = consumer.input[1]
+                    A = model.get_initializer(mul_weight_name)
+                    if A is None:
+                        warnings.warn("Mul param is not constant, skipping")
+                        continue
+                    transp_node = n
+                    mul_node = consumer
+                    start_name = transp_node.input[0]
+                    middle_name = transp_node.output[0]
+                    end_name = mul_node.output[0]
+                    transp_in_shape = model.get_tensor_shape(start_name)
+                    transp_out_shape = model.get_tensor_shape(middle_name)
+                    transp_in_layout = model.get_tensor_layout(start_name)
+                    transp_out_layout = model.get_tensor_layout(middle_name)
+                    if transp_in_layout is None or transp_out_layout is None:
+                        warnings.warn(
+                            """Datalayout is not set for tensors.
+                            Transformation can't be applied."""
+                        )
+                        continue
+                    if all(x == 1 for x in A.shape):
+                        # if the mul is scalar, we can simply swap the order of ops
+                        # rewire transpose input to be mul input
+                        mul_node.input[0] = start_name
+                        model.set_tensor_shape(start_name, transp_in_shape)
+                        model.set_tensor_layout(start_name, transp_in_layout)
+                        mul_node.output[0] = middle_name
+                        model.set_tensor_shape(middle_name, transp_in_shape)
+                        model.set_tensor_layout(middle_name, transp_in_layout)
+                        transp_node.input[0] = middle_name
+                        transp_node.output[0] = end_name
+                        model.set_tensor_shape(end_name, transp_out_shape)
+                        model.set_tensor_layout(end_name, transp_out_layout)
+                        graph.node.remove(transp_node)
+                        graph.node.insert(node_ind, transp_node)
+                        graph_modified = True
+
+        if graph_modified is True:
+            model = model.transform(InferDataLayouts())
+            model = model.transform(InferShapes())
+        return (model, graph_modified)
diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index 0d4db225801065099b9bbed2cbcd570fa6558990..6c92e9b2765b1c2be6f95ee148964bccfb3cd7be 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -31,6 +31,7 @@ import random
 import string
 import subprocess
 import tempfile
+import warnings
 
 import numpy as np
 
@@ -52,6 +53,19 @@ pynq_native_port_width["Ultra96"] = 128
 pynq_native_port_width["ZCU102"] = 128
 pynq_native_port_width["ZCU104"] = 128
 
+# Alveo device and platform mappings
+alveo_part_map = dict()
+alveo_part_map["U50"] = "xcu50-fsvh2104-2L-e"
+alveo_part_map["U200"] = "xcu200-fsgd2104-2-e"
+alveo_part_map["U250"] = "xcu250-figd2104-2L-e"
+alveo_part_map["U280"] = "xcu280-fsvh2892-2L-e"
+
+alveo_default_platform = dict()
+alveo_default_platform["U50"] = "xilinx_u50_gen3x16_xdma_201920_3"
+alveo_default_platform["U200"] = "xilinx_u200_xdma_201830_2"
+alveo_default_platform["U250"] = "xilinx_u250_xdma_201830_2"
+alveo_default_platform["U280"] = "xilinx_u280_xdma_201920_3"
+
 
 def get_rtlsim_trace_depth():
     """Return the trace depth for rtlsim via PyVerilator. Controllable
@@ -72,6 +86,16 @@ def get_rtlsim_trace_depth():
         return 1
 
 
+def get_remote_vivado():
+    """Return the address of the remote Vivado synthesis server as set by the,
+    REMOTE_VIVADO environment variable, otherwise return None"""
+
+    try:
+        return os.environ["REMOTE_VIVADO"]
+    except KeyError:
+        return None
+
+
 def get_num_default_workers():
     """Return the number of workers for parallel transformations. Controllable
     via the NUM_DEFAULT_WORKERS environment variable. If the env.var. is
@@ -97,6 +121,25 @@ def get_finn_root():
         )
 
 
+def get_execution_error_thresh():
+    "Return the max error that is allowed for rounding in FINN execution."
+    try:
+        return float(os.environ["ERROR_THRESH"])
+    except KeyError:
+        return 1e-2
+
+
+def get_sanitize_quant_tensors():
+    """Return whether tensors with quantization annotations should be sanitized.
+    Enabled by default, disabling will yield faster ONNX execution but may give
+    incorrect results. Use with caution."""
+    try:
+        return int(os.environ["SANITIZE_QUANT_TENSORS"])
+    except KeyError:
+        # enabled by default
+        return 1
+
+
 def make_build_dir(prefix=""):
     """Creates a temporary folder with given prefix to be used as a build dir.
     Use this function instead of tempfile.mkdtemp to ensure any generated files
@@ -256,6 +299,69 @@ def calculate_signed_dot_prod_range(dt_a, dt_b, len):
     return (min_prod, max_prod)
 
 
+def sanitize_quant_values(model, node_tensors, execution_context, check_values=False):
+    """ Sanitize given list of tensors in execution_context by rounding values
+    that are supposed to be integers (as indicated by their quantization
+    annotation). Will raise an assertion if the amount of rounding is too large.
+    Returns the sanitized execution context.
+
+    If check_values is specified, an extra DataType.allowed() check will be
+    performed on any rounded tensors.
+
+    Background:
+    FINN uses floating point tensors as a carrier data type to represent
+    integers. Floating point arithmetic can introduce rounding errors, e.g.
+    (int_num * float_scale) / float_scale is not always equal to int_num.
+    We use this function to ensure that the values that are supposed to be
+    integers are indeed integers.
+    """
+
+    for tensor in node_tensors:
+        dtype = model.get_tensor_datatype(tensor)
+        # floats don't need sanitization, skip to next
+        # introduces less quicker runtime
+        if dtype == DataType.FLOAT32:
+            continue
+        current_values = execution_context[tensor]
+        updated_values = current_values
+        has_to_be_rounded = False
+        # TODO: vectorize with numpy
+        for value in np.nditer(current_values):
+            if not dtype.allowed(value):
+                has_to_be_rounded = True
+                break
+        if has_to_be_rounded:
+            updated_values = np.round(current_values)
+            warnings.warn(
+                "The values of tensor {} can't be represented "
+                "with the set FINN datatype ({}), they will be rounded to match the "
+                "FINN datatype.".format(tensor, dtype)
+            )
+        # check if rounded values are not too far from original values
+        max_error = max(np.abs(current_values - updated_values).flatten())
+        if max_error <= get_execution_error_thresh():
+            if check_values is True:
+                # check again if values can now be represented with set finn datatype
+                # TODO: vectorize with numpy
+                for value in np.nditer(updated_values):
+                    if not dtype.allowed(value):
+                        raise Exception(
+                            """Values can't be represented with set
+                                finn datatype ({}) for input {}""".format(
+                                dtype, tensor
+                            )
+                        )
+            execution_context[tensor] = updated_values
+        else:
+            raise Exception(
+                """Rounding error is too high to match set FINN
+            datatype ({}) for input {}""".format(
+                    dtype, tensor
+                )
+            )
+    return execution_context
+
+
 class CppBuilder:
     """Builds the g++ compiler command to produces the executable of the c++ code
     in code_gen_dir which is passed to the function build() of this class."""
diff --git a/src/finn/util/create.py b/src/finn/util/create.py
new file mode 100644
index 0000000000000000000000000000000000000000..853cdd0d44a05426b34bf1db3caa58d9289b2e9e
--- /dev/null
+++ b/src/finn/util/create.py
@@ -0,0 +1,178 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+from finn.core.modelwrapper import ModelWrapper
+from onnx import TensorProto, helper
+from finn.core.datatype import DataType
+from finn.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor
+
+
+def hls_random_mlp_maker(layer_spec):
+    """Create an MLP of given specification using HLSCustomOp instances.
+    Generate random weights/thresholds of appropriate size."""
+    ret = []
+    for l in layer_spec:
+        idt = l["idt"]
+        wdt = l["wdt"]
+        mw = l["mw"]
+        mh = l["mh"]
+        act = l["act"]
+        l["W"] = gen_finn_dt_tensor(wdt, (mw, mh))
+        if act is None:
+            # no activation, produce accumulators
+            T = None
+            tdt = None
+            if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
+                odt = DataType.UINT32
+            else:
+                odt = DataType.INT32
+        else:
+            odt = act
+            (min, max) = calculate_signed_dot_prod_range(idt, wdt, mw)
+            n_steps = act.get_num_possible_values() - 1
+            T = np.random.randint(min, max - 1, (mh, n_steps)).astype(np.float32)
+            # provide non-decreasing thresholds
+            T = np.sort(T, axis=1)
+            # generate thresholds for activation
+            if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
+                tdt = DataType.UINT32
+                # bias thresholds to be positive
+                T = np.ceil((T + mw) / 2)
+                assert (T >= 0).all()
+            else:
+                tdt = DataType.INT32
+        l["T"] = T
+        l["tdt"] = tdt
+        l["odt"] = odt
+        ret.append(l)
+
+    return hls_mlp_maker(ret)
+
+
+def hls_mlp_maker(layer_spec):
+    """Create an MLP of given specification using HLSCustomOp instances."""
+
+    current_in_name = ""
+    current_out_name = ""
+    i = 0
+
+    graph = helper.make_graph(nodes=[], name="mlp", inputs=[], outputs=[])
+
+    model = helper.make_model(graph, producer_name="finn")
+    model = ModelWrapper(model)
+
+    for l in layer_spec:
+        current_W_name = "W_%d" % i
+        current_T_name = "T_%d" % i
+        current_in_name = "act_%d" % i
+        current_out_name = "act_%d" % (i + 1)
+
+        W = l["W"]
+        (mw, mh) = W.shape
+        T = l["T"]
+        pe = l["pe"]
+        simd = l["simd"]
+        wdt = l["wdt"]
+        idt = l["idt"]
+        tdt = l["tdt"]
+        odt = l["odt"]
+
+        if i == 0:
+            global_in = helper.make_tensor_value_info(
+                current_in_name, TensorProto.FLOAT, [1, mw]
+            )
+            model.graph.input.append(global_in)
+
+        if i == len(layer_spec) - 1:
+            global_out = helper.make_tensor_value_info(
+                current_out_name, TensorProto.FLOAT, [1, mh]
+            )
+            model.graph.output.append(global_out)
+
+        # there are two ways to implement bipolar weights and inputs for
+        # StreamingFC:
+        # - specify their datatypes as such
+        # - specify their datatypes as BINARY as use binaryXnorMode
+        if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
+            # we'll internally convert weights/inputs to binary and specify the
+            # datatypes as such, and also set the binaryXnorMode attribute to 1
+            export_wdt = DataType.BINARY
+            export_idt = DataType.BINARY
+            binary_xnor_mode = 1
+        else:
+            export_wdt = wdt
+            export_idt = idt
+            binary_xnor_mode = 0
+
+        if T is not None:
+            no_act = 0
+            node_inp_list = [current_in_name, current_W_name, current_T_name]
+            if odt == DataType.BIPOLAR:
+                actval = 0
+            else:
+                actval = odt.min()
+        else:
+            # no thresholds
+            node_inp_list = [current_in_name, current_W_name]
+            actval = 0
+            no_act = 1
+        FCLayer_node = helper.make_node(
+            "StreamingFCLayer_Batch",
+            node_inp_list,
+            [current_out_name],
+            domain="finn",
+            backend="fpgadataflow",
+            resType="ap_resource_lut()",
+            MW=mw,
+            MH=mh,
+            SIMD=simd,
+            PE=pe,
+            inputDataType=export_idt.name,
+            weightDataType=export_wdt.name,
+            outputDataType=odt.name,
+            ActVal=actval,
+            binaryXnorMode=binary_xnor_mode,
+            noActivation=no_act,
+        )
+
+        model.graph.node.append(FCLayer_node)
+        model.set_tensor_datatype(current_in_name, idt)
+        model.set_tensor_datatype(current_out_name, odt)
+        model.set_tensor_datatype(current_W_name, wdt)
+        if binary_xnor_mode:
+            # convert bipolar to binary
+            model.set_initializer(current_W_name, (W + 1) / 2)
+        else:
+            model.set_initializer(current_W_name, W)
+        if T is not None:
+            model.set_tensor_datatype(current_T_name, tdt)
+            model.set_initializer(current_T_name, T)
+        i += 1
+
+    return model
diff --git a/src/finn/util/onnx.py b/src/finn/util/onnx.py
index b9932111d86d7206b23e1d0e49a6aa8451f8ba24..4d7cdd126ededac887639a932c2021ef5f081c02 100644
--- a/src/finn/util/onnx.py
+++ b/src/finn/util/onnx.py
@@ -28,6 +28,7 @@
 
 import numpy as np
 import onnx
+import finn.core.data_layout as DataLayout
 
 
 def valueinfo_to_tensor(vi):
@@ -37,3 +38,38 @@ def valueinfo_to_tensor(vi):
     return np.zeros(
         dims, dtype=onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[vi.type.tensor_type.elem_type]
     )
+
+
+def nchw_to_nhwc(t, model, idx, reverse=False):
+    """Converts between NCHW <-> NHWC layouts for tensor t by inserting a transpose. 
+    If reverse=False, t is assumed NCHW and we insert transpose to convert NCHW -> NHWC
+    If reverse=True, t is assumed NHWC and we insert transpose to convert NHWC -> NCHW.
+    """
+    graph = model.graph
+    # create new NHWC tensor
+    t_shape = model.get_tensor_shape(t)
+    bs = t_shape[0]
+    ch = t_shape[1]
+    height = t_shape[2]
+    width = t_shape[3]
+    t_trans = onnx.helper.make_tensor_value_info(
+        model.make_new_valueinfo_name(),
+        onnx.TensorProto.FLOAT,
+        (bs, height, width, ch),  # NHWC
+    )
+    graph.value_info.append(t_trans)
+    dt = model.get_tensor_datatype(t)
+    t_trans = t_trans.name
+    model.set_tensor_datatype(t_trans, dt)
+    model.set_tensor_layout(t_trans, DataLayout.NHWC)
+    # NCHW <-> NHWC transpose
+    if reverse:
+        t_trans_node = onnx.helper.make_node(
+            "Transpose", [t_trans], [t], perm=[0, 3, 1, 2]
+        )
+    else:
+        t_trans_node = onnx.helper.make_node(
+            "Transpose", [t], [t_trans], perm=[0, 2, 3, 1]
+        )
+    graph.node.insert(idx, t_trans_node)
+    return t_trans
diff --git a/src/finn/util/vcd.py b/src/finn/util/vcd.py
index d9e244422065314ceb790dc6719b57688ff76828..a4400f7bd7e75549189f081ce255fd67c49b3746 100644
--- a/src/finn/util/vcd.py
+++ b/src/finn/util/vcd.py
@@ -162,16 +162,23 @@ def _get_stats(x):
     return (x[0], get_stream_if_stats(x[1], x[0]))
 
 
-def get_all_stream_if_stats(vcd_file, stream_ifs=None, sort_by="{'V': 1, 'R': 0}"):
+def get_all_stream_if_stats(vcd_file, stream_ifs=None, sort_by="{'V': 1, 'R': 0}", num_workers=None):
     """Return a list of streaming interface stats, sorted by the percentage
-    for the given sort_by key. If stream_ifs is None, all streamin interface
+    for the given sort_by key. If stream_ifs is None, all streaming interface
     stats will be returned, otherwise treated as a list of interface names to
-    return the stats for."""
+    return the stats for.
+    By default the number of parallel workers from the environment variable
+    NUM_DEFAULT_WORKERS will be used. This behavior can be changed on a per
+    call basis by supplying the optional parameter: num_workers
+    """
 
     if stream_ifs is None:
         stream_ifs = list_stream_if(vcd_file)
 
-    with mp.Pool(get_num_default_workers()) as p:
+    if num_workers is None:
+        num_workers = get_num_default_workers()
+
+    with mp.Pool(num_workers) as p:
         stream_ifs = map(lambda x: (x, vcd_file), stream_ifs)
         all_stats = p.map(_get_stats, stream_ifs)
 
diff --git a/src/finn/util/vivado.py b/src/finn/util/vivado.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b6df3940cfeeed292345382471719c49f725de6
--- /dev/null
+++ b/src/finn/util/vivado.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import subprocess
+import stat
+from finn.util.basic import get_remote_vivado
+
+
+def which(program):
+    "Python equivalent of the shell cmd 'which'."
+
+    # source:
+    # https://stackoverflow.com/questions/377017/test-if-executable-exists-in-python
+    def is_exe(fpath):
+        return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
+
+    fpath, fname = os.path.split(program)
+    if fpath:
+        if is_exe(program):
+            return program
+    else:
+        for path in os.environ["PATH"].split(os.pathsep):
+            exe_file = os.path.join(path, program)
+            if is_exe(exe_file):
+                return exe_file
+
+    return None
+
+
+def out_of_context_synth(
+    verilog_dir,
+    top_name,
+    fpga_part="xczu3eg-sbva484-1-e",
+    clk_name="ap_clk_0",
+    clk_period_ns=5.0,
+    remote_server=get_remote_vivado(),
+):
+    "Run out-of-context Vivado synthesis, return resources and slack."
+
+    # ensure that the OH_MY_XILINX envvar is set
+    if "OHMYXILINX" not in os.environ:
+        raise Exception("The environment variable OHMYXILINX is not defined.")
+    # ensure that vivado is in PATH: source $VIVADO_PATH/settings64.sh
+    if which("vivado") is None:
+        raise Exception("vivado is not in PATH, ensure settings64.sh is sourced.")
+    omx_path = os.environ["OHMYXILINX"]
+    if remote_server is None:
+        script = "vivadocompile.sh"
+    else:
+        script = "vivadoprojgen.sh"
+    # vivadocompile.sh <top-level-entity> <clock-name (optional)> <fpga-part (optional)>
+    call_omx = "zsh %s/%s %s %s %s %f" % (
+        omx_path,
+        script,
+        top_name,
+        clk_name,
+        fpga_part,
+        float(clk_period_ns),
+    )
+    call_omx = call_omx.split()
+    proc = subprocess.Popen(
+        call_omx, cwd=verilog_dir, stdout=subprocess.PIPE, env=os.environ
+    )
+    proc.communicate()
+
+    vivado_proj_folder = "%s/results_%s" % (verilog_dir, top_name)
+    res_counts_path = vivado_proj_folder + "/res.txt"
+    if remote_server is not None:
+        print("Using remote Vivado OOC synth, remote server %s" % remote_server)
+        run_synth = """
+#!/bin/bash
+which vivado;
+cd %s;
+vivado -mode tcl -source %s.tcl -tclargs %s;
+cat %s
+        """ % (
+            vivado_proj_folder,
+            top_name,
+            top_name,
+            res_counts_path,
+        )
+        with open(vivado_proj_folder + "/run.sh", "w") as f:
+            f.write(run_synth)
+        st = os.stat(vivado_proj_folder + "/run.sh")
+        os.chmod(vivado_proj_folder + "/run.sh", st.st_mode | stat.S_IEXEC)
+        # note that this assumes the same temp folder can be created on the
+        # remote server
+        # note we set target path as / due to use of -R (relative)
+        remote_server_uri = remote_server + ":/"
+        copy_files = "rsync -avzR %s %s" % (verilog_dir + "/", remote_server_uri)
+        copy_files = copy_files.split()
+        proc = subprocess.Popen(copy_files, cwd=verilog_dir, env=os.environ)
+        proc.communicate()
+        vivado_cmd = "bash -ic %s/run.sh" % vivado_proj_folder
+        run_vivado = ["ssh", "-t", remote_server, vivado_cmd]
+        proc = subprocess.Popen(run_vivado, cwd=verilog_dir, env=os.environ)
+        proc.communicate()
+        remote_server_result = remote_server + ":" + res_counts_path
+        copy_results = "rsync -avz %s %s" % (remote_server_result, res_counts_path)
+        copy_results = copy_results.split()
+        proc = subprocess.Popen(copy_results, cwd=verilog_dir, env=os.environ)
+        proc.communicate()
+
+    with open(res_counts_path, "r") as myfile:
+        res_data = myfile.read().split("\n")
+    ret = {}
+    ret["vivado_proj_folder"] = vivado_proj_folder
+    for res_line in res_data:
+        res_fields = res_line.split("=")
+        print(res_fields)
+        try:
+            ret[res_fields[0]] = float(res_fields[1])
+        except ValueError:
+            ret[res_fields[0]] = 0
+        except IndexError:
+            ret[res_fields[0]] = 0
+    if ret["WNS"] == 0:
+        ret["fmax_mhz"] = 0
+    else:
+        ret["fmax_mhz"] = 1000.0 / (clk_period_ns - ret["WNS"])
+    return ret
diff --git a/tests/brevitas/test_brevitas_QConv2d.py b/tests/brevitas/test_brevitas_QConv2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..198f1e7961a9e160589989b8b34b45b5fda53817
--- /dev/null
+++ b/tests/brevitas/test_brevitas_QConv2d.py
@@ -0,0 +1,76 @@
+import pytest
+import os
+import numpy as np
+import torch
+import brevitas.onnx as bo
+from brevitas.nn import QuantConv2d
+from brevitas.core.restrict_val import RestrictValueType
+from brevitas.core.quant import QuantType
+from brevitas.core.scaling import ScalingImplType
+from brevitas.core.stats import StatsOp
+
+from finn.core.modelwrapper import ModelWrapper
+from finn.core.datatype import DataType
+import finn.core.onnx_exec as oxe
+from finn.transformation.infer_shapes import InferShapes
+from finn.util.basic import gen_finn_dt_tensor
+
+export_onnx_path = "test_brevitas_conv.onnx"
+
+
+@pytest.mark.parametrize("dw", [False, True])
+@pytest.mark.parametrize("in_channels", [32])
+def test_brevitas_QConv2d(dw, in_channels):
+    ishape = (1, 32, 111, 111)
+    if dw is True:
+        groups = in_channels
+        out_channels = in_channels
+        kernel_size = 3
+        padding = 1
+        stride = 1
+        w_shape = (32, 1, 3, 3)
+
+    else:
+        groups = 1
+        out_channels = 64
+        kernel_size = 1
+        padding = 0
+        stride = 1
+        w_shape = (64, 32, 1, 1)
+
+    b_conv = QuantConv2d(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        groups=groups,
+        kernel_size=kernel_size,
+        padding=padding,
+        stride=stride,
+        bias=False,
+        bias_quant_type=QuantType.FP,
+        compute_output_bit_width=False,
+        compute_output_scale=False,
+        weight_bit_width=4,
+        weight_quant_type=QuantType.INT,
+        weight_scaling_impl_type=ScalingImplType.STATS,
+        weight_scaling_stats_op=StatsOp.MAX,
+        weight_scaling_per_output_channel=True,
+        weight_restrict_scaling_type=RestrictValueType.LOG_FP,
+        weight_narrow_range=True,
+        weight_scaling_min_val=2e-16,
+    )
+    weight_tensor = gen_finn_dt_tensor(DataType.INT4, w_shape)
+    b_conv.weight = torch.nn.Parameter(torch.from_numpy(weight_tensor).float())
+
+    bo.export_finn_onnx(b_conv, ishape, export_onnx_path)
+    model = ModelWrapper(export_onnx_path)
+    model = model.transform(InferShapes())
+    inp_tensor = np.random.uniform(low=-1.0, high=1.0, size=ishape).astype(np.float32)
+    idict = {model.graph.input[0].name: inp_tensor}
+    odict = oxe.execute_onnx(model, idict, True)
+    produced = odict[model.graph.output[0].name]
+    inp_tensor = torch.from_numpy(inp_tensor).float()
+    b_conv.eval()
+    expected = b_conv.forward(inp_tensor).detach().numpy()
+
+    assert np.isclose(produced, expected, atol=1e-3).all()
+    os.remove(export_onnx_path)
diff --git a/tests/brevitas/test_brevitas_avg_pool_export.py b/tests/brevitas/test_brevitas_avg_pool_export.py
new file mode 100644
index 0000000000000000000000000000000000000000..e78812b21a03baa6963f1f0efaefdb4c73e4d0db
--- /dev/null
+++ b/tests/brevitas/test_brevitas_avg_pool_export.py
@@ -0,0 +1,103 @@
+import os
+
+import onnx  # noqa
+import torch
+import numpy as np
+import brevitas.onnx as bo
+from brevitas.nn import QuantAvgPool2d
+from brevitas.quant_tensor import pack_quant_tensor
+from brevitas.core.quant import QuantType
+from finn.core.modelwrapper import ModelWrapper
+from finn.core.datatype import DataType
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.util.basic import gen_finn_dt_tensor
+import finn.core.onnx_exec as oxe
+
+import pytest
+
+export_onnx_path = "test_brevitas_avg_pool_export.onnx"
+
+
+@pytest.mark.parametrize("kernel_size", [2, 3])
+@pytest.mark.parametrize("stride", [1, 2])
+@pytest.mark.parametrize("signed", [False, True])
+@pytest.mark.parametrize("bit_width", [2, 4])
+@pytest.mark.parametrize("input_bit_width", [4, 8, 32])
+@pytest.mark.parametrize("channels", [2, 4])
+@pytest.mark.parametrize("idim", [7, 8])
+def test_brevitas_avg_pool_export(
+    kernel_size, stride, signed, bit_width, input_bit_width, channels, idim
+):
+    ishape = (1, channels, idim, idim)
+    ibw_tensor = torch.Tensor([input_bit_width])
+
+    b_avgpool = QuantAvgPool2d(
+        kernel_size=kernel_size,
+        stride=stride,
+        signed=signed,
+        min_overall_bit_width=bit_width,
+        max_overall_bit_width=bit_width,
+        quant_type=QuantType.INT,
+    )
+    # call forward pass manually once to cache scale factor and bitwidth
+    input_tensor = torch.from_numpy(np.zeros(ishape)).float()
+    scale = np.ones((1, channels, 1, 1))
+    output_scale = torch.from_numpy(scale).float()
+    input_quant_tensor = pack_quant_tensor(
+        tensor=input_tensor, scale=output_scale, bit_width=ibw_tensor
+    )
+    bo.export_finn_onnx(b_avgpool, ishape, export_onnx_path, input_t=input_quant_tensor)
+    model = ModelWrapper(export_onnx_path)
+
+    # determine input FINN datatype
+    if signed is True:
+        prefix = "INT"
+    else:
+        prefix = "UINT"
+    dt_name = prefix + str(input_bit_width // 2)
+    dtype = DataType[dt_name]
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+
+    # execution with input tensor using integers and scale = 1
+    # calculate golden output
+    inp = gen_finn_dt_tensor(dtype, ishape)
+    input_tensor = torch.from_numpy(inp).float()
+    input_quant_tensor = pack_quant_tensor(
+        tensor=input_tensor, scale=output_scale, bit_width=ibw_tensor
+    )
+    b_avgpool.eval()
+    expected = b_avgpool.forward(input_quant_tensor).tensor.detach().numpy()
+
+    # finn execution
+    idict = {model.graph.input[0].name: inp}
+    odict = oxe.execute_onnx(model, idict, True)
+    produced = odict[model.graph.output[0].name]
+    assert (expected == produced).all()
+
+    # execution with input tensor using float and scale != 1
+    scale = np.random.uniform(low=0, high=1, size=(1, channels, 1, 1)).astype(
+        np.float32
+    )
+    inp_tensor = inp * scale
+    input_tensor = torch.from_numpy(inp_tensor).float()
+    input_scale = torch.from_numpy(scale).float()
+    input_quant_tensor = pack_quant_tensor(
+        tensor=input_tensor, scale=input_scale, bit_width=ibw_tensor
+    )
+    # export again to set the scale values correctly
+    bo.export_finn_onnx(b_avgpool, ishape, export_onnx_path, input_t=input_quant_tensor)
+    model = ModelWrapper(export_onnx_path)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    b_avgpool.eval()
+    expected = b_avgpool.forward(input_quant_tensor).tensor.detach().numpy()
+    # finn execution
+    idict = {model.graph.input[0].name: inp_tensor}
+    odict = oxe.execute_onnx(model, idict, True)
+    produced = odict[model.graph.output[0].name]
+
+    assert np.isclose(expected, produced).all()
+
+    os.remove(export_onnx_path)
diff --git a/tests/brevitas/test_brevitas_cnv.py b/tests/brevitas/test_brevitas_cnv.py
index c04e16ad1923609c81240235057cc7a190c90ffb..764671bee13710ef1d9fa21aab5ef600075b9b0d 100644
--- a/tests/brevitas/test_brevitas_cnv.py
+++ b/tests/brevitas/test_brevitas_cnv.py
@@ -38,11 +38,11 @@ import finn.core.onnx_exec as oxe
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fold_constants import FoldConstants
 from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.general import GiveUniqueNodeNames
+from finn.transformation.general import GiveUniqueNodeNames, RemoveStaticGraphInputs
 from finn.transformation.double_to_single_float import DoubleToSingleFloat
 from finn.util.test import get_test_model_trained
 
-export_onnx_path = "test_output_cnv.onnx"
+export_onnx_path = "test_brevitas_cnv.onnx"
 
 
 @pytest.mark.parametrize("abits", [1, 2])
@@ -57,6 +57,9 @@ def test_brevitas_cnv_export_exec(wbits, abits):
     model = model.transform(DoubleToSingleFloat())
     model = model.transform(InferShapes())
     model = model.transform(FoldConstants())
+    model = model.transform(RemoveStaticGraphInputs())
+    assert len(model.graph.input) == 1
+    assert len(model.graph.output) == 1
     fn = pk.resource_filename("finn", "data/cifar10/cifar10-test-data-class3.npz")
     input_tensor = np.load(fn)["arr_0"].astype(np.float32)
     input_tensor = input_tensor / 255
diff --git a/tests/brevitas/test_brevitas_fc.py b/tests/brevitas/test_brevitas_fc.py
index db18d91e3590e896e111c9e38bdc4de43872a98c..9369b25385080875efcb286c02291fc579a15a34 100644
--- a/tests/brevitas/test_brevitas_fc.py
+++ b/tests/brevitas/test_brevitas_fc.py
@@ -39,6 +39,7 @@ import torch
 import finn.core.onnx_exec as oxe
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fold_constants import FoldConstants
+from finn.transformation.general import RemoveStaticGraphInputs
 from finn.transformation.infer_shapes import InferShapes
 from finn.util.basic import make_build_dir
 from finn.util.test import get_test_model_trained
@@ -63,6 +64,9 @@ def test_brevitas_fc_onnx_export_and_exec(size, wbits, abits):
     model = ModelWrapper(finn_onnx)
     model = model.transform(InferShapes())
     model = model.transform(FoldConstants())
+    model = model.transform(RemoveStaticGraphInputs())
+    assert len(model.graph.input) == 1
+    assert len(model.graph.output) == 1
     # load one of the test vectors
     raw_i = get_data("finn", "data/onnx/mnist-conv/test_data_set_0/input_0.pb")
     input_tensor = onnx.load_tensor_from_string(raw_i)
diff --git a/tests/brevitas/test_brevitas_non_scaled_QuantHardTanh_export.py b/tests/brevitas/test_brevitas_non_scaled_QuantHardTanh_export.py
index b66348a9902802bc65b2a35e8bc3e311cc81e0bc..9c7296b7b3b6d36cfb43b6d9e96e7fba6bbce49a 100644
--- a/tests/brevitas/test_brevitas_non_scaled_QuantHardTanh_export.py
+++ b/tests/brevitas/test_brevitas_non_scaled_QuantHardTanh_export.py
@@ -12,7 +12,7 @@ import finn.core.onnx_exec as oxe
 from finn.transformation.infer_shapes import InferShapes
 from brevitas.core.quant import QuantType
 
-export_onnx_path = "test_act.onnx"
+export_onnx_path = "test_brevitas_non_scaled_QuantHardTanh_export.onnx"
 
 
 @pytest.mark.parametrize("abits", [1, 2, 4, 8])
diff --git a/tests/brevitas/test_brevitas_relu_act_export.py b/tests/brevitas/test_brevitas_relu_act_export.py
index c9d8f2d812bc7bea1a2fd2598a7711099ad421e6..77974dacb51aa8746ce33f9a490becd35390db5a 100644
--- a/tests/brevitas/test_brevitas_relu_act_export.py
+++ b/tests/brevitas/test_brevitas_relu_act_export.py
@@ -12,7 +12,7 @@ from finn.core.modelwrapper import ModelWrapper
 import finn.core.onnx_exec as oxe
 from finn.transformation.infer_shapes import InferShapes
 
-export_onnx_path = "test_act.onnx"
+export_onnx_path = "test_brevitas_relu_act_export.onnx"
 
 
 @pytest.mark.parametrize("abits", [1, 2, 4, 8])
@@ -23,6 +23,7 @@ export_onnx_path = "test_act.onnx"
 def test_brevitas_act_export_relu(abits, max_val, scaling_impl_type):
     min_val = -1.0
     ishape = (1, 15)
+
     b_act = QuantReLU(
         bit_width=abits,
         max_val=max_val,
@@ -67,3 +68,60 @@ scaling_impl.learned_value": torch.tensor(
 
     assert np.isclose(produced, expected, atol=1e-3).all()
     os.remove(export_onnx_path)
+
+
+@pytest.mark.parametrize("abits", [1, 2, 4, 8])
+@pytest.mark.parametrize("max_val", [1.0, 1.5, 1 - 2 ** (-7)])
+@pytest.mark.parametrize("scaling_per_channel", [True, False])
+def test_brevitas_act_export_relu_imagenet(abits, max_val, scaling_per_channel):
+    out_channels = 32
+    ishape = (1, out_channels, 1, 1)
+    min_val = -1.0
+    b_act = QuantReLU(
+        bit_width=abits,
+        quant_type=QuantType.INT,
+        scaling_impl_type=ScalingImplType.PARAMETER,
+        scaling_per_channel=scaling_per_channel,
+        restrict_scaling_type=RestrictValueType.LOG_FP,
+        scaling_min_val=2e-16,
+        max_val=6.0,
+        return_quant_tensor=True,
+        per_channel_broadcastable_shape=(1, out_channels, 1, 1),
+    )
+    if scaling_per_channel is True:
+        rand_tensor = (2) * torch.rand((1, out_channels, 1, 1))
+    else:
+        rand_tensor = torch.tensor(1.2398)
+    checkpoint = {
+        "act_quant_proxy.fused_activation_quant_proxy.tensor_quant.\
+scaling_impl.learned_value": rand_tensor.type(
+            torch.FloatTensor
+        )
+    }
+    b_act.load_state_dict(checkpoint)
+    bo.export_finn_onnx(b_act, ishape, export_onnx_path)
+    model = ModelWrapper(export_onnx_path)
+    model = model.transform(InferShapes())
+    inp_tensor = np.random.uniform(low=min_val, high=max_val, size=ishape).astype(
+        np.float32
+    )
+    idict = {model.graph.input[0].name: inp_tensor}
+    odict = oxe.execute_onnx(model, idict, True)
+    produced = odict[model.graph.output[0].name]
+    inp_tensor = torch.from_numpy(inp_tensor).float()
+    b_act.eval()
+    expected = b_act.forward(inp_tensor).tensor.detach().numpy()
+    if not np.isclose(produced, expected, atol=1e-3).all():
+        print(abits, max_val)
+        print("scale: ", b_act.quant_act_scale().type(torch.FloatTensor).detach())
+        if abits < 5:
+            print(
+                "thres:",
+                ", ".join(["{:8.4f}".format(x) for x in b_act.export_thres[0]]),
+            )
+        print("input:", ", ".join(["{:8.4f}".format(x) for x in inp_tensor[0]]))
+        print("prod :", ", ".join(["{:8.4f}".format(x) for x in produced[0]]))
+        print("expec:", ", ".join(["{:8.4f}".format(x) for x in expected[0]]))
+
+    assert np.isclose(produced, expected, atol=1e-3).all()
+    os.remove(export_onnx_path)
diff --git a/tests/brevitas/test_brevitas_scaled_QHardTanh_export.py b/tests/brevitas/test_brevitas_scaled_QHardTanh_export.py
index d499f1517341477eca9915245da9ad12c346c5a9..e0ec82ebed44e2e984be9f62e02bc1721a7f9c33 100644
--- a/tests/brevitas/test_brevitas_scaled_QHardTanh_export.py
+++ b/tests/brevitas/test_brevitas_scaled_QHardTanh_export.py
@@ -12,7 +12,7 @@ from finn.core.modelwrapper import ModelWrapper
 import finn.core.onnx_exec as oxe
 from finn.transformation.infer_shapes import InferShapes
 
-export_onnx_path = "test_act.onnx"
+export_onnx_path = "test_brevitas_scaled_QHardTanh_export.onnx"
 
 
 @pytest.mark.parametrize("abits", [2, 4, 8])
diff --git a/tests/core/test_basic_onnx_exec.py b/tests/core/test_basic_onnx_exec.py
index a7b6da9965aa5912870812a8c1f8d6da2ee0d181..ddb2cbfc40c7647970f0c51ecb95340e7d1dddae 100644
--- a/tests/core/test_basic_onnx_exec.py
+++ b/tests/core/test_basic_onnx_exec.py
@@ -35,6 +35,8 @@ import onnx.numpy_helper as np_helper
 import finn.core.onnx_exec as oxe
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.infer_shapes import InferShapes
+from finn.core.datatype import DataType
+from finn.util.basic import gen_finn_dt_tensor
 
 
 def test_mnist_onnx_download_extract_run():
@@ -47,9 +49,50 @@ def test_mnist_onnx_download_extract_run():
     raw_o = get_data("finn", "data/onnx/mnist-conv/test_data_set_0/output_0.pb")
     input_tensor = onnx.load_tensor_from_string(raw_i)
     output_tensor = onnx.load_tensor_from_string(raw_o)
-    # run using FINN-based execution
+    # run using FINN-based execution (full graph)
     input_dict = {"Input3": np_helper.to_array(input_tensor)}
-    output_dict = oxe.execute_onnx(model, input_dict)
+    output_dict = oxe.execute_onnx(model, input_dict, return_full_exec_context=True)
     assert np.isclose(
         np_helper.to_array(output_tensor), output_dict["Plus214_Output_0"], atol=1e-3
     ).all()
+    # test subgraph execution
+    start_node = model.graph.node[1]
+    end_node = model.graph.node[3]
+    subgraph_i_dict = {start_node.input[0]: output_dict[start_node.input[0]]}
+    subgraph_o_dict = oxe.execute_onnx(
+        model,
+        subgraph_i_dict,
+        return_full_exec_context=True,
+        start_node=start_node,
+        end_node=end_node,
+    )
+    assert np.isclose(
+        subgraph_o_dict[end_node.output[0]], output_dict[end_node.output[0]], atol=1e-3
+    ).all()
+
+
+def test_onnx_exec_internal_rounding():
+    inp0 = onnx.helper.make_tensor_value_info("inp0", onnx.TensorProto.FLOAT, [2, 2])
+    inp1 = onnx.helper.make_tensor_value_info("inp1", onnx.TensorProto.FLOAT, [1])
+    outp = onnx.helper.make_tensor_value_info("outp", onnx.TensorProto.FLOAT, [2, 2])
+    mul_node = onnx.helper.make_node("Mul", inputs=["inp0", "inp1"], outputs=["outp"])
+    graph = onnx.helper.make_graph(
+        nodes=[mul_node], name="mul_graph", inputs=[inp0, inp1], outputs=[outp]
+    )
+
+    model = onnx.helper.make_model(graph, producer_name="mul-model")
+    model = ModelWrapper(model)
+    idt = DataType.INT2
+    model.set_tensor_datatype("inp0", idt)
+    model.set_tensor_datatype("inp1", idt)
+    model.transform(InferShapes())
+
+    mul_value = np.asarray([-1], dtype=np.float32)
+    inp_int = gen_finn_dt_tensor(idt, [2, 2])
+    scale = np.random.uniform(low=0, high=1, size=(2, 2)).astype(np.float32)
+    inp_rounded = (inp_int * scale) / (scale + 1e-7)
+    input_dict = {"inp0": inp_rounded, "inp1": mul_value}
+    output_dict = oxe.execute_onnx(model, input_dict)
+    produced = output_dict["outp"]
+    expected = np.multiply(inp_int, mul_value)
+    assert (produced == expected).all()
diff --git a/tests/core/test_modelwrapper.py b/tests/core/test_modelwrapper.py
index 5fa9b23bad5c5b67f65530c55f862f889c07b1ac..0fb7ae42f3bd556755f81a02be6c71fd73ffc519 100644
--- a/tests/core/test_modelwrapper.py
+++ b/tests/core/test_modelwrapper.py
@@ -36,7 +36,7 @@ import finn.core.data_layout as DataLayout
 from finn.core.modelwrapper import ModelWrapper
 from finn.util.test import get_test_model_trained
 
-export_onnx_path = "test_output_lfc.onnx"
+export_onnx_path = "test_modelwrapper.onnx"
 
 
 def test_modelwrapper():
diff --git a/tests/custom_op/test_xnorpopcountmatmul.py b/tests/custom_op/test_xnorpopcountmatmul.py
index 37d9b7e5968bdb70023be9b70515410e941f51ce..745b782d418129d96e21c327a49de04d53aa7c48 100644
--- a/tests/custom_op/test_xnorpopcountmatmul.py
+++ b/tests/custom_op/test_xnorpopcountmatmul.py
@@ -47,7 +47,7 @@ from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline.sign_to_thres import ConvertSignToThres
 from finn.util.test import get_test_model_trained
 
-export_onnx_path = "test_output_lfc.onnx"
+export_onnx_path = "test_xnorpopcountmatmul.onnx"
 
 
 def test_xnorpopcountmatmul():
diff --git a/tests/end2end/test_end2end_cnv_w1a1.py b/tests/end2end/test_end2end_cnv_w1a1.py
index c3359dcc82650bf0e9e8a5bc5276f5ca770ee96c..ebca224389550929cebd542cf4201cf62481a169 100644
--- a/tests/end2end/test_end2end_cnv_w1a1.py
+++ b/tests/end2end/test_end2end_cnv_w1a1.py
@@ -42,7 +42,12 @@ from finn.transformation.double_to_single_float import DoubleToSingleFloat
 from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
 from finn.transformation.fold_constants import FoldConstants
-from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from finn.transformation.general import (
+    RemoveUnusedTensors,
+    RemoveStaticGraphInputs,
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+)
 from finn.transformation.streamline import Streamline
 from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
 from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
@@ -72,6 +77,7 @@ from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip
 from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+from finn.core.throughput_test import throughput_test_rtlsim
 
 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
@@ -96,6 +102,7 @@ def test_end2end_cnv_w1a1_import_and_tidy():
     model = model.transform(FoldConstants())
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(GiveReadableTensorNames())
+    model = model.transform(RemoveStaticGraphInputs())
     model.save(build_dir + "/end2end_cnv_w1a1_tidy.onnx")
 
 
@@ -107,6 +114,7 @@ def test_end2end_cnv_w1a1_streamline():
     model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
     model = model.transform(ConvertBipolarMatMulToXnorPopcount())
     model = model.transform(Streamline())
+    model = model.transform(RemoveUnusedTensors())
     model.save(build_dir + "/end2end_cnv_w1a1_streamlined.onnx")
 
 
@@ -142,15 +150,15 @@ def test_end2end_cnv_w1a1_fold_and_tlastmarker():
     fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
     # each tuple is (PE, SIMD, in_fifo_depth) for a layer
     folding = [
-        (16, 3, 128),
-        (32, 32, 128),
-        (16, 32, 128),
-        (16, 32, 128),
-        (4, 32, 81),
+        (16, 3, 256),
+        (32, 32, 256),
+        (16, 32, 256),
+        (16, 32, 256),
+        (4, 32, 214),
         (1, 32, 2),
-        (1, 4, 2),
-        (1, 8, 128),
-        (5, 1, 3),
+        (1, 4, 126),
+        (1, 8, 62),
+        (5, 1, 6),
     ]
     for fcl, (pe, simd, ififodepth) in zip(fc_layers, folding):
         fcl_inst = getCustomOp(fcl)
@@ -159,10 +167,12 @@ def test_end2end_cnv_w1a1_fold_and_tlastmarker():
         fcl_inst.set_nodeattr("inFIFODepth", ififodepth)
 
     swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator")
+    swg_idepth = [2, 51, 9, 106, 2, 2]
     for i in range(len(swg_layers)):
         swg_inst = getCustomOp(swg_layers[i])
         simd = folding[i][1]
         swg_inst.set_nodeattr("SIMD", simd)
+        swg_inst.set_nodeattr("inFIFODepth", swg_idepth[i])
 
     model = model.transform(InsertDWC())
     model = model.transform(InsertFIFO())
@@ -221,6 +231,20 @@ def test_end2end_cnv_w1a1_verify_dataflow_part():
     assert np.isclose(res_cppsim, res_rtlsim_whole).all()
 
 
+@pytest.mark.vivado
+def test_end2end_cnv_w1a1_throughput_test_rtlsim():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w1a1_ipstitch_whole_rtlsim.onnx"
+    )
+    model.set_metadata_prop("rtlsim_trace", "rtlsim_trace.vcd")
+    # os.environ["RTLSIM_TRACE_DEPTH"] = "4"
+    # run through IP-stitched rtlsim with increasing batch sizes and
+    # check the number of cycles it takes to execute
+    ret = throughput_test_rtlsim(model, 10)
+    # TODO check for expected performance
+    assert ret["cycles"] > 0
+
+
 @pytest.mark.vivado
 def test_end2end_cnv_w1a1_verify_all():
     # use the streamlined model as the "golden" model for right answers
@@ -298,7 +322,7 @@ def test_end2end_cnv_w1a1_synth_pynq_project():
 
 def test_end2end_cnv_w1a1_make_driver():
     model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w1a1_synth.onnx")
-    model = model.transform(MakePYNQDriver())
+    model = model.transform(MakePYNQDriver(platform="zynq"))
     model.save(build_dir + "/end2end_cnv_w1a1_pynq_driver.onnx")
 
 
diff --git a/tests/end2end/test_end2end_cnv_w2a2.py b/tests/end2end/test_end2end_cnv_w2a2.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e34990007677ce1b8e0a9ae4a1781d4527ee040
--- /dev/null
+++ b/tests/end2end/test_end2end_cnv_w2a2.py
@@ -0,0 +1,384 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+
+import numpy as np
+
+# as of Feb'20 there is a bug that segfaults ONNX shape inference if we
+# import pytorch before onnx, so we make sure to import onnx first
+import onnx  # NOQA
+
+import pytest
+import pkg_resources as pk
+from finn.custom_op.registry import getCustomOp
+from finn.core.onnx_exec import execute_onnx
+from finn.transformation.double_to_single_float import DoubleToSingleFloat
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
+from finn.transformation.fold_constants import FoldConstants
+from finn.transformation.general import (
+    RemoveUnusedTensors,
+    RemoveStaticGraphInputs,
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+)
+from finn.transformation.streamline import Streamline
+from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+import finn.transformation.streamline.absorb as absorb
+from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+from finn.transformation.fpgadataflow.create_dataflow_partition import (
+    CreateDataflowPartition,
+)
+from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
+from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
+from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject
+from finn.transformation.fpgadataflow.synth_pynq_proj import SynthPYNQProject
+from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
+from finn.util.basic import pynq_part_map
+from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip
+from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+from finn.core.throughput_test import throughput_test_rtlsim
+
+build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
+test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
+test_fpga_part = pynq_part_map[test_pynq_board]
+target_clk_ns = 10
+mem_mode = "decoupled"
+
+
+def test_end2end_cnv_w2a2_export():
+    import brevitas.onnx as bo
+
+    cnv = get_test_model_trained("CNV", 2, 2)
+    bo.export_finn_onnx(
+        cnv, (1, 3, 32, 32), build_dir + "/end2end_cnv_w2a2_export.onnx"
+    )
+
+
+def test_end2end_cnv_w2a2_import_and_tidy():
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_export.onnx")
+    model = model.transform(DoubleToSingleFloat())
+    model = model.transform(InferShapes())
+    model = model.transform(FoldConstants())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(RemoveStaticGraphInputs())
+    model.save(build_dir + "/end2end_cnv_w2a2_tidy.onnx")
+
+
+def test_end2end_cnv_w2a2_streamline():
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_tidy.onnx")
+    model = model.transform(Streamline())
+    model = model.transform(LowerConvsToMatMul())
+    model = model.transform(MakeMaxPoolNHWC())
+    model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
+    model = model.transform(Streamline())
+    model = model.transform(RemoveUnusedTensors())
+    model.save(build_dir + "/end2end_cnv_w2a2_streamlined.onnx")
+
+
+def test_end2end_cnv_w2a2_convert_to_hls_layers():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w2a2_streamlined.onnx"
+    )
+    model = model.transform(to_hls.InferQuantizedStreamingFCLayer(mem_mode))
+    model = model.transform(to_hls.InferConvInpGen())
+    model = model.transform(to_hls.InferStreamingMaxPool())
+    model = model.transform(RemoveCNVtoFCFlatten())
+    model.save(build_dir + "/end2end_cnv_w2a2_hls_layers.onnx")
+
+
+def test_end2end_cnv_w2a2_create_dataflow_partition():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w2a2_hls_layers.onnx"
+    )
+    parent_model = model.transform(CreateDataflowPartition())
+    parent_model.save(build_dir + "/end2end_cnv_w2a2_dataflow_parent.onnx")
+    sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+    sdp_node = getCustomOp(sdp_node)
+    dataflow_model_filename = sdp_node.get_nodeattr("model")
+    dataflow_model = load_test_checkpoint_or_skip(dataflow_model_filename)
+    dataflow_model.save(build_dir + "/end2end_cnv_w2a2_dataflow_model.onnx")
+
+
+def test_end2end_cnv_w2a2_fold_and_tlastmarker():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w2a2_dataflow_model.onnx"
+    )
+    fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
+    # each tuple is (PE, SIMD, in_fifo_depth) for a layer
+    folding = [
+        (8, 3, 256, "auto"),
+        (16, 16, 256, "auto"),
+        (8, 16, 256, "auto"),
+        (8, 16, 256, "block"),
+        (4, 8, 214, "auto"),
+        (1, 8, 2, "auto"),
+        (1, 2, 126, "distributed"),
+        (2, 2, 62, "block"),
+        (5, 1, 6, "distributed"),
+    ]
+    for fcl, (pe, simd, ififodepth, ramstyle) in zip(fc_layers, folding):
+        fcl_inst = getCustomOp(fcl)
+        fcl_inst.set_nodeattr("PE", pe)
+        fcl_inst.set_nodeattr("SIMD", simd)
+        fcl_inst.set_nodeattr("inFIFODepth", ififodepth)
+        fcl_inst.set_nodeattr("ram_style", ramstyle)
+
+    swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator")
+    swg_idepth = [2, 51, 9, 106, 2, 2]
+    for i in range(len(swg_layers)):
+        swg_inst = getCustomOp(swg_layers[i])
+        simd = folding[i][1]
+        swg_inst.set_nodeattr("SIMD", simd)
+        swg_inst.set_nodeattr("inFIFODepth", swg_idepth[i])
+
+    model = model.transform(InsertDWC())
+    model = model.transform(InsertFIFO())
+    model = model.transform(InsertTLastMarker())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(AnnotateResources("estimate"))
+    model.save(build_dir + "/end2end_cnv_w2a2_folded.onnx")
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_end2end_cnv_w2a2_gen_hls_ip():
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_folded.onnx")
+    model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(AnnotateResources("hls"))
+    model.save(build_dir + "/end2end_cnv_w2a2_ipgen.onnx")
+
+
+@pytest.mark.vivado
+def test_end2end_cnv_w2a2_ip_stitch():
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_ipgen.onnx")
+    model = model.transform(ReplaceVerilogRelPaths())
+    model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
+    model.save(build_dir + "/end2end_cnv_w2a2_ipstitch.onnx")
+
+
+@pytest.mark.vivado
+def test_end2end_cnv_w2a2_verify_dataflow_part():
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_ipstitch.onnx")
+    x = np.zeros((1, 32, 32, 3), dtype=np.float32)
+    inp_name = model.graph.input[0].name
+    out_name = model.graph.output[0].name
+    inp_dict = {inp_name: x}
+    # cppsim
+    model = model.transform(PrepareCppSim())
+    model = model.transform(CompileCppSim())
+    model = model.transform(SetExecMode("cppsim"))
+    model.save(build_dir + "/end2end_cnv_w2a2_ipgen_cppsim.onnx")
+    ret_cppsim = execute_onnx(model, inp_dict, True)
+    res_cppsim = ret_cppsim[out_name]
+    # node-by-node rtlsim
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(PrepareRTLSim())
+    model.save(build_dir + "/end2end_cnv_w2a2_ipgen_nodebynode_rtlsim.onnx")
+    ret_rtlsim_nodebynode = execute_onnx(model, inp_dict, True)
+    res_rtlsim_nodebynode = ret_rtlsim_nodebynode[out_name]
+    # whole-network (ip-stitched) rtlsim
+    model.set_metadata_prop("exec_mode", "rtlsim")
+    model.save(build_dir + "/end2end_cnv_w2a2_ipstitch_whole_rtlsim.onnx")
+    # this is a particularly long-running test, set liveness thr. to unlimited
+    os.environ["LIVENESS_THRESHOLD"] = "-1"
+    ret_rtlsim_whole = execute_onnx(model, inp_dict, True)
+    res_rtlsim_whole = ret_rtlsim_whole[out_name]
+    assert np.isclose(res_cppsim, res_rtlsim_nodebynode).all()
+    assert np.isclose(res_cppsim, res_rtlsim_whole).all()
+
+
+@pytest.mark.vivado
+def test_end2end_cnv_w2a2_throughput_test_rtlsim():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w2a2_ipstitch_whole_rtlsim.onnx"
+    )
+    model.set_metadata_prop("rtlsim_trace", "rtlsim_trace.vcd")
+    # os.environ["RTLSIM_TRACE_DEPTH"] = "4"
+    # run through IP-stitched rtlsim with increasing batch sizes and
+    # check the number of cycles it takes to execute
+    ret = throughput_test_rtlsim(model, 10)
+    # TODO check for expected performance
+    assert ret["cycles"] > 0
+
+
+@pytest.mark.vivado
+def test_end2end_cnv_w2a2_verify_all():
+    # use the streamlined model as the "golden" model for right answers
+    golden = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w2a2_streamlined.onnx"
+    )
+    iname = golden.graph.input[0].name
+    oname = golden.graph.output[0].name
+    # load one of the test vectors
+    fn = pk.resource_filename("finn", "data/cifar10/cifar10-test-data-class3.npz")
+    input_tensor = np.load(fn)["arr_0"].astype(np.float32)
+    input_tensor = input_tensor / 255
+    assert input_tensor.shape == (1, 3, 32, 32)
+    x = input_tensor
+    # x = np.zeros(ishape, dtype=np.float32)
+    ret_golden = execute_onnx(golden, {iname: x}, True)
+    y_golden = ret_golden[oname]
+    # set up parent+child graph to test
+    # we'll use models from the previous step as the child model
+    parent_model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w2a2_dataflow_parent.onnx"
+    )
+    iname = parent_model.graph.input[0].name
+    oname = parent_model.graph.output[0].name
+    # produce results with cppsim
+    sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+    sdp_node = getCustomOp(sdp_node)
+    load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_ipgen_cppsim.onnx")
+    sdp_node.set_nodeattr("model", build_dir + "/end2end_cnv_w2a2_ipgen_cppsim.onnx")
+    ret_cppsim = execute_onnx(parent_model, {iname: x}, True)
+    y_cppsim = ret_cppsim[oname]
+    # produce results with node-by-node rtlsim
+    load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w2a2_ipgen_nodebynode_rtlsim.onnx"
+    )
+    sdp_node.set_nodeattr(
+        "model", build_dir + "/end2end_cnv_w2a2_ipgen_nodebynode_rtlsim.onnx"
+    )
+    ret_nodebynode_rtlsim = execute_onnx(parent_model, {iname: x}, True)
+    y_nodebynode_rtlsim = ret_nodebynode_rtlsim[oname]
+    # produce results with whole-network (stitched ip) rtlsim
+    load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w2a2_ipstitch_whole_rtlsim.onnx"
+    )
+    sdp_node.set_nodeattr(
+        "model", build_dir + "/end2end_cnv_w2a2_ipstitch_whole_rtlsim.onnx"
+    )
+    # this is a particularly long-running test, set liveness thr. to unlimited
+    os.environ["LIVENESS_THRESHOLD"] = "-1"
+    ret_whole_rtlsim = execute_onnx(parent_model, {iname: x}, True)
+    y_whole_rtlsim = ret_whole_rtlsim[oname]
+    assert np.isclose(y_golden, y_cppsim).all()
+    assert np.isclose(y_golden, y_nodebynode_rtlsim).all()
+    assert np.isclose(y_golden, y_whole_rtlsim).all()
+    assert np.argmax(y_golden) == 3
+
+
+@pytest.mark.vivado
+def test_end2end_cnv_w2a2_make_pynq_proj():
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_ipstitch.onnx")
+    model = model.transform(MakePYNQProject(test_pynq_board))
+    model.save(build_dir + "/end2end_cnv_w2a2_pynq_project.onnx")
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_end2end_cnv_w2a2_synth_pynq_project():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w2a2_pynq_project.onnx"
+    )
+    model = model.transform(SynthPYNQProject())
+    model = model.transform(AnnotateResources("synth"))
+    model.save(build_dir + "/end2end_cnv_w2a2_synth.onnx")
+
+
+def test_end2end_cnv_w2a2_make_driver():
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_synth.onnx")
+    model = model.transform(MakePYNQDriver(platform="zynq"))
+    model.save(build_dir + "/end2end_cnv_w2a2_pynq_driver.onnx")
+
+
+def test_end2end_cnv_w2a2_deploy_on_pynq():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w2a2_pynq_driver.onnx"
+    )
+    try:
+        ip = os.environ["PYNQ_IP"]  # no fault for this one; skip if not defined
+        if ip == "":
+            pytest.skip("PYNQ board IP address not specified")
+        username = os.getenv("PYNQ_USERNAME", "xilinx")
+        password = os.getenv("PYNQ_PASSWORD", "xilinx")
+        port = os.getenv("PYNQ_PORT", 22)
+        target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
+        model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
+        # save the model to be able to link it to the parent
+        model.save(build_dir + "/end2end_cnv_w2a2_pynq_deploy.onnx")
+    except KeyError:
+        pytest.skip("PYNQ board IP address not specified")
+
+
+def test_end2end_cnv_w2a2_run_on_pynq():
+    # use the streamlined model as the "golden" model for right answers
+    golden = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w2a2_streamlined.onnx"
+    )
+    iname = golden.graph.input[0].name
+    oname = golden.graph.output[0].name
+    # load one of the test vectors
+    fn = pk.resource_filename("finn", "data/cifar10/cifar10-test-data-class3.npz")
+    input_tensor = np.load(fn)["arr_0"].astype(np.float32)
+    input_tensor = input_tensor / 255
+    assert input_tensor.shape == (1, 3, 32, 32)
+    x = input_tensor
+    # run using FINN-based execution
+    ret_golden = execute_onnx(golden, {iname: x}, True)
+    y_golden = ret_golden[oname]
+    # set up parent+child graph to test
+    # we'll use models from the previous step as the child model
+    parent_model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w2a2_dataflow_parent.onnx"
+    )
+    iname = parent_model.graph.input[0].name
+    oname = parent_model.graph.output[0].name
+    try:
+        ip = os.environ["PYNQ_IP"]  # NOQA
+        if ip == "":
+            pytest.skip("PYNQ board IP address not specified")
+        # produce results with cppsim
+        sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+        sdp_node = getCustomOp(sdp_node)
+        load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_pynq_deploy.onnx")
+        sdp_node.set_nodeattr("model", build_dir + "/end2end_cnv_w2a2_pynq_deploy.onnx")
+        ret = execute_onnx(parent_model, {iname: x}, True)
+        y = ret[oname]
+        assert np.isclose(y, y_golden).all()
+        assert np.argmax(y) == 3
+
+    except KeyError:
+        pytest.skip("PYNQ board IP address not specified")
diff --git a/tests/end2end/test_end2end_tfc_w1a1.py b/tests/end2end/test_end2end_tfc_w1a1.py
index 13758e01e1df96a79658f5ebc7501c9fb43d0882..b827cbb1c31cc84de9fa5d4df4d6b23e02a02a5f 100644
--- a/tests/end2end/test_end2end_tfc_w1a1.py
+++ b/tests/end2end/test_end2end_tfc_w1a1.py
@@ -63,7 +63,12 @@ from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
 )
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.synth_pynq_proj import SynthPYNQProject
-from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from finn.transformation.general import (
+    RemoveUnusedTensors,
+    RemoveStaticGraphInputs,
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+)
 from finn.transformation.infer_datatypes import InferDataTypes
 from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline import Streamline
@@ -72,6 +77,7 @@ from finn.util.basic import pynq_part_map
 from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip
 from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.core.throughput_test import throughput_test_rtlsim
 import finn.util.vcd as vcd
 
 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
@@ -97,12 +103,14 @@ def test_end2end_tfc_w1a1_import_and_tidy():
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(GiveReadableTensorNames())
     model = model.transform(InferDataTypes())
+    model = model.transform(RemoveStaticGraphInputs())
     model.save(build_dir + "/end2end_tfc_w1a1_tidy.onnx")
 
 
 def test_end2end_tfc_w1a1_streamline():
     model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a1_tidy.onnx")
     model = model.transform(Streamline())
+    model = model.transform(RemoveUnusedTensors())
     model.save(build_dir + "/end2end_tfc_w1a1_streamlined.onnx")
 
 
@@ -225,6 +233,21 @@ def test_end2end_tfc_w1a1_verify_fifo_fullness():
     )
 
 
+@pytest.mark.vivado
+def test_end2end_tfc_w1a1_throughput_test_rtlsim():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a1_ipstitch_whole_rtlsim.onnx"
+    )
+    # run through IP-stitched rtlsim with increasing batch sizes and
+    # check the number of cycles it takes to execute
+    ret = throughput_test_rtlsim(model, 1)
+    assert ret["cycles"] == 205
+    ret = throughput_test_rtlsim(model, 10)
+    assert ret["cycles"] == 844
+    ret = throughput_test_rtlsim(model, 100)
+    assert ret["cycles"] == 7234
+
+
 @pytest.mark.vivado
 def test_end2end_tfc_w1a1_verify_all():
     # use the streamlined model as the "golden" model for right answers
@@ -296,7 +319,7 @@ def test_end2end_tfc_w1a1_synth_pynq_project():
 
 def test_end2end_tfc_w1a1_make_driver():
     model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a1_synth.onnx")
-    model = model.transform(MakePYNQDriver())
+    model = model.transform(MakePYNQDriver(platform="zynq"))
     model.save(build_dir + "/end2end_tfc_w1a1_pynq_driver.onnx")
 
 
diff --git a/tests/end2end/test_end2end_tfc_w1a2.py b/tests/end2end/test_end2end_tfc_w1a2.py
index d4c005a86580fb36e735beb00717fcfdffff21e5..755650e3d4da6947a93495fd5bbe0464cf485193 100644
--- a/tests/end2end/test_end2end_tfc_w1a2.py
+++ b/tests/end2end/test_end2end_tfc_w1a2.py
@@ -61,7 +61,12 @@ from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
 )
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.synth_pynq_proj import SynthPYNQProject
-from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from finn.transformation.general import (
+    RemoveUnusedTensors,
+    RemoveStaticGraphInputs,
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+)
 from finn.transformation.infer_datatypes import InferDataTypes
 from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline import Streamline
@@ -93,12 +98,14 @@ def test_end2end_tfc_w1a2_import_and_tidy():
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(GiveReadableTensorNames())
     model = model.transform(InferDataTypes())
+    model = model.transform(RemoveStaticGraphInputs())
     model.save(build_dir + "/end2end_tfc_w1a2_tidy.onnx")
 
 
 def test_end2end_tfc_w1a2_streamline():
     model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a2_tidy.onnx")
     model = model.transform(Streamline())
+    model = model.transform(RemoveUnusedTensors())
     model.save(build_dir + "/end2end_tfc_w1a2_streamlined.onnx")
 
 
@@ -268,7 +275,7 @@ def test_end2end_tfc_w1a2_synth_pynq_project():
 
 def test_end2end_tfc_w1a2_make_driver():
     model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a2_synth.onnx")
-    model = model.transform(MakePYNQDriver())
+    model = model.transform(MakePYNQDriver(platform="zynq"))
     model.save(build_dir + "/end2end_tfc_w1a2_pynq_driver.onnx")
 
 
diff --git a/tests/end2end/test_end2end_tfc_w2a2.py b/tests/end2end/test_end2end_tfc_w2a2.py
index 19d3f86e046658c4080d71984df1cff74008adab..4b2dd9ef01850897d95ede1214f87e9aa5b79f63 100644
--- a/tests/end2end/test_end2end_tfc_w2a2.py
+++ b/tests/end2end/test_end2end_tfc_w2a2.py
@@ -61,7 +61,12 @@ from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
 )
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.synth_pynq_proj import SynthPYNQProject
-from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from finn.transformation.general import (
+    RemoveUnusedTensors,
+    RemoveStaticGraphInputs,
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+)
 from finn.transformation.infer_datatypes import InferDataTypes
 from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline import Streamline
@@ -93,12 +98,14 @@ def test_end2end_tfc_w2a2_import_and_tidy():
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(GiveReadableTensorNames())
     model = model.transform(InferDataTypes())
+    model = model.transform(RemoveStaticGraphInputs())
     model.save(build_dir + "/end2end_tfc_w2a2_tidy.onnx")
 
 
 def test_end2end_tfc_w2a2_streamline():
     model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w2a2_tidy.onnx")
     model = model.transform(Streamline())
+    model = model.transform(RemoveUnusedTensors())
     model.save(build_dir + "/end2end_tfc_w2a2_streamlined.onnx")
 
 
@@ -268,7 +275,7 @@ def test_end2end_tfc_w2a2_synth_pynq_project():
 
 def test_end2end_tfc_w2a2_make_driver():
     model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w2a2_synth.onnx")
-    model = model.transform(MakePYNQDriver())
+    model = model.transform(MakePYNQDriver(platform="zynq"))
     model.save(build_dir + "/end2end_tfc_w2a2_pynq_driver.onnx")
 
 
diff --git a/tests/end2end/test_zynqbuild_end2end_cnv_w1a1.py b/tests/end2end/test_zynqbuild_end2end_cnv_w1a1.py
new file mode 100644
index 0000000000000000000000000000000000000000..25cafcfd4c552fb368cbaca2d1d2714cf2d14011
--- /dev/null
+++ b/tests/end2end/test_zynqbuild_end2end_cnv_w1a1.py
@@ -0,0 +1,246 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import pytest
+import numpy as np
+
+# as of Feb'20 there is a bug that segfaults ONNX shape inference if we
+# import pytorch before onnx, so we make sure to import onnx first
+import onnx  # NOQA
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+import finn.transformation.streamline.absorb as absorb
+from finn.core.onnx_exec import execute_onnx
+from finn.custom_op.registry import getCustomOp
+from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
+from finn.transformation.fold_constants import FoldConstants
+
+from finn.transformation.fpgadataflow.create_dataflow_partition import (
+    CreateDataflowPartition,
+)
+from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
+from finn.transformation.general import (
+    RemoveUnusedTensors,
+    RemoveStaticGraphInputs,
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+)
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.streamline import Streamline
+from finn.util.basic import pynq_part_map
+from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip
+from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
+from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
+import pkg_resources as pk
+from finn.transformation.double_to_single_float import DoubleToSingleFloat
+from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
+from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
+
+
+build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
+test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
+test_fpga_part = pynq_part_map[test_pynq_board]
+target_clk_ns = 10
+mem_mode = "decoupled"
+
+
+def test_end2end_zynqbuild_cnv_w1a1_export():
+    import brevitas.onnx as bo
+
+    tfc = get_test_model_trained("CNV", 1, 1)
+    bo.export_finn_onnx(
+        tfc, (1, 3, 32, 32), build_dir + "/end2end_zynqbuild_cnv_w1a1_export.onnx"
+    )
+
+
+def test_end2end_zynqbuild_cnv_w1a1_import_and_tidy():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_zynqbuild_cnv_w1a1_export.onnx"
+    )
+    model = model.transform(DoubleToSingleFloat())
+    model = model.transform(InferShapes())
+    model = model.transform(FoldConstants())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(RemoveStaticGraphInputs())
+    model.save(build_dir + "/end2end_zynqbuild_cnv_w1a1_tidy.onnx")
+
+
+def test_end2end_zynqbuild_cnv_w1a1_streamline():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_zynqbuild_cnv_w1a1_tidy.onnx"
+    )
+    model = model.transform(Streamline())
+    model = model.transform(LowerConvsToMatMul())
+    model = model.transform(MakeMaxPoolNHWC())
+    model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
+    model = model.transform(ConvertBipolarMatMulToXnorPopcount())
+    model = model.transform(Streamline())
+    model = model.transform(RemoveUnusedTensors())
+    model.save(build_dir + "/end2end_zynqbuild_cnv_w1a1_streamlined.onnx")
+
+
+def test_end2end_zynqbuild_cnv_w1a1_convert_to_hls_layers():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_zynqbuild_cnv_w1a1_streamlined.onnx"
+    )
+    model = model.transform(to_hls.InferBinaryStreamingFCLayer(mem_mode))
+    model = model.transform(to_hls.InferQuantizedStreamingFCLayer(mem_mode))
+    model = model.transform(to_hls.InferConvInpGen())
+    model = model.transform(to_hls.InferStreamingMaxPool())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(RemoveCNVtoFCFlatten())
+    model = model.transform(InferDataLayouts())
+    model.save(build_dir + "/end2end_zynqbuild_cnv_w1a1_hls_layers.onnx")
+
+
+def test_end2end_zynqbuild_cnv_w1a1_create_dataflow_partition():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_zynqbuild_cnv_w1a1_hls_layers.onnx"
+    )
+    parent_model = model.transform(CreateDataflowPartition())
+    parent_model.save(build_dir + "/end2end_zynqbuild_cnv_w1a1_dataflow_parent.onnx")
+    sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+    sdp_node = getCustomOp(sdp_node)
+    dataflow_model_filename = sdp_node.get_nodeattr("model")
+    dataflow_model = load_test_checkpoint_or_skip(dataflow_model_filename)
+    dataflow_model.save(build_dir + "/end2end_zynqbuild_cnv_w1a1_dataflow_model.onnx")
+
+
+def test_end2end_zynqbuild_cnv_w1a1_fold():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_zynqbuild_cnv_w1a1_dataflow_model.onnx"
+    )
+    fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
+    # each tuple is (PE, SIMD, in_fifo_depth) for a layer
+    folding = [
+        (16, 3, 256),
+        (32, 32, 256),
+        (16, 32, 256),
+        (16, 32, 256),
+        (4, 32, 214),
+        (1, 32, 2),
+        (1, 4, 126),
+        (1, 8, 62),
+        (5, 1, 6),
+    ]
+    for fcl, (pe, simd, ififodepth) in zip(fc_layers, folding):
+        fcl_inst = getCustomOp(fcl)
+        fcl_inst.set_nodeattr("PE", pe)
+        fcl_inst.set_nodeattr("SIMD", simd)
+        fcl_inst.set_nodeattr("inFIFODepth", ififodepth)
+
+    swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator")
+    swg_idepth = [2, 51, 9, 106, 2, 2]
+    for i in range(len(swg_layers)):
+        swg_inst = getCustomOp(swg_layers[i])
+        simd = folding[i][1]
+        swg_inst.set_nodeattr("SIMD", simd)
+        swg_inst.set_nodeattr("inFIFODepth", swg_idepth[i])
+    model = model.transform(AnnotateResources("estimate"))
+    model = model.transform(AnnotateCycles())
+    model.save(build_dir + "/end2end_zynqbuild_cnv_w1a1_folded.onnx")
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_end2end_zynqbuild_cnv_w1a1_build():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_zynqbuild_cnv_w1a1_folded.onnx"
+    )
+    model = model.transform(ZynqBuild(test_pynq_board, target_clk_ns))
+    model = model.transform(AnnotateResources("synth"))
+    model.save(build_dir + "/end2end_zynqbuild_cnv_w1a1_build.onnx")
+
+
+def test_end2end_zynqbuild_cnv_w1a1_deploy_on_pynq():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_zynqbuild_cnv_w1a1_build.onnx"
+    )
+    try:
+        ip = os.environ["PYNQ_IP"]  # no fault for this one; skip if not defined
+        if ip == "":
+            pytest.skip("PYNQ board IP address not specified")
+        username = os.getenv("PYNQ_USERNAME", "xilinx")
+        password = os.getenv("PYNQ_PASSWORD", "xilinx")
+        port = os.getenv("PYNQ_PORT", 22)
+        target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
+        model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
+        # save the model to be able to link it to the parent
+        model.save(build_dir + "/end2end_zynqbuild_cnv_w1a1_pynq_deploy.onnx")
+    except KeyError:
+        pytest.skip("PYNQ board IP address not specified")
+
+
+def test_end2end_zynqbuild_cnv_w1a1_run_on_pynq():
+    # use the streamlined model as the "golden" model for right answers
+    golden = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_zynqbuild_cnv_w1a1_streamlined.onnx"
+    )
+    iname = golden.graph.input[0].name
+    oname = golden.graph.output[0].name
+    # load one of the test vectors
+    fn = pk.resource_filename("finn", "data/cifar10/cifar10-test-data-class3.npz")
+    input_tensor = np.load(fn)["arr_0"].astype(np.float32)
+    input_tensor = input_tensor / 255
+    assert input_tensor.shape == (1, 3, 32, 32)
+    x = input_tensor
+    # x = np.zeros(ishape, dtype=np.float32)
+    # run using FINN-based execution
+    ret_golden = execute_onnx(golden, {iname: x}, True)
+    y_golden = ret_golden[oname]
+    # set up parent+child graph to test
+    # we'll use models from the previous step as the child model
+    parent_model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_zynqbuild_cnv_w1a1_dataflow_parent.onnx"
+    )
+    iname = parent_model.graph.input[0].name
+    oname = parent_model.graph.output[0].name
+    try:
+        ip = os.environ["PYNQ_IP"]  # NOQA
+        if ip == "":
+            pytest.skip("PYNQ board IP address not specified")
+        # produce results with cppsim
+        sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+        sdp_node = getCustomOp(sdp_node)
+        load_test_checkpoint_or_skip(
+            build_dir + "/end2end_zynqbuild_cnv_w1a1_pynq_deploy.onnx"
+        )
+        sdp_node.set_nodeattr(
+            "model", build_dir + "/end2end_zynqbuild_cnv_w1a1_pynq_deploy.onnx"
+        )
+        ret = execute_onnx(parent_model, {iname: x}, True)
+        y = ret[oname]
+        assert np.isclose(y, y_golden).all()
+        assert np.argmax(y) == 3
+
+    except KeyError:
+        pytest.skip("PYNQ board IP address not specified")
diff --git a/tests/end2end/test_zynqbuild_end2end_tfc_w1a1.py b/tests/end2end/test_zynqbuild_end2end_tfc_w1a1.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff2af70731d9248dd2593db5be9e465fa86157dd
--- /dev/null
+++ b/tests/end2end/test_zynqbuild_end2end_tfc_w1a1.py
@@ -0,0 +1,224 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+from pkgutil import get_data
+
+import pytest
+
+import numpy as np
+
+# as of Feb'20 there is a bug that segfaults ONNX shape inference if we
+# import pytorch before onnx, so we make sure to import onnx first
+import onnx  # NOQA
+import onnx.numpy_helper as nph
+
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+import finn.transformation.streamline.absorb as absorb
+from finn.core.onnx_exec import execute_onnx
+from finn.custom_op.registry import getCustomOp
+from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
+from finn.transformation.fold_constants import FoldConstants
+
+from finn.transformation.fpgadataflow.create_dataflow_partition import (
+    CreateDataflowPartition,
+)
+from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
+from finn.transformation.general import (
+    RemoveUnusedTensors,
+    RemoveStaticGraphInputs,
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+)
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.streamline import Streamline
+from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
+from finn.util.basic import pynq_part_map
+from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip
+from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
+
+build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
+test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
+test_fpga_part = pynq_part_map[test_pynq_board]
+target_clk_ns = 10
+mem_mode = "decoupled"
+
+
+def test_end2end_zynqbuild_tfc_w1a1_export():
+    import brevitas.onnx as bo
+
+    tfc = get_test_model_trained("TFC", 1, 1)
+    bo.export_finn_onnx(
+        tfc, (1, 1, 28, 28), build_dir + "/end2end_zynqbuild_tfc_w1a1_export.onnx"
+    )
+
+
+def test_end2end_zynqbuild_tfc_w1a1_import_and_tidy():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_zynqbuild_tfc_w1a1_export.onnx"
+    )
+    model = model.transform(InferShapes())
+    model = model.transform(FoldConstants())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(InferDataTypes())
+    model = model.transform(RemoveStaticGraphInputs())
+    model.save(build_dir + "/end2end_zynqbuild_tfc_w1a1_tidy.onnx")
+
+
+def test_end2end_zynqbuild_tfc_w1a1_streamline():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_zynqbuild_tfc_w1a1_tidy.onnx"
+    )
+    model = model.transform(Streamline())
+    model = model.transform(RemoveUnusedTensors())
+    model.save(build_dir + "/end2end_zynqbuild_tfc_w1a1_streamlined.onnx")
+
+
+def test_end2end_zynqbuild_tfc_w1a1_convert_to_hls_layers():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_zynqbuild_tfc_w1a1_streamlined.onnx"
+    )
+    model = model.transform(ConvertBipolarMatMulToXnorPopcount())
+    model = model.transform(absorb.AbsorbAddIntoMultiThreshold())
+    model = model.transform(absorb.AbsorbMulIntoMultiThreshold())
+    model = model.transform(RoundAndClipThresholds())
+    model = model.transform(to_hls.InferBinaryStreamingFCLayer(mem_mode))
+    model = model.transform(InferDataLayouts())
+    model.save(build_dir + "/end2end_zynqbuild_tfc_w1a1_hls_layers.onnx")
+
+
+def test_end2end_zynqbuild_tfc_w1a1_create_dataflow_partition():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_zynqbuild_tfc_w1a1_hls_layers.onnx"
+    )
+    parent_model = model.transform(CreateDataflowPartition())
+    parent_model.save(build_dir + "/end2end_zynqbuild_tfc_w1a1_dataflow_parent.onnx")
+    sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+    sdp_node = getCustomOp(sdp_node)
+    dataflow_model_filename = sdp_node.get_nodeattr("model")
+    dataflow_model = load_test_checkpoint_or_skip(dataflow_model_filename)
+    dataflow_model.save(build_dir + "/end2end_zynqbuild_tfc_w1a1_dataflow_model.onnx")
+
+
+def test_end2end_zynqbuild_tfc_w1a1_fold():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_zynqbuild_tfc_w1a1_dataflow_model.onnx"
+    )
+    fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
+    # (PE, SIMD, in_fifo_depth, out_fifo_depth, ramstyle) for each layer
+    config = [
+        (16, 49, 16, 64, "block"),
+        (8, 8, 64, 64, "auto"),
+        (8, 8, 64, 64, "auto"),
+        (10, 8, 64, 10, "distributed"),
+    ]
+    for fcl, (pe, simd, ififo, ofifo, ramstyle) in zip(fc_layers, config):
+        fcl_inst = getCustomOp(fcl)
+        fcl_inst.set_nodeattr("PE", pe)
+        fcl_inst.set_nodeattr("SIMD", simd)
+        fcl_inst.set_nodeattr("inFIFODepth", ififo)
+        fcl_inst.set_nodeattr("outFIFODepth", ofifo)
+        fcl_inst.set_nodeattr("ram_style", ramstyle)
+
+    model.save(build_dir + "/end2end_zynqbuild_tfc_w1a1_folded.onnx")
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_end2end_zynqbuild_tfc_w1a1_build():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_zynqbuild_tfc_w1a1_folded.onnx"
+    )
+    model = model.transform(ZynqBuild(test_pynq_board, target_clk_ns))
+    model = model.transform(AnnotateResources("synth"))
+    model.save(build_dir + "/end2end_zynqbuild_tfc_w1a1_build.onnx")
+
+
+def test_end2end_zynqbuild_tfc_w1a1_deploy_on_pynq():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_zynqbuild_tfc_w1a1_build.onnx"
+    )
+    try:
+        ip = os.environ["PYNQ_IP"]  # no fault for this one; skip if not defined
+        if ip == "":
+            pytest.skip("PYNQ board IP address not specified")
+        username = os.getenv("PYNQ_USERNAME", "xilinx")
+        password = os.getenv("PYNQ_PASSWORD", "xilinx")
+        port = os.getenv("PYNQ_PORT", 22)
+        target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
+        model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
+        # save the model to be able to link it to the parent
+        model.save(build_dir + "/end2end_zynqbuild_tfc_w1a1_pynq_deploy.onnx")
+    except KeyError:
+        pytest.skip("PYNQ board IP address not specified")
+
+
+def test_end2end_zynqbuild_tfc_w1a1_run_on_pynq():
+    # use the streamlined model as the "golden" model for right answers
+    golden = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_zynqbuild_tfc_w1a1_streamlined.onnx"
+    )
+    iname = golden.graph.input[0].name
+    oname = golden.graph.output[0].name
+    raw_i = get_data("finn", "data/onnx/mnist-conv/test_data_set_0/input_0.pb")
+    input_tensor = onnx.load_tensor_from_string(raw_i)
+    x = nph.to_array(input_tensor)
+    # x = np.zeros(ishape, dtype=np.float32)
+    # run using FINN-based execution
+    ret_golden = execute_onnx(golden, {iname: x}, True)
+    y_golden = ret_golden[oname]
+    # set up parent+child graph to test
+    # we'll use models from the previous step as the child model
+    parent_model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_zynqbuild_tfc_w1a1_dataflow_parent.onnx"
+    )
+    iname = parent_model.graph.input[0].name
+    oname = parent_model.graph.output[0].name
+    try:
+        ip = os.environ["PYNQ_IP"]  # NOQA
+        if ip == "":
+            pytest.skip("PYNQ board IP address not specified")
+        # produce results with cppsim
+        sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+        sdp_node = getCustomOp(sdp_node)
+        load_test_checkpoint_or_skip(
+            build_dir + "/end2end_zynqbuild_tfc_w1a1_pynq_deploy.onnx"
+        )
+        sdp_node.set_nodeattr(
+            "model", build_dir + "/end2end_zynqbuild_tfc_w1a1_pynq_deploy.onnx"
+        )
+        ret = execute_onnx(parent_model, {iname: x}, True)
+        y = ret[oname]
+        assert np.isclose(y, y_golden).all()
+
+    except KeyError:
+        pytest.skip("PYNQ board IP address not specified")
diff --git a/tests/end2end/test_zynqbuild_end2end_tfc_w2a2.py b/tests/end2end/test_zynqbuild_end2end_tfc_w2a2.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b28090854adbbcb6f400f73c2b6f6557f540e5e
--- /dev/null
+++ b/tests/end2end/test_zynqbuild_end2end_tfc_w2a2.py
@@ -0,0 +1,213 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+from pkgutil import get_data
+
+import pytest
+
+import numpy as np
+
+# as of Feb'20 there is a bug that segfaults ONNX shape inference if we
+# import pytorch before onnx, so we make sure to import onnx first
+import onnx  # NOQA
+import onnx.numpy_helper as nph
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+from finn.core.onnx_exec import execute_onnx
+from finn.custom_op.registry import getCustomOp
+from finn.transformation.fold_constants import FoldConstants
+from finn.transformation.fpgadataflow.create_dataflow_partition import (
+    CreateDataflowPartition,
+)
+from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
+from finn.transformation.general import (
+    RemoveUnusedTensors,
+    RemoveStaticGraphInputs,
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+)
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.streamline import Streamline
+from finn.util.basic import pynq_part_map
+from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip
+from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
+from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
+
+build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
+test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
+test_fpga_part = pynq_part_map[test_pynq_board]
+target_clk_ns = 10
+mem_mode = "decoupled"
+
+
+def test_end2end_zynqbuild_tfc_w2a2_export():
+    import brevitas.onnx as bo
+
+    tfc = get_test_model_trained("TFC", 2, 2)
+    bo.export_finn_onnx(
+        tfc, (1, 1, 28, 28), build_dir + "/end2end_zynqbuild_tfc_w2a2_export.onnx"
+    )
+
+
+def test_end2end_zynqbuild_tfc_w2a2_import_and_tidy():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_zynqbuild_tfc_w2a2_export.onnx"
+    )
+    model = model.transform(InferShapes())
+    model = model.transform(FoldConstants())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(InferDataTypes())
+    model = model.transform(RemoveStaticGraphInputs())
+    model.save(build_dir + "/end2end_zynqbuild_tfc_w2a2_tidy.onnx")
+
+
+def test_end2end_zynqbuild_tfc_w2a2_streamline():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_zynqbuild_tfc_w2a2_tidy.onnx"
+    )
+    model = model.transform(Streamline())
+    model = model.transform(RemoveUnusedTensors())
+    model.save(build_dir + "/end2end_zynqbuild_tfc_w2a2_streamlined.onnx")
+
+
+def test_end2end_zynqbuild_tfc_w2a2_convert_to_hls_layers():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_zynqbuild_tfc_w2a2_streamlined.onnx"
+    )
+    model = model.transform(to_hls.InferQuantizedStreamingFCLayer(mem_mode))
+    model.save(build_dir + "/end2end_zynqbuild_tfc_w2a2_hls_layers.onnx")
+
+
+def test_end2end_zynqbuild_tfc_w2a2_create_dataflow_partition():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_zynqbuild_tfc_w2a2_hls_layers.onnx"
+    )
+    parent_model = model.transform(CreateDataflowPartition())
+    parent_model.save(build_dir + "/end2end_zynqbuild_tfc_w2a2_dataflow_parent.onnx")
+    sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+    sdp_node = getCustomOp(sdp_node)
+    dataflow_model_filename = sdp_node.get_nodeattr("model")
+    dataflow_model = load_test_checkpoint_or_skip(dataflow_model_filename)
+    dataflow_model.save(build_dir + "/end2end_zynqbuild_tfc_w2a2_dataflow_model.onnx")
+
+
+def test_end2end_zynqbuild_tfc_w2a2_fold():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_zynqbuild_tfc_w2a2_dataflow_model.onnx"
+    )
+    fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
+    # (PE, SIMD, in_fifo_depth, out_fifo_depth, ramstyle) for each layer
+    config = [
+        (16, 49, 16, 64, "block"),
+        (8, 8, 64, 64, "auto"),
+        (8, 8, 64, 64, "auto"),
+        (10, 8, 64, 10, "distributed"),
+    ]
+    for fcl, (pe, simd, ififo, ofifo, ramstyle) in zip(fc_layers, config):
+        fcl_inst = getCustomOp(fcl)
+        fcl_inst.set_nodeattr("PE", pe)
+        fcl_inst.set_nodeattr("SIMD", simd)
+        fcl_inst.set_nodeattr("inFIFODepth", ififo)
+        fcl_inst.set_nodeattr("outFIFODepth", ofifo)
+        fcl_inst.set_nodeattr("ram_style", ramstyle)
+
+    model.save(build_dir + "/end2end_zynqbuild_tfc_w2a2_folded.onnx")
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_end2end_zynqbuild_tfc_w2a2_build():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_zynqbuild_tfc_w2a2_folded.onnx"
+    )
+    model = model.transform(ZynqBuild(test_pynq_board, target_clk_ns))
+    model = model.transform(AnnotateResources("synth"))
+    model.save(build_dir + "/end2end_zynqbuild_tfc_w2a2_build.onnx")
+
+
+def test_end2end_zynqbuild_tfc_w2a2_deploy_on_pynq():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_zynqbuild_tfc_w2a2_build.onnx"
+    )
+    try:
+        ip = os.environ["PYNQ_IP"]  # no fault for this one; skip if not defined
+        if ip == "":
+            pytest.skip("PYNQ board IP address not specified")
+        username = os.getenv("PYNQ_USERNAME", "xilinx")
+        password = os.getenv("PYNQ_PASSWORD", "xilinx")
+        port = os.getenv("PYNQ_PORT", 22)
+        target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
+        model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
+        # save the model to be able to link it to the parent
+        model.save(build_dir + "/end2end_zynqbuild_tfc_w2a2_pynq_deploy.onnx")
+    except KeyError:
+        pytest.skip("PYNQ board IP address not specified")
+
+
+def test_end2end_zynqbuild_tfc_w2a2_run_on_pynq():
+    # use the streamlined model as the "golden" model for right answers
+    golden = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_zynqbuild_tfc_w2a2_streamlined.onnx"
+    )
+    iname = golden.graph.input[0].name
+    oname = golden.graph.output[0].name
+    raw_i = get_data("finn", "data/onnx/mnist-conv/test_data_set_0/input_0.pb")
+    input_tensor = onnx.load_tensor_from_string(raw_i)
+    x = nph.to_array(input_tensor)
+    # x = np.zeros(ishape, dtype=np.float32)
+    # run using FINN-based execution
+    ret_golden = execute_onnx(golden, {iname: x}, True)
+    y_golden = ret_golden[oname]
+    # set up parent+child graph to test
+    # we'll use models from the previous step as the child model
+    parent_model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_zynqbuild_tfc_w2a2_dataflow_parent.onnx"
+    )
+    iname = parent_model.graph.input[0].name
+    oname = parent_model.graph.output[0].name
+    try:
+        ip = os.environ["PYNQ_IP"]  # NOQA
+        if ip == "":
+            pytest.skip("PYNQ board IP address not specified")
+        # produce results with cppsim
+        sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+        sdp_node = getCustomOp(sdp_node)
+        load_test_checkpoint_or_skip(
+            build_dir + "/end2end_zynqbuild_tfc_w2a2_pynq_deploy.onnx"
+        )
+        sdp_node.set_nodeattr(
+            "model", build_dir + "/end2end_zynqbuild_tfc_w2a2_pynq_deploy.onnx"
+        )
+        ret = execute_onnx(parent_model, {iname: x}, True)
+        y = ret[oname]
+        assert np.isclose(y, y_golden).all()
+
+    except KeyError:
+        pytest.skip("PYNQ board IP address not specified")
diff --git a/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py b/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d09c64a1250f78604c1a0a362cf234712de2cf57
--- /dev/null
+++ b/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py
@@ -0,0 +1,115 @@
+import pytest
+
+from onnx import TensorProto, helper
+
+import finn.core.onnx_exec as oxe
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.util.basic import gen_finn_dt_tensor
+from finn.transformation.infer_shapes import InferShapes
+import numpy as np
+
+
+def prepare_inputs(input_tensor):
+    return {"inp": input_tensor}
+
+
+def make_single_maxpool_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape):
+
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, ishape)
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, ishape)
+    p0 = helper.make_tensor_value_info("p0", TensorProto.FLOAT, pshape)
+
+    model = helper.make_model(
+        helper.make_graph(
+            name="test",
+            inputs=[inp],
+            outputs=[outp],
+            value_info=[p0],
+            nodes=[helper.make_node(onnx_op_name, ["inp", "p0"], ["outp"])],
+        )
+    )
+
+    model = ModelWrapper(model)
+    model.set_initializer("p0", gen_finn_dt_tensor(pdt, pshape))
+    model.set_tensor_datatype("inp", idt)
+    model.transform(InferDataLayouts(), make_deepcopy=False)
+    model.transform(InferShapes(), make_deepcopy=False)
+    return model
+
+
+# parameter datatype
+@pytest.mark.parametrize("pdt", [DataType.BIPOLAR, DataType.UINT4, DataType.INT2])
+# input datatype
+@pytest.mark.parametrize("idt", [DataType.INT32, DataType.UINT4, DataType.INT4])
+# function
+@pytest.mark.parametrize("onnx_op_name", ["Add", "Mul"])
+# vector parameter or scalar parameter (broadcast)
+@pytest.mark.parametrize("scalar_param", [True, False])
+# execution mode
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.vivado
+@pytest.mark.slow
+def test_convert_to_hls_channelwise_layer(
+    pdt, idt, onnx_op_name, scalar_param, exec_mode
+):
+    ifm_ch = 16
+    ifm_dim = 5
+    ishape = (1, ifm_ch, ifm_dim, ifm_dim)
+    if scalar_param:
+        pshape = (1,)
+    else:
+        pshape = (1, ifm_ch, 1, 1)
+
+    np.random.seed(0)
+    model = make_single_maxpool_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape)
+
+    # Since the aren't Data types with a bit width of a non power of 2,
+    # there are cases where the input won't use it full range.
+    if idt == DataType.INT32:
+        x = gen_finn_dt_tensor(DataType.INT16, (1, ifm_ch, ifm_dim, ifm_dim))
+    elif idt == DataType.UINT32:
+        x = gen_finn_dt_tensor(DataType.UINT16, (1, ifm_ch, ifm_dim, ifm_dim))
+    else:
+        x = gen_finn_dt_tensor(idt, (1, ifm_ch, ifm_dim, ifm_dim))
+
+    input_dict = prepare_inputs(x)
+    y_expected = oxe.execute_onnx(model, input_dict)["outp"]
+
+    new_model = model.transform(to_hls.InferChannelwiseLinearLayer())
+    new_model = new_model.transform(GiveUniqueNodeNames())
+
+    if exec_mode == "cppsim":
+        new_model = new_model.transform(PrepareCppSim())
+        new_model = new_model.transform(CompileCppSim())
+        new_model = new_model.transform(SetExecMode("cppsim"))
+    elif exec_mode == "rtlsim":
+        new_model = new_model.transform(SetExecMode("rtlsim"))
+        new_model = new_model.transform(GiveUniqueNodeNames())
+        new_model = new_model.transform(PrepareIP("xc7z020clg400-1", 5))
+        new_model = new_model.transform(HLSSynthIP())
+        new_model = new_model.transform(ReplaceVerilogRelPaths())
+        new_model = new_model.transform(PrepareRTLSim())
+    else:
+        raise Exception("Unknown exec_mode")
+
+    ctx_produced = oxe.execute_onnx(
+        new_model, input_dict, return_full_exec_context=True
+    )
+    y_produced = ctx_produced["outp"]
+
+    assert (y_produced == y_expected).all()
+    assert new_model.graph.node[1].op_type == "ChannelwiseOp_Batch"
diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
index ee65326ec57fb7fa7fa0490a8980dbabb8efc13c..9be9c904b0be0a8c1ab2421590922ae6cf2e1295 100644
--- a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
@@ -5,10 +5,15 @@ import pytest
 from finn.core.datatype import DataType
 from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
-from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.general import GiveUniqueNodeNames
 from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
 
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
 import finn.core.onnx_exec as oxe
 from finn.core.modelwrapper import ModelWrapper
 from finn.util.basic import gen_finn_dt_tensor
@@ -17,47 +22,49 @@ import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.custom_op.im2col import compute_conv_output_dim
+from finn.custom_op.registry import getCustomOp
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 
+# conv_config  kernel_size,stride, pad
 
-@pytest.mark.parametrize("padding", [True, False])
-@pytest.mark.parametrize("kernel_size", [3, 5])
+
+@pytest.mark.parametrize(
+    "conv_config", [(1, 2, 0), (1, 3, 0), (3, 2, 1), (3, 1, 0), (3, 1, 1), (5, 2, 1)]
+)
+@pytest.mark.parametrize("depthwise", [False, True])
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_convert_to_hls_conv_layer(padding, kernel_size):
-
-    assert (
-        kernel_size % 2 != 0
-    ), """test_convert_to_hls_conv_layer test only
-    supports odd kernel_size"""
-
+def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode):
+    kernel_size, stride, pad = conv_config
     np.random.seed(0)
-    padding = True
     idt = DataType.UINT4
 
     in_feature_dim = 7
-    in_chn = 3
+    in_chn = 16
 
-    stages = 1  # just one convolution
+    if depthwise is True:
+        group = out_chn = in_chn
+        conv_param_shape = [out_chn, 1, kernel_size, kernel_size]
+    else:
+        group = 1
+        out_chn = 20
+        conv_param_shape = [out_chn, in_chn, kernel_size, kernel_size]
 
-    out_feature_dim = (
-        in_feature_dim if padding else in_feature_dim - (kernel_size // 2 * 2) * stages
-    )
+    out_feature_dim = compute_conv_output_dim(in_feature_dim, kernel_size, stride, pad)
 
     input_shape = [1, in_chn, in_feature_dim, in_feature_dim]
-    output_shape = [1, in_chn, out_feature_dim, out_feature_dim]
+    output_shape = [1, out_chn, out_feature_dim, out_feature_dim]
 
-    conv_param_shape = [in_chn, in_chn, kernel_size, kernel_size]
+    conv_weight_dt = DataType.UINT4
 
     conv_config = {}
     conv_config["dilations"] = [1, 1]
-    conv_config["group"] = 1
+    conv_config["group"] = group
     conv_config["kernel_shape"] = [kernel_size, kernel_size]
-    if padding:
-        pad = kernel_size // 2
-        conv_config["pads"] = [pad, pad, pad, pad]
-    else:
-        conv_config["pads"] = [0, 0, 0, 0]
-    conv_config["strides"] = [1, 1]
+    conv_config["pads"] = [pad, pad, pad, pad]
+    conv_config["strides"] = [stride, stride]
 
     top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
     top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape)
@@ -80,27 +87,69 @@ def test_convert_to_hls_conv_layer(padding, kernel_size):
     model = ModelWrapper(modelproto)
     model.set_tensor_datatype("top_in", idt)
     model.set_tensor_datatype("top_out", idt)
-    model.set_tensor_datatype("p1", DataType.UINT4)
+    model.set_tensor_datatype("p1", conv_weight_dt)
+    model.set_initializer("p1", gen_finn_dt_tensor(conv_weight_dt, conv_param_shape))
 
     model = model.transform(InferShapes())
-    model.set_initializer(
-        "p1", np.round(np.random.rand(*conv_param_shape).astype(np.float32) * 16)
-    )
-
-    model.set_tensor_datatype(model.graph.input[0].name, idt)
-    model = model.transform(InferShapes())
-    model = model.transform(InferDataLayouts())
-    model = model.transform(GiveUniqueNodeNames())
-    model = model.transform(GiveReadableTensorNames())
     model = model.transform(InferDataTypes())
 
     new_model = model.transform(LowerConvsToMatMul())
     new_model = new_model.transform(to_hls.InferConvInpGen())
-
-    new_model = new_model.transform(PrepareCppSim())
-    new_model = new_model.transform(CompileCppSim())
-    new_model = new_model.transform(SetExecMode("cppsim"))
+    if depthwise is True:
+        new_model = new_model.transform(to_hls.InferVVAU())
+    else:
+        new_model = new_model.transform(to_hls.InferQuantizedStreamingFCLayer())
+        fc_node = new_model.get_nodes_by_op_type("StreamingFCLayer_Batch")[0]
+        fc_inst = getCustomOp(fc_node)
+        mw = fc_inst.get_nodeattr("MW")
+        mh = fc_inst.get_nodeattr("MH")
+        pe_cands = list(filter(lambda x: mh % x == 0, range(2, mh + 1)))
+        simd_cands = list(filter(lambda x: mw % x == 0, range(2, mw + 1)))
+        fc_inst.set_nodeattr("PE", pe_cands[0])
+        fc_inst.set_nodeattr("SIMD", simd_cands[0])
+
+    new_model = new_model.transform(GiveUniqueNodeNames())
+    new_model = new_model.transform(InferShapes())
+    new_model = new_model.transform(InferDataTypes())
+
+    if exec_mode == "cppsim":
+        new_model = new_model.transform(PrepareCppSim())
+        new_model = new_model.transform(CompileCppSim())
+        new_model = new_model.transform(SetExecMode("cppsim"))
+    elif exec_mode == "rtlsim":
+        new_model = new_model.transform(SetExecMode("rtlsim"))
+        new_model = new_model.transform(GiveUniqueNodeNames())
+        new_model = new_model.transform(PrepareIP("xc7z020clg400-1", 5))
+        new_model = new_model.transform(HLSSynthIP())
+        new_model = new_model.transform(ReplaceVerilogRelPaths())
+        new_model = new_model.transform(PrepareRTLSim())
+    else:
+        raise Exception("Unknown exec_mode")
 
     x = gen_finn_dt_tensor(idt, input_shape)
     inp_dict = {model.graph.input[0].name: x}
     assert oxe.compare_execution(model, new_model, inp_dict)
+    if kernel_size == 1 and stride > 1 and pad == 0:
+        assert new_model.graph.node[1].op_type == "DownSampler"
+        if exec_mode == "rtlsim":
+            node = new_model.get_nodes_by_op_type("DownSampler")[0]
+            inst = getCustomOp(node)
+            cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+            exp_cycles_dict = new_model.analysis(exp_cycles_per_layer)
+            exp_cycles = exp_cycles_dict[node.name]
+            assert np.isclose(exp_cycles, cycles_rtlsim, atol=11)
+            assert exp_cycles != 0
+
+    if pad == 1:
+        padding_node = new_model.get_nodes_by_op_type("FMPadding_Batch")[0]
+        padding_inst = getCustomOp(padding_node)
+        assert padding_inst.get_nodeattr("SIMD") == in_chn
+
+    if depthwise is True and exec_mode == "rtlsim":
+        node = new_model.get_nodes_by_op_type("Vector_Vector_Activate_Batch")[0]
+        inst = getCustomOp(node)
+        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+        exp_cycles_dict = new_model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        assert np.isclose(exp_cycles, cycles_rtlsim, atol=11)
+        assert exp_cycles != 0
diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py b/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
index 48803c9614f53a3a149c6eaac4289d10086513a5..20e3ee08d7ffdd013a89d26bb71d86ccc554a5b4 100644
--- a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
+++ b/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
@@ -51,7 +51,7 @@ from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.custom_op.registry import getCustomOp
 
-export_onnx_path_cnv = "test_output_cnv.onnx"
+export_onnx_path_cnv = "test_convert_to_hls_layers_cnv.onnx"
 
 
 @pytest.mark.vivado
diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_fc.py b/tests/fpgadataflow/test_convert_to_hls_layers_fc.py
index e261a3114853bf24bdb4c931c46ff92eea4150dd..d77065ad9396d0cc8dd57a39ed823fffcb30ee47 100644
--- a/tests/fpgadataflow/test_convert_to_hls_layers_fc.py
+++ b/tests/fpgadataflow/test_convert_to_hls_layers_fc.py
@@ -52,8 +52,7 @@ from finn.transformation.streamline.round_thresholds import RoundAndClipThreshol
 from finn.util.test import get_test_model_trained
 
 
-export_onnx_path = "test_output_tfc.onnx"
-export_onnx_path_cnv = "test_output_cnv.onnx"
+export_onnx_path = "test_convert_to_hls_layers_fc.onnx"
 
 
 @pytest.mark.vivado
diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py b/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d861929f3d421c431a27ccac5d513938aa7d726
--- /dev/null
+++ b/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py
@@ -0,0 +1,232 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import numpy as np
+
+from onnx import TensorProto, helper
+
+import finn.core.onnx_exec as oxe
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.fold_constants import FoldConstants
+from finn.transformation.general import (
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    SortGraph,
+)
+from finn.transformation.streamline.reorder import MoveScalarLinearPastInvariants
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.util.basic import gen_finn_dt_tensor
+from finn.util.test import soft_verify_topk
+from finn.transformation.double_to_single_float import DoubleToSingleFloat
+from finn.transformation.insert_topk import InsertTopK
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.streamline.absorb import (
+    AbsorbScalarMulIntoTopK,
+    AbsorbConsecutiveTransposes,
+)
+from finn.transformation.streamline.collapse_repeated import (
+    CollapseRepeatedMul,
+    CollapseRepeatedAdd,
+)
+from finn.transformation.streamline.reorder import MoveAddPastMul
+
+import pytest
+
+export_onnx_path = "test_output_synthetic.onnx"
+
+# construct a synthetic graph to test:
+# topk insertion, topk conversion to hls, add conversion to hls
+# graph should just be a sum
+
+
+def make_model(ch, ifmdim):
+    shape = [1, ch, ifmdim, ifmdim]
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape)
+    inp1_add0_ct = helper.make_tensor_value_info("inp1_add0_ct", TensorProto.FLOAT, [1])
+    inp1_add = helper.make_tensor_value_info("inp1_add", TensorProto.FLOAT, shape)
+    inp1_add_ct = helper.make_tensor_value_info("inp1_add_ct", TensorProto.FLOAT, [1])
+    inp2_add = helper.make_tensor_value_info("inp2_add", TensorProto.FLOAT, shape)
+    inp2_add_ct = helper.make_tensor_value_info("inp2_add_ct", TensorProto.FLOAT, [1])
+    inp1_mul = helper.make_tensor_value_info("inp1_mul", TensorProto.FLOAT, shape)
+    inp1_mul_ct = helper.make_tensor_value_info("inp1_mul_ct", TensorProto.FLOAT, [1])
+    inp2_mul = helper.make_tensor_value_info("inp2_mul", TensorProto.FLOAT, shape)
+    inp2_mul_ct = helper.make_tensor_value_info("inp2_mul_ct", TensorProto.FLOAT, [1])
+    eltwise_add = helper.make_tensor_value_info("eltwise_add", TensorProto.FLOAT, shape)
+    pool = helper.make_tensor_value_info("pool", TensorProto.FLOAT, [1, ch, 1, 1])
+    reshape_ct = helper.make_tensor_value_info("reshape_ct", TensorProto.INT64, [2])
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ch])
+
+    add0_node = helper.make_node("Add", [inp.name, inp1_add0_ct.name], ["out_add0"])
+    add1_node = helper.make_node("Add", ["out_add0", inp1_add_ct.name], [inp1_add.name])
+    add2_node = helper.make_node("Add", ["out_add0", inp2_add_ct.name], [inp2_add.name])
+    mul1_node = helper.make_node(
+        "Mul", [inp1_add.name, inp1_mul_ct.name], [inp1_mul.name]
+    )
+    mul2_node = helper.make_node(
+        "Mul", [inp2_add.name, inp2_mul_ct.name], [inp2_mul.name]
+    )
+    eltwise_add_node = helper.make_node(
+        "Add", [inp1_mul.name, inp2_mul.name], [eltwise_add.name]
+    )
+    globalavgpool_node = helper.make_node(
+        "GlobalAveragePool", [eltwise_add.name], [pool.name]
+    )
+    reshape_node = helper.make_node(
+        "Reshape", [pool.name, reshape_ct.name], [outp.name]
+    )
+
+    graph = helper.make_graph(
+        nodes=[
+            add0_node,
+            add1_node,
+            add2_node,
+            mul1_node,
+            mul2_node,
+            eltwise_add_node,
+            globalavgpool_node,
+            reshape_node,
+        ],
+        name="graph",
+        inputs=[inp],
+        outputs=[outp],
+    )
+
+    model = helper.make_model(graph, producer_name="add-model")
+    model = ModelWrapper(model)
+
+    # set initializers for scalar add/mul nodes
+    model.set_initializer(add0_node.input[1], np.array([0.0]))
+    model.set_initializer(add1_node.input[1], np.array([7.0]))
+    model.set_initializer(add2_node.input[1], np.array([8.0]))
+    model.set_initializer(mul1_node.input[1], np.array([2.0]))
+    model.set_initializer(mul2_node.input[1], np.array([2.0]))
+    model.set_initializer(reshape_node.input[1], np.array([1, -1]))
+
+    return model
+
+
+# data types
+@pytest.mark.parametrize("idt", [DataType.UINT2])
+# channels
+@pytest.mark.parametrize("ch", [16])
+# ifmdim
+@pytest.mark.parametrize("ifmdim", [5])
+@pytest.mark.vivado
+@pytest.mark.slow
+def test_convert_to_hls_layers_synthetic(ch, ifmdim, idt):
+    model = make_model(ch, ifmdim)
+    model.save(export_onnx_path)
+    model = ModelWrapper(export_onnx_path)
+    model = model.transform(DoubleToSingleFloat())
+    model = model.transform(InferShapes())
+    model = model.transform(FoldConstants())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(InferDataLayouts())
+    # model.save("golden.onnx")
+    # generate test vectors of correct shape
+    if ifmdim == -1:
+        input_tensor_shape = (1, ch)
+    else:
+        input_tensor_shape = (1, ch, ifmdim, ifmdim)
+
+    x = gen_finn_dt_tensor(idt, input_tensor_shape)
+
+    # generate expected value from streamlined net
+    input_dict = {model.graph.input[0].name: x}
+
+    output_dict = oxe.execute_onnx(model, input_dict, True)
+    produced_sum = output_dict[model.graph.output[0].name]
+    chw_mul = model.get_initializer(model.graph.node[-1].input[1])
+    chw_mul = 1
+    expected_sum = chw_mul * np.sum(2 * (2 * x + 15.0), axis=(2, 3)) / (ifmdim * ifmdim)
+    assert (produced_sum.flatten() == expected_sum.flatten()).all()
+
+    model = model.transform(InferDataLayouts())
+
+    # convert to hls
+    model.set_tensor_datatype(model.graph.input[0].name, idt)
+    # extra streamlining
+    model = model.transform(MoveScalarLinearPastInvariants())
+    model = model.transform(MoveAddPastMul())
+    model = model.transform(CollapseRepeatedMul())
+    model = model.transform(CollapseRepeatedAdd())
+    # insert top-k node, which should absorb linear ops before it
+
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataLayouts())
+    model = model.transform(InferDataTypes())
+
+    model = model.transform(to_hls.InferChannelwiseLinearLayer())
+    model = model.transform(to_hls.InferAddStreamsLayer())
+    model = model.transform(to_hls.InferGlobalAccPoolLayer())
+    model = model.transform(MoveScalarLinearPastInvariants())
+    model = model.transform(InsertTopK())
+    model = model.transform(AbsorbScalarMulIntoTopK())
+    model = model.transform(InferDataTypes())
+    model = model.transform(to_hls.InferLabelSelectLayer())
+    model = model.transform(AbsorbConsecutiveTransposes())
+    model = model.transform(InferDataTypes())
+    model = model.transform(to_hls.InferLabelSelectLayer())
+    model = model.transform(to_hls.InferDuplicateStreamsLayer())
+
+    model = model.transform(SortGraph())
+
+    # model.save("golden_hls.onnx")
+    # check topology status
+
+    finn_nodes = model.get_finn_nodes()
+    assert len(finn_nodes) == 9
+    add_nodes = model.get_nodes_by_op_type("AddStreams_Batch")
+    assert len(add_nodes) == 1
+    pool_nodes = model.get_nodes_by_op_type("GlobalAccPool_Batch")
+    assert len(pool_nodes) == 1
+    label_nodes = model.get_nodes_by_op_type("LabelSelect_Batch")
+    assert len(label_nodes) == 1
+    channelwise_nodes = model.get_nodes_by_op_type("ChannelwiseOp_Batch")
+    assert len(channelwise_nodes) == 5
+    dup_nodes = model.get_nodes_by_op_type("DuplicateStreams_Batch")
+    assert len(dup_nodes) == 1
+
+    model = model.transform(PrepareCppSim())
+    model = model.transform(CompileCppSim())
+    model = model.transform(SetExecMode("cppsim"))
+
+    output_dict = oxe.execute_onnx(model, input_dict, True)
+    produced_topk_hls = output_dict[model.graph.output[0].name]
+    topk_input = output_dict[model.graph.node[-1].input[0]]
+    assert soft_verify_topk(topk_input, produced_topk_hls, 5)
+
+    os.remove(export_onnx_path)
diff --git a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..86409feffd120b1baeeee471415e93f29d9e655a
--- /dev/null
+++ b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
@@ -0,0 +1,221 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+from onnx import TensorProto, helper
+import numpy as np
+import finn.core.onnx_exec as oxe
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.custom_op.registry import getCustomOp
+from finn.util.basic import gen_finn_dt_tensor
+from finn.transformation.infer_shapes import InferShapes
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+
+
+def make_single_maxpool_modelwrapper(k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt):
+    odt = idt
+    inp = helper.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim]
+    )
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ifm_ch, ofm_dim, ofm_dim]
+    )
+
+    mp_node = helper.make_node(
+        "MaxPool",
+        ["inp"],
+        ["outp"],
+        kernel_shape=[k, k],
+        pads=[pad, pad, pad, pad],
+        strides=[stride, stride],
+    )
+    graph = helper.make_graph(
+        nodes=[mp_node], name="mp_graph", inputs=[inp], outputs=[outp]
+    )
+
+    model = helper.make_model(graph, producer_name="mp-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+    model = model.transform(InferShapes())
+
+    return model
+
+
+def make_single_quantavpool_modelwrapper(k, stride, ifm_ch, ifm_dim, ofm_dim, idt, odt):
+    inp = helper.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim]
+    )
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ifm_ch, ofm_dim, ofm_dim]
+    )
+
+    mp_node = helper.make_node(
+        "QuantAvgPool2d",
+        ["inp"],
+        ["outp"],
+        domain="finn",
+        stride=stride,
+        kernel=k,
+        ibits=idt.bitwidth(),
+        obits=odt.bitwidth(),
+        signed=1 if idt.signed() else 0,
+        data_layout="NCHW",
+    )
+    graph = helper.make_graph(
+        nodes=[mp_node], name="mp_graph", inputs=[inp], outputs=[outp]
+    )
+
+    model = helper.make_model(graph, producer_name="mp-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+    model = model.transform(InferShapes())
+
+    return model
+
+
+def prepare_inputs(input_tensor):
+    return {"inp": input_tensor}
+
+
+# input datatype
+@pytest.mark.parametrize("idt", [DataType.UINT4, DataType.INT4, DataType.INT8])
+# output datatype
+@pytest.mark.parametrize("odt", [DataType.UINT4, DataType.INT4])
+# pool configuration:                   ( k,stride, pad, ifm_dim )
+@pytest.mark.parametrize("pool_config", [(7, 7, 0, 7), (3, 2, 1, 5)])
+# input channels
+@pytest.mark.parametrize("ifm_ch", [1, 4])
+# number of out channel computed in parallel
+@pytest.mark.parametrize("pe", [1, 2, 4])
+# pool type
+@pytest.mark.parametrize("op_type", ["QuantAvgPool2d", "MaxPool"])
+# execution mode
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_convert_to_hls_pool_batch(
+    idt, odt, pool_config, ifm_ch, pe, op_type, exec_mode
+):
+    k, stride, pad, ifm_dim = pool_config
+
+    if ifm_ch % pe != 0:
+        pytest.skip("ifm_ch%pe != 0. Skipping")
+
+    if pad != 0 and idt.signed():
+        pytest.skip("No support for pal_val != 0. Skipping")
+
+    np.random.seed(0)
+    ofm_dim = int(((ifm_dim + 2 * pad - k) / stride) + 1)
+
+    x = gen_finn_dt_tensor(idt, (1, ifm_ch, ifm_dim, ifm_dim))
+    # prepare input data
+    input_dict = prepare_inputs(x)
+    if op_type == "MaxPool":
+        # if idt.signed():
+        #     pytest.skip("""No support for signed input (see accu initialization
+        #         in Pool_batch HLSLIB function). Skipping""")
+
+        if idt != odt:
+            pytest.skip("Skipping Maxpool with idt != odt")
+
+        model = make_single_maxpool_modelwrapper(
+            k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt
+        )
+    elif op_type == "QuantAvgPool2d":
+        if pad != 0:
+            pytest.skip("No padding support for QuantAvgPool2d. Skipping")
+
+        if idt.signed() != odt.signed():
+            pytest.skip("Skipping QuantAvgPool2d with idt.signed() != odt.signed()")
+        model = make_single_quantavpool_modelwrapper(
+            k, stride, ifm_ch, ifm_dim, ofm_dim, idt, odt
+        )
+    else:
+        assert False, "{} is not a supported op_type".format(op_type)
+
+    y_expected = oxe.execute_onnx(model, input_dict)["outp"]
+
+    new_model = model.transform(to_hls.InferPool_Batch())
+    new_model = new_model.transform(GiveUniqueNodeNames())
+
+    if ifm_ch != pe:
+        new_model = new_model.transform(to_hls.InferConvInpGen())
+        # Folding
+        for n in new_model.graph.node:
+            if n.op_type == "ConvolutionInputGenerator":
+                inst = getCustomOp(n)
+                inst.set_nodeattr("SIMD", pe)
+            elif n.op_type == "Pool_Batch":
+                inst = getCustomOp(n)
+                inst.set_nodeattr("PE", pe)
+
+    if exec_mode == "cppsim":
+        new_model = new_model.transform(SetExecMode("cppsim"))
+        new_model = new_model.transform(PrepareCppSim())
+        new_model = new_model.transform(CompileCppSim())
+    elif exec_mode == "rtlsim":
+        new_model = new_model.transform(SetExecMode("rtlsim"))
+        new_model = new_model.transform(GiveUniqueNodeNames())
+        new_model = new_model.transform(PrepareIP("xc7z020clg400-1", 5))
+        new_model = new_model.transform(HLSSynthIP())
+        new_model = new_model.transform(PrepareRTLSim())
+    else:
+        raise Exception("Unknown exec_mode")
+
+    # execute new_model
+    y_produced = oxe.execute_onnx(new_model, input_dict)["outp"]
+    assert (y_produced == y_expected).all()
+    if stride <= k:
+        if pad == 0 or ifm_ch == pe:
+            assert len(new_model.graph.node) == 4
+        else:
+            assert len(new_model.graph.node) == 5
+    else:
+        assert len(new_model.graph.node) == 1
+
+    if exec_mode == "rtlsim":
+        node = new_model.get_nodes_by_op_type("Pool_Batch")[0]
+        inst = getCustomOp(node)
+        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+        exp_cycles_dict = new_model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
diff --git a/tests/fpgadataflow/test_depthwise_convolution.py b/tests/fpgadataflow/test_depthwise_convolution.py
new file mode 100644
index 0000000000000000000000000000000000000000..f530926e46ac5c116c3f15688c7f2face7954a30
--- /dev/null
+++ b/tests/fpgadataflow/test_depthwise_convolution.py
@@ -0,0 +1,249 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+import onnx.helper as oh
+from onnx import TensorProto
+import numpy as np
+
+from finn.core.modelwrapper import ModelWrapper
+from finn.core.datatype import DataType
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.fpgadataflow.convert_to_hls_layers import (
+    InferConvInpGen,
+    InferVVAU,
+)
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+
+import finn.core.onnx_exec as oxe
+from finn.custom_op.im2col import compute_conv_output_dim
+from finn.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor
+from finn.custom_op.registry import getCustomOp
+
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
+
+
+def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding):
+
+    # set up reference model consisting of Im2Col + MatMul (+ MultiThreshold)
+    ofm_ch = ifm_ch
+    ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, pad=padding)
+
+    if act is None:
+        odt = DataType.INT32
+    else:
+        odt = act
+        out_act = oh.make_tensor_value_info(
+            "out_act", TensorProto.FLOAT, [1, ofm_dim, ofm_dim, ofm_ch]
+        )
+        T = oh.make_tensor_value_info("T", TensorProto.FLOAT, [ofm_ch, 15])
+        tdt = DataType.INT32
+        thresh_node = oh.make_node(
+            "MultiThreshold",
+            domain="finn",
+            inputs=["outp", "T"],
+            outputs=["out_act"],
+            data_layout="NHWC",
+            out_dtype=odt.name,
+            out_scale=1.0,
+            out_bias=0.0,
+        )
+
+    # set up onnx model
+    inp = oh.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ifm_ch]
+    )
+    outp = oh.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_dim, ofm_dim, ofm_ch]
+    )
+
+    W_sparse = oh.make_tensor_value_info(
+        "W_sparse", TensorProto.FLOAT, [ifm_ch * k * k, ofm_ch]
+    )
+
+    im2col_node = oh.make_node(
+        "Im2Col",
+        domain="finn",
+        inputs=["inp"],
+        outputs=["im2col_out"],
+        kernel_size=k,
+        stride=stride,
+        pad_amount=padding,
+        input_shape="(1, {}, {}, {})".format(ifm_dim, ifm_dim, ifm_ch),
+        depthwise=1,
+    )
+
+    matmul_node = oh.make_node(
+        "MatMul", inputs=["im2col_out", "W_sparse"], outputs=["outp"]
+    )
+
+    if act is None:
+        node_list = [im2col_node, matmul_node]
+        global_out = outp
+        value_info = [W_sparse]
+    else:
+        node_list = [im2col_node, matmul_node, thresh_node]
+        global_out = out_act
+        value_info = [W_sparse, T]
+
+    graph = oh.make_graph(
+        nodes=node_list,
+        name="lowered_dw_cnv_graph",
+        inputs=[inp],
+        outputs=[global_out],
+        value_info=value_info,
+    )
+    model = oh.make_model(graph, producer_name="lowered_dw_cnv-model")
+    model = ModelWrapper(model)
+
+    # initialize model
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype(model.graph.output[0].name, odt)
+    model.set_tensor_datatype("W_sparse", wdt)
+
+    w_tensor = gen_finn_dt_tensor(wdt, [ofm_ch, 1, k, k])
+    # create sparse matrix
+    W_matrix = np.zeros((ofm_ch, ifm_ch, k, k))
+    for ch in range(ifm_ch):
+        W_matrix[ch][ch] = w_tensor[ch][0]
+    W_matrix = W_matrix.astype(np.float32)
+    W_matrix = W_matrix.transpose(0, 2, 3, 1)
+    W_matrix = W_matrix.reshape(ofm_ch, ifm_ch * k * k)
+
+    model.set_initializer("W_sparse", W_matrix.T)
+    sparsity = {"dw": {"kernel_shape": k}}
+    model.set_tensor_sparsity("W_sparse", sparsity)
+
+    if act is not None:
+        (min, max) = calculate_signed_dot_prod_range(idt, wdt, ifm_ch * k * k)
+        n_steps = odt.get_num_possible_values() - 1
+        T_values = np.random.randint(min, max - 1, (ofm_ch, n_steps)).astype(np.float32)
+        # provide non-decreasing thresholds
+        T_values = np.sort(T_values, axis=1)
+        model.set_initializer("T", T_values)
+        model.set_tensor_datatype("T", tdt)
+
+    model = model.transform(InferShapes())
+
+    return model
+
+
+# PE
+@pytest.mark.parametrize("pe", [1, 2, 4])
+# Output activation
+@pytest.mark.parametrize("act", [None, DataType.UINT4])
+# kernel size
+@pytest.mark.parametrize("k", [2, 4])
+# stride
+@pytest.mark.parametrize("stride", [1, 2])
+# padding
+@pytest.mark.parametrize("padding", [0, 1])
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_depthwise_conv_hls_cppsim(act, pe, k, stride, padding):
+    idt = wdt = DataType.INT4
+    ifm_dim = 6
+    ifm_ch = 4
+
+    # set up reference model consisting of Im2Col + MatMul (+ MultiThreshold)
+    model = set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding)
+
+    input_tensor = gen_finn_dt_tensor(idt, [1, ifm_dim, ifm_dim, ifm_ch])
+    input_dict = {"inp": input_tensor}
+
+    new_model = model.transform(InferConvInpGen())
+    new_model = new_model.transform(InferVVAU())
+
+    # set SIMD in ConvInputGen node and PE in VVAU node
+
+    for n in new_model.graph.node:
+        if n.op_type == "ConvolutionInputGenerator":
+            convinputgen_node = getCustomOp(n)
+            convinputgen_node.set_nodeattr("SIMD", pe)
+        elif n.op_type == "Vector_Vector_Activate_Batch":
+            vvau_node = getCustomOp(n)
+            vvau_node.set_nodeattr("PE", pe)
+    new_model = new_model.transform(SetExecMode("cppsim"))
+    new_model = new_model.transform(PrepareCppSim())
+    new_model = new_model.transform(CompileCppSim())
+
+    assert oxe.compare_execution(model, new_model, input_dict)
+
+
+# PE
+@pytest.mark.parametrize("pe", [1, 2, 4])
+# Output activation
+@pytest.mark.parametrize("act", [None, DataType.UINT4])
+# kernel size
+@pytest.mark.parametrize("k", [2, 4])
+# stride
+@pytest.mark.parametrize("stride", [1, 2])
+# padding
+@pytest.mark.parametrize("padding", [0, 1])
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_depthwise_conv_hls_rtlsim(act, pe, k, stride, padding):
+    idt = wdt = DataType.INT4
+    ifm_dim = 6
+    ifm_ch = 4
+
+    # set up reference model consisting of Im2Col + MatMul (+ MultiThreshold)
+    model = set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding)
+
+    input_tensor = gen_finn_dt_tensor(idt, [1, ifm_dim, ifm_dim, ifm_ch])
+    input_dict = {"inp": input_tensor}
+
+    new_model = model.transform(InferConvInpGen())
+    new_model = new_model.transform(InferVVAU())
+
+    # set SIMD in ConvInputGen node and PE in VVAU node
+
+    for n in new_model.graph.node:
+        if n.op_type == "ConvolutionInputGenerator":
+            convinputgen_node = getCustomOp(n)
+            convinputgen_node.set_nodeattr("SIMD", pe)
+        elif n.op_type == "Vector_Vector_Activate_Batch":
+            vvau_node = getCustomOp(n)
+            vvau_node.set_nodeattr("PE", pe)
+
+    new_model = new_model.transform(SetExecMode("rtlsim"))
+    new_model = new_model.transform(GiveUniqueNodeNames())
+    new_model = new_model.transform(PrepareIP("xc7z020clg400-1", 5))
+    new_model = new_model.transform(HLSSynthIP())
+    new_model = new_model.transform(ReplaceVerilogRelPaths())
+    new_model = new_model.transform(PrepareRTLSim())
+
+    assert oxe.compare_execution(model, new_model, input_dict)
diff --git a/tests/fpgadataflow/test_fpgadataflow_addstreams.py b/tests/fpgadataflow/test_fpgadataflow_addstreams.py
index f94784457a43718516e76946269fc47119423b24..81456796a75c6bf6a01c0a1f83c38b0b39bf4c81 100644
--- a/tests/fpgadataflow/test_fpgadataflow_addstreams.py
+++ b/tests/fpgadataflow/test_fpgadataflow_addstreams.py
@@ -27,6 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pytest
+import numpy as np
 
 from onnx import TensorProto, helper
 
@@ -44,6 +45,8 @@ from finn.util.basic import gen_finn_dt_tensor
 from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
     ReplaceVerilogRelPaths,
 )
+from finn.custom_op.registry import getCustomOp
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 
 
 def make_addstreams_modelwrapper(ch, pe, idt):
@@ -125,3 +128,12 @@ def test_fpgadataflow_addstreams(idt, ch, fold, exec_mode):
     y_produced = y_produced.reshape(y_expected.shape)
 
     assert (y_produced == y_expected).all(), exec_mode + " failed"
+
+    if exec_mode == "rtlsim":
+        node = model.get_nodes_by_op_type("AddStreams_Batch")[0]
+        inst = getCustomOp(node)
+        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
+        assert exp_cycles != 0
diff --git a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..23ce8314e9c45196d7311ac58cb6bb5ef5267220
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
@@ -0,0 +1,166 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+from onnx import TensorProto, helper
+
+import finn.core.onnx_exec as oxe
+from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.util.basic import gen_finn_dt_tensor
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
+from finn.custom_op.registry import getCustomOp
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+
+
+def make_modelwrapper(C, pe, idt, odt, pdt, func, vecs):
+    NumChannels = C.shape[0]
+
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, vecs + [NumChannels])
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, vecs + [NumChannels]
+    )
+
+    node_inp_list = ["inp", "const"]
+
+    node = helper.make_node(
+        "ChannelwiseOp_Batch",
+        node_inp_list,
+        ["outp"],
+        domain="finn",
+        backend="fpgadataflow",
+        NumChannels=NumChannels,
+        Func=func,
+        PE=pe,
+        inputDataType=idt.name,
+        outputDataType=odt.name,
+        paramDataType=pdt.name,
+        numInputVectors=vecs,
+    )
+    graph = helper.make_graph(nodes=[node], name="graph", inputs=[inp], outputs=[outp])
+
+    model = helper.make_model(graph, producer_name="model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+
+    model.set_tensor_datatype("const", idt)
+    model.set_initializer("const", C)
+    return model
+
+
+# activation: None or DataType
+@pytest.mark.parametrize("act", [DataType.INT8])
+# input datatype
+@pytest.mark.parametrize("idt", [DataType.INT4])
+# param datatype
+@pytest.mark.parametrize("pdt", [DataType.INT4])
+# folding, -1 is maximum possible
+@pytest.mark.parametrize("nf", [-1, 2])
+# number of input features
+@pytest.mark.parametrize("ich", [16])
+# vecs
+@pytest.mark.parametrize("vecs", [[1], [1, 7, 7]])
+# function
+@pytest.mark.parametrize("func", ["add", "mul"])
+# execution mode
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.vivado
+@pytest.mark.slow
+def test_fpgadataflow_channelwise_ops(idt, act, pdt, nf, ich, func, vecs, exec_mode):
+    if nf == -1:
+        nf = ich
+    pe = ich // nf
+    assert ich % pe == 0
+
+    # generate input and param data
+    x = gen_finn_dt_tensor(idt, tuple(vecs + [ich]))
+    # C = np.random.randint(idt.min(), idt.max() + 1, ich).astype(np.float32)
+    C = gen_finn_dt_tensor(pdt, (ich))
+
+    odt = act
+
+    model = make_modelwrapper(C, pe, idt, odt, pdt, func, vecs)
+
+    if exec_mode == "cppsim":
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+        model = model.transform(SetExecMode("cppsim"))
+    elif exec_mode == "rtlsim":
+        model = model.transform(SetExecMode("rtlsim"))
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(ReplaceVerilogRelPaths())
+        model = model.transform(PrepareRTLSim())
+    else:
+        raise Exception("Unknown exec_mode")
+
+    # package input data as dictionary
+    input_dict = {"inp": x}
+
+    oshape = model.get_tensor_shape("outp")
+
+    C_reshaped = np.broadcast_to(C.flatten(), x.shape)
+    if func == "add":
+        y = x + C_reshaped
+    elif func == "mul":
+        y = x * C_reshaped
+
+    y_expected = y.reshape(oshape)
+    # execute model
+    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
+
+    y_produced = y_produced.reshape(y_expected.shape)
+
+    assert (y_produced == y_expected).all(), "cppsim failed"
+
+    if exec_mode == "rtlsim":
+        hls_synt_res_est = model.analysis(hls_synth_res_estimation)
+        assert "ChannelwiseOp_Batch_0" in hls_synt_res_est
+
+        node = model.get_nodes_by_op_type("ChannelwiseOp_Batch")[0]
+        inst = getCustomOp(node)
+        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
+        assert exp_cycles != 0
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
index b5fc85caf274edc9e7afc52df962862fa8a99ba3..020a2a545dadaf32c469789c90d0ea530688812c 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
@@ -27,6 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pytest
+import numpy as np
 
 from onnx import TensorProto, helper
 
@@ -42,6 +43,9 @@ from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.general import GiveUniqueNodeNames
 from finn.util.basic import gen_finn_dt_tensor
 
+from finn.custom_op.registry import getCustomOp
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+
 
 def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt):
     odt = idt
@@ -182,3 +186,12 @@ def test_fpgadataflow_slidingwindow(
         y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5)
         y_expected = y_expected.reshape(1, ofm_dim, ofm_dim, ifm_ch * k * k)
         assert (y_produced == y_expected).all()
+
+    if exec_mode == "rtlsim":
+        node = model.get_nodes_by_op_type("ConvolutionInputGenerator")[0]
+        inst = getCustomOp(node)
+        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
+        assert exp_cycles != 0
diff --git a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
index 4fb84be59333ef0e696204c9064fcf77e35b5d9b..5066b9709cac922f6bd3670ec7199f3e0f8fd9a2 100644
--- a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
+++ b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
@@ -27,12 +27,15 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pytest
+import numpy as np
 
 from onnx import TensorProto, helper
 
 import finn.core.onnx_exec as oxe
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
@@ -44,6 +47,8 @@ from finn.util.basic import gen_finn_dt_tensor
 from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
     ReplaceVerilogRelPaths,
 )
+from finn.custom_op.registry import getCustomOp
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 
 
 def make_dupstreams_modelwrapper(ch, pe, idim, idt):
@@ -72,6 +77,9 @@ def make_dupstreams_modelwrapper(ch, pe, idim, idt):
 
     model.set_tensor_datatype("inp", idt)
 
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+
     return model
 
 
@@ -125,3 +133,12 @@ def test_fpgadataflow_duplicatestreams(idt, ch, fold, imdim, exec_mode):
 
     assert (y0 == expected_y).all(), exec_mode + " failed"
     assert (y1 == expected_y).all(), exec_mode + " failed"
+
+    if exec_mode == "rtlsim":
+        node = model.get_nodes_by_op_type("DuplicateStreams_Batch")[0]
+        inst = getCustomOp(node)
+        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
+        assert exp_cycles != 0
diff --git a/tests/fpgadataflow/test_fpgadataflow_fclayer.py b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
index fc5cdb7745945bee99564ba9ab19423a66d8e035..37a1cc81ebd0824cdd8ac2c073298ad39424f57f 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fclayer.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
@@ -49,6 +49,7 @@ from finn.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor
 from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
     ReplaceVerilogRelPaths,
 )
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 
 
 def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=None):
@@ -134,7 +135,7 @@ def prepare_inputs(input_tensor, idt, wdt):
 
 
 # mem_mode: const or decoupled
-@pytest.mark.parametrize("mem_mode", ["const", "decoupled"])
+@pytest.mark.parametrize("mem_mode", ["const", "decoupled", "external"])
 # activation: None or DataType
 @pytest.mark.parametrize("act", [None, DataType.BIPOLAR, DataType.INT4])
 # weight datatype
@@ -221,7 +222,7 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
 
 
 # mem_mode: const or decoupled
-@pytest.mark.parametrize("mem_mode", ["const", "decoupled"])
+@pytest.mark.parametrize("mem_mode", ["const", "decoupled", "external"])
 # activation: None or DataType
 @pytest.mark.parametrize("act", [None, DataType.BIPOLAR, DataType.INT4])
 # weight datatype
@@ -311,6 +312,14 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
     hls_synt_res_est = model.analysis(hls_synth_res_estimation)
     assert "StreamingFCLayer_Batch_0" in hls_synt_res_est
 
+    node = model.get_nodes_by_op_type("StreamingFCLayer_Batch")[0]
+    inst = getCustomOp(node)
+    cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+    exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+    exp_cycles = exp_cycles_dict[node.name]
+    assert np.isclose(exp_cycles, cycles_rtlsim, atol=15)
+    assert exp_cycles != 0
+
 
 # mem_mode: const or decoupled
 @pytest.mark.parametrize("mem_mode", ["decoupled"])
@@ -329,7 +338,7 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
 # HLS matrix height (output features)
 @pytest.mark.parametrize("mh", [128])
 @pytest.mark.vivado
-def test_fpgadataflow_fclayer_large_depth_decoupled_mode(
+def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim(
     mem_mode, idt, wdt, act, nf, sf, mw, mh
 ):
     if nf == -1:
@@ -403,3 +412,11 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode(
 
     hls_synt_res_est = model.analysis(hls_synth_res_estimation)
     assert "StreamingFCLayer_Batch_0" in hls_synt_res_est
+
+    node = model.get_nodes_by_op_type("StreamingFCLayer_Batch")[0]
+    inst = getCustomOp(node)
+    cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+    exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+    exp_cycles = exp_cycles_dict[node.name]
+    assert np.isclose(exp_cycles, cycles_rtlsim, atol=15)
+    assert exp_cycles != 0
diff --git a/tests/fpgadataflow/test_fpgadataflow_fifo.py b/tests/fpgadataflow/test_fpgadataflow_fifo.py
index 94090a47ad64fc377530e6e21d35661e1d92b5a6..a0881e2c95a491c79bb86b9817fb81735eb63d81 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fifo.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fifo.py
@@ -99,28 +99,32 @@ def test_fpgadataflow_fifo_rtlsim(Shape, folded_shape, depth, finn_dtype):
        input values anymore."""
     assert y.shape == tuple(Shape), """The output shape is incorrect."""
 
-    model = model.transform(ReplaceVerilogRelPaths())
-    model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
-    model = model.transform(MakePYNQProject(test_pynq_board))
-    model = model.transform(SynthPYNQProject())
-    model = model.transform(MakePYNQDriver())
-    ip = os.environ["PYNQ_IP"]
-    username = os.getenv("PYNQ_USERNAME", "xilinx")
-    password = os.getenv("PYNQ_PASSWORD", "xilinx")
-    port = os.getenv("PYNQ_PORT", 22)
-    target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
-    model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
-
-    res = throughput_test(model)
-    expected_dict = {}
-    expected_dict["runtime[ms]"] = []
-    expected_dict["throughput[images/s]"] = []
-    expected_dict["DRAM_in_bandwidth[Mb/s]"] = []
-    expected_dict["DRAM_out_bandwidth[Mb/s]"] = []
-    for key in expected_dict:
-        assert (
-            key in res
-        ), """Throughput test not successful, no value for {}
-        in result dictionary""".format(
-            key
-        )
+    try:
+        ip = os.environ["PYNQ_IP"]  # NOQA
+        if ip == "":
+            pytest.skip("PYNQ board IP address not specified")
+        model = model.transform(ReplaceVerilogRelPaths())
+        model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
+        model = model.transform(MakePYNQProject(test_pynq_board))
+        model = model.transform(SynthPYNQProject())
+        model = model.transform(MakePYNQDriver(platform="zynq"))
+        username = os.getenv("PYNQ_USERNAME", "xilinx")
+        password = os.getenv("PYNQ_PASSWORD", "xilinx")
+        port = os.getenv("PYNQ_PORT", 22)
+        target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
+        model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
+        res = throughput_test(model)
+        expected_dict = {}
+        expected_dict["runtime[ms]"] = []
+        expected_dict["throughput[images/s]"] = []
+        expected_dict["DRAM_in_bandwidth[Mb/s]"] = []
+        expected_dict["DRAM_out_bandwidth[Mb/s]"] = []
+        for key in expected_dict:
+            assert (
+                key in res
+            ), """Throughput test not successful, no value for {}
+            in result dictionary""".format(
+                key
+            )
+    except KeyError:
+        pytest.skip("PYNQ board IP address not specified")
diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
index 9d6390b2673e5d2c0e72748183ac04ed222d078e..ef4f17998dbb09d31cdc9b3c89afafd10653fd28 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
@@ -15,6 +15,8 @@ from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.custom_op.registry import getCustomOp
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 
 from finn.util.basic import pynq_part_map
 
@@ -23,7 +25,7 @@ test_fpga_part = pynq_part_map[test_pynq_board]
 target_clk_ns = 10
 
 
-def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style):
+def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_style):
     assert pad_style == 2, "only pad_style == 2 supported in hlslib"
     assert padding > 0, "Output dim should be greater than input dim"
     odim = idim + padding
@@ -47,6 +49,7 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style):
         inputDataType=str(idt.name),
         PaddingStyle=pad_style,
         numInputVectors=1,
+        SIMD=simd,
     )
 
     graph = helper.make_graph(
@@ -63,11 +66,13 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style):
 
 
 # input image dimension
-@pytest.mark.parametrize("idim", [8, 16])
+@pytest.mark.parametrize("idim", [8])
 # number of rows and number of cols to add
 @pytest.mark.parametrize("pad", [2, 3])
 # number of channels
-@pytest.mark.parametrize("num_ch", [1, 2])
+@pytest.mark.parametrize("num_ch", [2, 4])
+# Input parallelism
+@pytest.mark.parametrize("simd", [1, 2])
 # PaddingStyle: selects behavior when (odim-idim)%2 != 0
 @pytest.mark.parametrize("pad_style", [2])
 # FINN input datatype
@@ -76,14 +81,15 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style):
 @pytest.mark.parametrize("mode", ["cppsim", "rtlsim"])
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_fmpadding(idim, pad, num_ch, pad_style, idt, mode):
-
+def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode):
+    if num_ch % simd != 0:
+        pytest.skip(" num_ch % simd != 0, skipping")
     # generate input data
     x = gen_finn_dt_tensor(idt, [1, idim, idim, num_ch])
     input_dict = {"inp": x}
     odim = idim + pad
 
-    model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, idt, pad_style)
+    model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, simd, idt, pad_style)
     model = model.transform(InferShapes())
     model = model.transform(SetExecMode(mode))
     model = model.transform(GiveUniqueNodeNames())
@@ -119,3 +125,12 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, pad_style, idt, mode):
     )
 
     assert (y_produced == y_expected).all()
+
+    if mode == "rtlsim":
+        node = model.get_nodes_by_op_type("FMPadding_Batch")[0]
+        inst = getCustomOp(node)
+        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
+        assert exp_cycles != 0
diff --git a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
index b46391daf629e97c24c2950aefad3cbc5055c345..27f1a32a481f006818fbdd7e879bd9dd92242c80 100644
--- a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
+++ b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
@@ -45,6 +45,8 @@ from finn.util.basic import gen_finn_dt_tensor
 from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
     ReplaceVerilogRelPaths,
 )
+from finn.custom_op.registry import getCustomOp
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 
 
 def make_accpool_modelwrapper(ch, pe, idim, idt):
@@ -121,3 +123,17 @@ def test_fpgadataflow_globalaccpool(idt, ch, fold, imdim, exec_mode):
     expected_y = np.sum(x, axis=(1, 2)).flatten()
 
     assert (y == expected_y).all(), exec_mode + " failed"
+
+    if exec_mode == "rtlsim":
+        node = model.get_nodes_by_op_type("GlobalAccPool_Batch")[0]
+        inst = getCustomOp(node)
+        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        # commented out, needs performance debug:
+        # test_fpgadataflow_globalaccpool[rtlsim-7-1-64-DataType.UINT4]
+        # assert False where False =
+        # <function isclose at 0x7eff26d5ca60>(50, 103, atol=(0.1 * 103))
+        # assert np.isclose(exp_cycles, cycles_rtlsim, atol=0.1 * cycles_rtlsim)
+        assert exp_cycles != 0
+        assert cycles_rtlsim != 0
diff --git a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
similarity index 67%
rename from tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
rename to tests/fpgadataflow/test_fpgadataflow_ipstitch.py
index 16100522aa94fd25d234efa1d03edfdc866ca1bb..66b0ef921453e9e6fee9eb9be18cc556b2612f23 100644
--- a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
+++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
@@ -50,9 +50,21 @@ from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject
 from finn.transformation.fpgadataflow.synth_pynq_proj import SynthPYNQProject
 import finn.transformation.fpgadataflow.replace_verilog_relpaths as rvp
 from finn.transformation.general import GiveUniqueNodeNames
-from finn.util.basic import gen_finn_dt_tensor, pynq_part_map
+from finn.util.basic import (
+    gen_finn_dt_tensor,
+    pynq_part_map,
+    alveo_part_map,
+    alveo_default_platform,
+)
 from finn.util.fpgadataflow import pyverilate_stitched_ip
 from finn.util.test import load_test_checkpoint_or_skip
+from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA
+from finn.transformation.fpgadataflow.floorplan import Floorplan
+from finn.transformation.fpgadataflow.vitis_build import VitisBuild
+from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
+
 
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
 test_fpga_part = pynq_part_map[test_pynq_board]
@@ -114,7 +126,7 @@ def create_one_fc_model():
     return model
 
 
-def create_two_fc_model():
+def create_two_fc_model(mem_mode="decoupled"):
     # create a model with two StreamingFCLayer instances
     wdt = DataType.INT2
     idt = DataType.INT32
@@ -147,7 +159,7 @@ def create_two_fc_model():
         ActVal=actval,
         binaryXnorMode=binary_xnor_mode,
         noActivation=no_act,
-        mem_mode="decoupled",
+        mem_mode=mem_mode,
     )
 
     fc1 = helper.make_node(
@@ -167,7 +179,7 @@ def create_two_fc_model():
         ActVal=actval,
         binaryXnorMode=binary_xnor_mode,
         noActivation=no_act,
-        mem_mode="decoupled",
+        mem_mode=mem_mode,
     )
 
     graph = helper.make_graph(
@@ -242,35 +254,35 @@ def test_fpgadataflow_ipstitch_rtlsim():
     model.set_metadata_prop("rtlsim_trace", "whole_trace.vcd")
     sim = pyverilate_stitched_ip(model)
     exp_io = [
-        "ap_clk_0",
-        "ap_rst_n_0",
-        "in0_V_V_0_tdata",
-        "in0_V_V_0_tready",
-        "in0_V_V_0_tvalid",
-        "out_r_0_tdata",
-        "out_r_0_tkeep",
-        "out_r_0_tlast",
-        "out_r_0_tready",
-        "out_r_0_tvalid",
-        "s_axi_control_0_araddr",
-        "s_axi_control_0_arready",
-        "s_axi_control_0_arvalid",
-        "s_axi_control_0_awaddr",
-        "s_axi_control_0_awready",
-        "s_axi_control_0_awvalid",
-        "s_axi_control_0_bready",
-        "s_axi_control_0_bresp",
-        "s_axi_control_0_bvalid",
-        "s_axi_control_0_rdata",
-        "s_axi_control_0_rready",
-        "s_axi_control_0_rresp",
-        "s_axi_control_0_rvalid",
-        "s_axi_control_0_wdata",
-        "s_axi_control_0_wready",
-        "s_axi_control_0_wstrb",
-        "s_axi_control_0_wvalid",
+        "ap_clk",
+        "ap_rst_n",
+        "s_axis_0_tdata",
+        "s_axis_0_tready",
+        "s_axis_0_tvalid",
+        "m_axis_0_tdata",
+        "m_axis_0_tkeep",
+        "m_axis_0_tlast",
+        "m_axis_0_tready",
+        "m_axis_0_tvalid",
+        "s_axi_control_araddr",
+        "s_axi_control_arready",
+        "s_axi_control_arvalid",
+        "s_axi_control_awaddr",
+        "s_axi_control_awready",
+        "s_axi_control_awvalid",
+        "s_axi_control_bready",
+        "s_axi_control_bresp",
+        "s_axi_control_bvalid",
+        "s_axi_control_rdata",
+        "s_axi_control_rready",
+        "s_axi_control_rresp",
+        "s_axi_control_rvalid",
+        "s_axi_control_wdata",
+        "s_axi_control_wready",
+        "s_axi_control_wstrb",
+        "s_axi_control_wvalid",
     ]
-    assert dir(sim.io) == exp_io
+    assert sorted(dir(sim.io)) == sorted(exp_io)
     model.set_metadata_prop("exec_mode", "rtlsim")
     idt = model.get_tensor_datatype("inp")
     ishape = model.get_tensor_shape("inp")
@@ -281,6 +293,27 @@ def test_fpgadataflow_ipstitch_rtlsim():
     assert (rtlsim_res == x).all()
 
 
+@pytest.mark.vivado
+@pytest.mark.slow
+def test_fpgadataflow_ipstitch_synth_ooc():
+    model = load_test_checkpoint_or_skip(
+        ip_stitch_model_dir + "/test_fpgadataflow_ip_stitch.onnx"
+    )
+    model = model.transform(SynthOutOfContext(test_fpga_part, 5))
+    ret = model.get_metadata_prop("res_total_ooc_synth")
+    assert ret is not None
+    # example expected output: (details may differ based on Vivado version etc)
+    # "{'vivado_proj_folder': ...,
+    # 'LUT': 708.0, 'FF': 1516.0, 'DSP': 0.0, 'BRAM': 0.0, 'WNS': 0.152, '': 0,
+    # 'fmax_mhz': 206.27062706270627}"
+    ret = eval(ret)
+    assert ret["LUT"] > 0
+    assert ret["FF"] > 0
+    assert ret["DSP"] == 0
+    assert ret["BRAM"] == 0
+    assert ret["fmax_mhz"] > 100
+
+
 @pytest.mark.vivado
 def test_fpgadataflow_ipstitch_pynq_projgen():
     model = load_test_checkpoint_or_skip(
@@ -310,7 +343,7 @@ def test_fpgadataflow_ipstitch_pynq_driver():
     model = load_test_checkpoint_or_skip(
         ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_pynq_synth.onnx"
     )
-    model = model.transform(MakePYNQDriver())
+    model = model.transform(MakePYNQDriver(platform="zynq"))
     driver_dir = model.get_metadata_prop("pynq_driver_dir")
     assert driver_dir is not None
     assert os.path.isdir(driver_dir)
@@ -368,3 +401,87 @@ def test_fpgadataflow_ipstitch_remote_execution():
         assert np.isclose(outp["outp"], x).all()
     except KeyError:
         pytest.skip("PYNQ board IP address not specified")
+
+
+def test_fpgadataflow_ipstitch_iodma_floorplan():
+    model = create_one_fc_model()
+    if model.graph.node[0].op_type == "StreamingDataflowPartition":
+        sdp_node = getCustomOp(model.graph.node[0])
+        assert sdp_node.__class__.__name__ == "StreamingDataflowPartition"
+        assert os.path.isfile(sdp_node.get_nodeattr("model"))
+        model = load_test_checkpoint_or_skip(sdp_node.get_nodeattr("model"))
+    model = model.transform(InferDataLayouts())
+    model = model.transform(InsertIODMA())
+    model = model.transform(Floorplan())
+    assert getCustomOp(model.graph.node[0]).get_nodeattr("partition_id") == 0
+    assert getCustomOp(model.graph.node[1]).get_nodeattr("partition_id") == 2
+    assert getCustomOp(model.graph.node[2]).get_nodeattr("partition_id") == 1
+    model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_iodma_floorplan.onnx")
+
+
+# board
+@pytest.mark.parametrize("board", ["U250"])
+# clock period
+@pytest.mark.parametrize("period_ns", [5])
+# override mem_mode to external
+@pytest.mark.parametrize("extw", [True, False])
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.vitis
+def test_fpgadataflow_ipstitch_vitis(board, period_ns, extw):
+    if "VITIS_PATH" not in os.environ:
+        pytest.skip("VITIS_PATH not set")
+    platform = alveo_default_platform[board]
+    fpga_part = alveo_part_map[board]
+    model = create_two_fc_model("external" if extw else "decoupled")
+    if model.graph.node[0].op_type == "StreamingDataflowPartition":
+        sdp_node = getCustomOp(model.graph.node[0])
+        assert sdp_node.__class__.__name__ == "StreamingDataflowPartition"
+        assert os.path.isfile(sdp_node.get_nodeattr("model"))
+        model = load_test_checkpoint_or_skip(sdp_node.get_nodeattr("model"))
+    model = model.transform(VitisBuild(fpga_part, period_ns, platform))
+    model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_vitis.onnx")
+
+
+# board
+@pytest.mark.parametrize("board", ["Pynq-Z1"])
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_fpgadataflow_ipstitch_zynqbuild(board):
+    model = create_two_fc_model()
+    if model.graph.node[0].op_type == "StreamingDataflowPartition":
+        sdp_node = getCustomOp(model.graph.node[0])
+        assert sdp_node.__class__.__name__ == "StreamingDataflowPartition"
+        assert os.path.isfile(sdp_node.get_nodeattr("model"))
+        model = load_test_checkpoint_or_skip(sdp_node.get_nodeattr("model"))
+    # generate inputs for remote exec
+    iname = "inp"
+    idt = model.get_tensor_datatype(iname)
+    ishape = model.get_tensor_shape(iname)
+    x = gen_finn_dt_tensor(idt, ishape)
+    # bitfile using ZynqBuild
+    model = model.transform(ZynqBuild(board, 10))
+    model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_customzynq.onnx")
+
+    bitfile_name = model.get_metadata_prop("vivado_pynq_bitfile")
+    assert bitfile_name is not None
+    assert os.path.isfile(bitfile_name)
+    # deployment
+    try:
+        ip = os.environ["PYNQ_IP"]  # no default for this one; skip if not defined
+        if ip == "":
+            pytest.skip("PYNQ board IP address not specified")
+        username = os.getenv("PYNQ_USERNAME", "xilinx")
+        password = os.getenv("PYNQ_PASSWORD", "xilinx")
+        port = os.getenv("PYNQ_PORT", 22)
+        target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
+        model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
+        deployment_dir = model.get_metadata_prop("pynq_deploy_dir")
+        assert deployment_dir is not None
+        assert os.path.isdir(deployment_dir)
+        # remote exec
+        input_dict = {"global_in": x}
+        outp = execute_onnx(model, input_dict)
+        assert np.isclose(outp["global_out"], x).all()
+    except KeyError:
+        pytest.skip("PYNQ board IP address not specified")
diff --git a/tests/fpgadataflow/test_fpgadataflow_labelselect.py b/tests/fpgadataflow/test_fpgadataflow_labelselect.py
index 2df841728395229dafe33d2804c44a3489ef3e45..9bc77cd47fd6115823f9a35d98e8874ee3f98b2d 100644
--- a/tests/fpgadataflow/test_fpgadataflow_labelselect.py
+++ b/tests/fpgadataflow/test_fpgadataflow_labelselect.py
@@ -27,6 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pytest
+import numpy as np
 
 from onnx import TensorProto, helper
 
@@ -70,7 +71,8 @@ def make_labelselect_modelwrapper(labels, pe, k, idt):
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
-    model.set_tensor_datatype("outp", DataType.UINT32)
+    odt = DataType.get_smallest_possible(labels - 1)
+    model.set_tensor_datatype("outp", odt)
 
     return model
 
@@ -79,19 +81,18 @@ def prepare_inputs(input_tensor, idt):
     return {"inp": input_tensor}
 
 
-# TODO: folded inputs fail, likely problem in hlslib
-# input datatype -- checked by assertion in HLSCustomOp
-@pytest.mark.parametrize("idt", [DataType.UINT8, DataType.UINT16])
+@pytest.mark.parametrize("idt", [DataType.UINT8, DataType.UINT16, DataType.INT16])
 # labels
-@pytest.mark.parametrize("labels", [10, 1000])
+@pytest.mark.parametrize("labels", [10, 100])
 # folding
-@pytest.mark.parametrize("fold", [-1])
+@pytest.mark.parametrize("fold", [-1, 2, 10])
 # number of top labels to select
 @pytest.mark.parametrize("k", [1, 5])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 @pytest.mark.vivado
 def test_fpgadataflow_labelselect(idt, labels, fold, k, exec_mode):
+    np.random.seed(0)
     if fold == -1:
         pe = 1
     else:
diff --git a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
index 38f792ed3cdd52044b28b4c19ac0603da4e502e6..398a17132a2ef6c92e600102ff5c0b71a1f65aaa 100644
--- a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
+++ b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
@@ -92,7 +92,7 @@ def test_res_estimate():
     model = model.transform(GiveUniqueNodeNames())
     prod_resource_estimation = model.analysis(res_estimation)
     expect_resource_estimation = {
-        "StreamingFCLayer_Batch_0": {"BRAM_18K": 1, "LUT": 304.4}
+        "StreamingFCLayer_Batch_0": {"BRAM_18K": 1, 'BRAM_efficiency': 0.001736111111111111, "LUT": 304.4}
     }
 
     assert check_two_dict_for_equality(
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
index 50b990f13494f22e985406791445b406e9946147..1715bcad0dd29799cdc99497179ce8635058f3be 100644
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
@@ -47,6 +47,8 @@ from finn.util.basic import gen_finn_dt_tensor
 from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
     ReplaceVerilogRelPaths,
 )
+from finn.custom_op.registry import getCustomOp
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 
 
 def make_single_thresholding_modelwrapper(T, pe, idt, odt):
@@ -152,3 +154,11 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode):
     if exec_mode == "rtlsim":
         hls_synt_res_est = model.analysis(hls_synth_res_estimation)
         assert "Thresholding_Batch_0" in hls_synt_res_est
+
+        node = model.get_nodes_by_op_type("Thresholding_Batch")[0]
+        inst = getCustomOp(node)
+        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
+        assert exp_cycles != 0
diff --git a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py b/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
index bda66bebbd93d346eb0026b17cbaff9a7ca5df5e..d61edc86dd6b5669c334e6b7f78ea9a8550cae93 100644
--- a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
+++ b/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
@@ -41,6 +41,9 @@ from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.general import GiveUniqueNodeNames
 from finn.util.basic import gen_finn_dt_tensor
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+from finn.custom_op.registry import getCustomOp
+import numpy as np
 
 
 def make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt):
@@ -154,3 +157,12 @@ def test_fpgadataflow_streamingmaxpool(idt, k, ifm_dim, ifm_ch, exec_mode):
     # execute model
     y_produced = oxe.execute_onnx(model, input_dict)["outp"]
     assert (y_produced == y_expected).all()
+
+    if exec_mode == "rtlsim":
+        node = model.get_nodes_by_op_type("StreamingMaxPool_Batch")[0]
+        inst = getCustomOp(node)
+        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        assert np.isclose(exp_cycles, cycles_rtlsim, atol=15)
+        assert exp_cycles != 0
diff --git a/tests/pynq/test_pynq_performance_end2end.py b/tests/pynq/test_pynq_performance_end2end.py
index 66a93a190061e0142637be19bb2ea841d192745a..3b6ea86741b8adefce4faaa65b791f1d213cf3ae 100644
--- a/tests/pynq/test_pynq_performance_end2end.py
+++ b/tests/pynq/test_pynq_performance_end2end.py
@@ -10,7 +10,7 @@ from finn.core.throughput_test import throughput_test
 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 
 
-@pytest.mark.parametrize("end2end_example", ["tfc_w1a1", "cnv_w1a1"])
+@pytest.mark.parametrize("end2end_example", ["tfc_w1a1", "cnv_w1a1", "cnv_w2a2"])
 @pytest.mark.slow
 def test_pynq_performance_end2end(end2end_example):
     model = load_test_checkpoint_or_skip(
diff --git a/tests/pynq/test_pynq_performance_fifo.py b/tests/pynq/test_pynq_performance_fifo.py
index 1d4542473c4b58d3baa62f4123fd0f2f76954d95..1a438f79e09925cab57866c83a3cc9c8a1896351 100644
--- a/tests/pynq/test_pynq_performance_fifo.py
+++ b/tests/pynq/test_pynq_performance_fifo.py
@@ -81,7 +81,7 @@ def test_pynq_performance_fifo():
         model = model.transform(CreateStitchedIP(fpga_part, clk_ns))
         model = model.transform(MakePYNQProject(board))
         model = model.transform(SynthPYNQProject())
-        model = model.transform(MakePYNQDriver())
+        model = model.transform(MakePYNQDriver(platform="zynq"))
         model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
 
         ret = dict()
diff --git a/tests/transformation/streamline/test_streamline_cnv.py b/tests/transformation/streamline/test_streamline_cnv.py
index 56dcd26076ec0a5fba6e9be6acac7f5e13572c3d..bcb66a2c22eb4d6a998580129881793bbc86b250 100644
--- a/tests/transformation/streamline/test_streamline_cnv.py
+++ b/tests/transformation/streamline/test_streamline_cnv.py
@@ -34,7 +34,12 @@ import pkg_resources as pk
 import finn.core.onnx_exec as oxe
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fold_constants import FoldConstants
-from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from finn.transformation.general import (
+    RemoveUnusedTensors,
+    RemoveStaticGraphInputs,
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+)
 from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline import Streamline
 from finn.util.test import get_test_model_trained
@@ -44,9 +49,9 @@ from finn.transformation.double_to_single_float import DoubleToSingleFloat
 export_onnx_path = make_build_dir("test_streamline_cnv_")
 
 # act bits
-@pytest.mark.parametrize("abits", [1])
+@pytest.mark.parametrize("abits", [1, 2])
 # weight bits
-@pytest.mark.parametrize("wbits", [1])
+@pytest.mark.parametrize("wbits", [1, 2])
 # network topology / size
 @pytest.mark.parametrize("size", ["CNV"])
 def test_streamline_cnv(size, wbits, abits):
@@ -62,6 +67,7 @@ def test_streamline_cnv(size, wbits, abits):
     model = model.transform(FoldConstants())
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(GiveReadableTensorNames())
+    model = model.transform(RemoveStaticGraphInputs())
     # load one of the test vectors
     fn = pk.resource_filename("finn", "data/cifar10/cifar10-test-data-class3.npz")
     input_tensor = np.load(fn)["arr_0"].astype(np.float32)
@@ -73,7 +79,11 @@ def test_streamline_cnv(size, wbits, abits):
     expected = expected_ctx[model.graph.output[0].name]
     # model.save("orig_cnv.onnx")
     model = model.transform(Streamline())
+    model = model.transform(RemoveUnusedTensors())
+    assert len(model.graph.initializer) == 21
+    assert len(model.graph.value_info) == 43
     # model.save("streamlined_cnv.onnx")
+    assert len(model.graph.node) == 23
     produced_ctx = oxe.execute_onnx(model, input_dict, True)
     produced = produced_ctx[model.graph.output[0].name]
     assert np.isclose(expected, produced, atol=1e-3).all()
diff --git a/tests/transformation/streamline/test_streamline_fc.py b/tests/transformation/streamline/test_streamline_fc.py
index c68561239b7c30973856fa282d20cd2afaa168ae..dd7e756b4021af26c228804d4b509ecff032347e 100644
--- a/tests/transformation/streamline/test_streamline_fc.py
+++ b/tests/transformation/streamline/test_streamline_fc.py
@@ -37,7 +37,12 @@ import pytest
 import finn.core.onnx_exec as oxe
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fold_constants import FoldConstants
-from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from finn.transformation.general import (
+    RemoveUnusedTensors,
+    RemoveStaticGraphInputs,
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+)
 from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline import Streamline
 from finn.util.test import get_test_model_trained
@@ -65,6 +70,7 @@ def test_streamline_fc(size, wbits, abits):
     model = model.transform(FoldConstants())
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(GiveReadableTensorNames())
+    model = model.transform(RemoveStaticGraphInputs())
     # load one of the test vectors
     raw_i = get_data("finn", "data/onnx/mnist-conv/test_data_set_0/input_0.pb")
     input_tensor = onnx.load_tensor_from_string(raw_i)
@@ -73,6 +79,10 @@ def test_streamline_fc(size, wbits, abits):
     expected_ctx = oxe.execute_onnx(model, input_dict, True)
     expected = expected_ctx[model.graph.output[0].name]
     model = model.transform(Streamline())
+    model = model.transform(RemoveUnusedTensors())
+    assert len(model.graph.initializer) == 11
+    assert len(model.graph.value_info) == 21
+    assert len(model.graph.quantization_annotation) == 18
     produced_ctx = oxe.execute_onnx(model, input_dict, True)
     produced = produced_ctx[model.graph.output[0].name]
     assert np.isclose(expected, produced, atol=1e-3).all()
diff --git a/tests/transformation/test_absorb_mul_into_topk.py b/tests/transformation/test_absorb_mul_into_topk.py
new file mode 100644
index 0000000000000000000000000000000000000000..1394220f7c336ccea8fe9c494734c4175bf2e847
--- /dev/null
+++ b/tests/transformation/test_absorb_mul_into_topk.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import pytest
+
+import numpy as np
+from onnx import TensorProto, helper
+
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames
+from finn.transformation.insert_topk import InsertTopK
+from finn.transformation.streamline.absorb import AbsorbScalarMulIntoTopK
+import finn.core.onnx_exec as oxe
+
+# parameter to indicate if mul parameter is negative or positive
+@pytest.mark.parametrize("mul_positive", [True, False])
+# parameter to indicate if mul parameter is scalar or not
+@pytest.mark.parametrize("scalar", [True, False])
+def test_absorb_mul_into_topk(mul_positive, scalar):
+    if scalar is True:
+        shape = [1]
+    else:
+        shape = [1, 1, 1, 1000]
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, 1, 1, 1000])
+    a0 = helper.make_tensor_value_info("a0", TensorProto.FLOAT, shape)
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, 1, 1, 1000])
+
+    mul_node = helper.make_node("Mul", ["inp", "a0"], ["outp"])
+    mul_graph = helper.make_graph(
+        nodes=[mul_node],
+        name="mul-graph",
+        inputs=[inp],
+        outputs=[outp],
+        value_info=[a0],
+    )
+
+    model = helper.make_model(mul_graph, producer_name="mul_model")
+    model = ModelWrapper(model)
+    # initialize values
+    if mul_positive is True:
+        a0_values = np.random.uniform(low=0.1, high=1, size=tuple(shape)).astype(
+            np.float32
+        )
+    else:
+        a0_values = np.random.uniform(low=-1, high=-0.1, size=tuple(shape)).astype(
+            np.float32
+        )
+    model.set_initializer("a0", a0_values)
+    model = model.transform(InsertTopK())
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model_transformed = model.transform(AbsorbScalarMulIntoTopK())
+
+    # compare execution results
+    inp_values = np.random.uniform(low=-10, high=10, size=(1, 1, 1, 1000)).astype(
+        np.float32
+    )
+    idict = {"global_in": inp_values}
+    odict = oxe.execute_onnx(model, idict, True)
+    y_indices = odict["global_out"]
+    y_values = odict["TopK_0_out0"]
+    odict = oxe.execute_onnx(model_transformed, idict, True)
+    y_tr_indices = odict["global_out"]
+    y_tr_values = odict["TopK_0_out0"]
+
+    # the indices stay the same, if the model is transformed or not
+    assert (y_indices == y_tr_indices).all()
+
+    if scalar is True and mul_positive is True:
+        # the values change if the model was transformed
+        assert (y_values != y_tr_values).all()
+
+        # check for new order
+        assert model.graph != model_transformed.graph
+        assert len(model.graph.node) - 1 == len(model_transformed.graph.node)
+        assert model_transformed.graph.node[0].op_type == "TopK"
+
+    else:
+        assert (y_values == y_tr_values).all()
+        assert model.graph == model_transformed.graph
diff --git a/tests/transformation/test_absorb_opposite_transposes.py b/tests/transformation/test_absorb_opposite_transposes.py
new file mode 100644
index 0000000000000000000000000000000000000000..859e691277a261f01b559e2e166763e402c5d689
--- /dev/null
+++ b/tests/transformation/test_absorb_opposite_transposes.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import onnx.helper as oh
+from onnx import TensorProto
+
+import finn.core.onnx_exec as ox
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.streamline.absorb import AbsorbConsecutiveTransposes
+
+
+def test_absorb_opposite_transposes():
+    np.random.seed(0)
+    input_shape = [1, 3, 4, 2]
+    top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
+    top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, input_shape)
+    value_info = [oh.make_tensor_value_info("add_param_0", TensorProto.FLOAT, [1])]
+    value_info += [oh.make_tensor_value_info("add_param_1", TensorProto.FLOAT, [1])]
+    value_info += [oh.make_tensor_value_info("mul_param_0", TensorProto.FLOAT, [1])]
+    modelproto = oh.make_model(
+        oh.make_graph(
+            name="test",
+            inputs=[top_in],
+            outputs=[top_out],
+            value_info=value_info,
+            nodes=[
+                oh.make_node("Add", ["top_in", "add_param_0"], ["t0"]),
+                oh.make_node("Transpose", ["t0"], ["t1"], perm=[0, 2, 3, 1]),
+                oh.make_node("Transpose", ["t1"], ["t2"], perm=[0, 3, 1, 2]),
+                oh.make_node("Add", ["t2", "add_param_1"], ["t3"]),
+                oh.make_node("Transpose", ["t3"], ["t4"], perm=[0, 2, 3, 1]),
+                oh.make_node("Transpose", ["t4"], ["t5"], perm=[0, 3, 1, 2]),
+                oh.make_node("Add", ["t5", "t2"], ["t6"]),
+                oh.make_node("Mul", ["t6", "mul_param_0"], ["top_out"]),
+            ],
+        )
+    )
+    model = ModelWrapper(modelproto)
+    model = model.transform(InferShapes())
+    model.set_initializer("add_param_0", np.asarray([1], dtype=np.float32))
+    model.set_initializer("add_param_1", np.asarray([3], dtype=np.float32))
+    model.set_initializer("mul_param_0", np.asarray([2], dtype=np.float32))
+    new_model = model.transform(AbsorbConsecutiveTransposes())
+    new_model = new_model.transform(InferShapes())
+    inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)}
+    assert ox.compare_execution(model, model, inp_dict)
+    assert len(new_model.graph.node) == 4
+    for n in new_model.graph.node:
+        assert new_model.graph.node[0].op_type != "Transpose"
diff --git a/tests/transformation/test_absorb_transp_into_flatten.py b/tests/transformation/test_absorb_transp_into_flatten.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbbb33b4606acf55ace662da0986105f8c456b39
--- /dev/null
+++ b/tests/transformation/test_absorb_transp_into_flatten.py
@@ -0,0 +1,99 @@
+import pytest
+
+import numpy as np
+from onnx import TensorProto, helper
+
+from finn.core.modelwrapper import ModelWrapper
+import finn.core.data_layout as DataLayout
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames
+from finn.transformation.streamline.absorb import AbsorbTransposeIntoFlatten
+import finn.core.onnx_exec as oxe
+
+# permutation of transpose node
+@pytest.mark.parametrize("perm", [[0, 2, 3, 1], [0, 1, 3, 2], [3, 2, 0, 1]])
+# reshape or flatten
+@pytest.mark.parametrize("shape", [None, [1, -1], [-1, 1]])
+# input shape
+@pytest.mark.parametrize("ishape", [[1, 1, 1, 4], [2, 4, 1, 1], [1, 2, 2, 4]])
+# datalayout
+@pytest.mark.parametrize("data_layout", ["NCHW", "NHWC"])
+def test_absorb_transp_into_flatten(perm, shape, ishape, data_layout):
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, ishape)
+    transp_node = helper.make_node("Transpose", ["inp"], ["transp_out"], perm=perm)
+    dummy_in = np.random.uniform(low=0, high=1, size=tuple(ishape)).astype(np.float32)
+    if shape is None:
+        shape_node = helper.make_node("Flatten", ["transp_out"], ["outp"])
+        dummy_in = dummy_in.transpose(tuple(perm))
+        oshape = dummy_in.reshape(dummy_in.shape[0], -1).shape
+        outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, oshape)
+        shape0 = None
+    else:
+        shape0 = helper.make_tensor_value_info("shape0", TensorProto.FLOAT, shape)
+        shape_node = helper.make_node("Reshape", ["transp_out", "shape0"], ["outp"])
+        oshape = dummy_in.transpose(tuple(perm)).reshape(tuple(shape)).shape
+        outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, oshape)
+
+    graph = helper.make_graph(
+        nodes=[transp_node, shape_node],
+        name="absorb-transpose-graph",
+        inputs=[inp],
+        outputs=[outp],
+    )
+
+    model = helper.make_model(graph, producer_name="absorb_transpose_model")
+    model = ModelWrapper(model)
+    if shape is not None:
+        model.graph.value_info.append(shape0)
+        model.set_initializer("shape0", np.asarray(shape))
+    if data_layout == "NCHW":
+        model.set_tensor_layout("inp", DataLayout.NCHW)
+    else:
+        model.set_tensor_layout("inp", DataLayout.NHWC)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    model = model.transform(InferDataLayouts())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    # model.save("test.onnx")
+    model_transformed = model.transform(AbsorbTransposeIntoFlatten())
+    # model_transformed.save("test2.onnx")
+
+    # verify transformation
+    inp_values = np.random.uniform(low=-1, high=1, size=tuple(ishape)).astype(
+        np.float32
+    )
+    idict = {model.graph.input[0].name: inp_values}
+    assert oxe.compare_execution(model, model_transformed, idict)
+
+    # only some of the parameter combinations lead to a graph that will be changed when
+    # AbsorbTransposeIntoFlatten is applied
+
+    if shape == [-1, 1]:  # not a flatten operation, so the graph will not be changed
+        assert model.graph == model_transformed.graph
+
+    elif perm == [
+        3,
+        2,
+        0,
+        1,
+    ]:  # the first dimension is also part of the transpose operation
+        # so the graph will not be changed
+        assert model.graph == model_transformed.graph
+
+    # the following cases are the ones in which the model is transformed
+    # because we tested the parameters shape and perm befire we can only consider ishape
+    # and data_layout (the transformed model should only contain a "Flatten" node)
+    elif ishape == [1, 1, 1, 4] and data_layout == "NHWC":
+        assert model_transformed.graph.node[0].op_type == "Flatten"
+
+    elif ishape == [2, 4, 1, 1] and data_layout == "NCHW" and shape is None:
+        # If the first  dimension of the input tensor is not 1, flatten and
+        # reshape (with shape = [1, -1]) would lead to different results
+        assert model_transformed.graph.node[0].op_type == "Flatten"
+
+    # all other cases lead to an unchanged model
+    else:
+        assert model.graph == model_transformed.graph
diff --git a/tests/transformation/test_change_datalayout.py b/tests/transformation/test_change_datalayout.py
new file mode 100644
index 0000000000000000000000000000000000000000..66459d574957575e61ec1bec631fb7030a27cca1
--- /dev/null
+++ b/tests/transformation/test_change_datalayout.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import pytest
+from onnx import helper, TensorProto
+
+from finn.custom_op.maxpoolnhwc import compute_pool_output_dim
+from finn.core.modelwrapper import ModelWrapper
+from finn.core.datatype import DataType
+import finn.core.data_layout as DataLayout
+from finn.transformation.change_datalayout import ChangeDataLayoutQuantAvgPool2d
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames
+from finn.util.basic import gen_finn_dt_tensor
+from finn.util.basic import get_by_name
+import finn.core.onnx_exec as oxe
+
+# stride
+@pytest.mark.parametrize("s", [1, 2])
+# kernel
+@pytest.mark.parametrize("k", [3, 4])
+# ibits
+@pytest.mark.parametrize("ibits", [4, 8])
+# obits
+@pytest.mark.parametrize("obits", [2, 4])
+# signed
+@pytest.mark.parametrize("signed", [False, True])
+# channels
+@pytest.mark.parametrize("c", [2, 3])
+# input dimension
+@pytest.mark.parametrize("idim", [6, 7])
+def test_change_datalayout_quantavgpool(s, k, ibits, obits, signed, c, idim):
+    n = 1
+    odim = compute_pool_output_dim(idim, k, s)
+    # determine input FINN datatype
+    if signed is True:
+        prefix = "INT"
+    else:
+        prefix = "UINT"
+    dt_name = prefix + str(ibits)
+    dtype = DataType[dt_name]
+
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [n, c, idim, idim])
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [n, c, odim, odim])
+
+    node = helper.make_node(
+        "QuantAvgPool2d",
+        ["inp"],
+        ["outp"],
+        domain="finn",
+        stride=s,
+        kernel=k,
+        ibits=ibits,
+        obits=obits,
+        signed=signed,
+        data_layout="NCHW",
+    )
+    graph = helper.make_graph(
+        nodes=[node], name="single-quantavgpool", inputs=[inp], outputs=[outp]
+    )
+
+    model = helper.make_model(graph)
+    model = ModelWrapper(model)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    model = model.transform(InferDataLayouts())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model_transformed = model.transform(ChangeDataLayoutQuantAvgPool2d())
+    model_transformed = model_transformed.transform(InferShapes())
+    model_transformed = model_transformed.transform(InferDataTypes())
+    model_transformed = model_transformed.transform(InferDataLayouts())
+    model_transformed = model_transformed.transform(GiveUniqueNodeNames())
+    model_transformed = model_transformed.transform(GiveReadableTensorNames())
+    inp_values = gen_finn_dt_tensor(dtype, [n, c, idim, idim])
+    idict = {"inp": inp_values}
+    assert oxe.compare_execution(model, model_transformed, idict)
+    assert len(model.graph.node) + 2 == len(model_transformed.graph.node)
+    assert model_transformed.graph.node[-1].op_type == "Transpose"
+    assert model_transformed.graph.node[0].op_type == "Transpose"
+    # check if QuantAvgPool2d node has datalayout set correctly
+    node = model_transformed.graph.node[1]
+    d_layout = get_by_name(node.attribute, "data_layout").s.decode("UTF-8")
+    assert d_layout == "NHWC"
+    assert model_transformed.get_tensor_layout(node.input[0]) == DataLayout.NHWC
+    assert model_transformed.get_tensor_layout(node.output[0]) == DataLayout.NHWC
diff --git a/tests/transformation/test_conv_lowering.py b/tests/transformation/test_conv_lowering.py
index 2cbc8e558940517168678b05c3bb46af8170abce..ab545d483321f8c52625b5401828277987bba3a9 100644
--- a/tests/transformation/test_conv_lowering.py
+++ b/tests/transformation/test_conv_lowering.py
@@ -26,21 +26,27 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import pytest
+import onnx.helper as oh
+from onnx import TensorProto
 import os
 import pkg_resources as pk
 import brevitas.onnx as bo
 import numpy as np
 
-
 from finn.core.modelwrapper import ModelWrapper
+from finn.core.datatype import DataType
 from finn.transformation.fold_constants import FoldConstants
 from finn.transformation.infer_shapes import InferShapes
 from finn.util.test import get_test_model_trained
 from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
 from finn.transformation.double_to_single_float import DoubleToSingleFloat
 import finn.core.onnx_exec as oxe
+from finn.custom_op.im2col import compute_conv_output_dim
+from finn.util.basic import gen_finn_dt_tensor
+from finn.custom_op.registry import getCustomOp
 
-export_onnx_path = "test_output_cnv.onnx"
+export_onnx_path = "test_conv_lowering.onnx"
 
 
 def test_conv_lowering_cnv_w1a1():
@@ -65,3 +71,121 @@ def test_conv_lowering_cnv_w1a1():
     assert np.isclose(produced, expected).all()
     assert np.argmax(produced) == 3
     os.remove(export_onnx_path)
+
+
+# input datatype
+@pytest.mark.parametrize("idt", [DataType.INT2, DataType.INT4])
+# kernel size
+@pytest.mark.parametrize("k", [2, 4])
+# input dimension
+@pytest.mark.parametrize("ifm_dim", [4, 6])
+# input channels
+@pytest.mark.parametrize("ifm_ch", [2, 3])
+# stride
+@pytest.mark.parametrize("stride", [1, 2])
+# padding
+@pytest.mark.parametrize("padding", [[0, 0, 0, 0], [1, 1, 1, 1]])
+def test_depthwise_conv_lowering(idt, k, ifm_dim, ifm_ch, stride, padding):
+    wdt = idt
+    odt = DataType.INT32
+    ofm_ch = ifm_ch
+    ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, pad=padding[0])
+
+    # set up onnx model
+    inp = oh.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim]
+    )
+    outp = oh.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_ch, ofm_dim, ofm_dim]
+    )
+
+    W = oh.make_tensor_value_info("W", TensorProto.FLOAT, [ofm_ch, 1, k, k])
+
+    dw_cnv = oh.make_node(
+        "Conv",
+        inputs=["inp", "W"],
+        outputs=["outp"],
+        kernel_shape=[k, k],
+        pads=padding,
+        strides=[stride, stride],
+        group=ifm_ch,
+    )
+    graph = oh.make_graph(
+        nodes=[dw_cnv],
+        name="dw_cnv_graph",
+        inputs=[inp],
+        outputs=[outp],
+        value_info=[W],
+    )
+
+    model = oh.make_model(graph, producer_name="dws_cnv-model")
+    model = ModelWrapper(model)
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+    model.set_tensor_datatype("W", wdt)
+    w_tensor = gen_finn_dt_tensor(wdt, [ofm_ch, 1, k, k])
+    model.set_initializer("W", w_tensor)
+    model = model.transform(InferShapes())
+
+    input_tensor = gen_finn_dt_tensor(idt, [1, ifm_ch, ifm_dim, ifm_dim])
+    input_dict = {"inp": input_tensor}
+    output_dict = oxe.execute_onnx(model, input_dict)
+    expected = output_dict["outp"]
+
+    model = model.transform(LowerConvsToMatMul())
+    output_dict = oxe.execute_onnx(model, input_dict)
+    produced = output_dict["outp"]
+    assert (produced == expected).all()
+
+    # check if created nodes have attributes that indicate depthwise conv
+    assert model.get_tensor_sparsity("W") is not None
+    im2col_node = getCustomOp(model.graph.node[1])
+    assert im2col_node.get_nodeattr("depthwise") == 1
+
+
+def test_conv_lowering_conv_1x1():
+    np.random.seed(0)
+
+    in_feature_dim = 7
+    in_chn = 3
+    kernel_size = 1
+    out_feature_dim = in_feature_dim
+
+    input_shape = [1, in_chn, in_feature_dim, in_feature_dim]
+    output_shape = [1, in_chn, out_feature_dim, out_feature_dim]
+
+    conv_param_shape = [in_chn, in_chn, kernel_size, kernel_size]
+
+    conv_config = {}
+    conv_config["dilations"] = [1, 1]
+    conv_config["group"] = 1
+    conv_config["kernel_shape"] = [kernel_size, kernel_size]
+    conv_config["pads"] = [0, 0, 0, 0]
+    conv_config["strides"] = [1, 1]
+
+    top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
+    top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape)
+
+    value_info = [oh.make_tensor_value_info("p1", TensorProto.FLOAT, conv_param_shape)]
+
+    modelproto = oh.make_model(
+        oh.make_graph(
+            name="test",
+            inputs=[top_in],
+            outputs=[top_out],
+            value_info=value_info,
+            nodes=[oh.make_node("Conv", ["top_in", "p1"], ["top_out"], **conv_config)],
+        )
+    )
+    model = ModelWrapper(modelproto)
+    model = model.transform(InferShapes())
+    model.set_initializer("p1", np.random.rand(*conv_param_shape).astype(np.float32))
+
+    new_model = model.transform(LowerConvsToMatMul())
+    inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)}
+
+    assert oxe.compare_execution(model, new_model, inp_dict)
+    assert new_model.graph.node[0].op_type == "Transpose"
+    assert new_model.graph.node[1].op_type == "MatMul"
+    assert new_model.graph.node[2].op_type == "Transpose"
+    assert len(new_model.graph.node) == 3
diff --git a/tests/transformation/test_fold_constants.py b/tests/transformation/test_fold_constants.py
index 685c14a98b9031096aaf5b244c4f484d4f308bca..a976ffd62bce744a474a6fac2a61a6478526777f 100644
--- a/tests/transformation/test_fold_constants.py
+++ b/tests/transformation/test_fold_constants.py
@@ -40,7 +40,7 @@ from finn.transformation.fold_constants import FoldConstants
 from finn.transformation.infer_shapes import InferShapes
 from finn.util.test import get_test_model_untrained
 
-export_onnx_path = "test_output_lfc.onnx"
+export_onnx_path = "test_fold_constants.onnx"
 
 
 def test_const_folding():
diff --git a/tests/transformation/test_infer_data_layouts.py b/tests/transformation/test_infer_data_layouts.py
index fccc7813da6f98c8af4ade7ae562c99b32247a8b..d6d9920043114c78e970842aee5955e3150cf526 100644
--- a/tests/transformation/test_infer_data_layouts.py
+++ b/tests/transformation/test_infer_data_layouts.py
@@ -44,7 +44,7 @@ import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 from finn.transformation.infer_data_layouts import InferDataLayouts
 import finn.core.data_layout as DataLayout
 
-export_onnx_path_cnv = "test_output_cnv.onnx"
+export_onnx_path_cnv = "test_infer_data_layouts.onnx"
 
 
 def test_infer_data_layouts():
diff --git a/tests/transformation/test_infer_datatypes.py b/tests/transformation/test_infer_datatypes.py
index e3db40289c4318894cf5ad41c2f67b3bff501db9..097ae03f6153843fbb7956a72b38431559d5d0f1 100644
--- a/tests/transformation/test_infer_datatypes.py
+++ b/tests/transformation/test_infer_datatypes.py
@@ -38,7 +38,7 @@ from finn.transformation.infer_datatypes import InferDataTypes
 from finn.transformation.infer_shapes import InferShapes
 from finn.util.test import get_test_model_trained
 
-export_onnx_path = "test_output_lfc.onnx"
+export_onnx_path = "test_infer_datatypes.onnx"
 
 
 def test_infer_datatypes():
diff --git a/tests/transformation/test_linear_past_eltwise.py b/tests/transformation/test_linear_past_eltwise.py
index b77f59779a5e8559f80e017d13b66bcb67249830..4cff5e5e1d40986a006cc02186fce21a907c2ef1 100644
--- a/tests/transformation/test_linear_past_eltwise.py
+++ b/tests/transformation/test_linear_past_eltwise.py
@@ -41,7 +41,7 @@ from finn.transformation.double_to_single_float import DoubleToSingleFloat
 
 import pytest
 
-export_onnx_path = "test_scalar_past_eltwise.onnx"
+export_onnx_path = "test_linear_past_eltwise.onnx"
 
 # construct a synthetic graph to test:
 # topk insertion, topk conversion to hls, add conversion to hls
diff --git a/tests/transformation/test_merge_onnx_models.py b/tests/transformation/test_merge_onnx_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..db7c990baddfb50a39603937a9c5b73f512a0e59
--- /dev/null
+++ b/tests/transformation/test_merge_onnx_models.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from pkgutil import get_data
+
+import numpy as np
+import onnx
+import onnx.numpy_helper as np_helper
+from onnx import TensorProto, helper
+
+from finn.core.modelwrapper import ModelWrapper
+from finn.core.datatype import DataType
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from finn.transformation.merge_onnx_models import MergeONNXModels
+import finn.core.onnx_exec as oxe
+
+
+def test_merge_onnx_models():
+    # load pre model
+    raw_m = get_data("finn", "data/onnx/mnist-conv/model.onnx")
+    model1 = ModelWrapper(raw_m)
+    # the input for model1 comes from a uint8 vector so we set the finn datatype
+    # of the input tensor to DataType.UINT8 to verify that the datatypes are correctly
+    # preserved in the transformed model
+    model1.set_tensor_datatype(model1.graph.input[0].name, DataType.UINT8)
+    model1 = model1.transform(InferShapes())
+    model1 = model1.transform(GiveUniqueNodeNames())
+    model1 = model1.transform(GiveReadableTensorNames())
+
+    # set up post model
+    shape = [1, 10]
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape)
+    a0 = helper.make_tensor_value_info("a0", TensorProto.FLOAT, [])
+    a1 = helper.make_tensor_value_info("a1", TensorProto.FLOAT, [])
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, shape)
+
+    mul_node = helper.make_node("Mul", ["inp", "a0"], ["mul_out"])
+    div_node = helper.make_node("Div", ["mul_out", "a1"], ["outp"])
+
+    graph = helper.make_graph(
+        nodes=[mul_node, div_node],
+        name="model2-graph",
+        inputs=[inp],
+        outputs=[outp],
+        value_info=[a0, a1],
+    )
+
+    model2 = helper.make_model(graph, producer_name="model2")
+    model2 = ModelWrapper(model2)
+    # initialize model2
+    a0_value = np.random.uniform(low=0, high=1, size=(1)).astype(np.float32)
+    model2.set_initializer("a0", a0_value)
+    a1_value = np.random.uniform(low=0.1, high=1, size=(1)).astype(np.float32)
+    model2.set_initializer("a1", a1_value)
+    # set a dummy sparsity annotation to check if it gets correctly transferred
+    # to the merged model
+    sparsity = {"dw": {"kernel_shape": 0}}
+    model2.set_tensor_sparsity("a1", sparsity)
+    model2 = model2.transform(InferShapes())
+    model2 = model2.transform(InferDataTypes())
+    model2 = model2.transform(InferDataLayouts())
+    model2 = model2.transform(GiveUniqueNodeNames())
+    model2 = model2.transform(GiveReadableTensorNames())
+
+    # simulate the models before the merging and pass the output of model1 to model2
+    # load one of the test vectors
+    raw_i = get_data("finn", "data/onnx/mnist-conv/test_data_set_0/input_0.pb")
+    inp_values = onnx.load_tensor_from_string(raw_i)
+    inp_values = np_helper.to_array(inp_values)
+    idict = {model1.graph.input[0].name: inp_values}
+    odict = oxe.execute_onnx(model1, idict)
+    temp = odict[model1.graph.output[0].name]
+
+    idict = {model2.graph.input[0].name: temp}
+    odict = oxe.execute_onnx(model2, idict)
+    outp = odict[model2.graph.output[0].name]
+    # merge models
+    model_transformed = model2.transform(MergeONNXModels(model1))
+
+    idict = {model_transformed.graph.input[0].name: inp_values}
+    odict = oxe.execute_onnx(model_transformed, idict)
+    outp_transformed = odict[model_transformed.graph.output[0].name]
+
+    assert (outp == outp_transformed).all()
+    assert len(model_transformed.graph.node) == len(model1.graph.node) + len(
+        model2.graph.node
+    )
+    # to test if the value is preserved we set the sparsity annotation of input[1]
+    # of the division block to a dummy value, we can now look for the division block
+    # and check if the sparsity annotation is still the same
+    for n in model_transformed.graph.node:
+        if n.op_type == "Div":
+            tensor_name = n.input[1]
+            set_sparsity = model_transformed.get_tensor_sparsity(tensor_name)
+            assert sparsity == set_sparsity
+
+    # check if finn datatype of graph.input[0] is still set to UINT8
+    assert model_transformed.get_tensor_datatype("global_in") == DataType.UINT8
diff --git a/tests/transformation/test_move_chw_add_past_conv.py b/tests/transformation/test_move_chw_add_past_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..b626f7e5b8564739ec383aaddfc262d642bf47cc
--- /dev/null
+++ b/tests/transformation/test_move_chw_add_past_conv.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+from onnx import helper, TensorProto
+
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.streamline.reorder import MoveAddPastConv
+from finn.custom_op.im2col import compute_conv_output_dim
+import finn.core.onnx_exec as oxe
+
+
+# input dimension
+@pytest.mark.parametrize("idim", [4, 7])
+# kernel size
+@pytest.mark.parametrize("k", [2, 3])
+# stride
+@pytest.mark.parametrize("s", [1, 2])
+# input channels
+@pytest.mark.parametrize("ich", [2, 4])
+# output channels
+@pytest.mark.parametrize("och", [2, 3])
+def test_move_chw_add_past_conv(idim, k, s, ich, och):
+    odim = compute_conv_output_dim(idim, k, s)
+
+    ishape = [1, ich, idim, idim]
+    oshape = [1, och, odim, odim]
+    add_param_shape = [1, ich, 1, 1]
+    conv_param_shape = [och, ich, k, k]
+
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, ishape)
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, oshape)
+    a0 = helper.make_tensor_value_info("a0", TensorProto.FLOAT, add_param_shape)
+    a1 = helper.make_tensor_value_info("a1", TensorProto.FLOAT, conv_param_shape)
+
+    conv_config = {}
+    conv_config["dilations"] = [1, 1]
+    conv_config["group"] = 1
+    conv_config["kernel_shape"] = [k, k]
+    conv_config["pads"] = [0, 0, 0, 0]
+    conv_config["strides"] = [s, s]
+
+    add_node = helper.make_node("Add", ["inp", "a0"], ["add_out"])
+    conv_node = helper.make_node("Conv", ["add_out", "a1"], ["outp"], **conv_config)
+
+    model = helper.make_model(
+        helper.make_graph(
+            nodes=[add_node, conv_node],
+            name="move-add-graph",
+            inputs=[inp],
+            outputs=[outp],
+            value_info=[a0, a1],
+        )
+    )
+
+    model = ModelWrapper(model)
+    # initialize model
+    a0_values = np.random.uniform(low=0, high=1, size=tuple(add_param_shape)).astype(
+        np.float32
+    )
+    model.set_initializer("a0", a0_values)
+    a1_values = np.random.uniform(low=0, high=1, size=tuple(conv_param_shape)).astype(
+        np.float32
+    )
+    model.set_initializer("a1", a1_values)
+
+    model = model.transform(InferShapes())
+
+    # execution before transformation
+    inp_values = np.random.uniform(low=0, high=1, size=tuple(ishape)).astype(np.float32)
+    idict = {model.graph.input[0].name: inp_values}
+    odict = oxe.execute_onnx(model, idict)
+    y_before = odict[model.graph.output[0].name]
+
+    model = model.transform(MoveAddPastConv())
+    odict = oxe.execute_onnx(model, idict)
+    y_after = odict[model.graph.output[0].name]
+
+    assert np.isclose(y_before, y_after).all()
+    assert model.graph.node[0].op_type == "Conv"
+    assert model.graph.node[1].op_type == "Add"
diff --git a/tests/transformation/test_move_flatten_past_affine.py b/tests/transformation/test_move_flatten_past_affine.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2d5e51613d41f3f2db3dabcef7b982ec2816b19
--- /dev/null
+++ b/tests/transformation/test_move_flatten_past_affine.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import pytest
+
+import numpy as np
+from onnx import TensorProto, helper
+
+from finn.core.modelwrapper import ModelWrapper
+from finn.core.datatype import DataType
+import finn.core.data_layout as DataLayout
+from finn.util.basic import gen_finn_dt_tensor
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames
+from finn.transformation.streamline.reorder import MoveFlattenPastAffine
+import finn.core.onnx_exec as oxe
+
+# data layout
+@pytest.mark.parametrize("data_layout", [DataLayout.NHWC, DataLayout.NCHW])
+# batch size
+@pytest.mark.parametrize("batch_size", [1, 2])
+def test_move_flatten_past_affine(data_layout, batch_size):
+    if data_layout == DataLayout.NHWC:
+        ishape = [batch_size, 1, 1, 1024]
+        oshape = [batch_size, 1000]
+    else:
+        ishape = [batch_size, 1024, 1, 1]
+        oshape = [batch_size, 1000]
+
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, ishape)
+    a0 = helper.make_tensor_value_info("a1", TensorProto.FLOAT, [1024, 1000])
+    a1 = helper.make_tensor_value_info("a2", TensorProto.FLOAT, [])
+    a2 = helper.make_tensor_value_info("a3", TensorProto.FLOAT, [1000])
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, oshape)
+
+    flatten_node = helper.make_node("Flatten", ["inp"], ["flatten_out"])
+    matmul_node = helper.make_node("MatMul", ["flatten_out", "a0"], ["matmul_out"])
+    mul_node = helper.make_node("Mul", ["matmul_out", "a1"], ["mul_out"])
+    add_node = helper.make_node("Add", ["mul_out", "a2"], ["outp"])
+
+    graph = helper.make_graph(
+        nodes=[flatten_node, matmul_node, mul_node, add_node],
+        name="move-reshape-graph",
+        inputs=[inp],
+        outputs=[outp],
+        value_info=[a0, a1, a2],
+    )
+
+    model = helper.make_model(graph, producer_name="move_reshape_model")
+    model = ModelWrapper(model)
+
+    # initialize values
+    a0_values = gen_finn_dt_tensor(DataType.TERNARY, [1024, 1000])
+    model.set_initializer("a0", a0_values)
+    a1_values = np.random.uniform(low=0.1, high=0.99, size=(1)).astype(np.float32)
+    model.set_initializer("a1", a1_values)
+    a2_values = np.random.uniform(low=-1, high=1, size=(1000)).astype(np.float32)
+    model.set_initializer("a2", a2_values)
+
+    model.set_tensor_datatype("inp", DataType.INT2)
+    model.set_tensor_layout("inp", data_layout)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    model = model.transform(InferDataLayouts())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+
+    # compare execution before and after transformation
+    inp_values = gen_finn_dt_tensor(DataType.INT2, ishape)
+    idict = {model.graph.input[0].name: inp_values}
+    model_transformed = model.transform(MoveFlattenPastAffine())
+    assert oxe.compare_execution(model, model_transformed, idict)
+
+    # depending on data layout check if graph is transformed or not
+    if data_layout == DataLayout.NHWC:
+        # check if nodes have new order in transformed graph
+        assert model.graph != model_transformed.graph
+        assert model_transformed.graph.node[-1].op_type == "Flatten"
+    else:
+        assert model.graph == model_transformed.graph
diff --git a/tests/transformation/test_move_flatten_past_topk.py b/tests/transformation/test_move_flatten_past_topk.py
new file mode 100644
index 0000000000000000000000000000000000000000..65da92c22dbe9f6b1c5a49172ffae59fa6e98607
--- /dev/null
+++ b/tests/transformation/test_move_flatten_past_topk.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import pytest
+
+from onnx import TensorProto, helper
+
+from finn.core.modelwrapper import ModelWrapper
+from finn.core.datatype import DataType
+import finn.core.data_layout as DataLayout
+from finn.util.basic import gen_finn_dt_tensor
+from finn.transformation.insert_topk import InsertTopK
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames
+from finn.transformation.streamline.reorder import MoveFlattenPastTopK
+import finn.core.onnx_exec as oxe
+
+# data layout
+@pytest.mark.parametrize("data_layout", [DataLayout.NHWC, DataLayout.NCHW])
+# batch size
+@pytest.mark.parametrize("batch_size", [1, 2])
+def test_move_flatten_past_affine(data_layout, batch_size):
+    if data_layout == DataLayout.NHWC:
+        ishape = [batch_size, 1, 1, 1024]
+        oshape = [batch_size, 1024]
+    else:
+        ishape = [batch_size, 1024, 1, 1]
+        oshape = [batch_size, 1024]
+
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, ishape)
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, oshape)
+
+    flatten_node = helper.make_node("Flatten", ["inp"], ["outp"])
+
+    graph = helper.make_graph(
+        nodes=[flatten_node], name="move-flatten-graph", inputs=[inp], outputs=[outp],
+    )
+
+    model = helper.make_model(graph, producer_name="move_flatten_model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", DataType.INT2)
+    model.set_tensor_layout("inp", data_layout)
+    model = model.transform(InsertTopK())
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    model = model.transform(InferDataLayouts())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+
+    # compare execution before and after transformation
+    inp_values = gen_finn_dt_tensor(DataType.INT2, ishape)
+    idict = {model.graph.input[0].name: inp_values}
+    model_transformed = model.transform(MoveFlattenPastTopK())
+    assert oxe.compare_execution(model, model_transformed, idict)
+
+    # depending on data layout check if graph is transformed or not
+    if data_layout == DataLayout.NHWC:
+        # check if nodes have new order in transformed graph
+        assert model.graph != model_transformed.graph
+        assert model_transformed.graph.node[-1].op_type == "Flatten"
+    else:
+        assert model.graph == model_transformed.graph
diff --git a/tests/transformation/test_move_maxpool_past_multithreshold.py b/tests/transformation/test_move_maxpool_past_multithreshold.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fc19debf8d6fc89d15e3d731f1e54daa491c321
--- /dev/null
+++ b/tests/transformation/test_move_maxpool_past_multithreshold.py
@@ -0,0 +1,100 @@
+from onnx import TensorProto, helper
+import numpy as np
+
+import finn.core.onnx_exec as oxe
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.streamline.reorder import MoveMaxPoolPastMultiThreshold
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
+
+
+def get_multithreshold_rand_params(channels, num_of_thres, seed=None):
+    if seed is not None:
+        np.random.seed(seed)
+    steps = np.random.rand(channels, 1) * 2
+    bias = np.random.rand(channels, 1) * 10
+    thres = [np.arange(num_of_thres) for chn in range(channels)]
+    thres = ((thres - bias) * steps).astype(np.float32)
+    return thres
+
+
+def test_move_maxpool_past_multithreshold():
+    # generate test vectors of correct shape
+    ch = 64
+    ifmdim = 16
+    ofmdim = 16 // 4
+    input_shape = (1, ch, ifmdim, ifmdim)
+    output_shape = (1, ch, ofmdim, ofmdim)
+
+    top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
+    top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape)
+
+    maxpool_config = {}
+    maxpool_config["pads"] = [1, 1, 1, 1]
+    maxpool_config["kernel_shape"] = [3, 3]
+    maxpool_config["strides"] = [2, 2]
+
+    value_info = []
+    thres1_shape = [1, 1]
+    value_info += [
+        helper.make_tensor_value_info("thres1", TensorProto.FLOAT, thres1_shape)
+    ]
+
+    thres2_shape = [ch, 14]
+    value_info += [
+        helper.make_tensor_value_info("thres2", TensorProto.FLOAT, thres2_shape)
+    ]
+
+    nodes = []
+    nodes += [helper.make_node("MaxPool", ["top_in"], ["t1"], **maxpool_config)]
+    nodes += [
+        helper.make_node(
+            "MultiThreshold",
+            ["t1", "thres1"],
+            ["t2"],
+            domain="finn",
+            out_dtype="BIPOLAR",
+            out_bias=-1.0,
+            out_scale=1.0,
+        )
+    ]
+    nodes += [helper.make_node("MaxPool", ["t2"], ["t3"], **maxpool_config)]
+    nodes += [
+        helper.make_node(
+            "MultiThreshold",
+            ["t3", "thres2"],
+            ["top_out"],
+            domain="finn",
+            out_dtype="UINT4",
+        )
+    ]
+
+    modelproto = helper.make_model(
+        helper.make_graph(
+            name="test",
+            inputs=[top_in],
+            outputs=[top_out],
+            value_info=value_info,
+            nodes=nodes,
+        )
+    )
+    model = ModelWrapper(modelproto)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+
+    model.set_initializer("thres1", np.array([[0]]))
+    model.set_initializer(
+        "thres2", get_multithreshold_rand_params(*thres2_shape, seed=0)
+    )
+
+    # Transform
+    new_model = model.transform(MoveMaxPoolPastMultiThreshold())
+    inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)}
+
+    # Test
+    assert oxe.compare_execution(model, new_model, inp_dict)
+    assert new_model.graph.node[0].op_type == "MaxPool"
+    assert new_model.graph.node[1].op_type == "MultiThreshold"
+    assert new_model.graph.node[2].op_type == "MultiThreshold"
+    assert new_model.graph.node[3].op_type == "MaxPool"
+    assert len(new_model.graph.node) == 4
diff --git a/tests/transformation/test_move_mul_past_dw_conv.py b/tests/transformation/test_move_mul_past_dw_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ae8fbfe89986d58d3d71f5f8735a98469d9d1e3
--- /dev/null
+++ b/tests/transformation/test_move_mul_past_dw_conv.py
@@ -0,0 +1,93 @@
+import pytest
+
+from onnx import helper, TensorProto
+from finn.custom_op.im2col import compute_conv_output_dim
+import finn.core.onnx_exec as oxe
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
+from finn.util.basic import gen_finn_dt_tensor
+from finn.transformation.streamline.reorder import MoveMulPastDWConv
+
+
+# input dimension
+@pytest.mark.parametrize("ifm_dim", [4, 7])
+# input channels
+@pytest.mark.parametrize("ifm_ch", [2, 3])
+# kernel size
+@pytest.mark.parametrize("k", [2, 3])
+# stride
+@pytest.mark.parametrize("stride", [1, 2])
+# padding
+@pytest.mark.parametrize("pad_amt", [0, 1])
+# depthwise
+@pytest.mark.parametrize("dw", [0, 1])
+def test_move_mul_past_dw_conv(ifm_dim, ifm_ch, k, stride, pad_amt, dw):
+    if dw == 1:
+        ofm_ch = ifm_ch
+        groups = ifm_ch
+        W_shape = [ofm_ch, 1, k, k]
+    else:
+        ofm_ch = ifm_ch + 2
+        groups = 1
+        W_shape = [ofm_ch, ifm_ch, k, k]
+
+    ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, pad_amt)
+
+    # set up onnx model
+    inp = helper.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim]
+    )
+    mul = helper.make_tensor_value_info("mul", TensorProto.FLOAT, [1, ifm_ch, 1, 1])
+    W = helper.make_tensor_value_info("W", TensorProto.FLOAT, W_shape)
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_ch, ofm_dim, ofm_dim]
+    )
+
+    Mul_node = helper.make_node("Mul", ["inp", "mul"], ["mul_out"])
+
+    Conv_node = helper.make_node(
+        "Conv",
+        ["mul_out", "W"],
+        ["outp"],
+        group=groups,
+        kernel_shape=[k, k],
+        pads=[pad_amt, pad_amt, pad_amt, pad_amt],
+        strides=[stride, stride],
+    )
+
+    graph = helper.make_graph(
+        nodes=[Mul_node, Conv_node],
+        name="mulpastconv_graph",
+        inputs=[inp],
+        outputs=[outp],
+        value_info=[mul, W],
+    )
+
+    model = helper.make_model(graph, producer_name="mulpastconv-model")
+    model = ModelWrapper(model)
+    inp_values = gen_finn_dt_tensor(DataType.INT2, [1, ifm_ch, ifm_dim, ifm_dim])
+    mul_values = gen_finn_dt_tensor(DataType.INT2, [1, ifm_ch, 1, 1])
+    W_values = gen_finn_dt_tensor(DataType.INT2, W_shape)
+    model.set_initializer("W", W_values)
+    model.set_initializer("mul", mul_values)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    idict = {"inp": inp_values}
+    odict = oxe.execute_onnx(model, idict, True)
+    out_before = odict["outp"]
+
+    # move channelwise multiplication past depthwise conv
+    model_transformed = model.transform(MoveMulPastDWConv())
+    odict = oxe.execute_onnx(model_transformed, idict, True)
+    out_after = odict["outp"]
+
+    assert (out_before == out_after).all()
+
+    if dw == 0:
+        assert model.graph.node[0].op_type == model_transformed.graph.node[0].op_type
+        assert model.graph.node[1].op_type == model_transformed.graph.node[1].op_type
+    else:
+        assert model.graph.node[0].op_type == model_transformed.graph.node[1].op_type
+        assert model.graph.node[1].op_type == model_transformed.graph.node[0].op_type
diff --git a/tests/transformation/test_move_scalar_past_conv.py b/tests/transformation/test_move_scalar_past_conv.py
index 0f50642d2b9d1583030630cb4927c2b86667e71a..94fee7907d1ed1cccbf95520e903c7d9b43d8f7d 100644
--- a/tests/transformation/test_move_scalar_past_conv.py
+++ b/tests/transformation/test_move_scalar_past_conv.py
@@ -7,14 +7,14 @@ import finn.core.onnx_exec as ox
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline import (
-    MoveScalarAddPastConv,
+    MoveAddPastConv,
     MoveScalarMulPastConv,
 )
 
 
 @pytest.mark.parametrize("padding", [False, True])
 @pytest.mark.parametrize(
-    "test_args", [("Add", MoveScalarAddPastConv()), ("Mul", MoveScalarMulPastConv())],
+    "test_args", [("Add", MoveAddPastConv()), ("Mul", MoveScalarMulPastConv())],
 )
 def test_move_scalar_past_conv(test_args, padding):
     scalar_op = test_args[0]
@@ -83,8 +83,8 @@ def test_move_scalar_past_conv(test_args, padding):
             assert new_model.graph.node[2].op_type == "Conv"
         else:
             assert new_model.graph.node[0].op_type == "Conv"
-            assert new_model.graph.node[1].op_type == scalar_op
-            assert new_model.graph.node[2].op_type == "Conv"
+            assert new_model.graph.node[1].op_type == "Conv"
+            assert new_model.graph.node[2].op_type == scalar_op
     else:
         assert new_model.graph.node[0].op_type == "Conv"
         assert new_model.graph.node[1].op_type == "Conv"
@@ -92,7 +92,7 @@ def test_move_scalar_past_conv(test_args, padding):
 
 
 @pytest.mark.parametrize(
-    "test_args", [("Add", MoveScalarAddPastConv()), ("Mul", MoveScalarMulPastConv())],
+    "test_args", [("Add", MoveAddPastConv()), ("Mul", MoveScalarMulPastConv())],
 )
 def test_move_scalar_past_conv_only_if_linear(test_args):
     scalar_op = test_args[0]
diff --git a/tests/transformation/test_move_transpose_past_scalar_mul.py b/tests/transformation/test_move_transpose_past_scalar_mul.py
new file mode 100644
index 0000000000000000000000000000000000000000..e434fc7d4f683120176e18a2bfa9da99d9ee0b0e
--- /dev/null
+++ b/tests/transformation/test_move_transpose_past_scalar_mul.py
@@ -0,0 +1,82 @@
+import pytest
+
+import numpy as np
+from onnx import TensorProto, helper
+
+from finn.core.modelwrapper import ModelWrapper
+import finn.core.data_layout as DataLayout
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames
+from finn.transformation.streamline.reorder import MoveTransposePastScalarMul
+import finn.core.onnx_exec as oxe
+
+# permutation of transpose node
+@pytest.mark.parametrize("perm", [[0, 2, 3, 1], [0, 1, 3, 2], [3, 2, 0, 1]])
+# scalar mul
+@pytest.mark.parametrize("scalar", [True, False])
+# data layout
+@pytest.mark.parametrize("data_layout", [None, DataLayout.NHWC, DataLayout.NCHW])
+def test_move_transpose_past_scalar_mul(perm, scalar, data_layout):
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, 2, 3, 4])
+    # to determine out_size we need to calculate with "perm" for this test case
+    dummy_in = np.random.uniform(low=0, high=1, size=(1, 2, 3, 4)).astype(np.float32)
+    out_size = dummy_in.transpose(tuple(perm)).shape
+
+    if scalar is True:
+        a0_size = []
+    else:
+        a0_size = out_size
+    a0 = helper.make_tensor_value_info("a0", TensorProto.FLOAT, a0_size)
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, out_size)
+    transp_node = helper.make_node("Transpose", ["inp"], ["transp_out"], perm=perm)
+    mul_node = helper.make_node("Mul", ["transp_out", "a0"], ["outp"])
+
+    graph = helper.make_graph(
+        nodes=[transp_node, mul_node],
+        name="mv-transpose-graph",
+        inputs=[inp],
+        outputs=[outp],
+        value_info=[a0],
+    )
+
+    model = helper.make_model(graph, producer_name="mv_transpose_model")
+    model = ModelWrapper(model)
+
+    # initialize values
+    a0_values = np.random.uniform(low=0, high=1, size=tuple(a0_size)).astype(np.float32)
+    model.set_initializer("a0", a0_values)
+    if data_layout is not None:
+        model.set_tensor_layout("inp", data_layout)
+        model = model.transform(InferDataLayouts())
+
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+
+    # compare execution before and after transformation
+    inp_values = np.random.uniform(low=0, high=1, size=(1, 2, 3, 4)).astype(np.float32)
+    idict = {model.graph.input[0].name: inp_values}
+    model_transformed = model.transform(MoveTransposePastScalarMul())
+    assert oxe.compare_execution(model, model_transformed, idict)
+
+    # check if order changed
+    if scalar is True and data_layout is not None:
+        assert model_transformed.graph.node[0] != model.graph.node[0]
+        assert model_transformed.graph.node[1] != model.graph.node[1]
+        assert model_transformed.graph.node[0].op_type == "Mul"
+        assert model_transformed.graph.node[1].op_type == "Transpose"
+        mul_input = model_transformed.graph.node[0].input[0]
+        mul_output = model_transformed.graph.node[0].output[0]
+        assert model_transformed.get_tensor_layout(mul_input) == data_layout
+        assert model_transformed.get_tensor_layout(mul_output) == data_layout
+    else:
+        assert model_transformed.graph.node[0] == model.graph.node[0]
+        assert model_transformed.graph.node[1] == model.graph.node[1]
+        if data_layout is not None:
+            mul_input = model_transformed.graph.node[1].input[0]
+            mul_output = model_transformed.graph.node[1].output[0]
+            assert model_transformed.get_tensor_layout(mul_input) != data_layout
+            assert model_transformed.get_tensor_layout(mul_output) != data_layout
diff --git a/tests/transformation/test_remove_identity_ops.py b/tests/transformation/test_remove_identity_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..536c1ab0b48fa44388da23f45b528da3c5f3b2f2
--- /dev/null
+++ b/tests/transformation/test_remove_identity_ops.py
@@ -0,0 +1,81 @@
+import pytest
+
+import numpy as np
+from onnx import helper, TensorProto
+import finn.core.onnx_exec as oxe
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.streamline.remove import RemoveIdentityOps
+from finn.util.basic import gen_finn_dt_tensor
+
+
+def insert_identity_op(model, op):
+    if op in ["Add", "Sub"]:
+        val = np.asarray([0.0], dtype=np.float32)
+    elif op in ["Mul", "Div"]:
+        val = np.asarray([1.0], dtype=np.float32)
+    else:
+        return
+
+    identity_node = helper.make_node(op, ["div_out", "value"], ["ident_out"])
+    graph = model.graph
+    graph.node.insert(3, identity_node)
+    graph.node[-1].input[0] = "ident_out"
+    model.set_initializer("value", val)
+
+    return model
+
+
+# identity operations to be inserted
+@pytest.mark.parametrize("op", ["Add", "Sub", "Mul", "Div"])
+def test_remove_identity_ops(op):
+
+    # set up onnx model
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, 4, 1, 1])
+    mul = helper.make_tensor_value_info("mul", TensorProto.FLOAT, [])
+    shape = helper.make_tensor_value_info("shape", TensorProto.FLOAT, [2])
+    div = helper.make_tensor_value_info("div", TensorProto.FLOAT, [])
+    matmul = helper.make_tensor_value_info("matmul", TensorProto.FLOAT, [4, 2])
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, 2])
+
+    mul_node = helper.make_node("Mul", ["inp", "mul"], ["mul_out"])
+    reshape_node = helper.make_node("Reshape", ["mul_out", "shape"], ["reshape_out"])
+    div_node = helper.make_node("Div", ["reshape_out", "div"], ["div_out"])
+    matmul_node = helper.make_node("MatMul", ["div_out", "matmul"], ["outp"])
+
+    graph = helper.make_graph(
+        nodes=[mul_node, reshape_node, div_node, matmul_node],
+        name="identity-graph",
+        inputs=[inp],
+        outputs=[outp],
+        value_info=[mul, shape, div, matmul],
+    )
+
+    model = helper.make_model(graph, producer_name="mulpastconv-model")
+    model = ModelWrapper(model)
+    inp_values = gen_finn_dt_tensor(DataType.INT2, [1, 4, 1, 1])
+    mul_values = np.random.uniform(low=0.1, high=0.99, size=(1)).astype(np.float32)
+    shape_values = np.asarray([1, -1], dtype=np.int64)
+    div_values = np.random.uniform(low=0.1, high=0.99, size=(1)).astype(np.float32)
+    matmul_values = gen_finn_dt_tensor(DataType.INT2, [4, 2])
+    model.set_initializer("mul", mul_values)
+    model.set_initializer("shape", shape_values)
+    model.set_initializer("div", div_values)
+    model.set_initializer("matmul", matmul_values)
+    insert_identity_op(model, op)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    idict = {"inp": inp_values}
+    odict = oxe.execute_onnx(model, idict)
+    out_before = odict["outp"]
+    num_of_nodes_before = len(model.graph.node)
+
+    model = model.transform(RemoveIdentityOps())
+    num_of_nodes_after = len(model.graph.node)
+    assert num_of_nodes_before - 1 == num_of_nodes_after
+
+    odict = oxe.execute_onnx(model, idict)
+    out_after = odict["outp"]
+    assert (out_before == out_after).all()
diff --git a/tests/transformation/test_sign_to_thres.py b/tests/transformation/test_sign_to_thres.py
index b10840df37a695986e54c0bdaa68baa0538f90f2..a92f839e5f6ca8b45eadf939fa35973ac153e0b1 100644
--- a/tests/transformation/test_sign_to_thres.py
+++ b/tests/transformation/test_sign_to_thres.py
@@ -40,8 +40,7 @@ from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline import ConvertSignToThres
 from finn.util.test import get_test_model_trained
 
-export_onnx_path = "test_output_lfc.onnx"
-transformed_onnx_path = "test_output_lfc_transformed.onnx"
+export_onnx_path = "test_sign_to_thres.onnx"
 
 
 def test_sign_to_thres():
diff --git a/tests/transformation/test_topk_insert.py b/tests/transformation/test_topk_insert.py
index 1af0f255d8fb1af8a6e571518f18d831aa71298b..a18e63384150f140cb63ec7b438283eb4797266c 100644
--- a/tests/transformation/test_topk_insert.py
+++ b/tests/transformation/test_topk_insert.py
@@ -18,7 +18,7 @@ from pkgutil import get_data
 
 import pytest
 
-export_onnx_path = "test_output_lfc.onnx"
+export_onnx_path = "test_topk_insert.onnx"
 
 
 @pytest.mark.parametrize("k", [1, 5, 10])
diff --git a/tests/util/test_create.py b/tests/util/test_create.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e236978592b02e1c18b03aba56ff8b2369311a6
--- /dev/null
+++ b/tests/util/test_create.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+import finn.util.create as create
+from finn.core.datatype import DataType
+
+
+@pytest.mark.parametrize("bitwidth", [DataType.BIPOLAR, DataType.INT2, DataType.INT4])
+def test_hls_random_mlp_maker(bitwidth):
+    w = bitwidth
+    a = bitwidth
+    layer_spec = [
+        {
+            "mw": 185,
+            "mh": 100,
+            "simd": 185,
+            "pe": 100,
+            "idt": DataType.BIPOLAR,
+            "wdt": w,
+            "act": a,
+        },
+        {"mw": 100, "mh": 100, "simd": 100, "pe": 100, "idt": a, "wdt": w, "act": a},
+        {"mw": 100, "mh": 100, "simd": 100, "pe": 100, "idt": a, "wdt": w, "act": a},
+        {"mw": 100, "mh": 100, "simd": 100, "pe": 100, "idt": a, "wdt": w, "act": a},
+        {
+            "mw": 100,
+            "mh": 1,
+            "simd": 100,
+            "pe": 1,
+            "idt": a,
+            "wdt": w,
+            "act": DataType.BIPOLAR,
+        },
+    ]
+
+    ret = create.hls_random_mlp_maker(layer_spec)
+    assert len(ret.graph.node) == 5
+    # ret.save("mlp-%s.onnx" % str(bitwidth))