diff --git a/.github/workflows/quicktest-dev-pr.yml b/.github/workflows/quicktest-dev-pr.yml
index 924fbd24a174df49af4b3e259ad57d0a7907d42b..0233a81ba06dc701a3a4579b9a5bd3ce17e47d04 100644
--- a/.github/workflows/quicktest-dev-pr.yml
+++ b/.github/workflows/quicktest-dev-pr.yml
@@ -5,7 +5,7 @@ on:
     branches: [ dev ]
   push:
     branches: [ dev ]
-  
+
 
 jobs:
 
@@ -18,6 +18,6 @@ jobs:
         uses: actions/checkout@v2
 
       - name: DockerRunQuicktest
-        env:
-          NUM_DEFAULT_WORKERS: 4
-        run: sh run-docker.sh quicktest
+        run: |
+          docker build -t finn_gha -f docker/Dockerfile.finn_ci --build-arg BUILD_PATH=/tmp/finn_gha .
+          docker run --init --hostname finn_gha -v $(pwd):/workspace/finn -e FINN_INST_NAME=finn_gha finn_gha quicktest.sh
diff --git a/docker/Dockerfile.finn_ci b/docker/Dockerfile.finn_ci
index d06ff8521555ccd6d09383cab039850f1565fc61..7d5772d9f5118d1f1238dd14a6b57a1b4fd5004d 100644
--- a/docker/Dockerfile.finn_ci
+++ b/docker/Dockerfile.finn_ci
@@ -30,7 +30,6 @@ FROM pytorch/pytorch:1.1.0-cuda10.0-cudnn7.5-devel
 MAINTAINER Yaman Umuroglu <yamanu@xilinx.com>
 ARG PYTHON_VERSION=3.6
 ARG BUILD_PATH
-ARG FINN_CI_BRANCH
 
 WORKDIR /workspace
 
@@ -55,10 +54,9 @@ RUN git clone https://github.com/maltanar/PYNQ-HelloWorld.git /workspace/PYNQ-He
 # oh-my-xilinx
 RUN git clone https://bitbucket.org/maltanar/oh-my-xilinx.git /workspace/oh-my-xilinx
 
-# checkout desired FINN branch for testing
-RUN git clone --branch $FINN_CI_BRANCH https://github.com/Xilinx/finn /workspace/finn
-
-RUN pip install -r /workspace/finn/requirements.txt
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+RUN rm requirements.txt
 RUN apt update; apt install nano
 RUN pip install pytest-dependency
 RUN pip install pytest-xdist
@@ -78,8 +76,8 @@ RUN mkdir -p $VIVADO_IP_CACHE
 
 WORKDIR /workspace/finn
 
-COPY finn_entrypoint.sh /usr/local/bin/
-COPY quicktest.sh /usr/local/bin/
+COPY docker/finn_entrypoint.sh /usr/local/bin/
+COPY docker/quicktest.sh /usr/local/bin/
 RUN chmod 755 /usr/local/bin/finn_entrypoint.sh
 RUN chmod 755 /usr/local/bin/quicktest.sh
 ENTRYPOINT ["finn_entrypoint.sh"]
diff --git a/docker/Dockerfile.finn_dev b/docker/Dockerfile.finn_dev
index f8919d7498e0e8ef08a52d1da0782988b56d6df4..8c1502eb4a1941061bd58e6f9a18106f98f259e2 100644
--- a/docker/Dockerfile.finn_dev
+++ b/docker/Dockerfile.finn_dev
@@ -50,7 +50,6 @@ COPY requirements.txt .
 RUN pip install -r requirements.txt
 RUN rm requirements.txt
 RUN pip install jupyter
-RUN pip install netron
 RUN pip install matplotlib
 RUN pip install pytest-dependency
 RUN pip install sphinx
@@ -81,13 +80,26 @@ RUN git clone https://github.com/maltanar/pyverilator /workspace/pyverilator
 RUN git clone https://github.com/maltanar/PYNQ-HelloWorld.git /workspace/PYNQ-HelloWorld
 # oh-my-xilinx
 RUN git clone https://bitbucket.org/maltanar/oh-my-xilinx.git /workspace/oh-my-xilinx
+# netron
+RUN git clone https://github.com/lutzroeder/netron.git /workspace/netron
+
+# build and install netron
+USER root
+RUN curl -sL https://deb.nodesource.com/setup_12.x | bash -
+RUN apt-get install -y nodejs
+WORKDIR /workspace/netron
+RUN git checkout 376e9d33733a3eacfe3c432808fd46e6cd1460cb
+RUN npm install
+RUN python setup.py build
+RUN pip install /workspace/netron
+USER $UNAME
 
 # for this developer-oriented Docker container we assume the FINN repo is cloned and mounted from the host
 # at /workspace/finn -- see run-docker.sh for an example of how to do this.
 ENV PYTHONPATH "${PYTHONPATH}:/workspace/finn/src"
 ENV PYTHONPATH "${PYTHONPATH}:/workspace/pyverilator"
 ENV PYNQSHELL_PATH "/workspace/PYNQ-HelloWorld/boards"
-ENV PATH "${PATH}:/workspace/oh-my-xilinx"
+ENV PATH "${PATH}:/workspace/oh-my-xilinx:/home/$UNAME/.local/bin"
 ENV OHMYXILINX "/workspace/oh-my-xilinx"
 
 WORKDIR /home/$UNAME/finn
diff --git a/docker/Jenkinsfile b/docker/Jenkinsfile
index 2215bc79cc7b2c20036d882fdc654fbe8721cab6..b2d3102bd4aa3c00620f41c102af5a8b385cede7 100644
--- a/docker/Jenkinsfile
+++ b/docker/Jenkinsfile
@@ -15,11 +15,13 @@ pipeline {
         string(name: 'DOCKER_CMD_RTLSIM', defaultValue: """python setup.py test --addopts "-k rtlsim --workers auto" """, description: 'rtlsim test command')
         // end2end tests: no parallel testing, use NUM_DEFAULT_WORKERS for parallel transformations
         string(name: 'DOCKER_CMD_END2END', defaultValue: """python setup.py test --addopts "-k end2end" """, description: 'end2end test command')
+        // allow specifying where to mount the cloned folder from, since Jenkins and FINN may be running in separate containers
+        string(name: 'WORKSPACE_MOUNT', defaultValue: '/var/jenkins_home/workspace/finn', description: 'Path to Jenkins workspace mount')
     }
     environment {
         DOCKER_TAG='finn_ci:$BUILD_ID'
-        DOCKER_INST_NAME='finn_ci_$BUILD_ID'
-        BUILD_PATH='/tmp/finn_ci_$BUILD_ID'
+        DOCKER_INST_NAME='finn_ci'
+        BUILD_PATH='/tmp/finn_ci'
     }
     stages {
         stage("Clone") {
@@ -32,17 +34,17 @@ pipeline {
                 sh """
                 docker build -t $DOCKER_TAG -f docker/Dockerfile.finn_ci \
                 --build-arg BUILD_PATH=$BUILD_PATH \
-                --build-arg FINN_CI_BRANCH=${params.FINN_CI_BRANCH} \
-                docker/
+                .
                 """
             }
         }
         stage('test-main') {
             steps {
-                catchError {
+                catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') {
                 sh """
                 docker run --init \
                 --hostname $DOCKER_INST_NAME \
+                -v ${params.WORKSPACE_MOUNT}:/workspace/finn \
                 -v ${params.VIVADO_PATH}:${params.VIVADO_PATH}:ro \
                 -e NUM_DEFAULT_WORKERS=1 \
                 -e FINN_INST_NAME=$DOCKER_INST_NAME \
@@ -58,10 +60,11 @@ pipeline {
         }
         stage('test-rtlsim') {
             steps {
-                catchError {
+                catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') {
                 sh """
                 docker run --init \
                 --hostname $DOCKER_INST_NAME \
+                -v ${params.WORKSPACE_MOUNT}:/workspace/finn \
                 -v ${params.VIVADO_PATH}:${params.VIVADO_PATH}:ro \
                 -e NUM_DEFAULT_WORKERS=1 \
                 -e FINN_INST_NAME=$DOCKER_INST_NAME \
@@ -77,10 +80,11 @@ pipeline {
         }
         stage('test-end2end') {
             steps {
-                catchError {
+                catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') {
                 sh """
                 docker run --init \
                 --hostname $DOCKER_INST_NAME \
+                -v ${params.WORKSPACE_MOUNT}:/workspace/finn \
                 -v ${params.VIVADO_PATH}:${params.VIVADO_PATH}:ro \
                 -e NUM_DEFAULT_WORKERS=${params.NUM_DEFAULT_WORKERS} \
                 -e FINN_INST_NAME=$DOCKER_INST_NAME \
diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index b312737c317517ca0ab19c74cf22284b5977b661..ee75089c657e4fad1e4a455ac7bd5fe4976e5d4c 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -1,6 +1,5 @@
 #!/bin/bash
 
-export XILINX_VIVADO=$VIVADO_PATH
 export SHELL=/bin/bash
 export FINN_ROOT=/workspace/finn
 
@@ -15,7 +14,7 @@ gecho () {
 # the repos themselves are cloned in the Dockerfile
 BREVITAS_COMMIT=f9a27226d4acf1661dd38bc449f71f89e0983cce
 CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4
-HLSLIB_COMMIT=8f9f2018762f654f196b666838aeaf6fc730ad9a
+HLSLIB_COMMIT=cfafe11a93b79ab1af7529d68f08886913a6466e
 PYVERILATOR_COMMIT=c97a5ba41bbc7c419d6f25c74cdf3bdc3393174f
 PYNQSHELL_COMMIT=0c82a61b0ec1a07fa275a14146233824ded7a13d
 OMX_COMMIT=1bae737669901e762f581af73348332b5c4b2ada
@@ -48,7 +47,14 @@ gecho "oh-my-xilinx @ $OMX_COMMIT"
 git -C /workspace/oh-my-xilinx pull --quiet
 git -C /workspace/oh-my-xilinx checkout $OMX_COMMIT --quiet
 
-# source Vivado env.vars
-source $VIVADO_PATH/settings64.sh
-
+if [ ! -z "$VIVADO_PATH" ];then
+  # source Vivado env.vars
+  export XILINX_VIVADO=$VIVADO_PATH
+  source $VIVADO_PATH/settings64.sh
+fi
+if [ ! -z "$VITIS_PATH" ];then
+  # source Vitis env.vars
+  export XILINX_VITIS=$VITIS_PATH
+  source $VITIS_PATH/settings64.sh
+fi
 exec "$@"
diff --git a/docker/quicktest.sh b/docker/quicktest.sh
index 49b7886836ac4e45dad856dfcd49223276bd831a..75d07d15338fd422bc6749b0a61b392616c61c5a 100755
--- a/docker/quicktest.sh
+++ b/docker/quicktest.sh
@@ -3,11 +3,10 @@
 : ${PYTEST_PARALLEL=auto}
 
 cd $FINN_ROOT
-
 # check if command line argument is empty or not present
 if [ -z $1 ]; then
   echo "Running quicktest: not (vivado or slow) with pytest-xdist"
-  python setup.py test --addopts "-m 'not (vivado or slow)' --dist=loadfile -n $PYTEST_PARALLEL"
+  python setup.py test --addopts "-m 'not (vivado or slow or vitis)' --dist=loadfile -n $PYTEST_PARALLEL"
 elif [ $1 = "main" ]; then
   echo "Running main test suite: not (rtlsim or end2end) with pytest-xdist"
   python setup.py test --addopts "-k not (rtlsim or end2end) --dist=loadfile -n $PYTEST_PARALLEL"
diff --git a/docs/finn/getting_started.rst b/docs/finn/getting_started.rst
index 8b20cebcfc49d14d0afbb26edd678d65425476d3..323692897800d45c6e6cf55b688a2c7b2b9a5277 100644
--- a/docs/finn/getting_started.rst
+++ b/docs/finn/getting_started.rst
@@ -13,7 +13,7 @@ The FINN compiler should not be thought of a single pushbutton tool that does ev
 Requirements
 ============
 
-* Ubuntu 18.04
+* Ubuntu 18.04 with `bash` installed
 * Docker
 * A working Vivado 2019.1 installation
 * A `VIVADO_PATH` environment variable pointing to the Vivado installation directory (e.g. the directory where settings64.sh is located)
@@ -26,9 +26,11 @@ We use Docker extensively for developing and deploying FINN. If you are not fami
 
 Getting an interactive shell for development or experimentation
 ***************************************************************
+.. note:: **run-docker.sh requires bash to execute correctly.**
+
 ::
 
-  sh run_docker.sh
+  ./run_docker.sh
 
 Simply running sh run-docker.sh without any additional arguments will clone the dependency repos, create a Docker container and give you a terminal with you can use for development for experimentation.
 If you want a new terminal on an already-running container, you can do this with `docker exec -it finn_dev_<username> bash`.
@@ -41,7 +43,7 @@ Running the Jupyter notebooks
 *****************************
 ::
 
-  sh run-docker.sh notebook
+  ./run-docker.sh notebook
 
 This will launch the `Jupyter notebook <https://jupyter.org/>`_ server inside a Docker container, and print a link on the terminal that you can open in your browser to run the FINN notebooks or create new ones.
 .. note:: The link will look something like this (the token you get will be different):
@@ -57,14 +59,14 @@ by:
 
 ::
 
-  sh run-docker.sh test
+  ./run-docker.sh test
 
 There is a quicker variant of the test suite that skips the tests marked as
 requiring Vivado or as slow-running tests:
 
 ::
 
-  sh run-docker.sh quicktest
+  ./run-docker.sh quicktest
 
 If you want to run individual tests, you can do this *inside the Docker container
 from the FINN root directory* as follows:
diff --git a/run-docker.sh b/run-docker.sh
index 00ca8f86985a78d8f2af099c51dcd4b80cd2e974..88956586c6a2ba9780d0597f8149038dad4aa6ab 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -50,6 +50,15 @@ if [ -z "$PYNQ_IP" ];then
         recho "Please set the PYNQ_IP env.var. to enable PYNQ deployment tests."
 fi
 
+if [ -z "$VITIS_PATH" ];then
+        recho "Please set the VITIS_PATH that contains the path to your Vitis installation directory."
+        recho "FINN functionality depending on Vitis will not be available."
+else
+    if [ -z "$PLATFORM_REPO_PATHS" ];then
+            recho "Please set PLATFORM_REPO_PATHS pointing to Vitis platform files (DSAs)."
+    fi
+fi
+
 DOCKER_GID=$(id -g)
 DOCKER_GNAME=$(id -gn)
 DOCKER_UNAME=$(id -un)
@@ -93,6 +102,7 @@ mkdir -p $FINN_SSH_KEY_DIR
 gecho "Instance is named as $DOCKER_INST_NAME"
 gecho "Mounting $BUILD_LOCAL into $BUILD_LOCAL"
 gecho "Mounting $VIVADO_PATH into $VIVADO_PATH"
+gecho "Mounting $VITIS_PATH into $VITIS_PATH"
 gecho "Port-forwarding for Jupyter $JUPYTER_PORT:$JUPYTER_PORT"
 gecho "Port-forwarding for Netron $NETRON_PORT:$NETRON_PORT"
 gecho "Vivado IP cache dir is at $VIVADO_IP_CACHE"
@@ -128,24 +138,34 @@ docker build -f docker/Dockerfile.finn_dev --tag=$DOCKER_TAG \
 # Launch container with current directory mounted
 # important to pass the --init flag here for correct Vivado operation, see:
 # https://stackoverflow.com/questions/55733058/vivado-synthesis-hangs-in-docker-container-spawned-by-jenkins
-docker run -t --rm --name $DOCKER_INST_NAME $DOCKER_INTERACTIVE --init \
---hostname $DOCKER_INST_NAME \
--e "XILINX_VIVADO=$VIVADO_PATH" \
--e "SHELL=/bin/bash" \
--v $SCRIPTPATH:/workspace/finn \
--v $BUILD_LOCAL:$BUILD_LOCAL \
--v $VIVADO_PATH:$VIVADO_PATH \
--v $FINN_SSH_KEY_DIR:/home/$DOCKER_UNAME/.ssh \
--e VIVADO_PATH=$VIVADO_PATH \
--e FINN_INST_NAME=$DOCKER_INST_NAME \
--e FINN_ROOT="/workspace/finn" \
--e VIVADO_IP_CACHE="$VIVADO_IP_CACHE" \
--e PYNQ_BOARD=$PYNQ_BOARD \
--e PYNQ_IP=$PYNQ_IP \
--e PYNQ_USERNAME=$PYNQ_USERNAME \
--e PYNQ_PASSWORD=$PYNQ_PASSWORD \
--e PYNQ_TARGET_DIR=$PYNQ_TARGET_DIR \
--e NUM_DEFAULT_WORKERS=$NUM_DEFAULT_WORKERS \
--p $JUPYTER_PORT:$JUPYTER_PORT \
--p $NETRON_PORT:$NETRON_PORT \
-$DOCKER_TAG $DOCKER_CMD
+DOCKER_EXEC="docker run -t --rm --name $DOCKER_INST_NAME $DOCKER_INTERACTIVE --init "
+DOCKER_EXEC+="--hostname $DOCKER_INST_NAME "
+DOCKER_EXEC+="-e SHELL=/bin/bash "
+DOCKER_EXEC+="-v $SCRIPTPATH:/workspace/finn "
+DOCKER_EXEC+="-v $BUILD_LOCAL:$BUILD_LOCAL "
+DOCKER_EXEC+="-v $FINN_SSH_KEY_DIR:/home/$DOCKER_UNAME/.ssh "
+DOCKER_EXEC+="-e FINN_INST_NAME=$DOCKER_INST_NAME "
+DOCKER_EXEC+="-e FINN_ROOT="/workspace/finn" "
+DOCKER_EXEC+="-e VIVADO_IP_CACHE=$VIVADO_IP_CACHE "
+DOCKER_EXEC+="-e PYNQ_BOARD=$PYNQ_BOARD "
+DOCKER_EXEC+="-e PYNQ_IP=$PYNQ_IP "
+DOCKER_EXEC+="-e PYNQ_USERNAME=$PYNQ_USERNAME "
+DOCKER_EXEC+="-e PYNQ_PASSWORD=$PYNQ_PASSWORD "
+DOCKER_EXEC+="-e PYNQ_TARGET_DIR=$PYNQ_TARGET_DIR "
+DOCKER_EXEC+="-e NUM_DEFAULT_WORKERS=$NUM_DEFAULT_WORKERS "
+DOCKER_EXEC+="-p $JUPYTER_PORT:$JUPYTER_PORT "
+DOCKER_EXEC+="-p $NETRON_PORT:$NETRON_PORT "
+if [ ! -z "$VIVADO_PATH" ];then
+  DOCKER_EXEC+="-e "XILINX_VIVADO=$VIVADO_PATH" "
+  DOCKER_EXEC+="-v $VIVADO_PATH:$VIVADO_PATH "
+  DOCKER_EXEC+="-e VIVADO_PATH=$VIVADO_PATH "
+fi
+if [ ! -z "$VITIS_PATH" ];then
+  DOCKER_EXEC+="-v $VITIS_PATH:$VITIS_PATH "
+  DOCKER_EXEC+="-v $PLATFORM_REPO_PATHS:/workspace/finn/vitis_platforms "
+  DOCKER_EXEC+="-e VITIS_PATH=$VITIS_PATH "
+  DOCKER_EXEC+="-e PLATFORM_REPO_PATHS=/workspace/finn/vitis_platforms "
+fi
+DOCKER_EXEC+="$DOCKER_TAG $DOCKER_CMD"
+
+$DOCKER_EXEC
diff --git a/setup.cfg b/setup.cfg
index 1d7dcf247636b486e35d6320669eae706c2b7a72..7729d0949ee133e06242905afab31708e79ebf04 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -104,6 +104,7 @@ addopts =
 markers =
     slow: marks tests as slow (deselect with '-m "not slow"')
     vivado: mark tests that require Vivado or Vivado HLS
+    vitis: mark tests that require Vitis
 norecursedirs =
     dist
     build
diff --git a/src/finn/core/rtlsim_exec.py b/src/finn/core/rtlsim_exec.py
index 1e1bee3aa7435d5cab6cbf5ea23dd37dcdfa4380..bb5b3075582b8e01e8eed95f709934302fcadb42 100644
--- a/src/finn/core/rtlsim_exec.py
+++ b/src/finn/core/rtlsim_exec.py
@@ -114,19 +114,19 @@ def rtlsim_exec(model, execution_context):
 def _reset_rtlsim(sim):
     """Sets reset input in pyverilator to zero, toggles the clock and set it
     back to one"""
-    sim.io.ap_rst_n_0 = 0
+    sim.io.ap_rst_n = 0
     _toggle_clk(sim)
     _toggle_clk(sim)
-    sim.io.ap_rst_n_0 = 1
+    sim.io.ap_rst_n = 1
     _toggle_clk(sim)
     _toggle_clk(sim)
 
 
 def _toggle_clk(sim):
     """Toggles the clock input in pyverilator once."""
-    sim.io.ap_clk_0 = 0
+    sim.io.ap_clk = 0
     sim.eval()
-    sim.io.ap_clk_0 = 1
+    sim.io.ap_clk = 1
     sim.eval()
 
 
@@ -140,7 +140,7 @@ def _run_rtlsim(sim, inp, num_out_values, trace_file=None, reset=True):
     from finn.util.fpgadataflow)"""
     inputs = inp
     outputs = []
-    sim.io.out_r_0_tready = 1
+    sim.io.m_axis_0_tready = 1
 
     # observe if output is completely calculated
     # observation_count will contain the number of cycles the calculation ran
@@ -159,12 +159,12 @@ def _run_rtlsim(sim, inp, num_out_values, trace_file=None, reset=True):
         _reset_rtlsim(sim)
 
     while not (output_observed):
-        sim.io.in0_V_V_0_tvalid = 1 if len(inputs) > 0 else 0
-        sim.io.in0_V_V_0_tdata = inputs[0] if len(inputs) > 0 else 0
-        if sim.io.in0_V_V_0_tready == 1 and sim.io.in0_V_V_0_tvalid == 1:
+        sim.io.s_axis_0_tvalid = 1 if len(inputs) > 0 else 0
+        sim.io.s_axis_0_tdata = inputs[0] if len(inputs) > 0 else 0
+        if sim.io.s_axis_0_tready == 1 and sim.io.s_axis_0_tvalid == 1:
             inputs = inputs[1:]
-        if sim.io.out_r_0_tvalid == 1 and sim.io.out_r_0_tready == 1:
-            outputs = outputs + [sim.io.out_r_0_tdata]
+        if sim.io.m_axis_0_tvalid == 1 and sim.io.m_axis_0_tready == 1:
+            outputs = outputs + [sim.io.m_axis_0_tdata]
         _toggle_clk(sim)
 
         observation_count = observation_count + 1
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 71c731f96ca45519c443a5f932ead050770e17de..bc816f18c5f72338dc726e504182998f3f4430b7 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -102,6 +102,23 @@ class HLSCustomOp(CustomOp):
         prefixed_top_name = "%s_%s" % (node.name, node.name)
         return prefixed_top_name
 
+    def get_verilog_top_module_intf_names(self):
+        """Return a dict of names of input and output interfaces.
+        The keys reflect the protocols each interface implements:
+        'clk', 'rst', 'm_axis', 's_axis', 'aximm', 'axilite'.
+        Values are lists of names:
+        's_axis' names correspond to the list of node inputs in order,
+        'm_axis' names correspond to the list of node outputs in order'
+        Each block must have at most one aximm and one axilite."""
+        intf_names = {}
+        intf_names["clk"] = ["ap_clk"]
+        intf_names["rst"] = ["ap_rst_n"]
+        intf_names["s_axis"] = ["in0_V_V"]
+        intf_names["m_axis"] = ["out_V_V"]
+        intf_names["aximm"] = []
+        intf_names["axilite"] = []
+        return intf_names
+
     def get_verilog_top_filename(self):
         "Return the Verilog top module filename for this node."
 
diff --git a/src/finn/custom_op/fpgadataflow/addstreams_batch.py b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
index d5f5c1194d36e86b895610c084222db5ab9eb2bf..d73f22672e7163eef0738d067f951e90fe80a89f 100644
--- a/src/finn/custom_op/fpgadataflow/addstreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
@@ -356,3 +356,8 @@ class AddStreams_Batch(HLSCustomOp):
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        intf_names["s_axis"] = ["in0_V_V", "in1_V_V"]
+        return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
index 54051af5e0387081a23e1f8fa77ec9e363098830..e4762509fb6246bafa7441e194312d69ad585d1b 100644
--- a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
@@ -32,7 +32,7 @@ import numpy as np
 
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow import HLSCustomOp
-from onnx import TensorProto, helper
+from onnx import helper, TensorProto
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 
@@ -80,24 +80,33 @@ class DuplicateStreams_Batch(HLSCustomOp):
 
     def make_shape_compatible_op(self, model):
         exp_ishape = self.get_normal_input_shape()
-        oshape = self.get_normal_output_shape()
         ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
         assert ishape == exp_ishape, "Unexpected input shape."
-        # implement tensor with correct shape
-        values = np.random.randn(*oshape).astype(np.float32)
+
+        oshape = self.get_normal_output_shape()
+        values = np.zeros(oshape).astype(np.float32)
         split_input = np.concatenate((values, values), axis=0)
-        return helper.make_node(
+
+        split_in = helper.make_tensor_value_info(
+            model.make_new_valueinfo_name(), TensorProto.FLOAT, oshape
+        )
+
+        model.graph.value_info.append(split_in)  # requires clean up
+        model.set_initializer(split_in.name, split_input)
+
+        shape_comp_node = helper.make_node(
             "Split",
-            inputs=[split_input],
-            outputs=[self.onnx_node.output[0], self.onnx_node.output[0]],
-            value=helper.make_tensor(
-                name="const_tensor", data_type=TensorProto.FLOAT, axis=0
-            ),
+            inputs=[split_in.name],
+            outputs=[self.onnx_node.output[0], self.onnx_node.output[1]],
+            axis=0,
         )
 
+        return shape_comp_node
+
     def infer_node_datatype(self, model):
         odt = self.get_output_datatype()
         model.set_tensor_datatype(self.onnx_node.output[0], odt)
+        model.set_tensor_datatype(self.onnx_node.output[1], odt)
 
     def verify_node(self):
         info_messages = []
@@ -359,3 +368,8 @@ class DuplicateStreams_Batch(HLSCustomOp):
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        intf_names["m_axis"] = ["out0_V_V", "out1_V_V"]
+        return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/iodma.py b/src/finn/custom_op/fpgadataflow/iodma.py
index 9b718ecbbc490610790b68871080de23a54f4891..05870b8d9d5d3a11bad7882c9a7d122f8cd34cf6 100644
--- a/src/finn/custom_op/fpgadataflow/iodma.py
+++ b/src/finn/custom_op/fpgadataflow/iodma.py
@@ -344,3 +344,15 @@ class IODMA(HLSCustomOp):
 
     def strm_decl(self):
         pass
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        if self.get_nodeattr("direction") == "out":
+            intf_names["s_axis"] = ["in0_V_V"]
+            intf_names["m_axis"] = []
+        else:
+            intf_names["s_axis"] = []
+            intf_names["m_axis"] = ["out_V_V"]
+        intf_names["axilite"] = ["s_axi_control"]
+        intf_names["aximm"] = ["m_axi_gmem"]
+        return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/pool_batch.py b/src/finn/custom_op/fpgadataflow/pool_batch.py
index c7edc24d0e24eef1154293caca2519ab3aa68358..801a634fdba1cd5e16c7c211175c1e7380bf0070 100644
--- a/src/finn/custom_op/fpgadataflow/pool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/pool_batch.py
@@ -39,16 +39,18 @@ class Pool_Batch(HLSCustomOp):
     """Class that corresponds to finn-hlslib Pool_batch function.
     Requires ConvolutionInputGenerator(depthwise == 1) to format its input
 
-    TODO: explain input shape (to reuse im2col code)
     Input shape (BatchSize,OutImgDim,OutImgDim,KernelSize^2*Channels)
     Output shape (BatchSize,OutImgDim,OutImgDim,Channels)
 
-    # note: the actual data layout produced by the hlslib kernels is different
-    # for depthwise ops.
-    # * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/PE, K, K, PE)
+    Notes:
+    # The input shape was chosen to be compatible with im2col (only true when there
+    is not folding).
+
+    # The actual data layout produced by the hlslib kernels is different
+    for depthwise ops.
+     * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/PE, K, K, PE)
 
     Channels can be folded using PE (SIMD from the input perspective)
-    TODO: doc
     """
 
     def get_nodeattr_types(self):
@@ -63,7 +65,10 @@ class Pool_Batch(HLSCustomOp):
             "Function": ("s", True, ""),
             "OutImgDim": ("i", True, 0),
             # FINN DataTypes for inputs/outputs
-            "dataType": ("s", True, ""),
+            "InputDataType": ("s", True, ""),
+            "OutputDataType": ("s", True, ""),
+            "AccumBits": ("i", False, 0),
+            "Size": ("i", False, 1),
             "BatchSize": ("i", False, 1),
         }
 
@@ -72,17 +77,28 @@ class Pool_Batch(HLSCustomOp):
 
     def get_input_datatype(self):
         """Returns FINN DataType of input."""
-        return DataType[self.get_nodeattr("dataType")]
+        return DataType[self.get_nodeattr("InputDataType")]
 
     def get_output_datatype(self):
         """Returns FINN DataType of output."""
         fxn = self.get_nodeattr("Function")
+        odt = DataType[self.get_nodeattr("OutputDataType")]
+
         if fxn == "MaxPool":
             # Same as input
-            return DataType[self.get_nodeattr("dataType")]
+            idt = DataType[self.get_nodeattr("InputDataType")]
+            assert odt == idt, "In datatype must be equal to out datatype for Maxpool"
+        elif fxn == "QuantAvgPool":
+            idt = DataType[self.get_nodeattr("InputDataType")]
+            assert (
+                idt.signed() == odt.signed()
+            ), """QuantAvgPool: Can't mix signed
+            and unsigned datatypes"""
         else:
             raise Exception("Pool_Batch doesn't currently support " + fxn)
 
+        return odt
+
     def get_normal_input_shape(self):
         ifm_ch = self.get_nodeattr("Channels")
         odim = self.get_nodeattr("OutImgDim")
@@ -123,19 +139,14 @@ class Pool_Batch(HLSCustomOp):
     def get_instream_width(self):
         dt_bits = self.get_input_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
-        # ofm_ch = self.get_nodeattr("Channels")
-        # k = self.get_nodeattr("KernelSize")
-        # assert ifm_ch % pe == 0, "PE must divide input channels"
-        # simd = int(ifm_ch/pe)
         in_width = int(dt_bits * pe)
         return in_width
 
     def get_outstream_width(self):
-        fxn = self.get_nodeattr("Function")
-        if fxn == "MaxPool":
-            return self.get_instream_width()
-        else:
-            raise Exception("Pool_Batch doesn't currently support " + fxn)
+        dt_bits = self.get_output_datatype().bitwidth()
+        pe = self.get_nodeattr("PE")
+        out_width = int(dt_bits * pe)
+        return out_width
 
     def make_shape_compatible_op(self, model):
         exp_ishape = self.get_normal_input_shape()
@@ -187,7 +198,7 @@ class Pool_Batch(HLSCustomOp):
 
         # check supported function
         fnx = self.get_nodeattr("Function")
-        if fnx == "MaxPool":
+        if fnx in ["MaxPool", "QuantAvgPool"]:
             info_messages.append(
                 "Attribute Function contains a supported pool function"
             )
@@ -251,7 +262,8 @@ class Pool_Batch(HLSCustomOp):
         i_hls_dt = idt.get_hls_datatype_str()
         odt = self.get_output_datatype()
         o_hls_dt = odt.get_hls_datatype_str()
-
+        size = self.get_nodeattr("Size")
+        accum_bits = self.get_nodeattr("AccumBits")
         self.code_gen_dict["$DOCOMPUTE$"] = []
 
         fxn = self.get_nodeattr("Function")
@@ -259,6 +271,16 @@ class Pool_Batch(HLSCustomOp):
             self.code_gen_dict["$DOCOMPUTE$"] += [
                 "MaxPoolFunction<{},KernelSize> pool_fxn;".format(i_hls_dt)
             ]
+        elif fxn == "QuantAvgPool":
+            if idt.signed():
+                act_hls_dt = "ap_int<{}>".format(accum_bits)
+            else:
+                act_hls_dt = "ap_uint<{}>".format(accum_bits)
+            self.code_gen_dict["$DOCOMPUTE$"] += [
+                "QuantAvgPoolFunction<{},{},{}> pool_fxn;".format(
+                    act_hls_dt, o_hls_dt, size
+                )
+            ]
         else:
             raise Exception("Pool_Batch doesn't currently support " + fxn)
 
@@ -369,7 +391,7 @@ class Pool_Batch(HLSCustomOp):
             super().reset_rtlsim(sim)
             super().toggle_clk(sim)
             rtlsim_output = self.rtlsim(sim, rtlsim_inp)
-            odt = export_idt
+            odt = self.get_output_datatype()
             target_bits = odt.bitwidth()
             packed_bits = self.get_outstream_width()
             out_npy_path = "{}/output.npy".format(code_gen_dir)
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index a7ebff68749120868cae9ce5ac18d2856fe2cb8a..9c3bd3ac87b94f3e0ff11a2937bf5083aae614f6 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -87,7 +87,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             "numInputVectors": ("ints", False, [1]),
             # memory mode for the FC weights
             # const -- embedded weights, default, long compile/synth times
-            # decoupled -- streaming weights
+            # decoupled -- streaming weights with weight streamer packaged inside IP
+            # external -- streaming weights with external streamer
             "mem_mode": ("s", False, "const"),
             # FPGA resource type for memories in decoupled mode
             # auto -- let Vivado decide
@@ -105,14 +106,14 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         node = self.onnx_node
         # set top name depending on mem_mode
         mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode == "const":
+        if mem_mode == "const" or mem_mode == "external":
             prefixed_top_name = "%s_%s" % (node.name, node.name)
         elif mem_mode == "decoupled":
             prefixed_top_name = "%s_memstream" % (node.name)
         else:
             raise Exception(
-                """Please set mem_mode to "const" or "decoupled", currently no other
-                parameter value is supported!"""
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
             )
         return prefixed_top_name
 
@@ -301,7 +302,10 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
     def get_weightstream_width(self):
         """Returns weight stream width. Used only in decoupled mode."""
-        if self.get_nodeattr("mem_mode") == "decoupled":
+        if (
+            self.get_nodeattr("mem_mode") == "decoupled"
+            or self.get_nodeattr("mem_mode") == "external"
+        ):
             pe = self.get_nodeattr("PE")
             simd = self.get_nodeattr("SIMD")
             wp = self.get_weight_datatype().bitwidth()
@@ -484,7 +488,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
     def generate_params(self, model, path):
         mem_mode = self.get_nodeattr("mem_mode")
-        # weights
+        code_gen_dir = path
+        # weights, if not external
         weights = model.get_initializer(self.onnx_node.input[1])
         # convert weights into hlslib-compatible format
         weight_tensor = self.get_hls_compatible_weight_tensor(weights)
@@ -493,7 +498,6 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         # so use it as such for weight generation
         if self.get_weight_datatype() == DataType.BIPOLAR:
             export_wdt = DataType.BINARY
-        code_gen_dir = path
 
         if mem_mode == "const":
             """Saves weights into params.h"""
@@ -523,7 +527,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             f_weights.write(weight_hls_code)
             f_weights.close()
 
-        elif mem_mode == "decoupled":
+        elif mem_mode == "decoupled" or mem_mode == "external":
             """Saves weights in corresponding file format for cppsim or rtlsim"""
             # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD)
             weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3))
@@ -552,37 +556,37 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 os.path.join(code_gen_dir, "weights.npy"), weight_tensor_simd_flipped
             )
 
-            """Saves weights into .dat file"""
-            # convert weight values into hexstring
-            weight_width = self.get_weightstream_width()
-            # pad to nearest 4 bits to get hex strings
-            weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
-            weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
-                weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
-            )
-            weight_stream_len = np.prod(weight_tensor_pe_flipped.shape)
-            factor = math.ceil(weight_stream_len / 1024)
-            # add zeroes to pad out file to 1024 entries
-            weight_stream = weight_tensor_pe_flipped.flatten()
-            pad_amt = (factor * 1024) - weight_stream_len
-            weight_stream = np.pad(
-                weight_stream, (0, pad_amt), mode="constant", constant_values="0"
-            )
-            weight_stream = weight_stream.copy()
-            i = 0
-            j = 0
-            for val in weight_stream:
-                if i == 1024:
-                    i = 0
-                    j += 1
-                with open("{}/memblock_{}.dat".format(code_gen_dir, j), "a+") as f:
-                    f.write(val + "\n")
-                i += 1
-
+            if mem_mode == "decoupled":
+                """Saves weights into .dat file"""
+                # convert weight values into hexstring
+                weight_width = self.get_weightstream_width()
+                # pad to nearest 4 bits to get hex strings
+                weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
+                weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
+                    weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
+                )
+                weight_stream_len = np.prod(weight_tensor_pe_flipped.shape)
+                factor = math.ceil(weight_stream_len / 1024)
+                # add zeroes to pad out file to 1024 entries
+                weight_stream = weight_tensor_pe_flipped.flatten()
+                pad_amt = (factor * 1024) - weight_stream_len
+                weight_stream = np.pad(
+                    weight_stream, (0, pad_amt), mode="constant", constant_values="0"
+                )
+                weight_stream = weight_stream.copy()
+                i = 0
+                j = 0
+                for val in weight_stream:
+                    if i == 1024:
+                        i = 0
+                        j += 1
+                    with open("{}/memblock_{}.dat".format(code_gen_dir, j), "a+") as f:
+                        f.write(val + "\n")
+                    i += 1
         else:
             raise Exception(
-                """Please set mem_mode to "const"i or "decoupled", currently no other
-                    parameter value is supported!"""
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
             )
 
         # save thresholds in thresh.h
@@ -630,6 +634,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
+        mem_mode = self.get_nodeattr("mem_mode")
         node = self.onnx_node
 
         # TODO ensure codegen dir exists
@@ -698,7 +703,24 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             )
             super().reset_rtlsim(sim)
             super().toggle_clk(sim)
-            output = self.rtlsim(sim, inp)
+            if mem_mode == "external":
+                wnbits = self.get_weightstream_width()
+                export_wdt = self.get_weight_datatype()
+                # we have converted bipolar weights to binary for export,
+                # so use it as such for weight generation
+                if self.get_weight_datatype() == DataType.BIPOLAR:
+                    export_wdt = DataType.BINARY
+                wei = npy_to_rtlsim_input(
+                    "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits
+                )
+                io_dict = {
+                    "inputs": {"in0": inp, "weights": wei},
+                    "outputs": {"out": []},
+                }
+                self.rtlsim_multi_io(sim, io_dict)
+                output = io_dict["outputs"]["out"]
+            else:
+                output = self.rtlsim(sim, inp)
             odt = self.get_output_datatype()
             target_bits = odt.bitwidth()
             packed_bits = self.get_outstream_width()
@@ -729,12 +751,12 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         if mem_mode == "const":
             # self.code_gen_dict["$GLOBALS$"] += ['#include "params.h"']
             pass
-        elif mem_mode == "decoupled":
+        elif mem_mode == "decoupled" or mem_mode == "external":
             self.code_gen_dict["$GLOBALS$"] += ['#include "mvau.hpp"']
         else:
             raise Exception(
-                """Please set mem_mode to "const" or "decoupled", currently no other
-                    parameter value is supported!"""
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
             )
         if self.calc_tmem() != 0:
             # TODO find a better way of checking for no pregenerated thresholds
@@ -757,7 +779,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 numReps,
             )
         ]
-        if mem_mode == "decoupled":
+        if mem_mode == "decoupled" or mem_mode == "external":
             wdt = self.get_weight_datatype()
             self.code_gen_dict["$DEFINES$"].append(
                 "#define WP1 {}\n".format(wdt.bitwidth())
@@ -783,7 +805,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         )
 
         mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode == "decoupled":
+        if mem_mode == "decoupled" or mem_mode == "external":
             wdt = self.get_weight_datatype()
             elem_bits = wdt.bitwidth()
             packed_bits = self.get_weightstream_width()
@@ -807,7 +829,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
         )
 
-        if mem_mode == "decoupled":
+        if mem_mode == "decoupled" or mem_mode == "external":
             self.code_gen_dict["$STREAMDECLARATIONS$"].append(
                 'hls::stream<ap_uint<{}>> weights ("weights");'.format(
                     self.get_weightstream_width()
@@ -835,7 +857,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                     self.get_nodeattr("resType"),
                 )
             ]
-        elif mem_mode == "decoupled":
+        elif mem_mode == "decoupled" or mem_mode == "external":
             wdt = self.get_weight_datatype()
             if wdt == DataType.BIPOLAR:
                 export_wdt = DataType.BINARY
@@ -856,8 +878,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
         else:
             raise Exception(
-                """Please set mem_mode to "const" or "decoupled", currently no other
-                    parameter value is supported!"""
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
             )
 
     def dataoutstrm(self):
@@ -903,7 +925,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                     self.get_outstream_width(),
                 )
             ]
-        elif mem_mode == "decoupled":
+        elif mem_mode == "decoupled" or mem_mode == "external":
             self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
                 """void {}(
                     hls::stream<ap_uint<{}>> &in0,
@@ -952,7 +974,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                     "complete dim=1"
                 )
             )
-        elif mem_mode == "decoupled":
+        elif mem_mode == "decoupled" or mem_mode == "external":
             self.code_gen_dict["$PRAGMAS$"].append(
                 "#pragma HLS INTERFACE axis port=weights"
             )
@@ -962,8 +984,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
         else:
             raise Exception(
-                """Please set mem_mode to "const", currently no other
-                    parameter value is supported!"""
+                """Please set mem_mode to "const", "decoupled", or external,
+                currently no other parameter value is supported!"""
             )
 
         # the threshold tensor is acc_type [PE][TMEM][N_THRES]
@@ -1092,3 +1114,10 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             )
             self.set_nodeattr("ip_vlnv", vlnv)
             self.code_gen_dict.clear()
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "external":
+            intf_names["s_axis"] = ["in0_V_V", "weights_V_V"]
+        return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/tlastmarker.py
index 17ba44b959577faf573d77ae222f7b2a3be6669d..38a139c279701ae7892f41b63c3c717a3e736691 100644
--- a/src/finn/custom_op/fpgadataflow/tlastmarker.py
+++ b/src/finn/custom_op/fpgadataflow/tlastmarker.py
@@ -33,8 +33,9 @@ class TLastMarker(HLSCustomOp):
     """Node that adds/removes AXI stream TLAST signals where needed. Its behavior
     is transparent in node-by-node execution, only visible in IP-stitched rtlsim or
     actual hardware.
-    This node  may be needed at the end of the network to signal a DMA write (needed by the
-    FINN PYNQ shell) or at the beginning to remove the end-of-burst from DMA read."""
+    This node  may be needed at the end of the network to signal a DMA write
+    (needed by the FINN PYNQ shell) or at the beginning to remove the end-of-burst
+    from DMA read."""
 
     def __init__(self, onnx_node):
         super().__init__(onnx_node)
@@ -239,3 +240,15 @@ class TLastMarker(HLSCustomOp):
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
             'hls::stream<OutDType> out ("out");'
         )
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        if self.get_nodeattr("Direction") == "in":
+            intf_names["s_axis"] = ["in0"]
+            intf_names["m_axis"] = ["out_V_V"]
+        else:
+            intf_names["s_axis"] = ["in0_V_V"]
+            intf_names["m_axis"] = ["out_r"]
+        if self.get_nodeattr("DynIters") == 1:
+            intf_names["axilite"] = ["s_axi_control"]
+        return intf_names
diff --git a/src/finn/custom_op/quantavgpool2d.py b/src/finn/custom_op/quantavgpool2d.py
index fb5c78bc0c8419ba519c5c3113d9b0c7ae2dd3b7..28d01069264d883f3afc400808470f5f303be799 100644
--- a/src/finn/custom_op/quantavgpool2d.py
+++ b/src/finn/custom_op/quantavgpool2d.py
@@ -75,6 +75,19 @@ class QuantAvgPool2d(CustomOp):
             raise Exception("Unsupported output datatype for QuantAvgPool2d")
         model.set_tensor_datatype(node.output[0], dtype)
 
+    def get_accum_size(self):
+        ibits = self.get_nodeattr("ibits")
+        k = self.get_nodeattr("kernel")
+        max_value = 2 ** ibits - 1
+        max_value = max_value * k * k
+        max_bit_width = int(max_value).bit_length()
+        return max_bit_width
+
+    def get_shifts(self):
+        shift_bits = self.get_accum_size() - self.get_nodeattr("obits")
+        shift_bits = shift_bits if shift_bits >= 0 else 0
+        return shift_bits
+
     def execute_node(self, context, graph):
         # create a standard average pooling node to help calculate the result
         node = self.onnx_node
@@ -107,12 +120,7 @@ class QuantAvgPool2d(CustomOp):
         result_temp = sess.run(None, idict)
         # remove scaling introduced by average
         result_temp = result_temp[0] * (k * k)
-        ibits = self.get_nodeattr("ibits")
-        max_value = 2 ** ibits - 1
-        max_value = max_value * k * k
-        max_bit_width = int(max_value).bit_length()
-        shift_bits = max_bit_width - self.get_nodeattr("obits")
-        result = np.right_shift(result_temp.astype(int), shift_bits)
+        result = np.right_shift(result_temp.astype(int), self.get_shifts())
         if self.get_nodeattr("data_layout") == "NHWC":
             result = result.transpose(0, 2, 3, 1)
         context[node.output[0]] = result.astype(np.float32)
diff --git a/src/finn/custom_op/streamingdataflowpartition.py b/src/finn/custom_op/streamingdataflowpartition.py
index b63326d676f4ded5ec1dd62f5cc7f02d7acb82ad..bce4dde426b8838d6c86638a3641d51ab259a6db 100644
--- a/src/finn/custom_op/streamingdataflowpartition.py
+++ b/src/finn/custom_op/streamingdataflowpartition.py
@@ -83,7 +83,7 @@ class StreamingDataflowPartition(CustomOp):
             )
 
         # verify the number of inputs
-        if len(self.onnx_node.input) == 1:
+        if len(self.onnx_node.input) >= 1:
             info_messages.append("The number of inputs is correct")
         else:
             info_messages.append("StreamingDataflowPartition needs 1 data input")
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index 34a697a43426aae0f984770689552063aa35b9e8..4cdf138130f37809357b281155d260fdbd789e12 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -26,6 +26,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+
 from onnx import helper, TensorProto
 import numpy as np
 
@@ -34,11 +35,11 @@ from finn.transformation import Transformation
 from finn.custom_op.registry import getCustomOp
 from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.general import SortGraph
 import finn.core.data_layout as DataLayout
 from finn.util.onnx import nchw_to_nhwc
 import warnings
 from finn.util.basic import get_by_name
-import warnings
 
 
 class InferConvInpGen(Transformation):
@@ -107,6 +108,7 @@ class InferConvInpGen(Transformation):
                         Padding=2 * pad,
                         NumChannels=ifm_ch,
                         inputDataType=dt.name,
+                        SIMD=ifm_ch,
                     )
                     graph.node.insert(node_ind, padding_node)
 
@@ -210,13 +212,16 @@ class InferPool_Batch(Transformation):
         graph_modified = False
         for n in graph.node:
             node_ind += 1
-            if n.op_type in ["MaxPool"]:
+            if n.op_type in ["MaxPool", "QuantAvgPool2d"]:
                 # extract pool parameters
-                k = get_by_name(n.attribute, "kernel_shape").ints[-1]
-                stride = get_by_name(n.attribute, "strides").ints[-1]
 
-                if k <= stride:
-                    continue
+                if n.op_type == "MaxPool":
+                    k = get_by_name(n.attribute, "kernel_shape").ints[-1]
+                    stride = get_by_name(n.attribute, "strides").ints[-1]
+                elif n.op_type == "QuantAvgPool2d":
+                    inst = getCustomOp(n)
+                    k = inst.get_nodeattr("kernel")
+                    stride = inst.get_nodeattr("stride")
 
                 try:
                     pad = get_by_name(n.attribute, "pads").ints[-1]
@@ -226,10 +231,21 @@ class InferPool_Batch(Transformation):
                 node_input = n.input[0]
                 node_output = n.output[0]
                 idt = model.get_tensor_datatype(node_input)
+
                 if not idt.is_integer():
                     continue
 
-                # odt = model.get_tensor_datatype(node_output)
+                if k < stride:
+                    continue
+                elif k == stride:
+                    warnings.warn(
+                        """Inferring Pool_Batch node for k == stride.
+                        This case can be optimized.
+                        For example, for MaxPool run InferStreamingMaxPool before
+                        InferPool_Batch """
+                    )
+
+                odt = model.get_tensor_datatype(node_output)
 
                 ifm_ch = model.get_tensor_shape(n.input[0])[1]  # assume NCHW
                 ofm_ch = ifm_ch
@@ -269,9 +285,22 @@ class InferPool_Batch(Transformation):
                     "Transpose", [node_input], [inp_trans_out], perm=[0, 2, 3, 1]
                 )
 
+                accum_bits = 0
+                pool_size_param = k
+                pad_value = 0
                 if n.op_type == "MaxPool":
                     pool_fxn = "MaxPool"
+                    odt = idt
                     pad_value = idt.min()
+                elif n.op_type == "QuantAvgPool2d":
+                    assert odt.is_integer(), """Output data type for QuantAvgPool2d
+                    needs to be integer"""
+                    assert pad == 0, "Padding is not supported for QuantAvgPool2d"
+                    inst = getCustomOp(n)
+                    pool_fxn = "QuantAvgPool"
+                    pool_size_param = inst.get_shifts()
+                    accum_bits = inst.get_accum_size()
+
                 else:
                     raise Exception(
                         "pad_value and pool_fxn not configured for {}".format(n.op_type)
@@ -301,12 +330,15 @@ class InferPool_Batch(Transformation):
                     [pool_output],
                     domain="finn",
                     backend="fpgadataflow",
-                    dataType=idt.name,
+                    InputDataType=idt.name,
+                    OutputDataType=odt.name,
                     Channels=ifm_ch,
                     PE=ifm_ch,
                     KernelSize=k,
                     Function=pool_fxn,
                     OutImgDim=ofm_dim,
+                    AccumBits=accum_bits,
+                    Size=pool_size_param,
                     BatchSize=1,
                 )
 
@@ -616,10 +648,21 @@ class InferThresholdingLayer(Transformation):
                 if not idt.is_integer():
                     continue
 
-                # skip conversion if input is not NHWC or NC
+                # check layout of inputs/outputs, and convert if needed
+                # check layout and convert if necessary
                 thl_in_layout = model.get_tensor_layout(thl_input)
-                if thl_in_layout != DataLayout.NHWC and thl_in_layout != DataLayout.NC:
-                    continue
+                if thl_in_layout == DataLayout.NCHW:
+                    thl_input = nchw_to_nhwc(thl_input, model, node_ind)
+                    node_ind += 1
+                    thl_in_shape = model.get_tensor_shape(thl_input)
+
+                # keep track of where we need to insert the HLS Op
+                # it has to be ahead of the output transform
+                insert_point = node_ind
+                thl_output_layout = model.get_tensor_layout(thl_output)
+                if thl_output_layout == DataLayout.NCHW:
+                    thl_output = nchw_to_nhwc(thl_output, model, node_ind, reverse=True)
+                    node_ind += 1
 
                 # now safe to assume number of channels is in last dimension
                 ifc = int(thl_in_shape[-1])
@@ -641,7 +684,7 @@ class InferThresholdingLayer(Transformation):
                     outputDataType=odt.name,
                     numInputVectors=list(thl_in_shape[:-1]),
                 )
-                graph.node.insert(node_ind, new_node)
+                graph.node.insert(insert_point, new_node)
                 # remove old node
                 graph.node.remove(node)
                 graph_modified = True
@@ -652,6 +695,166 @@ class InferThresholdingLayer(Transformation):
         return (model, graph_modified)
 
 
+class InferAddStreamsLayer(Transformation):
+    """Convert any Add into a AddStreams HLS layer."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for node in graph.node:
+            node_ind += 1
+            if node.op_type == "Add":
+                in0 = node.input[0]
+                in1 = node.input[1]
+                result = node.output[0]
+                in0_shape = model.get_tensor_shape(in0)
+                in1_shape = model.get_tensor_shape(in1)
+
+                # skip if different shapes on inputs
+                if in0_shape != in1_shape:
+                    continue
+
+                idt0 = model.get_tensor_datatype(in0)
+                idt1 = model.get_tensor_datatype(in1)
+
+                # skip if different data types on inputs
+                if idt0 != idt1:
+                    continue
+
+                idt = idt0
+
+                # skip conversion for layers with float input
+                if not idt.is_integer():
+                    continue
+
+                # check layout and convert if necessary
+                in0_layout = model.get_tensor_layout(in0)
+                in1_layout = model.get_tensor_layout(in1)
+                result_layout = model.get_tensor_layout(result)
+
+                if in0_layout == DataLayout.NCHW:
+                    in0 = nchw_to_nhwc(in0, model, node_ind)
+                    node_ind += 1
+                    in0_shape = model.get_tensor_shape(in0)
+
+                if in1_layout == DataLayout.NCHW:
+                    in1 = nchw_to_nhwc(in1, model, node_ind)
+                    node_ind += 1
+                    in1_shape = model.get_tensor_shape(in1)
+
+                # keep track of where we need to insert the HLS Op
+                # it has to be ahead of the output transform
+                insert_point = node_ind
+
+                if result_layout == DataLayout.NCHW:
+                    result = nchw_to_nhwc(result, model, node_ind, reverse=True)
+                    node_ind += 1
+
+                # now safe to assume num_channels is size of last dimension
+                num_channels = int(in0_shape[-1])
+                # create node with no parallelization first
+                pe = 1
+                assert (
+                    num_channels % pe == 0
+                ), "Requirement Channels divisable by PE is violated."
+
+                # create and insert new StreamingFCLayer node
+                new_node = helper.make_node(
+                    "AddStreams_Batch",
+                    [in0, in1],
+                    [result],
+                    domain="finn",
+                    backend="fpgadataflow",
+                    NumChannels=num_channels,
+                    PE=pe,
+                    inputDataType=idt.name,
+                    numInputVectors=in0_shape[:-1],
+                )
+                graph.node.insert(insert_point, new_node)
+                # remove old node
+                graph.node.remove(node)
+                graph_modified = True
+
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
+
+
+class InferDuplicateStreamsLayer(Transformation):
+    """Insert a DuplicateStreams HLS layer for any tensor with fanout == 2 """
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for node in graph.node:
+            node_ind += 1
+            successors = model.find_consumers(node.output[0])
+            if successors is not None and len(successors) == 2:
+                output_tensor = node.output[0]
+
+                dt = model.get_tensor_datatype(output_tensor)
+
+                # skip conversion for layers with float input
+                if not dt.is_integer():
+                    continue
+
+                # create clone tensors
+                out_shape = model.get_tensor_shape(output_tensor)
+                out_tensor_clones = []
+                for i in range(2):
+                    clone = helper.make_tensor_value_info(
+                        model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
+                    )
+                    model.graph.value_info.append(clone)
+                    out_tensor_clones += [clone.name]
+
+                num_ch = int(out_shape[-1])
+                vecs = out_shape[:-1]
+
+                # create node with no parallelization first
+                pe = 1
+                assert (
+                    num_ch % pe == 0
+                ), "Requirement channels divisable by PE is violated."
+
+                dup_node = helper.make_node(
+                    "DuplicateStreams_Batch",
+                    [output_tensor],
+                    out_tensor_clones,
+                    domain="finn",
+                    backend="fpgadataflow",
+                    NumChannels=num_ch,
+                    PE=pe,
+                    inputDataType=dt.name,
+                    numInputVectors=vecs,
+                )
+
+                graph.node.insert(node_ind, dup_node)
+
+                # connect successors to out tensor clone
+                clone_idx = 0
+                for successor in successors:
+                    for i, succ_input in enumerate(successor.input):
+                        if succ_input == output_tensor:
+                            successor.input[i] = out_tensor_clones[clone_idx]
+                            clone_idx += 1
+                            # if one node has multiple connections to the same output
+                            # find_direct_successors will return one node per input
+                            # so break the inner loop will result in correct behaviour
+                            break
+
+                graph_modified = True
+
+        if graph_modified:
+            model = model.transform(SortGraph())
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
+
+
 class InferChannelwiseLinearLayer(Transformation):
     """Convert any channel-wise Add/Mul into a HLS layer."""
 
@@ -807,6 +1010,64 @@ class InferChannelwiseLinearLayer(Transformation):
         return (model, graph_modified)
 
 
+class InferLabelSelectLayer(Transformation):
+    """Convert any TopK into a LabelSelect HLS layer."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for node in graph.node:
+            node_ind += 1
+            if node.op_type == "TopK":
+                fc_input = node.input[0]
+                k_input = node.input[1]
+                val_output = node.output[0]
+                idx_output = node.output[1]
+                fc_in_shape = model.get_tensor_shape(fc_input)
+
+                idt = model.get_tensor_datatype(fc_input)
+
+                # skip conversion for layers with float input
+                if not idt.is_integer():
+                    continue
+
+                # skip conversion for if value output is connected (not supported)
+                if model.find_consumer(val_output) is not None:
+                    continue
+
+                num_labels = int(fc_in_shape[-1])
+                # create node with no parallelization first
+                pe = 1
+                assert (
+                    num_labels % pe == 0
+                ), "Requirement Labels divisable by PE is violated."
+
+                k = model.get_initializer(k_input)[0]
+
+                # create and insert new StreamingFCLayer node
+                new_node = helper.make_node(
+                    "LabelSelect_Batch",
+                    [fc_input],
+                    [idx_output],
+                    domain="finn",
+                    backend="fpgadataflow",
+                    Labels=num_labels,
+                    PE=pe,
+                    K=k,
+                    inputDataType=idt.name,
+                )
+                graph.node.insert(node_ind, new_node)
+                # remove old node
+                graph.node.remove(node)
+                graph_modified = True
+
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
+
+
 class InferGlobalAccPoolLayer(Transformation):
     """Convert any GlobalAveragePool into a GlobalAccPool HLS layer and a scalar Mul."""
 
diff --git a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
index e0f990600d9ca4be748b662b47ce8296d3d462ce..7197e68be2fbdf5fc39b7ed202e88672614514ec 100644
--- a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
+++ b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
@@ -45,58 +45,89 @@ class CreateDataflowPartition(Transformation):
         super().__init__()
 
     def apply(self, model):
-        # TODO we currently assume that all dataflow nodes are connected to
-        # each other, forming a single partition. check the assumption and/or
-        # improve this.
-        all_nodes = list(model.graph.node)
-        df_nodes = filter(
-            lambda x: get_by_name(x.attribute, "backend") is not None, all_nodes
-        )
-        df_nodes = filter(
-            lambda x: get_by_name(x.attribute, "backend").s.decode("UTF-8")
-            == "fpgadataflow",
-            df_nodes,
-        )
-        df_nodes = list(df_nodes)
-        non_df_nodes = filter(lambda x: x not in df_nodes, all_nodes)
-        non_df_nodes = list(non_df_nodes)
-
-        if len(df_nodes) == 0:
-            # no changes if no dataflow nodes are present
-            return (model, False)
-        else:
-            # partition the model into two models
-            df_model = copy.deepcopy(model)
-            non_df_model = model
-            # remove all non-dataflow nodes from the dataflow model
-            for node_to_remove in non_df_nodes:
-                df_model.graph.node.remove(node_to_remove)
-            # identify the entry and exit points for the dataflow part
-            df_in = df_model.graph.node[0].input[0]
-            df_out = df_model.graph.node[-1].output[0]
-            df_in_vi = df_model.get_tensor_valueinfo(df_in)
-            df_out_vi = df_model.get_tensor_valueinfo(df_out)
-            # set df graph in/out to be df_in/df_out
-            df_model.graph.input.remove(df_model.graph.input[0])
-            df_model.graph.input.insert(0, df_in_vi)
-            df_model.graph.output.remove(df_model.graph.output[0])
-            df_model.graph.output.insert(0, df_out_vi)
-            df_model_dir = make_build_dir("dataflow_partition_")
-            df_model_filename = df_model_dir + "/df_model.onnx"
-            df_model.save(df_model_filename)
-            # remove all dataflow nodes from the non-dataflow model
-            # keep track of where the dataflow part starts
-            df_start_ind = all_nodes.index(df_nodes[0])
-            for node_to_remove in df_nodes:
-                non_df_model.graph.node.remove(node_to_remove)
-            # create StreamingDataflow node with df_in/df_out io
-            df_node = helper.make_node(
-                "StreamingDataflowPartition",
-                [df_in],
-                [df_out],
-                # use the model attribute to mark the df model
-                model=df_model_filename,
+        target_partition_id = 0
+        # we currently assume that all dataflow nodes belonging to the same partition
+        # are connected to each other and there is a single input/output to/from each.
+        # NOTE: all dataflow nodes with no partition_id set are moved to partition 0
+        # TODO: check the assumption and/or improve this.
+        while True:
+            all_nodes = list(model.graph.node)
+            df_nodes = filter(
+                lambda x: get_by_name(x.attribute, "backend") is not None, all_nodes
+            )
+            df_nodes = filter(
+                lambda x: get_by_name(x.attribute, "backend").s.decode("UTF-8")
+                == "fpgadataflow"
+                and (
+                    get_by_name(x.attribute, "partition_id") is None
+                    or get_by_name(x.attribute, "partition_id").i == target_partition_id
+                )
+                and x.op_type != "StreamingDataflowPartition",
+                df_nodes,
             )
-            non_df_model.graph.node.insert(df_start_ind, df_node)
+            df_nodes = list(df_nodes)
+            non_df_nodes = filter(lambda x: x not in df_nodes, all_nodes)
+            non_df_nodes = list(non_df_nodes)
+
+            if len(df_nodes) == 0:
+                # no changes if no dataflow nodes are present
+                break
+            else:
+                # partition the model into two models
+                df_model = copy.deepcopy(model)
+                non_df_model = model
+                # remove all non-dataflow nodes from the dataflow model
+                for node_to_remove in non_df_nodes:
+                    df_model.graph.node.remove(node_to_remove)
+                # identify the entry and exit points for the dataflow part
+                df_in = df_model.graph.node[0].input[0]
+                df_out = df_model.graph.node[-1].output[0]
+                df_in_vi = df_model.get_tensor_valueinfo(df_in)
+                df_out_vi = df_model.get_tensor_valueinfo(df_out)
+                # set df graph in/out to be df_in/df_out
+                df_model.graph.input.remove(df_model.graph.input[0])
+                df_model.graph.input.insert(0, df_in_vi)
+                df_model.graph.output.remove(df_model.graph.output[0])
+                df_model.graph.output.insert(0, df_out_vi)
+                # parse StreamingFCLayers looking for external weight memories
+                fc_extw_nodes = filter(
+                    lambda x: x.op_type == "StreamingFCLayer_Batch"
+                    and get_by_name(x.attribute, "mem_mode") is not None
+                    and get_by_name(x.attribute, "mem_mode").s.decode("UTF-8")
+                    == "external",
+                    df_nodes,
+                )
+                fc_extw_nodes = list(fc_extw_nodes)
+                extra_df_inputs = []
+
+                for i in range(len(fc_extw_nodes)):
+                    fc_weight_vi = df_model.get_tensor_valueinfo(
+                        fc_extw_nodes[i].input[1]
+                    )
+                    df_model.graph.input.insert(i + 1, fc_weight_vi)
+                    extra_df_inputs.append(fc_extw_nodes[i].input[1])
+
+                # save model
+                df_model_dir = make_build_dir(
+                    "dataflow_partition" + str(target_partition_id) + "_"
+                )
+                df_model_filename = df_model_dir + "/df_model.onnx"
+                df_model.save(df_model_filename)
+                # remove all dataflow nodes from the non-dataflow model
+                # keep track of where the dataflow part starts
+                df_start_ind = all_nodes.index(df_nodes[0])
+                for node_to_remove in df_nodes:
+                    non_df_model.graph.node.remove(node_to_remove)
+                # create StreamingDataflow node with df_in/df_out io
+                df_node = helper.make_node(
+                    "StreamingDataflowPartition",
+                    [df_in] + extra_df_inputs,
+                    [df_out],
+                    # use the model attribute to mark the df model
+                    model=df_model_filename,
+                )
+                non_df_model.graph.node.insert(df_start_ind, df_node)
+                model = non_df_model
+                target_partition_id += 1
 
-        return (non_df_model, False)
+        return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index 0e898f63db785f80cfce2683df0c9b6268e3ec7e..018ad385f33a8e0aea4aa42599fd47fe5dae57dd 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -33,6 +33,8 @@ import subprocess
 from finn.transformation import Transformation
 from finn.util.basic import get_by_name, make_build_dir
 from finn.custom_op.registry import getCustomOp
+from finn.util.basic import get_num_default_workers
+import multiprocessing as mp
 
 
 class CreateStitchedIP(Transformation):
@@ -49,20 +51,137 @@ class CreateStitchedIP(Transformation):
     The packaged block design IP can be found under the ip subdirectory.
     """
 
-    def __init__(self, fpgapart, clk_ns = 10.0):
+    def __init__(self, fpgapart, clk_ns=10.0, ip_name="finn_design", vitis=False):
         super().__init__()
         self.fpgapart = fpgapart
         self.clk_ns = clk_ns
+        self.ip_name = ip_name
+        self.vitis = vitis
         if float(clk_ns) not in [5.0, 10.0, 20.0]:
             warnings.warn(
                 """The chosen frequency may lead to failure due to clock divider
                 constraints."""
             )
+        self.has_axilite = False
+        self.has_aximm = False
+        self.has_m_axis = False
+        self.m_axis_idx = 0
+        self.has_s_axis = False
+        self.s_axis_idx = 0
+        self.clock_reset_are_external = False
+        self.create_cmds = []
+        self.connect_cmds = []
+        # keep track of top-level interface names
+        self.intf_names = {
+            "clk": [],
+            "rst": [],
+            "s_axis": [],
+            "m_axis": [],
+            "aximm": [],
+            "axilite": [],
+        }
+
+    def connect_clk_rst(self, node):
+        inst_name = node.name
+        node_inst = getCustomOp(node)
+        clock_intf_name = node_inst.get_verilog_top_module_intf_names()["clk"][0]
+        reset_intf_name = node_inst.get_verilog_top_module_intf_names()["rst"][0]
+        # make clock and reset external, if they aren't already
+        if not self.clock_reset_are_external:
+            self.connect_cmds.append(
+                "make_bd_pins_external [get_bd_pins %s/%s]"
+                % (inst_name, clock_intf_name)
+            )
+            self.connect_cmds.append("set_property name ap_clk [get_bd_ports ap_clk_0]")
+            self.connect_cmds.append(
+                "make_bd_pins_external [get_bd_pins %s/%s]"
+                % (inst_name, reset_intf_name)
+            )
+            self.connect_cmds.append(
+                "set_property name ap_rst_n [get_bd_ports ap_rst_n_0]"
+            )
+            self.clock_reset_are_external = True
+            self.intf_names["clk"] = ["ap_clk"]
+            self.intf_names["rst"] = ["ap_rst_n"]
+        # otherwise connect clock and reset
+        else:
+            self.connect_cmds.append(
+                "connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins %s/%s]"
+                % (inst_name, reset_intf_name)
+            )
+            self.connect_cmds.append(
+                "connect_bd_net [get_bd_ports ap_clk] [get_bd_pins %s/%s]"
+                % (inst_name, clock_intf_name)
+            )
+
+    def connect_axi(self, node):
+        inst_name = node.name
+        node_inst = getCustomOp(node)
+        axilite_intf_name = node_inst.get_verilog_top_module_intf_names()["axilite"]
+        aximm_intf_name = node_inst.get_verilog_top_module_intf_names()["aximm"]
+        if len(axilite_intf_name) != 0:
+            self.connect_cmds.append(
+                "make_bd_intf_pins_external "
+                "[get_bd_intf_pins %s/%s]" % (inst_name, axilite_intf_name[0])
+            )
+            self.connect_cmds.append(
+                "set_property name s_axi_control " "[get_bd_intf_ports s_axi_control_0]"
+            )
+            assert (
+                self.has_axilite is False
+            ), "Currently limited to one slave AXI-Stream"
+            self.intf_names["axilite"] = ["s_axi_control"]
+            self.has_axilite = True
+        if len(aximm_intf_name) != 0:
+            self.connect_cmds.append(
+                "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]"
+                % (inst_name, aximm_intf_name[0])
+            )
+            self.connect_cmds.append(
+                "set_property name m_axi_gmem0 [get_bd_intf_ports m_axi_gmem_0]"
+            )
+            self.intf_names["aximm"] = ["m_axi_gmem0"]
+            assert self.has_aximm is False, "Currently limited to one AXI-MM interface"
+            self.has_aximm = True
+
+    def connect_m_axis_external(self, node):
+        inst_name = node.name
+        node_inst = getCustomOp(node)
+        output_intf_names = node_inst.get_verilog_top_module_intf_names()["m_axis"]
+        # make output axis external
+        for output_intf_name in output_intf_names:
+            self.connect_cmds.append(
+                "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]"
+                % (inst_name, output_intf_name)
+            )
+            self.connect_cmds.append(
+                "set_property name m_axis_%d [get_bd_intf_ports %s_0]"
+                % (self.m_axis_idx, output_intf_name)
+            )
+            self.has_m_axis = True
+            self.intf_names["m_axis"].append("m_axis_%d" % self.m_axis_idx)
+            self.m_axis_idx += 1
+
+    def connect_s_axis_external(self, node):
+        inst_name = node.name
+        node_inst = getCustomOp(node)
+        input_intf_names = node_inst.get_verilog_top_module_intf_names()["s_axis"]
+        # make input axis external
+        for input_intf_name in input_intf_names:
+            self.connect_cmds.append(
+                "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]"
+                % (inst_name, input_intf_name)
+            )
+            self.connect_cmds.append(
+                "set_property name s_axis_%d [get_bd_intf_ports %s_0]"
+                % (self.s_axis_idx, input_intf_name)
+            )
+            self.has_s_axis = True
+            self.intf_names["s_axis"].append("s_axis_%d" % self.s_axis_idx)
+            self.s_axis_idx += 1
 
     def apply(self, model):
         ip_dirs = ["list"]
-        create_cmds = []
-        connect_cmds = []
         # ensure that all nodes are fpgadataflow, and that IPs are generated
         for node in model.graph.node:
             assert node.domain == "finn", 'Node domain is not set to "finn"'
@@ -80,59 +199,62 @@ class CreateStitchedIP(Transformation):
             vlnv = node_inst.get_nodeattr("ip_vlnv")
             inst_name = node.name
             create_cmd = "create_bd_cell -type ip -vlnv %s %s" % (vlnv, inst_name)
-            create_cmds += [create_cmd]
-            # TODO nonlinear topologies: check this for all inputs
+            self.create_cmds += [create_cmd]
             my_producer = model.find_producer(node.input[0])
+            self.connect_clk_rst(node)
+            self.connect_axi(node)
             if my_producer is None:
                 # first node in graph
-                # make clock and reset external
-                connect_cmds.append(
-                    "make_bd_pins_external [get_bd_pins %s/ap_clk]" % inst_name
-                )
-                connect_cmds.append(
-                    "make_bd_pins_external [get_bd_pins %s/ap_rst_n]" % inst_name
-                )
-                # make input external
-                connect_cmds.append(
-                    "make_bd_intf_pins_external [get_bd_intf_pins %s/in0_V_V]"
-                    % inst_name
-                )
+                self.connect_s_axis_external(node)
+                if node.op_type == "TLastMarker":
+                    assert (
+                        node_inst.get_nodeattr("Direction") == "in"
+                    ), """Output TLastMarker incorrect direction"""
+                elif node.op_type == "IODMA":
+                    assert (
+                        node_inst.get_nodeattr("direction") == "in"
+                    ), """Input DMA incorrect direction"""
             else:
                 # intermediate node
-                # wire up global clock and reset
-                connect_cmds.append(
-                    "connect_bd_net [get_bd_ports ap_rst_n_0] [get_bd_pins %s/ap_rst_n]"
-                    % inst_name
-                )
-                connect_cmds.append(
-                    "connect_bd_net [get_bd_ports ap_clk_0] [get_bd_pins %s/ap_clk]"
-                    % inst_name
-                )
-                # wire up input to previous output
-                # TODO nonlinear topologies: loop over all inputs
-                my_in_name = "%s/in0_V_V" % (inst_name)
-                prev_out_name = "%s/out_V_V" % (my_producer.name)
-                connect_cmds.append(
-                    "connect_bd_intf_net [get_bd_intf_pins %s] [get_bd_intf_pins %s]"
-                    % (prev_out_name, my_in_name)
-                )
-            if model.find_consumer(node.output[0]) is None:
+                # wire up input(s) to previous node output(s)
+                # foreach input
+                #     find producer
+                #     find index of producer output connected to our target input
+                #     get names of hdl interfaces for input and producer output
+                #     issue a TCL directive to connect input to output
+                for i in range(len(node.input)):
+                    producer = model.find_producer(node.input[i])
+                    if producer is None:
+                        continue
+                    j = list(producer.output).index(node.input[i])
+                    src_intf_name = getCustomOp(
+                        producer
+                    ).get_verilog_top_module_intf_names()["m_axis"][j]
+                    dst_intf_name = node_inst.get_verilog_top_module_intf_names()[
+                        "s_axis"
+                    ][i]
+                    self.connect_cmds.append(
+                        "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                        "[get_bd_intf_pins %s/%s]"
+                        % (producer.name, src_intf_name, node.name, dst_intf_name)
+                    )
+            if model.find_consumers(node.output[0]) is None:
                 # last node in graph
+                self.connect_m_axis_external(node)
                 # ensure it is a TLastMarker to have a valid TLast signal
                 assert (
-                    node.op_type == "TLastMarker"
-                ), """Last node is not TLastMarker.
-                Please run transformation InsertTLastMarker to ensure a valid
-                TLast signal"""
-                # make output external
-                connect_cmds.append(
-                    "make_bd_intf_pins_external [get_bd_intf_pins %s/out_r]" % inst_name
-                )
-                # make AXI lite IF external
-                connect_cmds.append(
-                    "make_bd_intf_pins_external [get_bd_intf_pins %s/s_axi_control]"
-                    % inst_name
-                )
+                    node.op_type == "TLastMarker" or node.op_type == "IODMA"
+                ), """Last node is not TLastMarker or DMA.
+                Please run transformation InsertTLastMarker/InsertIODMA to ensure
+                a valid TLast signal"""
+                if node.op_type == "TLastMarker":
+                    assert (
+                        node_inst.get_nodeattr("Direction") == "out"
+                    ), """Output TLastMarker incorrect direction"""
+                elif node.op_type == "IODMA":
+                    assert (
+                        node_inst.get_nodeattr("direction") == "out"
+                    ), """Output DMA incorrect direction"""
 
         # create a temporary folder for the project
         prjname = "finn_vivado_stitch_proj"
@@ -150,22 +272,54 @@ class CreateStitchedIP(Transformation):
         tcl.append("set_property ip_repo_paths [%s] [current_project]" % ip_dirs_str)
         tcl.append("update_ip_catalog")
         # create block design and instantiate all layers
-        block_name = "finn_design"
+        block_name = self.ip_name
         tcl.append('create_bd_design "%s"' % block_name)
-        tcl.extend(create_cmds)
-        tcl.extend(connect_cmds)
+        tcl.extend(self.create_cmds)
+        tcl.extend(self.connect_cmds)
         fclk_mhz = 1 / (self.clk_ns * 0.001)
         fclk_hz = fclk_mhz * 1000000
         model.set_metadata_prop("clk_ns", str(self.clk_ns))
-        tcl.append("set_property CONFIG.FREQ_HZ %f [get_bd_ports /ap_clk_0]" % fclk_hz)
+        tcl.append("set_property CONFIG.FREQ_HZ %f [get_bd_ports /ap_clk]" % fclk_hz)
         tcl.append("regenerate_bd_layout")
         tcl.append("validate_bd_design")
         tcl.append("save_bd_design")
+        # create wrapper hdl (for rtlsim later on)
+        bd_base = "%s/%s.srcs/sources_1/bd/%s" % (
+            vivado_stitch_proj_dir,
+            prjname,
+            block_name,
+        )
+        bd_filename = "%s/%s.bd" % (bd_base, block_name)
+        tcl.append("make_wrapper -files [get_files %s] -top" % bd_filename)
+        wrapper_filename = "%s/hdl/%s_wrapper.v" % (bd_base, block_name)
+        tcl.append("add_files -norecurse %s" % wrapper_filename)
+        model.set_metadata_prop("wrapper_filename", wrapper_filename)
+        # synthesize to DCP and export stub, DCP and constraints
+        if self.vitis:
+            tcl.append(
+                "set_property SYNTH_CHECKPOINT_MODE Hierarchical [ get_files %s ]"
+                % bd_filename
+            )
+            tcl.append(
+                "set_property -name {STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS} "
+                "-value {-mode out_of_context} -objects [get_runs synth_1]"
+            )
+            num_workers = get_num_default_workers()
+            assert num_workers >= 0, "Number of workers must be nonnegative."
+            if num_workers == 0:
+                num_workers = mp.cpu_count()
+            tcl.append("launch_runs synth_1 -jobs %s" % str(num_workers))
+            tcl.append("wait_on_run [get_runs synth_1]")
+            tcl.append("open_run synth_1 -name synth_1")
+            tcl.append("write_verilog -force -mode synth_stub %s.v" % block_name)
+            tcl.append("write_checkpoint %s.dcp" % block_name)
+            tcl.append("write_xdc %s.xdc" % block_name)
         # export block design itself as an IP core
         block_vendor = "xilinx_finn"
         block_library = "finn"
         block_vlnv = "%s:%s:%s:1.0" % (block_vendor, block_library, block_name)
         model.set_metadata_prop("vivado_stitch_vlnv", block_vlnv)
+        model.set_metadata_prop("vivado_stitch_ifnames", str(self.intf_names))
         tcl.append(
             (
                 "ipx::package_project -root_dir %s/ip -vendor %s "
@@ -175,19 +329,89 @@ class CreateStitchedIP(Transformation):
         )
         tcl.append("set_property core_revision 2 [ipx::find_open_core %s]" % block_vlnv)
         tcl.append("ipx::create_xgui_files [ipx::find_open_core %s]" % block_vlnv)
+        # if targeting Vitis, add some properties to the IP
+        if self.vitis:
+            tcl.append(
+                "ipx::remove_bus_parameter FREQ_HZ "
+                "[ipx::get_bus_interfaces CLK.AP_CLK -of_objects [ipx::current_core]]"
+            )
+            # replace source code with dcp
+            tcl.append(
+                "set_property sdx_kernel true [ipx::find_open_core %s]" % block_vlnv
+            )
+            tcl.append(
+                "set_property sdx_kernel_type rtl [ipx::find_open_core %s]" % block_vlnv
+            )
+            tcl.append(
+                "set_property supported_families { } [ipx::find_open_core %s]"
+                % block_vlnv
+            )
+            tcl.append(
+                "set_property xpm_libraries {XPM_CDC XPM_MEMORY XPM_FIFO} "
+                "[ipx::find_open_core %s]" % block_vlnv
+            )
+            tcl.append(
+                "set_property auto_family_support_level level_2 "
+                "[ipx::find_open_core %s]" % block_vlnv
+            )
+            # remove all files from synthesis and sim groups
+            # we'll replace with DCP, stub, and xdc
+            tcl.append(
+                "ipx::remove_all_file "
+                "[ipx::get_file_groups xilinx_anylanguagebehavioralsimulation]"
+            )
+            tcl.append(
+                "ipx::remove_all_file "
+                "[ipx::get_file_groups xilinx_anylanguagesynthesis]"
+            )
+            tcl.append(
+                "ipx::remove_file_group "
+                "xilinx_anylanguagebehavioralsimulation [ipx::current_core]"
+            )
+            tcl.append(
+                "ipx::remove_file_group "
+                "xilinx_anylanguagesynthesis [ipx::current_core]"
+            )
+            # remove sim and src folders
+            tcl.append("file delete -force %s/ip/sim" % vivado_stitch_proj_dir)
+            tcl.append("file delete -force %s/ip/src" % vivado_stitch_proj_dir)
+            # copy and add DCP, stub, and xdc
+            tcl.append("file mkdir %s/ip/dcp" % vivado_stitch_proj_dir)
+            tcl.append("file mkdir %s/ip/impl" % vivado_stitch_proj_dir)
+            tcl.append(
+                "file copy -force %s.dcp %s/ip/dcp"
+                % (block_name, vivado_stitch_proj_dir)
+            )
+            tcl.append(
+                "file copy -force %s.xdc %s/ip/impl"
+                % (block_name, vivado_stitch_proj_dir)
+            )
+            tcl.append("ipx::add_file_group xilinx_implementation [ipx::current_core]")
+            tcl.append(
+                "ipx::add_file impl/%s.xdc [ipx::get_file_groups xilinx_implementation]"
+                % block_name
+            )
+            tcl.append(
+                "set_property used_in [list implementation] "
+                "[ipx::get_files impl/%s.xdc "
+                "-of_objects [ipx::get_file_groups xilinx_implementation]]" % block_name
+            )
+            tcl.append(
+                "ipx::add_file_group " "xilinx_synthesischeckpoint [ipx::current_core]"
+            )
+            tcl.append(
+                "ipx::add_file dcp/%s.dcp "
+                "[ipx::get_file_groups xilinx_synthesischeckpoint]" % block_name
+            )
+            tcl.append(
+                "ipx::add_file_group xilinx_simulationcheckpoint [ipx::current_core]"
+            )
+            tcl.append(
+                "ipx::add_file dcp/%s.dcp "
+                "[ipx::get_file_groups xilinx_simulationcheckpoint]" % block_name
+            )
         tcl.append("ipx::update_checksums [ipx::find_open_core %s]" % block_vlnv)
         tcl.append("ipx::save_core [ipx::find_open_core %s]" % block_vlnv)
-        # create wrapper hdl (for rtlsim later on)
-        bd_base = "%s/%s.srcs/sources_1/bd/%s" % (
-            vivado_stitch_proj_dir,
-            prjname,
-            block_name,
-        )
-        bd_filename = "%s/%s.bd" % (bd_base, block_name)
-        tcl.append("make_wrapper -files [get_files %s] -top" % bd_filename)
-        wrapper_filename = "%s/hdl/%s_wrapper.v" % (bd_base, block_name)
-        tcl.append("add_files -norecurse %s" % wrapper_filename)
-        model.set_metadata_prop("wrapper_filename", wrapper_filename)
         # export list of used Verilog files (for rtlsim later on)
         tcl.append("set all_v_files [get_files -filter {FILE_TYPE == Verilog}]")
         v_file_list = "%s/all_verilog_srcs.txt" % vivado_stitch_proj_dir
diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index b01f8cbe5c48db6c5288b2db1a8b009ea09ce6c0..85a2d47be0599a852b223f1a65d3ec04efe9bda7 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -118,8 +118,11 @@ class InsertFIFO(Transformation):
                         graph_modified = True
 
         if graph_modified is False:
-            # insert FIFO as first node
-            if graph.node[0].op_type != "StreamingFIFO":
+            # insert FIFO as first node, except when first node is DMA
+            if (
+                graph.node[0].op_type != "StreamingFIFO"
+                and graph.node[0].op_type != "IODMA"
+            ):
                 n = graph.node[0]
                 n_input = n.input[0]
                 n0 = getCustomOp(n)
@@ -153,8 +156,11 @@ class InsertFIFO(Transformation):
                 # set fifo output tensor as new input tensor of second node
                 n.input[0] = fifo_output_tensor.name
 
-            # insert FIFO as last node
-            if graph.node[-1].op_type != "StreamingFIFO":
+            # insert FIFO as last node, except when last node is DMA
+            if (
+                graph.node[-1].op_type != "StreamingFIFO"
+                and graph.node[0].op_type != "IODMA"
+            ):
                 n = graph.node[-1]
                 assert (
                     n.op_type != "TLastMarker"
diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py
index e4368edea717f7499481e9b1c6ac20f7d5bb5f58..0cd7c0d4d41accf8cdba8adfaf4dbb00fc0cab7a 100644
--- a/src/finn/transformation/fpgadataflow/insert_iodma.py
+++ b/src/finn/transformation/fpgadataflow/insert_iodma.py
@@ -171,6 +171,7 @@ class InsertIODMA(Transformation):
                 # calculate width of stream output from DMA
                 pe = get_by_name(fc_node.attribute, "PE").i
                 simd = get_by_name(fc_node.attribute, "SIMD").i
+                assert pe * simd == w_shape[0], "Malformed weight matrix"
                 streamWidth = simd * pe * w_dtype.bitwidth()
                 # make new buffer
                 fc_node_in = oh.make_tensor_value_info(
@@ -178,12 +179,13 @@ class InsertIODMA(Transformation):
                 )
                 model.graph.value_info.append(fc_node_in)
                 model.set_tensor_datatype(fc_node_in.name, w_dtype)
+                model.set_initializer(fc_node_in.name, model.get_initializer(fc_w_name))
                 dma_node = oh.make_node(
                     "IODMA",
                     [fc_w_name],
                     [fc_node_in.name],
-                    numInputVectors=w_shape[:-1],
-                    NumChannels=w_shape[-1],
+                    numInputVectors=[w_shape[1]],
+                    NumChannels=w_shape[0],
                     dataType=str(w_dtype.name),
                     intfWidth=intfwidth,
                     streamWidth=streamWidth,
diff --git a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
index 04dd437af27b9fbe18b2255c20a8e4acda03b3d0..bbb0e43fda464e919a7d8c9dcd25e08a49b33cec 100644
--- a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
+++ b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
@@ -38,7 +38,8 @@ import numpy as np
 
 class InsertTLastMarker(Transformation):
     """Ensure that the graph is started/terminated with a TLastMarker node, inserting
-    one if necessary. Use constructor args to determine type of TLastMarker to be inserted.
+    one if necessary.
+    Use constructor args to determine type of TLastMarker to be inserted.
     More information available on the TLastMarker documentation.
     """
 
@@ -90,41 +91,78 @@ class InsertTLastMarker(Transformation):
             graph_modified = True
         # if both is True, also insert marker on input
         if self.both:
-            graph_in_name = model.graph.input[0].name
-            first_node = model.find_consumer(graph_in_name)
-            if first_node.op_type != "TLastMarker" and not (
-                first_node.op_type == "IODMA"
-                and get_by_name(first_node.attribute, "direction").s.decode("UTF-8")
-                == "in"
-            ):
+            # detect and parse graph inputs
+            insert_idx = 0
+            graph_in_names = [x.name for x in model.graph.input]
+            for graph_in_name in graph_in_names:
+                first_node = model.find_consumers(graph_in_name)
+                # skip if no consumers (this may be the case for unused initializers)
+                # TODO: fix this with a cleanup transform
+                if first_node is None:
+                    continue
+                assert len(first_node) == 1, "Input fans out to multiple nodes"
+                first_node = first_node[0]
+                # several scenarios exclude the node:
+                # 1. node is a FC layer with internal weights, in which case
+                #    the input is in the list of graph inputs because it has an
+                #    initializer (TODO: fix this with a clean-up transform)
+                if (
+                    first_node.op_type == "StreamingFCLayer_Batch"
+                    and get_by_name(first_node.attribute, "mem_mode").s.decode("UTF-8")
+                    != "external"
+                ):
+                    continue
+                # 2. node is either a TLastMarker or an input IODMA
+                if first_node.op_type != "TLastMarker" and not (
+                    first_node.op_type == "IODMA"
+                    and get_by_name(first_node.attribute, "direction").s.decode("UTF-8")
+                    == "in"
+                ):
 
-                custom_op = getCustomOp(first_node)
-                num_iters = np.prod(custom_op.get_folded_input_shape()[1:-1])
-                stream_width = int(custom_op.get_instream_width())
-                in_shape = model.get_tensor_shape(graph_in_name)
-                in_dtype = model.get_tensor_datatype(graph_in_name)
-                elem_width = in_dtype.bitwidth()
-                # make new buffer
-                first_node_in = oh.make_tensor_value_info(
-                    model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape
-                )
-                model.graph.value_info.append(first_node_in)
-                model.set_tensor_datatype(first_node_in.name, in_dtype)
-                # reroute final node output to first_node_in_name
-                first_node.input[0] = first_node_in.name
-                tlast_node = oh.make_node(
-                    "TLastMarker",
-                    [graph_in_name],
-                    [first_node_in.name],
-                    NumIters=num_iters,
-                    StreamWidth=stream_width,
-                    ElemWidth=elem_width,
-                    DynIters=(1 if self.dyniters else 0),
-                    Direction="in",
-                    Protocol=("external" if self.external else "internal"),
-                    domain="finn",
-                    backend="fpgadataflow",
-                )
-                model.graph.node.insert(0, tlast_node)
-                graph_modified = True
+                    custom_op = getCustomOp(first_node)
+                    num_iters = np.prod(custom_op.get_folded_input_shape()[1:-1])
+                    inp_idx = list(first_node.input).index(graph_in_name)
+                    if inp_idx > 0:
+                        if (
+                            first_node.op_type == "StreamingFCLayer_Batch"
+                            and inp_idx == 1
+                        ):
+                            stream_width = int(custom_op.get_weightstream_width())
+                        elif first_node.op_type == "AddStreams_Batch" and inp_idx == 1:
+                            stream_width = int(custom_op.get_instream_width())
+                        else:
+                            raise Exception("No method to determine stream width")
+                    else:
+                        stream_width = int(custom_op.get_instream_width())
+                    in_shape = model.get_tensor_shape(graph_in_name)
+                    in_dtype = model.get_tensor_datatype(graph_in_name)
+                    elem_width = in_dtype.bitwidth()
+                    # make new buffer
+                    first_node_in = oh.make_tensor_value_info(
+                        model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape
+                    )
+                    model.graph.value_info.append(first_node_in)
+                    model.set_tensor_datatype(first_node_in.name, in_dtype)
+                    ini = model.get_initializer(graph_in_name)
+                    # copy initializer if it exists
+                    if ini is not None:
+                        model.set_initializer(first_node_in.name, ini)
+                    # reroute final node output to first_node_in_name
+                    first_node.input[inp_idx] = first_node_in.name
+                    tlast_node = oh.make_node(
+                        "TLastMarker",
+                        [graph_in_name],
+                        [first_node_in.name],
+                        NumIters=num_iters,
+                        StreamWidth=stream_width,
+                        ElemWidth=elem_width,
+                        DynIters=(1 if self.dyniters else 0),
+                        Direction="in",
+                        Protocol=("external" if self.external else "internal"),
+                        domain="finn",
+                        backend="fpgadataflow",
+                    )
+                    model.graph.node.insert(insert_idx, tlast_node)
+                    graph_modified = True
+                    insert_idx += 1
         return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
index 18d3db18da089a5dda4dbb6d97180dd4a20613b5..1e45a65720604144f67245b98dcbe3f6dc8363f5 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
@@ -28,6 +28,7 @@
 
 import os
 import shutil
+import warnings
 
 from finn.custom_op.registry import getCustomOp
 from finn.transformation import Transformation
@@ -53,7 +54,7 @@ class MakePYNQDriver(Transformation):
     def apply(self, model):
         vivado_pynq_proj = model.get_metadata_prop("vivado_pynq_proj")
         if vivado_pynq_proj is None or (not os.path.isdir(vivado_pynq_proj)):
-            raise Exception("No PYNQ project found, apply MakePYNQProject first.")
+            warnings.warn("No PYNQ project found, apply MakePYNQProject first.")
 
         # create a temporary folder for the generated driver
         pynq_driver_dir = make_build_dir(prefix="pynq_driver_")
@@ -108,7 +109,12 @@ class MakePYNQDriver(Transformation):
         driver = driver.replace("$OUTPUT_SHAPE_PACKED$", mss(o_tensor_shape_packed))
 
         # clock settings for driver
-        clk_ns = float(model.get_metadata_prop("clk_ns"))
+        clk_ns = model.get_metadata_prop("clk_ns")
+        # default to 10ns / 100 MHz if property not set
+        if clk_ns is None:
+            clk_ns = 10.0
+        else:
+            clk_ns = float(clk_ns)
         fclk_mhz = 1 / (clk_ns * 0.001)
         # TODO change according to PYNQ board?
         driver = driver.replace("$CLK_NAME$", "fclk0_mhz")
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_proj.py b/src/finn/transformation/fpgadataflow/make_pynq_proj.py
index 91f6bd2c4ab19c736fcf21322979cac17a163f24..a874d7a7c702e1b3e9125fc031aa65dc287a407d 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_proj.py
@@ -67,6 +67,16 @@ class MakePYNQProject(Transformation):
             raise Exception(
                 "No vlnv for stitched IP found, apply CreateStitchedIP first."
             )
+        vivado_stitch_ifnames = model.get_metadata_prop("vivado_stitch_ifnames")
+        if vivado_stitch_ifnames is None:
+            raise Exception("No IF name metadata found, apply CreateStitchedIP first.")
+        vivado_stitch_ifnames = eval(vivado_stitch_ifnames)
+        # recover interface names from dict
+        self.clk_name = vivado_stitch_ifnames["clk"][0]
+        self.rst_name = vivado_stitch_ifnames["rst"][0]
+        self.s_axis_if_name = vivado_stitch_ifnames["s_axis"][0]
+        self.m_axis_if_name = vivado_stitch_ifnames["m_axis"][0]
+        self.s_aximm_if_name = vivado_stitch_ifnames["axilite"][0]
 
         # collect list of all IP dirs
         ip_dirs = ["list"]
@@ -105,11 +115,11 @@ class MakePYNQProject(Transformation):
         multiple of 8."""
         in_bytes = i_bits_per_cycle_padded / 8
         out_bytes = o_bits_per_cycle_padded / 8
-        in_if_name = "in0_V_V_0"
-        out_if_name = "out_r_0"
-        clk_name = "ap_clk_0"
-        nrst_name = "ap_rst_n_0"
-        axi_lite_if_name = "s_axi_control_0"
+        in_if_name = self.s_axis_if_name
+        out_if_name = self.m_axis_if_name
+        clk_name = self.clk_name
+        nrst_name = self.rst_name
+        axi_lite_if_name = self.s_aximm_if_name
         vivado_ip_cache = os.getenv("VIVADO_IP_CACHE", default="")
 
         # create a temporary folder for the project
diff --git a/src/finn/transformation/fpgadataflow/synth_ooc.py b/src/finn/transformation/fpgadataflow/synth_ooc.py
index 1d49970c819961d1794cc89e998108639ca15593..8fd7e4724ef7f255b1435d5ab5e680d155d39487 100644
--- a/src/finn/transformation/fpgadataflow/synth_ooc.py
+++ b/src/finn/transformation/fpgadataflow/synth_ooc.py
@@ -37,7 +37,7 @@ from finn.util.basic import make_build_dir
 class SynthOutOfContext(Transformation):
     """Run out-of-context Vivado synthesis on a stitched IP design."""
 
-    def __init__(self, part, clk_period_ns, clk_name="ap_clk_0"):
+    def __init__(self, part, clk_period_ns, clk_name="ap_clk"):
         super().__init__()
         self.part = part
         self.clk_period_ns = clk_period_ns
diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py
index e4da964552d15543ea93df4fbf01ddab7eb7f6f2..4b91b5c33f032ae1664163ab0ae1cacdf8b91826 100644
--- a/src/finn/transformation/fpgadataflow/templates.py
+++ b/src/finn/transformation/fpgadataflow/templates.py
@@ -104,9 +104,10 @@ from finn.core.datatype import DataType
 from pynq.ps import Clocks
 
 class FINNAccelDriver():
-    def __init__(self, N, bitfile):
+    def __init__(self, N, bitfile, platform="zynq"):
         \"\"\"Instantiate the FINN accelerator driver.
         Gets batchsize (N) as integer and path to bitfile as string.\"\"\"
+        self.platform = platform
         self.N = N
         # input FINN DataType
         self.idt = $INPUT_FINN_DATATYPE$
@@ -119,21 +120,29 @@ class FINNAccelDriver():
         self.oshape_folded = $OUTPUT_SHAPE_FOLDED$
         self.ishape_packed = $INPUT_SHAPE_PACKED$   # datatype np.uint8
         self.oshape_packed = $OUTPUT_SHAPE_PACKED$  # datatype np.uint8
-        # clock frequency
-        self.fclk_mhz = $CLOCK_FREQ_MHZ$
         # load bitfile and set up accelerator
         self.ol = Overlay(bitfile)
-        # set the clock frequency as specified by user during transformations
-        Clocks.$CLK_NAME$ = self.fclk_mhz
-        self.dma = self.ol.axi_dma_0
-        self.ctrl_regs = self.ol.resize_accel_0
         # neuron folding factor of output = iterations per sample
         self.itersPerSample = self.oshape_packed[-2]
-        # AXI lite register offset for number of iterations
-        # used by TLastMarker to signal end of transmission for AXI CDMA
-        self.REG_OFFSET_NUM_ITERS = 0x10
-        # set up TLastMarker with correct num. samples
-        self.ctrl_regs.write(self.REG_OFFSET_NUM_ITERS, self.N*self.itersPerSample)
+        if self.platform == "zynq":
+            # clock frequency
+            self.fclk_mhz = $CLOCK_FREQ_MHZ$
+            # set the clock frequency as specified by user during transformations
+            if self.fclk_mhz > 0:
+                Clocks.$CLK_NAME$ = self.fclk_mhz
+            self.dma = self.ol.axi_dma_0
+            self.ctrl_regs = self.ol.resize_accel_0
+
+            # AXI lite register offset for number of iterations
+            # used by TLastMarker to signal end of transmission for AXI CDMA
+            self.REG_OFFSET_NUM_ITERS = 0x10
+            # set up TLastMarker with correct num. samples
+            self.ctrl_regs.write(self.REG_OFFSET_NUM_ITERS, self.N*self.itersPerSample)
+        elif self.platform == "alveo":
+            self.idma = self.ol.idma0
+            self.odma = self.ol.odma0
+        else:
+            raise ValueError("Supported platforms are zynq and alveo")
 
         # allocate a PYNQ buffer for the packed input and buffer
         self.ibuf_packed_device = allocate(shape=self.ishape_packed, dtype=np.uint8)
@@ -176,19 +185,29 @@ class FINNAccelDriver():
         np.copyto(self.ibuf_packed_device, data)
 
     def execute(self):
-        \"\"\"Executes accelerator by setting up the DMA and
-        waiting until all transfers complete. Uses only member variables and
+        \"\"\"Executes accelerator by setting up the DMA(s) and
+        waiting until all transfers/calls complete. Uses only member variables and
         returns nothing.\"\"\"
-        dma = self.dma
-        dma.sendchannel.transfer(self.ibuf_packed_device)
-        dma.recvchannel.transfer(self.obuf_packed_device)
-        dma.sendchannel.wait()
-        dma.recvchannel.wait()
+        if self.platform == "zynq":
+            dma = self.dma
+            dma.sendchannel.transfer(self.ibuf_packed_device)
+            dma.recvchannel.transfer(self.obuf_packed_device)
+            dma.sendchannel.wait()
+            dma.recvchannel.wait()
+        else:
+            self.ibuf_packed_device.sync_to_device()
+            self.idma.start(self.ibuf_packed_device, self.N)
+            self.odma.start(self.obuf_packed_device, self.N)
+            self.idma.wait()
+            self.odma.wait()
+            self.obuf_packed_device.sync_from_device()
+
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='Set exec mode, batchsize N, bitfile name, inputfile name and outputfile name')
     parser.add_argument('--exec_mode', help='Please select functional verification ("execute") or throughput test ("throughput_test")', default="execute")
+    parser.add_argument('--platform', help='Target platform, zynq or alveo', default="zynq")
     parser.add_argument('--batchsize', help='number of samples for inference', type=int, default=1)
     parser.add_argument('--bitfile', help='name of bitfile (i.e. "resizer.bit")', default="resizer.bit")
     parser.add_argument('--inputfile', help='name of input npy file (i.e. "input.npy")', default="input.npy")
@@ -196,13 +215,14 @@ if __name__ == "__main__":
     # parse arguments
     args = parser.parse_args()
     exec_mode = args.exec_mode
+    platform = args.platform
     N = args.batchsize
     bitfile = args.bitfile
     inputfile = args.inputfile
     outputfile = args.outputfile
 
     # instantiate FINN accelerator driver and pass batchsize and bitfile
-    finnDriver = FINNAccelDriver(N, bitfile)
+    finnDriver = FINNAccelDriver(N, bitfile, platform)
 
     # for the remote execution the data from the input npy file has to be loaded,
     # packed and copied to the PYNQ buffer
diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae529f2f4a165a732627befea0675073bc490996
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/vitis_build.py
@@ -0,0 +1,309 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import subprocess
+
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation import Transformation
+from finn.custom_op.registry import getCustomOp
+
+from finn.transformation.fpgadataflow.create_dataflow_partition import (
+    CreateDataflowPartition,
+)
+from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker
+from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.floorplan import Floorplan
+from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
+from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from finn.util.basic import make_build_dir
+from finn.transformation.infer_data_layouts import InferDataLayouts
+
+def _check_vitis_envvars():
+    assert "VITIS_PATH" in os.environ, "VITIS_PATH must be set for Vitis"
+    assert "PLATFORM_REPO_PATHS" in os.environ, "PLATFORM_REPO_PATHS must be set for Vitis"
+    assert "XILINX_XRT" in os.environ, "XILINX_XRT must be set for Vitis, ensure the XRT env is sourced"
+
+class CreateVitisXO(Transformation):
+    """Create a Vitis object file from a stitched FINN ip.
+
+    Outcome if successful: sets the vitis_xo attribute in the ONNX
+    ModelProto's metadata_props field with the name of the object file as value.
+    The object file can be found under the ip subdirectory.
+    """
+
+    def __init__(self, ip_name="finn_design"):
+        super().__init__()
+        self.ip_name = ip_name
+
+    def apply(self, model):
+        _check_vitis_envvars()
+        vivado_proj_dir = model.get_metadata_prop("vivado_stitch_proj")
+        stitched_ip_dir = vivado_proj_dir + "/ip"
+        args_string = []
+        m_axis_idx = 0
+        s_axis_idx = 0
+        # NOTE: this assumes the graph is Vitis-compatible: max one axi lite interface
+        # developed from instructions in UG1393 (v2019.2) and package_xo documentation
+        # package_xo is responsible for generating the kernel xml
+        for node in model.graph.node:
+            node_inst = getCustomOp(node)
+            arg_id = 0
+            if node.op_type == "TLastMarker":
+                stream_width = node_inst.get_nodeattr("StreamWidth")
+                # add a stream input or output port, based on direction
+                if node_inst.get_nodeattr("Direction") == "in":
+                    args_string.append(
+                        "{in:4:%s:s_axis_%d:0x0:0x0:ap_uint&lt;%s>:0}"
+                        % (str(arg_id), s_axis_idx, str(stream_width))
+                    )
+                    s_axis_idx += 1
+                else:
+                    args_string.append(
+                        "{out:4:%s:m_axis_%d:0x0:0x0:ap_uint&lt;%s>:0}"
+                        % (str(arg_id), m_axis_idx, str(stream_width))
+                    )
+                    m_axis_idx += 1
+                arg_id += 1
+                # add a axilite port if dynamic
+                # add a count parameter if dynamic
+                if node_inst.get_nodeattr("DynIters") == 1:
+                    args_string.append(
+                        "{numReps:0:%s:s_axi_control:0x4:0x10:uint:0}" % str(arg_id)
+                    )
+                    arg_id += 1
+            elif node.op_type == "IODMA":
+                port_width = node_inst.get_nodeattr("intfWidth")
+                # add an address parameter
+                # add a count parameter
+                args_string.append(
+                    "{addr:1:%s:m_axi_gmem0:0x8:0x10:ap_uint&lt;%s>*:0}"
+                    % (str(arg_id), str(port_width))
+                )
+                arg_id += 1
+                args_string.append(
+                    "{numReps:0:%s:s_axi_control:0x4:0x1C:uint:0}" % str(arg_id)
+                )
+                arg_id += 1
+
+        # save kernel xml then run package_xo
+        xo_name = self.ip_name + ".xo"
+        xo_path = vivado_proj_dir + "/" + xo_name
+        model.set_metadata_prop("vitis_xo", xo_path)
+
+        # generate the package_xo command in a tcl script
+        package_xo_string = (
+            "package_xo -force -xo_path %s -kernel_name %s -ip_directory %s"
+            % (xo_path, self.ip_name, stitched_ip_dir)
+        )
+        for arg in args_string:
+            package_xo_string += " -kernel_xml_args " + arg
+        with open(vivado_proj_dir + "/gen_xo.tcl", "w") as f:
+            f.write(package_xo_string)
+
+        # create a shell script and call Vivado
+        package_xo_sh = vivado_proj_dir + "/gen_xo.sh"
+        working_dir = os.environ["PWD"]
+        with open(package_xo_sh, "w") as f:
+            f.write("#!/bin/bash \n")
+            f.write("cd {}\n".format(vivado_proj_dir))
+            f.write("vivado -mode batch -source gen_xo.tcl\n")
+            f.write("cd {}\n".format(working_dir))
+        bash_command = ["bash", package_xo_sh]
+        process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
+        process_compile.communicate()
+        assert os.path.isfile(xo_path), "Vitis .xo file not created, check logs under %s" % vivado_proj_dir
+        return (model, False)
+
+
+class VitisLink(Transformation):
+    """Create an XCLBIN with Vitis.
+
+    Outcome if successful: sets the vitis_xclbin attribute in the ONNX
+    ModelProto's metadata_props field with the XCLBIN full path as value.
+    """
+
+    def __init__(self, platform, f_mhz=200):
+        super().__init__()
+        self.platform = platform
+        self.f_mhz = f_mhz
+
+    def apply(self, model):
+        _check_vitis_envvars()
+        # create a config file and empty list of xo files
+        config = ["[connectivity]"]
+        object_files = []
+        idma_idx = 0
+        odma_idx = 0
+        instance_names = {}
+        for node in model.graph.node:
+            assert node.op_type == "StreamingDataflowPartition", "Invalid link graph"
+            sdp_node = getCustomOp(node)
+            dataflow_model_filename = sdp_node.get_nodeattr("model")
+            kernel_model = ModelWrapper(dataflow_model_filename)
+            kernel_xo = kernel_model.get_metadata_prop("vitis_xo")
+            object_files.append(kernel_xo)
+            # gather info on connectivity
+            # assume each node connected to outputs/inputs is DMA:
+            # has axis, aximm and axilite
+            # everything else is axis-only
+            # assume only one connection from each ip to the next
+            # all aximm allocated to DDR[0]
+            # all kernels allocated to SLR0
+            producer = model.find_producer(node.input[0])
+            consumer = model.find_consumers(node.output[0])
+            # define kernel instances
+            # name kernels connected to graph inputs as idmaxx
+            # name kernels connected to graph inputs as odmaxx
+            if producer is None:
+                instance_names[node.name] = "idma" + str(idma_idx)
+                config.append("nk=%s:1:%s" % (node.name, instance_names[node.name]))
+                idma_idx += 1
+            elif consumer is None:
+                instance_names[node.name] = "odma" + str(odma_idx)
+                config.append("nk=%s:1:%s" % (node.name, instance_names[node.name]))
+                odma_idx += 1
+            else:
+                instance_names[node.name] = node.name
+                config.append("nk=%s:1:%s" % (node.name, instance_names[node.name]))
+            # assign SLRs
+            config.append("slr=%s:SLR0" % instance_names[node.name])
+            # assign memory banks
+            if producer is None or consumer is None:
+                config.append(
+                    "sp=%s.m_axi_gmem0:DDR[%d]" % (instance_names[node.name], 0)
+                )
+            # connect streams
+            if producer is not None:
+                for i in range(len(node.input)):
+                    producer = model.find_producer(node.input[i])
+                    if producer is not None:
+                        j = list(producer.output).index(node.input[i])
+                        config.append(
+                            "stream_connect=%s.m_axis_%d:%s.s_axis_%d"
+                            % (
+                                instance_names[producer.name],
+                                j,
+                                instance_names[node.name],
+                                i,
+                            )
+                        )
+
+        # create a temporary folder for the project
+        link_dir = make_build_dir(prefix="vitis_link_proj_")
+        model.set_metadata_prop("vitis_link_proj", link_dir)
+
+        config = "\n".join(config) + "\n"
+        with open(link_dir + "/config.txt", "w") as f:
+            f.write(config)
+
+        # create a shell script and call Vitis
+        script = link_dir + "/run_vitis_link.sh"
+        working_dir = os.environ["PWD"]
+        with open(script, "w") as f:
+            f.write("#!/bin/bash \n")
+            f.write("cd {}\n".format(link_dir))
+            f.write(
+                "v++ -t hw --platform %s --link %s"
+                " --kernel_frequency %d --config config.txt\n"
+                % (self.platform, " ".join(object_files), self.f_mhz)
+            )
+            f.write("cd {}\n".format(working_dir))
+        bash_command = ["bash", script]
+        process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
+        process_compile.communicate()
+        # TODO rename xclbin appropriately here?
+        xclbin = link_dir + "/a.xclbin"
+        assert os.path.isfile(xclbin), "Vitis .xclbin file not created, check logs under %s" % link_dir
+        model.set_metadata_prop("vitis_xclbin", xclbin)
+        return (model, False)
+
+
+class VitisBuild(Transformation):
+    """Best-effort attempt at building the accelerator with Vitis."""
+
+    def __init__(self, fpga_part, period_ns, platform):
+        super().__init__()
+        self.fpga_part = fpga_part
+        self.period_ns = period_ns
+        self.platform = platform
+
+    def apply(self, model):
+        _check_vitis_envvars()
+        # first infer layouts
+        model = model.transform(InferDataLayouts())
+        # prepare at global level, then break up into kernels
+        prep_transforms = [
+            MakePYNQDriver(),
+            InsertIODMA(512),
+            InsertDWC(),
+            Floorplan(),
+            CreateDataflowPartition(),
+        ]
+        for trn in prep_transforms:
+            model = model.transform(trn)
+            model = model.transform(GiveUniqueNodeNames())
+            model = model.transform(GiveReadableTensorNames())
+        # Build each kernel individually
+        sdp_nodes = model.get_nodes_by_op_type("StreamingDataflowPartition")
+        for sdp_node in sdp_nodes:
+            sdp_node = getCustomOp(sdp_node)
+            dataflow_model_filename = sdp_node.get_nodeattr("model")
+            kernel_model = ModelWrapper(dataflow_model_filename)
+            kernel_model = kernel_model.transform(InsertFIFO())
+            kernel_model = kernel_model.transform(
+                InsertTLastMarker(both=True, external=False, dynamic=False)
+            )
+            kernel_model = kernel_model.transform(GiveUniqueNodeNames())
+            kernel_model.save(dataflow_model_filename)
+            kernel_model = kernel_model.transform(
+                PrepareIP(self.fpga_part, self.period_ns)
+            )
+            kernel_model = kernel_model.transform(HLSSynthIP())
+            kernel_model = kernel_model.transform(ReplaceVerilogRelPaths())
+            kernel_model = kernel_model.transform(
+                CreateStitchedIP(
+                    self.fpga_part, self.period_ns, sdp_node.onnx_node.name, True
+                )
+            )
+            kernel_model = kernel_model.transform(
+                CreateVitisXO(sdp_node.onnx_node.name)
+            )
+            kernel_model.save(dataflow_model_filename)
+        # Assemble design from kernels
+        model = model.transform(VitisLink(self.platform, round(1000 / self.period_ns)))
+
+        return (model, False)
diff --git a/src/finn/transformation/move_reshape.py b/src/finn/transformation/move_reshape.py
index 2ddaf4f840f449d3f5ec5cb83eaf461d624eb7a2..9943d371dad79a977b61810bcddafdcba505d6cc 100644
--- a/src/finn/transformation/move_reshape.py
+++ b/src/finn/transformation/move_reshape.py
@@ -36,5 +36,15 @@ class RemoveCNVtoFCFlatten(Transformation):
                             graph_modified = True
                             consumer.input[0] = n.input[0]
                             graph.node.remove(n)
+                    elif producer.op_type == "Transpose":
+                        transp_node = producer
+                        producer = model.find_producer(transp_node.input[0])
+                        if _is_fpgadataflow_node(producer) is True:
+                            consumer = model.find_consumer(n.output[0])
+                            if _is_fpgadataflow_node(consumer) is True:
+                                graph_modified = True
+                                consumer.input[0] = transp_node.input[0]
+                                graph.node.remove(n)
+                                graph.node.remove(transp_node)
 
         return (model, graph_modified)
diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index 2b03532ce3ba7d5159e5ae57e61c2af9c8c37fce..b47f269dd6f2671c3d98c9316954483c0e72f14f 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -502,6 +502,73 @@ class MoveLinearPastEltwiseAdd(Transformation):
         return (model, graph_modified)
 
 
+class MoveScalarLinearPastInvariants(Transformation):
+    """Move scalar linear operations (mul, add) past functions which are invariant
+       to them. Specifically, matches and transforms the following patterns:
+       f(x*C) -> f(x) * C
+       f(x+C) -> f(x) + C
+       where x is a dynamic input, C is a constant tensor.
+       Known f which obey this property are: Reshape, Flatten, Transpose,
+       GlobalAveragePool
+    """
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        nodes = [n for n in graph.node]
+        for n in nodes:
+            node_ind += 1
+            if (
+                n.op_type == "GlobalAveragePool"
+                or n.op_type == "Reshape"
+                or n.op_type == "Transpose"
+                or n.op_type == "Flatten"
+            ):
+                in0 = n.input[0]
+                if in0 is None:
+                    continue
+                # find and check producer on our input
+                prod0 = model.find_producer(in0)
+                if prod0 is None:
+                    continue
+
+                if prod0.op_type == "Mul" or prod0.op_type == "Add":
+                    # check if second input of producer is an initializer
+                    init0 = model.get_initializer(prod0.input[1])
+                    # if either initializer is None, skip
+                    if init0 is None:
+                        continue
+                    # if initializer is not scalar, skip
+                    if np.prod(init0.shape) != 1:
+                        continue
+                    # move prod0 from input to output,
+                    old_prod0_in = prod0.input[0]
+                    old_prod0_out = prod0.output[0]
+                    scalar_op_odt = model.get_tensor_datatype(old_prod0_out)
+                    old_n_out = n.output[0]
+                    in_shape = model.get_tensor_shape(n.input[0])
+                    out_shape = model.get_tensor_shape(n.output[0])
+                    n.input[0] = old_prod0_in
+                    n.output[0] = old_prod0_out
+                    prod0.input[0] = old_prod0_out
+                    prod0.output[0] = old_n_out
+                    model.set_tensor_shape(n.input[0], in_shape)
+                    model.set_tensor_shape(n.output[0], out_shape)
+                    model.set_tensor_shape(prod0.output[0], out_shape)
+                    model.set_tensor_datatype(prod0.output[0], scalar_op_odt)
+                    model.set_tensor_datatype(n.output[0], DataType.FLOAT32)
+                    graph.node.remove(prod0)
+                    graph.node.insert(node_ind - 1, prod0)
+                    graph_modified = True
+                else:
+                    continue
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
+
+
 class MakeMaxPoolNHWC(Transformation):
     """Convert (MaxPool, NHWCTranpose) into (MaxPoolNHWC)."""
 
@@ -685,6 +752,7 @@ class MoveMaxPoolPastMultiThreshold(Transformation):
         model = model.transform(InferShapes())
         return (model, graph_modified)
 
+
 class MoveFlattenPastTopK(Transformation):
     """Move flatten node past a succeeding topk node, if the "axis" attribute in topk
     is set to -1 and the data layout before the flatten is NHWC with H=W=1"""
@@ -745,6 +813,7 @@ class MoveFlattenPastTopK(Transformation):
         model = model.transform(InferShapes())
         return (model, graph_modified)
 
+
 class MoveFlattenPastAffine(Transformation):
     """Moves a node that implements a (1, -1) reshape past a MatMul, Mul or Add node."""
 
@@ -831,9 +900,10 @@ class MoveFlattenPastAffine(Transformation):
 
         model = model.transform(InferShapes())
         model = model.transform(InferDataTypes())
-        model = model.transform(InferDataLayouts())                  
+        model = model.transform(InferDataLayouts())
         return (model, graph_modified)
-      
+
+
 class MoveTransposePastScalarMul(Transformation):
     """Moves a Transpose node past a scalar Mul node"""
 
@@ -895,4 +965,3 @@ class MoveTransposePastScalarMul(Transformation):
             model = model.transform(InferDataLayouts())
             model = model.transform(InferShapes())
         return (model, graph_modified)
-
diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index 4a8277e08d3fc21e0b20668edf2ecad947b36647..91ff811069369383099f5ae5aebf3228fbdbaae5 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -51,6 +51,19 @@ pynq_native_port_width["Pynq-Z2"] = 64
 pynq_native_port_width["Ultra96"] = 128
 pynq_native_port_width["ZCU104"] = 128
 
+# Alveo device and platform mappings
+alveo_part_map = dict()
+alveo_part_map["U50"] = "xcu50-fsvh2104-2L-e"
+alveo_part_map["U200"] = "xcu200-fsgd2104-2-e"
+alveo_part_map["U250"] = "xcu250-figd2104-2L-e"
+alveo_part_map["U280"] = "xcu280-fsvh2892-2L-e"
+
+alveo_default_platform = dict()
+alveo_default_platform["U50"] = "xilinx_u50_gen3x16_xdma_201920_3"
+alveo_default_platform["U200"] = "xilinx_u200_xdma_201830_2"
+alveo_default_platform["U250"] = "xilinx_u250_xdma_201830_2"
+alveo_default_platform["U280"] = "xilinx_u280_xdma_201920_3"
+
 
 def get_rtlsim_trace_depth():
     """Return the trace depth for rtlsim via PyVerilator. Controllable
diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
index 22c356a5869b25fcc7ae3ef0164ed61b53ef232c..188f20e22fc52e435f8ba0e7d76dff223e084d69 100644
--- a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
@@ -23,6 +23,7 @@ from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.custom_op.im2col import compute_conv_output_dim
+from finn.custom_op.registry import getCustomOp
 
 # conv_config  kernel_size,stride, pad
 
@@ -110,3 +111,8 @@ def test_convert_to_hls_conv_layer(conv_config, exec_mode):
     assert oxe.compare_execution(model, new_model, inp_dict)
     if kernel_size == 1 and stride > 1 and pad == 0:
         assert new_model.graph.node[1].op_type == "DownSampler"
+
+    if pad == 1:
+        padding_node = new_model.get_nodes_by_op_type("FMPadding_Batch")[0]
+        padding_inst = getCustomOp(padding_node)
+        assert padding_inst.get_nodeattr("SIMD") == in_chn
diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py b/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d861929f3d421c431a27ccac5d513938aa7d726
--- /dev/null
+++ b/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py
@@ -0,0 +1,232 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import numpy as np
+
+from onnx import TensorProto, helper
+
+import finn.core.onnx_exec as oxe
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.fold_constants import FoldConstants
+from finn.transformation.general import (
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    SortGraph,
+)
+from finn.transformation.streamline.reorder import MoveScalarLinearPastInvariants
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.util.basic import gen_finn_dt_tensor
+from finn.util.test import soft_verify_topk
+from finn.transformation.double_to_single_float import DoubleToSingleFloat
+from finn.transformation.insert_topk import InsertTopK
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.streamline.absorb import (
+    AbsorbScalarMulIntoTopK,
+    AbsorbConsecutiveTransposes,
+)
+from finn.transformation.streamline.collapse_repeated import (
+    CollapseRepeatedMul,
+    CollapseRepeatedAdd,
+)
+from finn.transformation.streamline.reorder import MoveAddPastMul
+
+import pytest
+
+export_onnx_path = "test_output_synthetic.onnx"
+
+# construct a synthetic graph to test:
+# topk insertion, topk conversion to hls, add conversion to hls
+# graph should just be a sum
+
+
+def make_model(ch, ifmdim):
+    shape = [1, ch, ifmdim, ifmdim]
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape)
+    inp1_add0_ct = helper.make_tensor_value_info("inp1_add0_ct", TensorProto.FLOAT, [1])
+    inp1_add = helper.make_tensor_value_info("inp1_add", TensorProto.FLOAT, shape)
+    inp1_add_ct = helper.make_tensor_value_info("inp1_add_ct", TensorProto.FLOAT, [1])
+    inp2_add = helper.make_tensor_value_info("inp2_add", TensorProto.FLOAT, shape)
+    inp2_add_ct = helper.make_tensor_value_info("inp2_add_ct", TensorProto.FLOAT, [1])
+    inp1_mul = helper.make_tensor_value_info("inp1_mul", TensorProto.FLOAT, shape)
+    inp1_mul_ct = helper.make_tensor_value_info("inp1_mul_ct", TensorProto.FLOAT, [1])
+    inp2_mul = helper.make_tensor_value_info("inp2_mul", TensorProto.FLOAT, shape)
+    inp2_mul_ct = helper.make_tensor_value_info("inp2_mul_ct", TensorProto.FLOAT, [1])
+    eltwise_add = helper.make_tensor_value_info("eltwise_add", TensorProto.FLOAT, shape)
+    pool = helper.make_tensor_value_info("pool", TensorProto.FLOAT, [1, ch, 1, 1])
+    reshape_ct = helper.make_tensor_value_info("reshape_ct", TensorProto.INT64, [2])
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ch])
+
+    add0_node = helper.make_node("Add", [inp.name, inp1_add0_ct.name], ["out_add0"])
+    add1_node = helper.make_node("Add", ["out_add0", inp1_add_ct.name], [inp1_add.name])
+    add2_node = helper.make_node("Add", ["out_add0", inp2_add_ct.name], [inp2_add.name])
+    mul1_node = helper.make_node(
+        "Mul", [inp1_add.name, inp1_mul_ct.name], [inp1_mul.name]
+    )
+    mul2_node = helper.make_node(
+        "Mul", [inp2_add.name, inp2_mul_ct.name], [inp2_mul.name]
+    )
+    eltwise_add_node = helper.make_node(
+        "Add", [inp1_mul.name, inp2_mul.name], [eltwise_add.name]
+    )
+    globalavgpool_node = helper.make_node(
+        "GlobalAveragePool", [eltwise_add.name], [pool.name]
+    )
+    reshape_node = helper.make_node(
+        "Reshape", [pool.name, reshape_ct.name], [outp.name]
+    )
+
+    graph = helper.make_graph(
+        nodes=[
+            add0_node,
+            add1_node,
+            add2_node,
+            mul1_node,
+            mul2_node,
+            eltwise_add_node,
+            globalavgpool_node,
+            reshape_node,
+        ],
+        name="graph",
+        inputs=[inp],
+        outputs=[outp],
+    )
+
+    model = helper.make_model(graph, producer_name="add-model")
+    model = ModelWrapper(model)
+
+    # set initializers for scalar add/mul nodes
+    model.set_initializer(add0_node.input[1], np.array([0.0]))
+    model.set_initializer(add1_node.input[1], np.array([7.0]))
+    model.set_initializer(add2_node.input[1], np.array([8.0]))
+    model.set_initializer(mul1_node.input[1], np.array([2.0]))
+    model.set_initializer(mul2_node.input[1], np.array([2.0]))
+    model.set_initializer(reshape_node.input[1], np.array([1, -1]))
+
+    return model
+
+
+# data types
+@pytest.mark.parametrize("idt", [DataType.UINT2])
+# channels
+@pytest.mark.parametrize("ch", [16])
+# ifmdim
+@pytest.mark.parametrize("ifmdim", [5])
+@pytest.mark.vivado
+@pytest.mark.slow
+def test_convert_to_hls_layers_synthetic(ch, ifmdim, idt):
+    model = make_model(ch, ifmdim)
+    model.save(export_onnx_path)
+    model = ModelWrapper(export_onnx_path)
+    model = model.transform(DoubleToSingleFloat())
+    model = model.transform(InferShapes())
+    model = model.transform(FoldConstants())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(InferDataLayouts())
+    # model.save("golden.onnx")
+    # generate test vectors of correct shape
+    if ifmdim == -1:
+        input_tensor_shape = (1, ch)
+    else:
+        input_tensor_shape = (1, ch, ifmdim, ifmdim)
+
+    x = gen_finn_dt_tensor(idt, input_tensor_shape)
+
+    # generate expected value from streamlined net
+    input_dict = {model.graph.input[0].name: x}
+
+    output_dict = oxe.execute_onnx(model, input_dict, True)
+    produced_sum = output_dict[model.graph.output[0].name]
+    chw_mul = model.get_initializer(model.graph.node[-1].input[1])
+    chw_mul = 1
+    expected_sum = chw_mul * np.sum(2 * (2 * x + 15.0), axis=(2, 3)) / (ifmdim * ifmdim)
+    assert (produced_sum.flatten() == expected_sum.flatten()).all()
+
+    model = model.transform(InferDataLayouts())
+
+    # convert to hls
+    model.set_tensor_datatype(model.graph.input[0].name, idt)
+    # extra streamlining
+    model = model.transform(MoveScalarLinearPastInvariants())
+    model = model.transform(MoveAddPastMul())
+    model = model.transform(CollapseRepeatedMul())
+    model = model.transform(CollapseRepeatedAdd())
+    # insert top-k node, which should absorb linear ops before it
+
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataLayouts())
+    model = model.transform(InferDataTypes())
+
+    model = model.transform(to_hls.InferChannelwiseLinearLayer())
+    model = model.transform(to_hls.InferAddStreamsLayer())
+    model = model.transform(to_hls.InferGlobalAccPoolLayer())
+    model = model.transform(MoveScalarLinearPastInvariants())
+    model = model.transform(InsertTopK())
+    model = model.transform(AbsorbScalarMulIntoTopK())
+    model = model.transform(InferDataTypes())
+    model = model.transform(to_hls.InferLabelSelectLayer())
+    model = model.transform(AbsorbConsecutiveTransposes())
+    model = model.transform(InferDataTypes())
+    model = model.transform(to_hls.InferLabelSelectLayer())
+    model = model.transform(to_hls.InferDuplicateStreamsLayer())
+
+    model = model.transform(SortGraph())
+
+    # model.save("golden_hls.onnx")
+    # check topology status
+
+    finn_nodes = model.get_finn_nodes()
+    assert len(finn_nodes) == 9
+    add_nodes = model.get_nodes_by_op_type("AddStreams_Batch")
+    assert len(add_nodes) == 1
+    pool_nodes = model.get_nodes_by_op_type("GlobalAccPool_Batch")
+    assert len(pool_nodes) == 1
+    label_nodes = model.get_nodes_by_op_type("LabelSelect_Batch")
+    assert len(label_nodes) == 1
+    channelwise_nodes = model.get_nodes_by_op_type("ChannelwiseOp_Batch")
+    assert len(channelwise_nodes) == 5
+    dup_nodes = model.get_nodes_by_op_type("DuplicateStreams_Batch")
+    assert len(dup_nodes) == 1
+
+    model = model.transform(PrepareCppSim())
+    model = model.transform(CompileCppSim())
+    model = model.transform(SetExecMode("cppsim"))
+
+    output_dict = oxe.execute_onnx(model, input_dict, True)
+    produced_topk_hls = output_dict[model.graph.output[0].name]
+    topk_input = output_dict[model.graph.node[-1].input[0]]
+    assert soft_verify_topk(topk_input, produced_topk_hls, 5)
+
+    os.remove(export_onnx_path)
diff --git a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
index c9f78dcea1a1ce364d0657ad64de7d440d41b822..aba973051cb14e3e428e4de72a57924884c831de 100644
--- a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
+++ b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
@@ -77,27 +77,63 @@ def make_single_maxpool_modelwrapper(k, stride, pad, ifm_ch, ifm_dim, ofm_dim, i
     return model
 
 
+def make_single_quantavpool_modelwrapper(k, stride, ifm_ch, ifm_dim, ofm_dim, idt, odt):
+    inp = helper.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim]
+    )
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ifm_ch, ofm_dim, ofm_dim]
+    )
+
+    mp_node = helper.make_node(
+        "QuantAvgPool2d",
+        ["inp"],
+        ["outp"],
+        domain="finn",
+        stride=stride,
+        kernel=k,
+        ibits=idt.bitwidth(),
+        obits=odt.bitwidth(),
+        signed=1 if idt.signed() else 0,
+        data_layout="NCHW",
+    )
+    graph = helper.make_graph(
+        nodes=[mp_node], name="mp_graph", inputs=[inp], outputs=[outp]
+    )
+
+    model = helper.make_model(graph, producer_name="mp-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+    model = model.transform(InferShapes())
+
+    return model
+
+
 def prepare_inputs(input_tensor):
     return {"inp": input_tensor}
 
 
 # input datatype
-@pytest.mark.parametrize("idt", [DataType.UINT4, DataType.INT4])
+@pytest.mark.parametrize("idt", [DataType.UINT4, DataType.INT4, DataType.INT8])
+# output datatype
+@pytest.mark.parametrize("odt", [DataType.UINT4, DataType.INT4])
 # pool configuration:                   ( k,stride, pad, ifm_dim )
-@pytest.mark.parametrize(
-    "pool_config", [(3, 2, 0, 5), (3, 2, 1, 5), (2, 2, 0, 8), (5, 2, 2, 7)]
-)
+@pytest.mark.parametrize("pool_config", [(7, 7, 0, 7), (3, 2, 1, 5)])
 # input channels
-@pytest.mark.parametrize("ifm_ch", [1, 4, 20])
+@pytest.mark.parametrize("ifm_ch", [1, 4])
 # number of out channel computed in parallel
-@pytest.mark.parametrize("pe", [1, 4, 20])
+@pytest.mark.parametrize("pe", [1, 2, 4])
+# pool type
+@pytest.mark.parametrize("op_type", ["QuantAvgPool2d", "MaxPool"])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
-# pool type
-@pytest.mark.parametrize("op_type", ["MaxPool"])
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_convert_to_hls_pool_batch(idt, pool_config, ifm_ch, pe, exec_mode, op_type):
+def test_convert_to_hls_pool_batch(
+    idt, odt, pool_config, ifm_ch, pe, op_type, exec_mode
+):
     k, stride, pad, ifm_dim = pool_config
 
     if ifm_ch % pe != 0:
@@ -113,9 +149,25 @@ def test_convert_to_hls_pool_batch(idt, pool_config, ifm_ch, pe, exec_mode, op_t
     # prepare input data
     input_dict = prepare_inputs(x)
     if op_type == "MaxPool":
+        # if idt.signed():
+        #     pytest.skip("""No support for signed input (see accu initialization
+        #         in Pool_batch HLSLIB function). Skipping""")
+
+        if idt != odt:
+            pytest.skip("Skipping Maxpool with idt != odt")
+
         model = make_single_maxpool_modelwrapper(
             k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt
         )
+    elif op_type == "QuantAvgPool2d":
+        if pad != 0:
+            pytest.skip("No padding support for QuantAvgPool2d. Skipping")
+
+        if idt.signed() != odt.signed():
+            pytest.skip("Skipping QuantAvgPool2d with idt.signed() != odt.signed()")
+        model = make_single_quantavpool_modelwrapper(
+            k, stride, ifm_ch, ifm_dim, ofm_dim, idt, odt
+        )
     else:
         assert False, "{} is not a supported op_type".format(op_type)
 
@@ -151,7 +203,7 @@ def test_convert_to_hls_pool_batch(idt, pool_config, ifm_ch, pe, exec_mode, op_t
     # execute new_model
     y_produced = oxe.execute_onnx(new_model, input_dict)["outp"]
     assert (y_produced == y_expected).all()
-    if stride != k:
+    if stride <= k:
         if pad == 0 or ifm_ch == pe:
             assert len(new_model.graph.node) == 4
         else:
diff --git a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
index 4fb84be59333ef0e696204c9064fcf77e35b5d9b..59ac1c09f4fe338ef03a8166c63b9d4b29bbc08e 100644
--- a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
+++ b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
@@ -33,6 +33,8 @@ from onnx import TensorProto, helper
 import finn.core.onnx_exec as oxe
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
@@ -72,6 +74,9 @@ def make_dupstreams_modelwrapper(ch, pe, idim, idt):
 
     model.set_tensor_datatype("inp", idt)
 
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+
     return model
 
 
diff --git a/tests/fpgadataflow/test_fpgadataflow_fclayer.py b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
index fc5cdb7745945bee99564ba9ab19423a66d8e035..251fc806c3b0f8a52183b8003db6d930351b0ace 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fclayer.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
@@ -134,7 +134,7 @@ def prepare_inputs(input_tensor, idt, wdt):
 
 
 # mem_mode: const or decoupled
-@pytest.mark.parametrize("mem_mode", ["const", "decoupled"])
+@pytest.mark.parametrize("mem_mode", ["const", "decoupled", "external"])
 # activation: None or DataType
 @pytest.mark.parametrize("act", [None, DataType.BIPOLAR, DataType.INT4])
 # weight datatype
@@ -221,7 +221,7 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
 
 
 # mem_mode: const or decoupled
-@pytest.mark.parametrize("mem_mode", ["const", "decoupled"])
+@pytest.mark.parametrize("mem_mode", ["const", "decoupled", "external"])
 # activation: None or DataType
 @pytest.mark.parametrize("act", [None, DataType.BIPOLAR, DataType.INT4])
 # weight datatype
@@ -329,7 +329,7 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
 # HLS matrix height (output features)
 @pytest.mark.parametrize("mh", [128])
 @pytest.mark.vivado
-def test_fpgadataflow_fclayer_large_depth_decoupled_mode(
+def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim(
     mem_mode, idt, wdt, act, nf, sf, mw, mh
 ):
     if nf == -1:
diff --git a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
index 9fcd78521e967ebed248e1873f92700673d484f2..c86ef8bf3e010f9ba21306a0308c8e992930a9b3 100644
--- a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
+++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
@@ -50,13 +50,19 @@ from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject
 from finn.transformation.fpgadataflow.synth_pynq_proj import SynthPYNQProject
 import finn.transformation.fpgadataflow.replace_verilog_relpaths as rvp
 from finn.transformation.general import GiveUniqueNodeNames
-from finn.util.basic import gen_finn_dt_tensor, pynq_part_map
+from finn.util.basic import (
+    gen_finn_dt_tensor,
+    pynq_part_map,
+    alveo_part_map,
+    alveo_default_platform,
+)
 from finn.util.fpgadataflow import pyverilate_stitched_ip
 from finn.util.test import load_test_checkpoint_or_skip
 from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext
 from finn.transformation.infer_data_layouts import InferDataLayouts
 from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA
 from finn.transformation.fpgadataflow.floorplan import Floorplan
+from finn.transformation.fpgadataflow.vitis_build import VitisBuild
 from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
 
 
@@ -120,7 +126,7 @@ def create_one_fc_model():
     return model
 
 
-def create_two_fc_model():
+def create_two_fc_model(mem_mode="decoupled"):
     # create a model with two StreamingFCLayer instances
     wdt = DataType.INT2
     idt = DataType.INT32
@@ -153,7 +159,7 @@ def create_two_fc_model():
         ActVal=actval,
         binaryXnorMode=binary_xnor_mode,
         noActivation=no_act,
-        mem_mode="decoupled",
+        mem_mode=mem_mode,
     )
 
     fc1 = helper.make_node(
@@ -173,7 +179,7 @@ def create_two_fc_model():
         ActVal=actval,
         binaryXnorMode=binary_xnor_mode,
         noActivation=no_act,
-        mem_mode="decoupled",
+        mem_mode=mem_mode,
     )
 
     graph = helper.make_graph(
@@ -248,35 +254,35 @@ def test_fpgadataflow_ipstitch_rtlsim():
     model.set_metadata_prop("rtlsim_trace", "whole_trace.vcd")
     sim = pyverilate_stitched_ip(model)
     exp_io = [
-        "ap_clk_0",
-        "ap_rst_n_0",
-        "in0_V_V_0_tdata",
-        "in0_V_V_0_tready",
-        "in0_V_V_0_tvalid",
-        "out_r_0_tdata",
-        "out_r_0_tkeep",
-        "out_r_0_tlast",
-        "out_r_0_tready",
-        "out_r_0_tvalid",
-        "s_axi_control_0_araddr",
-        "s_axi_control_0_arready",
-        "s_axi_control_0_arvalid",
-        "s_axi_control_0_awaddr",
-        "s_axi_control_0_awready",
-        "s_axi_control_0_awvalid",
-        "s_axi_control_0_bready",
-        "s_axi_control_0_bresp",
-        "s_axi_control_0_bvalid",
-        "s_axi_control_0_rdata",
-        "s_axi_control_0_rready",
-        "s_axi_control_0_rresp",
-        "s_axi_control_0_rvalid",
-        "s_axi_control_0_wdata",
-        "s_axi_control_0_wready",
-        "s_axi_control_0_wstrb",
-        "s_axi_control_0_wvalid",
+        "ap_clk",
+        "ap_rst_n",
+        "s_axis_0_tdata",
+        "s_axis_0_tready",
+        "s_axis_0_tvalid",
+        "m_axis_0_tdata",
+        "m_axis_0_tkeep",
+        "m_axis_0_tlast",
+        "m_axis_0_tready",
+        "m_axis_0_tvalid",
+        "s_axi_control_araddr",
+        "s_axi_control_arready",
+        "s_axi_control_arvalid",
+        "s_axi_control_awaddr",
+        "s_axi_control_awready",
+        "s_axi_control_awvalid",
+        "s_axi_control_bready",
+        "s_axi_control_bresp",
+        "s_axi_control_bvalid",
+        "s_axi_control_rdata",
+        "s_axi_control_rready",
+        "s_axi_control_rresp",
+        "s_axi_control_rvalid",
+        "s_axi_control_wdata",
+        "s_axi_control_wready",
+        "s_axi_control_wstrb",
+        "s_axi_control_wvalid",
     ]
-    assert dir(sim.io) == exp_io
+    assert sorted(dir(sim.io)) == sorted(exp_io)
     model.set_metadata_prop("exec_mode", "rtlsim")
     idt = model.get_tensor_datatype("inp")
     ishape = model.get_tensor_shape("inp")
@@ -413,6 +419,28 @@ def test_fpgadataflow_ipstitch_iodma_floorplan():
     model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_iodma_floorplan.onnx")
 
 
+# board
+@pytest.mark.parametrize("board", ["U250"])
+# clock period
+@pytest.mark.parametrize("period_ns", [5])
+# override mem_mode to external
+@pytest.mark.parametrize("extw", [True, False])
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.vitis
+def test_fpgadataflow_ipstitch_vitis(board, period_ns, extw):
+    platform = alveo_default_platform[board]
+    fpga_part = alveo_part_map[board]
+    model = create_two_fc_model("external" if extw else "decoupled")
+    if model.graph.node[0].op_type == "StreamingDataflowPartition":
+        sdp_node = getCustomOp(model.graph.node[0])
+        assert sdp_node.__class__.__name__ == "StreamingDataflowPartition"
+        assert os.path.isfile(sdp_node.get_nodeattr("model"))
+        model = load_test_checkpoint_or_skip(sdp_node.get_nodeattr("model"))
+    model = model.transform(VitisBuild(fpga_part, period_ns, platform))
+    model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_vitis.onnx")
+
+
 # board
 @pytest.mark.parametrize("board", ["Pynq-Z1"])
 @pytest.mark.slow
diff --git a/tests/transformation/test_topk_insert.py b/tests/transformation/test_topk_insert.py
index a18e63384150f140cb63ec7b438283eb4797266c..b85ed4aa6999faf751e535c1cc687d639c4eb74f 100644
--- a/tests/transformation/test_topk_insert.py
+++ b/tests/transformation/test_topk_insert.py
@@ -1,4 +1,4 @@
-import os
+# import os
 import onnx
 from finn.util.test import get_test_model_trained
 import brevitas.onnx as bo
@@ -57,4 +57,4 @@ def test_topk_insert(k):
     output_pysim_topk = output_pysim_topk.astype(np.int).flatten()
 
     assert np.array_equal(output_golden_topk, output_pysim_topk)
-    os.remove(export_onnx_path)
+    # os.remove(export_onnx_path)