diff --git a/.github/workflows/quicktest-dev-pr.yml b/.github/workflows/quicktest-dev-pr.yml
index 924fbd24a174df49af4b3e259ad57d0a7907d42b..0233a81ba06dc701a3a4579b9a5bd3ce17e47d04 100644
--- a/.github/workflows/quicktest-dev-pr.yml
+++ b/.github/workflows/quicktest-dev-pr.yml
@@ -5,7 +5,7 @@ on:
     branches: [ dev ]
   push:
     branches: [ dev ]
-  
+
 
 jobs:
 
@@ -18,6 +18,6 @@ jobs:
         uses: actions/checkout@v2
 
       - name: DockerRunQuicktest
-        env:
-          NUM_DEFAULT_WORKERS: 4
-        run: sh run-docker.sh quicktest
+        run: |
+          docker build -t finn_gha -f docker/Dockerfile.finn_ci --build-arg BUILD_PATH=/tmp/finn_gha .
+          docker run --init --hostname finn_gha -v $(pwd):/workspace/finn -e FINN_INST_NAME=finn_gha finn_gha quicktest.sh
diff --git a/docker/Dockerfile.finn_ci b/docker/Dockerfile.finn_ci
index d06ff8521555ccd6d09383cab039850f1565fc61..7d5772d9f5118d1f1238dd14a6b57a1b4fd5004d 100644
--- a/docker/Dockerfile.finn_ci
+++ b/docker/Dockerfile.finn_ci
@@ -30,7 +30,6 @@ FROM pytorch/pytorch:1.1.0-cuda10.0-cudnn7.5-devel
 MAINTAINER Yaman Umuroglu <yamanu@xilinx.com>
 ARG PYTHON_VERSION=3.6
 ARG BUILD_PATH
-ARG FINN_CI_BRANCH
 
 WORKDIR /workspace
 
@@ -55,10 +54,9 @@ RUN git clone https://github.com/maltanar/PYNQ-HelloWorld.git /workspace/PYNQ-He
 # oh-my-xilinx
 RUN git clone https://bitbucket.org/maltanar/oh-my-xilinx.git /workspace/oh-my-xilinx
 
-# checkout desired FINN branch for testing
-RUN git clone --branch $FINN_CI_BRANCH https://github.com/Xilinx/finn /workspace/finn
-
-RUN pip install -r /workspace/finn/requirements.txt
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+RUN rm requirements.txt
 RUN apt update; apt install nano
 RUN pip install pytest-dependency
 RUN pip install pytest-xdist
@@ -78,8 +76,8 @@ RUN mkdir -p $VIVADO_IP_CACHE
 
 WORKDIR /workspace/finn
 
-COPY finn_entrypoint.sh /usr/local/bin/
-COPY quicktest.sh /usr/local/bin/
+COPY docker/finn_entrypoint.sh /usr/local/bin/
+COPY docker/quicktest.sh /usr/local/bin/
 RUN chmod 755 /usr/local/bin/finn_entrypoint.sh
 RUN chmod 755 /usr/local/bin/quicktest.sh
 ENTRYPOINT ["finn_entrypoint.sh"]
diff --git a/docker/Dockerfile.finn_dev b/docker/Dockerfile.finn_dev
index f8919d7498e0e8ef08a52d1da0782988b56d6df4..8c1502eb4a1941061bd58e6f9a18106f98f259e2 100644
--- a/docker/Dockerfile.finn_dev
+++ b/docker/Dockerfile.finn_dev
@@ -50,7 +50,6 @@ COPY requirements.txt .
 RUN pip install -r requirements.txt
 RUN rm requirements.txt
 RUN pip install jupyter
-RUN pip install netron
 RUN pip install matplotlib
 RUN pip install pytest-dependency
 RUN pip install sphinx
@@ -81,13 +80,26 @@ RUN git clone https://github.com/maltanar/pyverilator /workspace/pyverilator
 RUN git clone https://github.com/maltanar/PYNQ-HelloWorld.git /workspace/PYNQ-HelloWorld
 # oh-my-xilinx
 RUN git clone https://bitbucket.org/maltanar/oh-my-xilinx.git /workspace/oh-my-xilinx
+# netron
+RUN git clone https://github.com/lutzroeder/netron.git /workspace/netron
+
+# build and install netron
+USER root
+RUN curl -sL https://deb.nodesource.com/setup_12.x | bash -
+RUN apt-get install -y nodejs
+WORKDIR /workspace/netron
+RUN git checkout 376e9d33733a3eacfe3c432808fd46e6cd1460cb
+RUN npm install
+RUN python setup.py build
+RUN pip install /workspace/netron
+USER $UNAME
 
 # for this developer-oriented Docker container we assume the FINN repo is cloned and mounted from the host
 # at /workspace/finn -- see run-docker.sh for an example of how to do this.
 ENV PYTHONPATH "${PYTHONPATH}:/workspace/finn/src"
 ENV PYTHONPATH "${PYTHONPATH}:/workspace/pyverilator"
 ENV PYNQSHELL_PATH "/workspace/PYNQ-HelloWorld/boards"
-ENV PATH "${PATH}:/workspace/oh-my-xilinx"
+ENV PATH "${PATH}:/workspace/oh-my-xilinx:/home/$UNAME/.local/bin"
 ENV OHMYXILINX "/workspace/oh-my-xilinx"
 
 WORKDIR /home/$UNAME/finn
diff --git a/docker/Jenkinsfile b/docker/Jenkinsfile
index 2215bc79cc7b2c20036d882fdc654fbe8721cab6..b2d3102bd4aa3c00620f41c102af5a8b385cede7 100644
--- a/docker/Jenkinsfile
+++ b/docker/Jenkinsfile
@@ -15,11 +15,13 @@ pipeline {
         string(name: 'DOCKER_CMD_RTLSIM', defaultValue: """python setup.py test --addopts "-k rtlsim --workers auto" """, description: 'rtlsim test command')
         // end2end tests: no parallel testing, use NUM_DEFAULT_WORKERS for parallel transformations
         string(name: 'DOCKER_CMD_END2END', defaultValue: """python setup.py test --addopts "-k end2end" """, description: 'end2end test command')
+        // allow specifying where to mount the cloned folder from, since Jenkins and FINN may be running in separate containers
+        string(name: 'WORKSPACE_MOUNT', defaultValue: '/var/jenkins_home/workspace/finn', description: 'Path to Jenkins workspace mount')
     }
     environment {
         DOCKER_TAG='finn_ci:$BUILD_ID'
-        DOCKER_INST_NAME='finn_ci_$BUILD_ID'
-        BUILD_PATH='/tmp/finn_ci_$BUILD_ID'
+        DOCKER_INST_NAME='finn_ci'
+        BUILD_PATH='/tmp/finn_ci'
     }
     stages {
         stage("Clone") {
@@ -32,17 +34,17 @@ pipeline {
                 sh """
                 docker build -t $DOCKER_TAG -f docker/Dockerfile.finn_ci \
                 --build-arg BUILD_PATH=$BUILD_PATH \
-                --build-arg FINN_CI_BRANCH=${params.FINN_CI_BRANCH} \
-                docker/
+                .
                 """
             }
         }
         stage('test-main') {
             steps {
-                catchError {
+                catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') {
                 sh """
                 docker run --init \
                 --hostname $DOCKER_INST_NAME \
+                -v ${params.WORKSPACE_MOUNT}:/workspace/finn \
                 -v ${params.VIVADO_PATH}:${params.VIVADO_PATH}:ro \
                 -e NUM_DEFAULT_WORKERS=1 \
                 -e FINN_INST_NAME=$DOCKER_INST_NAME \
@@ -58,10 +60,11 @@ pipeline {
         }
         stage('test-rtlsim') {
             steps {
-                catchError {
+                catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') {
                 sh """
                 docker run --init \
                 --hostname $DOCKER_INST_NAME \
+                -v ${params.WORKSPACE_MOUNT}:/workspace/finn \
                 -v ${params.VIVADO_PATH}:${params.VIVADO_PATH}:ro \
                 -e NUM_DEFAULT_WORKERS=1 \
                 -e FINN_INST_NAME=$DOCKER_INST_NAME \
@@ -77,10 +80,11 @@ pipeline {
         }
         stage('test-end2end') {
             steps {
-                catchError {
+                catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') {
                 sh """
                 docker run --init \
                 --hostname $DOCKER_INST_NAME \
+                -v ${params.WORKSPACE_MOUNT}:/workspace/finn \
                 -v ${params.VIVADO_PATH}:${params.VIVADO_PATH}:ro \
                 -e NUM_DEFAULT_WORKERS=${params.NUM_DEFAULT_WORKERS} \
                 -e FINN_INST_NAME=$DOCKER_INST_NAME \
diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index f701952d5d64e5d9b95aa59170b492fe7722ae02..ee75089c657e4fad1e4a455ac7bd5fe4976e5d4c 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -1,6 +1,5 @@
 #!/bin/bash
 
-export XILINX_VIVADO=$VIVADO_PATH
 export SHELL=/bin/bash
 export FINN_ROOT=/workspace/finn
 
@@ -48,7 +47,14 @@ gecho "oh-my-xilinx @ $OMX_COMMIT"
 git -C /workspace/oh-my-xilinx pull --quiet
 git -C /workspace/oh-my-xilinx checkout $OMX_COMMIT --quiet
 
-# source Vivado env.vars
-source $VIVADO_PATH/settings64.sh
-
+if [ ! -z "$VIVADO_PATH" ];then
+  # source Vivado env.vars
+  export XILINX_VIVADO=$VIVADO_PATH
+  source $VIVADO_PATH/settings64.sh
+fi
+if [ ! -z "$VITIS_PATH" ];then
+  # source Vitis env.vars
+  export XILINX_VITIS=$VITIS_PATH
+  source $VITIS_PATH/settings64.sh
+fi
 exec "$@"
diff --git a/docs/finn/getting_started.rst b/docs/finn/getting_started.rst
index 8b20cebcfc49d14d0afbb26edd678d65425476d3..323692897800d45c6e6cf55b688a2c7b2b9a5277 100644
--- a/docs/finn/getting_started.rst
+++ b/docs/finn/getting_started.rst
@@ -13,7 +13,7 @@ The FINN compiler should not be thought of a single pushbutton tool that does ev
 Requirements
 ============
 
-* Ubuntu 18.04
+* Ubuntu 18.04 with `bash` installed
 * Docker
 * A working Vivado 2019.1 installation
 * A `VIVADO_PATH` environment variable pointing to the Vivado installation directory (e.g. the directory where settings64.sh is located)
@@ -26,9 +26,11 @@ We use Docker extensively for developing and deploying FINN. If you are not fami
 
 Getting an interactive shell for development or experimentation
 ***************************************************************
+.. note:: **run-docker.sh requires bash to execute correctly.**
+
 ::
 
-  sh run_docker.sh
+  ./run_docker.sh
 
 Simply running sh run-docker.sh without any additional arguments will clone the dependency repos, create a Docker container and give you a terminal with you can use for development for experimentation.
 If you want a new terminal on an already-running container, you can do this with `docker exec -it finn_dev_<username> bash`.
@@ -41,7 +43,7 @@ Running the Jupyter notebooks
 *****************************
 ::
 
-  sh run-docker.sh notebook
+  ./run-docker.sh notebook
 
 This will launch the `Jupyter notebook <https://jupyter.org/>`_ server inside a Docker container, and print a link on the terminal that you can open in your browser to run the FINN notebooks or create new ones.
 .. note:: The link will look something like this (the token you get will be different):
@@ -57,14 +59,14 @@ by:
 
 ::
 
-  sh run-docker.sh test
+  ./run-docker.sh test
 
 There is a quicker variant of the test suite that skips the tests marked as
 requiring Vivado or as slow-running tests:
 
 ::
 
-  sh run-docker.sh quicktest
+  ./run-docker.sh quicktest
 
 If you want to run individual tests, you can do this *inside the Docker container
 from the FINN root directory* as follows:
diff --git a/run-docker.sh b/run-docker.sh
index 00ca8f86985a78d8f2af099c51dcd4b80cd2e974..88956586c6a2ba9780d0597f8149038dad4aa6ab 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -50,6 +50,15 @@ if [ -z "$PYNQ_IP" ];then
         recho "Please set the PYNQ_IP env.var. to enable PYNQ deployment tests."
 fi
 
+if [ -z "$VITIS_PATH" ];then
+        recho "Please set the VITIS_PATH that contains the path to your Vitis installation directory."
+        recho "FINN functionality depending on Vitis will not be available."
+else
+    if [ -z "$PLATFORM_REPO_PATHS" ];then
+            recho "Please set PLATFORM_REPO_PATHS pointing to Vitis platform files (DSAs)."
+    fi
+fi
+
 DOCKER_GID=$(id -g)
 DOCKER_GNAME=$(id -gn)
 DOCKER_UNAME=$(id -un)
@@ -93,6 +102,7 @@ mkdir -p $FINN_SSH_KEY_DIR
 gecho "Instance is named as $DOCKER_INST_NAME"
 gecho "Mounting $BUILD_LOCAL into $BUILD_LOCAL"
 gecho "Mounting $VIVADO_PATH into $VIVADO_PATH"
+gecho "Mounting $VITIS_PATH into $VITIS_PATH"
 gecho "Port-forwarding for Jupyter $JUPYTER_PORT:$JUPYTER_PORT"
 gecho "Port-forwarding for Netron $NETRON_PORT:$NETRON_PORT"
 gecho "Vivado IP cache dir is at $VIVADO_IP_CACHE"
@@ -128,24 +138,34 @@ docker build -f docker/Dockerfile.finn_dev --tag=$DOCKER_TAG \
 # Launch container with current directory mounted
 # important to pass the --init flag here for correct Vivado operation, see:
 # https://stackoverflow.com/questions/55733058/vivado-synthesis-hangs-in-docker-container-spawned-by-jenkins
-docker run -t --rm --name $DOCKER_INST_NAME $DOCKER_INTERACTIVE --init \
---hostname $DOCKER_INST_NAME \
--e "XILINX_VIVADO=$VIVADO_PATH" \
--e "SHELL=/bin/bash" \
--v $SCRIPTPATH:/workspace/finn \
--v $BUILD_LOCAL:$BUILD_LOCAL \
--v $VIVADO_PATH:$VIVADO_PATH \
--v $FINN_SSH_KEY_DIR:/home/$DOCKER_UNAME/.ssh \
--e VIVADO_PATH=$VIVADO_PATH \
--e FINN_INST_NAME=$DOCKER_INST_NAME \
--e FINN_ROOT="/workspace/finn" \
--e VIVADO_IP_CACHE="$VIVADO_IP_CACHE" \
--e PYNQ_BOARD=$PYNQ_BOARD \
--e PYNQ_IP=$PYNQ_IP \
--e PYNQ_USERNAME=$PYNQ_USERNAME \
--e PYNQ_PASSWORD=$PYNQ_PASSWORD \
--e PYNQ_TARGET_DIR=$PYNQ_TARGET_DIR \
--e NUM_DEFAULT_WORKERS=$NUM_DEFAULT_WORKERS \
--p $JUPYTER_PORT:$JUPYTER_PORT \
--p $NETRON_PORT:$NETRON_PORT \
-$DOCKER_TAG $DOCKER_CMD
+DOCKER_EXEC="docker run -t --rm --name $DOCKER_INST_NAME $DOCKER_INTERACTIVE --init "
+DOCKER_EXEC+="--hostname $DOCKER_INST_NAME "
+DOCKER_EXEC+="-e SHELL=/bin/bash "
+DOCKER_EXEC+="-v $SCRIPTPATH:/workspace/finn "
+DOCKER_EXEC+="-v $BUILD_LOCAL:$BUILD_LOCAL "
+DOCKER_EXEC+="-v $FINN_SSH_KEY_DIR:/home/$DOCKER_UNAME/.ssh "
+DOCKER_EXEC+="-e FINN_INST_NAME=$DOCKER_INST_NAME "
+DOCKER_EXEC+="-e FINN_ROOT="/workspace/finn" "
+DOCKER_EXEC+="-e VIVADO_IP_CACHE=$VIVADO_IP_CACHE "
+DOCKER_EXEC+="-e PYNQ_BOARD=$PYNQ_BOARD "
+DOCKER_EXEC+="-e PYNQ_IP=$PYNQ_IP "
+DOCKER_EXEC+="-e PYNQ_USERNAME=$PYNQ_USERNAME "
+DOCKER_EXEC+="-e PYNQ_PASSWORD=$PYNQ_PASSWORD "
+DOCKER_EXEC+="-e PYNQ_TARGET_DIR=$PYNQ_TARGET_DIR "
+DOCKER_EXEC+="-e NUM_DEFAULT_WORKERS=$NUM_DEFAULT_WORKERS "
+DOCKER_EXEC+="-p $JUPYTER_PORT:$JUPYTER_PORT "
+DOCKER_EXEC+="-p $NETRON_PORT:$NETRON_PORT "
+if [ ! -z "$VIVADO_PATH" ];then
+  DOCKER_EXEC+="-e "XILINX_VIVADO=$VIVADO_PATH" "
+  DOCKER_EXEC+="-v $VIVADO_PATH:$VIVADO_PATH "
+  DOCKER_EXEC+="-e VIVADO_PATH=$VIVADO_PATH "
+fi
+if [ ! -z "$VITIS_PATH" ];then
+  DOCKER_EXEC+="-v $VITIS_PATH:$VITIS_PATH "
+  DOCKER_EXEC+="-v $PLATFORM_REPO_PATHS:/workspace/finn/vitis_platforms "
+  DOCKER_EXEC+="-e VITIS_PATH=$VITIS_PATH "
+  DOCKER_EXEC+="-e PLATFORM_REPO_PATHS=/workspace/finn/vitis_platforms "
+fi
+DOCKER_EXEC+="$DOCKER_TAG $DOCKER_CMD"
+
+$DOCKER_EXEC
diff --git a/src/finn/core/rtlsim_exec.py b/src/finn/core/rtlsim_exec.py
index 1e1bee3aa7435d5cab6cbf5ea23dd37dcdfa4380..bb5b3075582b8e01e8eed95f709934302fcadb42 100644
--- a/src/finn/core/rtlsim_exec.py
+++ b/src/finn/core/rtlsim_exec.py
@@ -114,19 +114,19 @@ def rtlsim_exec(model, execution_context):
 def _reset_rtlsim(sim):
     """Sets reset input in pyverilator to zero, toggles the clock and set it
     back to one"""
-    sim.io.ap_rst_n_0 = 0
+    sim.io.ap_rst_n = 0
     _toggle_clk(sim)
     _toggle_clk(sim)
-    sim.io.ap_rst_n_0 = 1
+    sim.io.ap_rst_n = 1
     _toggle_clk(sim)
     _toggle_clk(sim)
 
 
 def _toggle_clk(sim):
     """Toggles the clock input in pyverilator once."""
-    sim.io.ap_clk_0 = 0
+    sim.io.ap_clk = 0
     sim.eval()
-    sim.io.ap_clk_0 = 1
+    sim.io.ap_clk = 1
     sim.eval()
 
 
@@ -140,7 +140,7 @@ def _run_rtlsim(sim, inp, num_out_values, trace_file=None, reset=True):
     from finn.util.fpgadataflow)"""
     inputs = inp
     outputs = []
-    sim.io.out_r_0_tready = 1
+    sim.io.m_axis_0_tready = 1
 
     # observe if output is completely calculated
     # observation_count will contain the number of cycles the calculation ran
@@ -159,12 +159,12 @@ def _run_rtlsim(sim, inp, num_out_values, trace_file=None, reset=True):
         _reset_rtlsim(sim)
 
     while not (output_observed):
-        sim.io.in0_V_V_0_tvalid = 1 if len(inputs) > 0 else 0
-        sim.io.in0_V_V_0_tdata = inputs[0] if len(inputs) > 0 else 0
-        if sim.io.in0_V_V_0_tready == 1 and sim.io.in0_V_V_0_tvalid == 1:
+        sim.io.s_axis_0_tvalid = 1 if len(inputs) > 0 else 0
+        sim.io.s_axis_0_tdata = inputs[0] if len(inputs) > 0 else 0
+        if sim.io.s_axis_0_tready == 1 and sim.io.s_axis_0_tvalid == 1:
             inputs = inputs[1:]
-        if sim.io.out_r_0_tvalid == 1 and sim.io.out_r_0_tready == 1:
-            outputs = outputs + [sim.io.out_r_0_tdata]
+        if sim.io.m_axis_0_tvalid == 1 and sim.io.m_axis_0_tready == 1:
+            outputs = outputs + [sim.io.m_axis_0_tdata]
         _toggle_clk(sim)
 
         observation_count = observation_count + 1
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 71c731f96ca45519c443a5f932ead050770e17de..bc816f18c5f72338dc726e504182998f3f4430b7 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -102,6 +102,23 @@ class HLSCustomOp(CustomOp):
         prefixed_top_name = "%s_%s" % (node.name, node.name)
         return prefixed_top_name
 
+    def get_verilog_top_module_intf_names(self):
+        """Return a dict of names of input and output interfaces.
+        The keys reflect the protocols each interface implements:
+        'clk', 'rst', 'm_axis', 's_axis', 'aximm', 'axilite'.
+        Values are lists of names:
+        's_axis' names correspond to the list of node inputs in order,
+        'm_axis' names correspond to the list of node outputs in order'
+        Each block must have at most one aximm and one axilite."""
+        intf_names = {}
+        intf_names["clk"] = ["ap_clk"]
+        intf_names["rst"] = ["ap_rst_n"]
+        intf_names["s_axis"] = ["in0_V_V"]
+        intf_names["m_axis"] = ["out_V_V"]
+        intf_names["aximm"] = []
+        intf_names["axilite"] = []
+        return intf_names
+
     def get_verilog_top_filename(self):
         "Return the Verilog top module filename for this node."
 
diff --git a/src/finn/custom_op/fpgadataflow/addstreams_batch.py b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
index d5f5c1194d36e86b895610c084222db5ab9eb2bf..d73f22672e7163eef0738d067f951e90fe80a89f 100644
--- a/src/finn/custom_op/fpgadataflow/addstreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
@@ -356,3 +356,8 @@ class AddStreams_Batch(HLSCustomOp):
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        intf_names["s_axis"] = ["in0_V_V", "in1_V_V"]
+        return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/iodma.py b/src/finn/custom_op/fpgadataflow/iodma.py
index 9b718ecbbc490610790b68871080de23a54f4891..05870b8d9d5d3a11bad7882c9a7d122f8cd34cf6 100644
--- a/src/finn/custom_op/fpgadataflow/iodma.py
+++ b/src/finn/custom_op/fpgadataflow/iodma.py
@@ -344,3 +344,15 @@ class IODMA(HLSCustomOp):
 
     def strm_decl(self):
         pass
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        if self.get_nodeattr("direction") == "out":
+            intf_names["s_axis"] = ["in0_V_V"]
+            intf_names["m_axis"] = []
+        else:
+            intf_names["s_axis"] = []
+            intf_names["m_axis"] = ["out_V_V"]
+        intf_names["axilite"] = ["s_axi_control"]
+        intf_names["aximm"] = ["m_axi_gmem"]
+        return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index a7ebff68749120868cae9ce5ac18d2856fe2cb8a..9c3bd3ac87b94f3e0ff11a2937bf5083aae614f6 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -87,7 +87,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             "numInputVectors": ("ints", False, [1]),
             # memory mode for the FC weights
             # const -- embedded weights, default, long compile/synth times
-            # decoupled -- streaming weights
+            # decoupled -- streaming weights with weight streamer packaged inside IP
+            # external -- streaming weights with external streamer
             "mem_mode": ("s", False, "const"),
             # FPGA resource type for memories in decoupled mode
             # auto -- let Vivado decide
@@ -105,14 +106,14 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         node = self.onnx_node
         # set top name depending on mem_mode
         mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode == "const":
+        if mem_mode == "const" or mem_mode == "external":
             prefixed_top_name = "%s_%s" % (node.name, node.name)
         elif mem_mode == "decoupled":
             prefixed_top_name = "%s_memstream" % (node.name)
         else:
             raise Exception(
-                """Please set mem_mode to "const" or "decoupled", currently no other
-                parameter value is supported!"""
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
             )
         return prefixed_top_name
 
@@ -301,7 +302,10 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
     def get_weightstream_width(self):
         """Returns weight stream width. Used only in decoupled mode."""
-        if self.get_nodeattr("mem_mode") == "decoupled":
+        if (
+            self.get_nodeattr("mem_mode") == "decoupled"
+            or self.get_nodeattr("mem_mode") == "external"
+        ):
             pe = self.get_nodeattr("PE")
             simd = self.get_nodeattr("SIMD")
             wp = self.get_weight_datatype().bitwidth()
@@ -484,7 +488,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
     def generate_params(self, model, path):
         mem_mode = self.get_nodeattr("mem_mode")
-        # weights
+        code_gen_dir = path
+        # weights, if not external
         weights = model.get_initializer(self.onnx_node.input[1])
         # convert weights into hlslib-compatible format
         weight_tensor = self.get_hls_compatible_weight_tensor(weights)
@@ -493,7 +498,6 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         # so use it as such for weight generation
         if self.get_weight_datatype() == DataType.BIPOLAR:
             export_wdt = DataType.BINARY
-        code_gen_dir = path
 
         if mem_mode == "const":
             """Saves weights into params.h"""
@@ -523,7 +527,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             f_weights.write(weight_hls_code)
             f_weights.close()
 
-        elif mem_mode == "decoupled":
+        elif mem_mode == "decoupled" or mem_mode == "external":
             """Saves weights in corresponding file format for cppsim or rtlsim"""
             # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD)
             weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3))
@@ -552,37 +556,37 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 os.path.join(code_gen_dir, "weights.npy"), weight_tensor_simd_flipped
             )
 
-            """Saves weights into .dat file"""
-            # convert weight values into hexstring
-            weight_width = self.get_weightstream_width()
-            # pad to nearest 4 bits to get hex strings
-            weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
-            weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
-                weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
-            )
-            weight_stream_len = np.prod(weight_tensor_pe_flipped.shape)
-            factor = math.ceil(weight_stream_len / 1024)
-            # add zeroes to pad out file to 1024 entries
-            weight_stream = weight_tensor_pe_flipped.flatten()
-            pad_amt = (factor * 1024) - weight_stream_len
-            weight_stream = np.pad(
-                weight_stream, (0, pad_amt), mode="constant", constant_values="0"
-            )
-            weight_stream = weight_stream.copy()
-            i = 0
-            j = 0
-            for val in weight_stream:
-                if i == 1024:
-                    i = 0
-                    j += 1
-                with open("{}/memblock_{}.dat".format(code_gen_dir, j), "a+") as f:
-                    f.write(val + "\n")
-                i += 1
-
+            if mem_mode == "decoupled":
+                """Saves weights into .dat file"""
+                # convert weight values into hexstring
+                weight_width = self.get_weightstream_width()
+                # pad to nearest 4 bits to get hex strings
+                weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
+                weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
+                    weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
+                )
+                weight_stream_len = np.prod(weight_tensor_pe_flipped.shape)
+                factor = math.ceil(weight_stream_len / 1024)
+                # add zeroes to pad out file to 1024 entries
+                weight_stream = weight_tensor_pe_flipped.flatten()
+                pad_amt = (factor * 1024) - weight_stream_len
+                weight_stream = np.pad(
+                    weight_stream, (0, pad_amt), mode="constant", constant_values="0"
+                )
+                weight_stream = weight_stream.copy()
+                i = 0
+                j = 0
+                for val in weight_stream:
+                    if i == 1024:
+                        i = 0
+                        j += 1
+                    with open("{}/memblock_{}.dat".format(code_gen_dir, j), "a+") as f:
+                        f.write(val + "\n")
+                    i += 1
         else:
             raise Exception(
-                """Please set mem_mode to "const"i or "decoupled", currently no other
-                    parameter value is supported!"""
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
             )
 
         # save thresholds in thresh.h
@@ -630,6 +634,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
+        mem_mode = self.get_nodeattr("mem_mode")
         node = self.onnx_node
 
         # TODO ensure codegen dir exists
@@ -698,7 +703,24 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             )
             super().reset_rtlsim(sim)
             super().toggle_clk(sim)
-            output = self.rtlsim(sim, inp)
+            if mem_mode == "external":
+                wnbits = self.get_weightstream_width()
+                export_wdt = self.get_weight_datatype()
+                # we have converted bipolar weights to binary for export,
+                # so use it as such for weight generation
+                if self.get_weight_datatype() == DataType.BIPOLAR:
+                    export_wdt = DataType.BINARY
+                wei = npy_to_rtlsim_input(
+                    "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits
+                )
+                io_dict = {
+                    "inputs": {"in0": inp, "weights": wei},
+                    "outputs": {"out": []},
+                }
+                self.rtlsim_multi_io(sim, io_dict)
+                output = io_dict["outputs"]["out"]
+            else:
+                output = self.rtlsim(sim, inp)
             odt = self.get_output_datatype()
             target_bits = odt.bitwidth()
             packed_bits = self.get_outstream_width()
@@ -729,12 +751,12 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         if mem_mode == "const":
             # self.code_gen_dict["$GLOBALS$"] += ['#include "params.h"']
             pass
-        elif mem_mode == "decoupled":
+        elif mem_mode == "decoupled" or mem_mode == "external":
             self.code_gen_dict["$GLOBALS$"] += ['#include "mvau.hpp"']
         else:
             raise Exception(
-                """Please set mem_mode to "const" or "decoupled", currently no other
-                    parameter value is supported!"""
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
             )
         if self.calc_tmem() != 0:
             # TODO find a better way of checking for no pregenerated thresholds
@@ -757,7 +779,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 numReps,
             )
         ]
-        if mem_mode == "decoupled":
+        if mem_mode == "decoupled" or mem_mode == "external":
             wdt = self.get_weight_datatype()
             self.code_gen_dict["$DEFINES$"].append(
                 "#define WP1 {}\n".format(wdt.bitwidth())
@@ -783,7 +805,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         )
 
         mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode == "decoupled":
+        if mem_mode == "decoupled" or mem_mode == "external":
             wdt = self.get_weight_datatype()
             elem_bits = wdt.bitwidth()
             packed_bits = self.get_weightstream_width()
@@ -807,7 +829,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
         )
 
-        if mem_mode == "decoupled":
+        if mem_mode == "decoupled" or mem_mode == "external":
             self.code_gen_dict["$STREAMDECLARATIONS$"].append(
                 'hls::stream<ap_uint<{}>> weights ("weights");'.format(
                     self.get_weightstream_width()
@@ -835,7 +857,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                     self.get_nodeattr("resType"),
                 )
             ]
-        elif mem_mode == "decoupled":
+        elif mem_mode == "decoupled" or mem_mode == "external":
             wdt = self.get_weight_datatype()
             if wdt == DataType.BIPOLAR:
                 export_wdt = DataType.BINARY
@@ -856,8 +878,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
         else:
             raise Exception(
-                """Please set mem_mode to "const" or "decoupled", currently no other
-                    parameter value is supported!"""
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
             )
 
     def dataoutstrm(self):
@@ -903,7 +925,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                     self.get_outstream_width(),
                 )
             ]
-        elif mem_mode == "decoupled":
+        elif mem_mode == "decoupled" or mem_mode == "external":
             self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
                 """void {}(
                     hls::stream<ap_uint<{}>> &in0,
@@ -952,7 +974,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                     "complete dim=1"
                 )
             )
-        elif mem_mode == "decoupled":
+        elif mem_mode == "decoupled" or mem_mode == "external":
             self.code_gen_dict["$PRAGMAS$"].append(
                 "#pragma HLS INTERFACE axis port=weights"
             )
@@ -962,8 +984,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
         else:
             raise Exception(
-                """Please set mem_mode to "const", currently no other
-                    parameter value is supported!"""
+                """Please set mem_mode to "const", "decoupled", or external,
+                currently no other parameter value is supported!"""
             )
 
         # the threshold tensor is acc_type [PE][TMEM][N_THRES]
@@ -1092,3 +1114,10 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             )
             self.set_nodeattr("ip_vlnv", vlnv)
             self.code_gen_dict.clear()
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "external":
+            intf_names["s_axis"] = ["in0_V_V", "weights_V_V"]
+        return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/tlastmarker.py
index 17ba44b959577faf573d77ae222f7b2a3be6669d..38a139c279701ae7892f41b63c3c717a3e736691 100644
--- a/src/finn/custom_op/fpgadataflow/tlastmarker.py
+++ b/src/finn/custom_op/fpgadataflow/tlastmarker.py
@@ -33,8 +33,9 @@ class TLastMarker(HLSCustomOp):
     """Node that adds/removes AXI stream TLAST signals where needed. Its behavior
     is transparent in node-by-node execution, only visible in IP-stitched rtlsim or
     actual hardware.
-    This node  may be needed at the end of the network to signal a DMA write (needed by the
-    FINN PYNQ shell) or at the beginning to remove the end-of-burst from DMA read."""
+    This node  may be needed at the end of the network to signal a DMA write
+    (needed by the FINN PYNQ shell) or at the beginning to remove the end-of-burst
+    from DMA read."""
 
     def __init__(self, onnx_node):
         super().__init__(onnx_node)
@@ -239,3 +240,15 @@ class TLastMarker(HLSCustomOp):
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
             'hls::stream<OutDType> out ("out");'
         )
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        if self.get_nodeattr("Direction") == "in":
+            intf_names["s_axis"] = ["in0"]
+            intf_names["m_axis"] = ["out_V_V"]
+        else:
+            intf_names["s_axis"] = ["in0_V_V"]
+            intf_names["m_axis"] = ["out_r"]
+        if self.get_nodeattr("DynIters") == 1:
+            intf_names["axilite"] = ["s_axi_control"]
+        return intf_names
diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index 0e898f63db785f80cfce2683df0c9b6268e3ec7e..018ad385f33a8e0aea4aa42599fd47fe5dae57dd 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -33,6 +33,8 @@ import subprocess
 from finn.transformation import Transformation
 from finn.util.basic import get_by_name, make_build_dir
 from finn.custom_op.registry import getCustomOp
+from finn.util.basic import get_num_default_workers
+import multiprocessing as mp
 
 
 class CreateStitchedIP(Transformation):
@@ -49,20 +51,137 @@ class CreateStitchedIP(Transformation):
     The packaged block design IP can be found under the ip subdirectory.
     """
 
-    def __init__(self, fpgapart, clk_ns = 10.0):
+    def __init__(self, fpgapart, clk_ns=10.0, ip_name="finn_design", vitis=False):
         super().__init__()
         self.fpgapart = fpgapart
         self.clk_ns = clk_ns
+        self.ip_name = ip_name
+        self.vitis = vitis
         if float(clk_ns) not in [5.0, 10.0, 20.0]:
             warnings.warn(
                 """The chosen frequency may lead to failure due to clock divider
                 constraints."""
             )
+        self.has_axilite = False
+        self.has_aximm = False
+        self.has_m_axis = False
+        self.m_axis_idx = 0
+        self.has_s_axis = False
+        self.s_axis_idx = 0
+        self.clock_reset_are_external = False
+        self.create_cmds = []
+        self.connect_cmds = []
+        # keep track of top-level interface names
+        self.intf_names = {
+            "clk": [],
+            "rst": [],
+            "s_axis": [],
+            "m_axis": [],
+            "aximm": [],
+            "axilite": [],
+        }
+
+    def connect_clk_rst(self, node):
+        inst_name = node.name
+        node_inst = getCustomOp(node)
+        clock_intf_name = node_inst.get_verilog_top_module_intf_names()["clk"][0]
+        reset_intf_name = node_inst.get_verilog_top_module_intf_names()["rst"][0]
+        # make clock and reset external, if they aren't already
+        if not self.clock_reset_are_external:
+            self.connect_cmds.append(
+                "make_bd_pins_external [get_bd_pins %s/%s]"
+                % (inst_name, clock_intf_name)
+            )
+            self.connect_cmds.append("set_property name ap_clk [get_bd_ports ap_clk_0]")
+            self.connect_cmds.append(
+                "make_bd_pins_external [get_bd_pins %s/%s]"
+                % (inst_name, reset_intf_name)
+            )
+            self.connect_cmds.append(
+                "set_property name ap_rst_n [get_bd_ports ap_rst_n_0]"
+            )
+            self.clock_reset_are_external = True
+            self.intf_names["clk"] = ["ap_clk"]
+            self.intf_names["rst"] = ["ap_rst_n"]
+        # otherwise connect clock and reset
+        else:
+            self.connect_cmds.append(
+                "connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins %s/%s]"
+                % (inst_name, reset_intf_name)
+            )
+            self.connect_cmds.append(
+                "connect_bd_net [get_bd_ports ap_clk] [get_bd_pins %s/%s]"
+                % (inst_name, clock_intf_name)
+            )
+
+    def connect_axi(self, node):
+        inst_name = node.name
+        node_inst = getCustomOp(node)
+        axilite_intf_name = node_inst.get_verilog_top_module_intf_names()["axilite"]
+        aximm_intf_name = node_inst.get_verilog_top_module_intf_names()["aximm"]
+        if len(axilite_intf_name) != 0:
+            self.connect_cmds.append(
+                "make_bd_intf_pins_external "
+                "[get_bd_intf_pins %s/%s]" % (inst_name, axilite_intf_name[0])
+            )
+            self.connect_cmds.append(
+                "set_property name s_axi_control " "[get_bd_intf_ports s_axi_control_0]"
+            )
+            assert (
+                self.has_axilite is False
+            ), "Currently limited to one slave AXI-Stream"
+            self.intf_names["axilite"] = ["s_axi_control"]
+            self.has_axilite = True
+        if len(aximm_intf_name) != 0:
+            self.connect_cmds.append(
+                "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]"
+                % (inst_name, aximm_intf_name[0])
+            )
+            self.connect_cmds.append(
+                "set_property name m_axi_gmem0 [get_bd_intf_ports m_axi_gmem_0]"
+            )
+            self.intf_names["aximm"] = ["m_axi_gmem0"]
+            assert self.has_aximm is False, "Currently limited to one AXI-MM interface"
+            self.has_aximm = True
+
+    def connect_m_axis_external(self, node):
+        inst_name = node.name
+        node_inst = getCustomOp(node)
+        output_intf_names = node_inst.get_verilog_top_module_intf_names()["m_axis"]
+        # make output axis external
+        for output_intf_name in output_intf_names:
+            self.connect_cmds.append(
+                "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]"
+                % (inst_name, output_intf_name)
+            )
+            self.connect_cmds.append(
+                "set_property name m_axis_%d [get_bd_intf_ports %s_0]"
+                % (self.m_axis_idx, output_intf_name)
+            )
+            self.has_m_axis = True
+            self.intf_names["m_axis"].append("m_axis_%d" % self.m_axis_idx)
+            self.m_axis_idx += 1
+
+    def connect_s_axis_external(self, node):
+        inst_name = node.name
+        node_inst = getCustomOp(node)
+        input_intf_names = node_inst.get_verilog_top_module_intf_names()["s_axis"]
+        # make input axis external
+        for input_intf_name in input_intf_names:
+            self.connect_cmds.append(
+                "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]"
+                % (inst_name, input_intf_name)
+            )
+            self.connect_cmds.append(
+                "set_property name s_axis_%d [get_bd_intf_ports %s_0]"
+                % (self.s_axis_idx, input_intf_name)
+            )
+            self.has_s_axis = True
+            self.intf_names["s_axis"].append("s_axis_%d" % self.s_axis_idx)
+            self.s_axis_idx += 1
 
     def apply(self, model):
         ip_dirs = ["list"]
-        create_cmds = []
-        connect_cmds = []
         # ensure that all nodes are fpgadataflow, and that IPs are generated
         for node in model.graph.node:
             assert node.domain == "finn", 'Node domain is not set to "finn"'
@@ -80,59 +199,62 @@ class CreateStitchedIP(Transformation):
             vlnv = node_inst.get_nodeattr("ip_vlnv")
             inst_name = node.name
             create_cmd = "create_bd_cell -type ip -vlnv %s %s" % (vlnv, inst_name)
-            create_cmds += [create_cmd]
-            # TODO nonlinear topologies: check this for all inputs
+            self.create_cmds += [create_cmd]
             my_producer = model.find_producer(node.input[0])
+            self.connect_clk_rst(node)
+            self.connect_axi(node)
             if my_producer is None:
                 # first node in graph
-                # make clock and reset external
-                connect_cmds.append(
-                    "make_bd_pins_external [get_bd_pins %s/ap_clk]" % inst_name
-                )
-                connect_cmds.append(
-                    "make_bd_pins_external [get_bd_pins %s/ap_rst_n]" % inst_name
-                )
-                # make input external
-                connect_cmds.append(
-                    "make_bd_intf_pins_external [get_bd_intf_pins %s/in0_V_V]"
-                    % inst_name
-                )
+                self.connect_s_axis_external(node)
+                if node.op_type == "TLastMarker":
+                    assert (
+                        node_inst.get_nodeattr("Direction") == "in"
+                    ), """Output TLastMarker incorrect direction"""
+                elif node.op_type == "IODMA":
+                    assert (
+                        node_inst.get_nodeattr("direction") == "in"
+                    ), """Input DMA incorrect direction"""
             else:
                 # intermediate node
-                # wire up global clock and reset
-                connect_cmds.append(
-                    "connect_bd_net [get_bd_ports ap_rst_n_0] [get_bd_pins %s/ap_rst_n]"
-                    % inst_name
-                )
-                connect_cmds.append(
-                    "connect_bd_net [get_bd_ports ap_clk_0] [get_bd_pins %s/ap_clk]"
-                    % inst_name
-                )
-                # wire up input to previous output
-                # TODO nonlinear topologies: loop over all inputs
-                my_in_name = "%s/in0_V_V" % (inst_name)
-                prev_out_name = "%s/out_V_V" % (my_producer.name)
-                connect_cmds.append(
-                    "connect_bd_intf_net [get_bd_intf_pins %s] [get_bd_intf_pins %s]"
-                    % (prev_out_name, my_in_name)
-                )
-            if model.find_consumer(node.output[0]) is None:
+                # wire up input(s) to previous node output(s)
+                # foreach input
+                #     find producer
+                #     find index of producer output connected to our target input
+                #     get names of hdl interfaces for input and producer output
+                #     issue a TCL directive to connect input to output
+                for i in range(len(node.input)):
+                    producer = model.find_producer(node.input[i])
+                    if producer is None:
+                        continue
+                    j = list(producer.output).index(node.input[i])
+                    src_intf_name = getCustomOp(
+                        producer
+                    ).get_verilog_top_module_intf_names()["m_axis"][j]
+                    dst_intf_name = node_inst.get_verilog_top_module_intf_names()[
+                        "s_axis"
+                    ][i]
+                    self.connect_cmds.append(
+                        "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                        "[get_bd_intf_pins %s/%s]"
+                        % (producer.name, src_intf_name, node.name, dst_intf_name)
+                    )
+            if model.find_consumers(node.output[0]) is None:
                 # last node in graph
+                self.connect_m_axis_external(node)
                 # ensure it is a TLastMarker to have a valid TLast signal
                 assert (
-                    node.op_type == "TLastMarker"
-                ), """Last node is not TLastMarker.
-                Please run transformation InsertTLastMarker to ensure a valid
-                TLast signal"""
-                # make output external
-                connect_cmds.append(
-                    "make_bd_intf_pins_external [get_bd_intf_pins %s/out_r]" % inst_name
-                )
-                # make AXI lite IF external
-                connect_cmds.append(
-                    "make_bd_intf_pins_external [get_bd_intf_pins %s/s_axi_control]"
-                    % inst_name
-                )
+                    node.op_type == "TLastMarker" or node.op_type == "IODMA"
+                ), """Last node is not TLastMarker or DMA.
+                Please run transformation InsertTLastMarker/InsertIODMA to ensure
+                a valid TLast signal"""
+                if node.op_type == "TLastMarker":
+                    assert (
+                        node_inst.get_nodeattr("Direction") == "out"
+                    ), """Output TLastMarker incorrect direction"""
+                elif node.op_type == "IODMA":
+                    assert (
+                        node_inst.get_nodeattr("direction") == "out"
+                    ), """Output DMA incorrect direction"""
 
         # create a temporary folder for the project
         prjname = "finn_vivado_stitch_proj"
@@ -150,22 +272,54 @@ class CreateStitchedIP(Transformation):
         tcl.append("set_property ip_repo_paths [%s] [current_project]" % ip_dirs_str)
         tcl.append("update_ip_catalog")
         # create block design and instantiate all layers
-        block_name = "finn_design"
+        block_name = self.ip_name
         tcl.append('create_bd_design "%s"' % block_name)
-        tcl.extend(create_cmds)
-        tcl.extend(connect_cmds)
+        tcl.extend(self.create_cmds)
+        tcl.extend(self.connect_cmds)
         fclk_mhz = 1 / (self.clk_ns * 0.001)
         fclk_hz = fclk_mhz * 1000000
         model.set_metadata_prop("clk_ns", str(self.clk_ns))
-        tcl.append("set_property CONFIG.FREQ_HZ %f [get_bd_ports /ap_clk_0]" % fclk_hz)
+        tcl.append("set_property CONFIG.FREQ_HZ %f [get_bd_ports /ap_clk]" % fclk_hz)
         tcl.append("regenerate_bd_layout")
         tcl.append("validate_bd_design")
         tcl.append("save_bd_design")
+        # create wrapper hdl (for rtlsim later on)
+        bd_base = "%s/%s.srcs/sources_1/bd/%s" % (
+            vivado_stitch_proj_dir,
+            prjname,
+            block_name,
+        )
+        bd_filename = "%s/%s.bd" % (bd_base, block_name)
+        tcl.append("make_wrapper -files [get_files %s] -top" % bd_filename)
+        wrapper_filename = "%s/hdl/%s_wrapper.v" % (bd_base, block_name)
+        tcl.append("add_files -norecurse %s" % wrapper_filename)
+        model.set_metadata_prop("wrapper_filename", wrapper_filename)
+        # synthesize to DCP and export stub, DCP and constraints
+        if self.vitis:
+            tcl.append(
+                "set_property SYNTH_CHECKPOINT_MODE Hierarchical [ get_files %s ]"
+                % bd_filename
+            )
+            tcl.append(
+                "set_property -name {STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS} "
+                "-value {-mode out_of_context} -objects [get_runs synth_1]"
+            )
+            num_workers = get_num_default_workers()
+            assert num_workers >= 0, "Number of workers must be nonnegative."
+            if num_workers == 0:
+                num_workers = mp.cpu_count()
+            tcl.append("launch_runs synth_1 -jobs %s" % str(num_workers))
+            tcl.append("wait_on_run [get_runs synth_1]")
+            tcl.append("open_run synth_1 -name synth_1")
+            tcl.append("write_verilog -force -mode synth_stub %s.v" % block_name)
+            tcl.append("write_checkpoint %s.dcp" % block_name)
+            tcl.append("write_xdc %s.xdc" % block_name)
         # export block design itself as an IP core
         block_vendor = "xilinx_finn"
         block_library = "finn"
         block_vlnv = "%s:%s:%s:1.0" % (block_vendor, block_library, block_name)
         model.set_metadata_prop("vivado_stitch_vlnv", block_vlnv)
+        model.set_metadata_prop("vivado_stitch_ifnames", str(self.intf_names))
         tcl.append(
             (
                 "ipx::package_project -root_dir %s/ip -vendor %s "
@@ -175,19 +329,89 @@ class CreateStitchedIP(Transformation):
         )
         tcl.append("set_property core_revision 2 [ipx::find_open_core %s]" % block_vlnv)
         tcl.append("ipx::create_xgui_files [ipx::find_open_core %s]" % block_vlnv)
+        # if targeting Vitis, add some properties to the IP
+        if self.vitis:
+            tcl.append(
+                "ipx::remove_bus_parameter FREQ_HZ "
+                "[ipx::get_bus_interfaces CLK.AP_CLK -of_objects [ipx::current_core]]"
+            )
+            # replace source code with dcp
+            tcl.append(
+                "set_property sdx_kernel true [ipx::find_open_core %s]" % block_vlnv
+            )
+            tcl.append(
+                "set_property sdx_kernel_type rtl [ipx::find_open_core %s]" % block_vlnv
+            )
+            tcl.append(
+                "set_property supported_families { } [ipx::find_open_core %s]"
+                % block_vlnv
+            )
+            tcl.append(
+                "set_property xpm_libraries {XPM_CDC XPM_MEMORY XPM_FIFO} "
+                "[ipx::find_open_core %s]" % block_vlnv
+            )
+            tcl.append(
+                "set_property auto_family_support_level level_2 "
+                "[ipx::find_open_core %s]" % block_vlnv
+            )
+            # remove all files from synthesis and sim groups
+            # we'll replace with DCP, stub, and xdc
+            tcl.append(
+                "ipx::remove_all_file "
+                "[ipx::get_file_groups xilinx_anylanguagebehavioralsimulation]"
+            )
+            tcl.append(
+                "ipx::remove_all_file "
+                "[ipx::get_file_groups xilinx_anylanguagesynthesis]"
+            )
+            tcl.append(
+                "ipx::remove_file_group "
+                "xilinx_anylanguagebehavioralsimulation [ipx::current_core]"
+            )
+            tcl.append(
+                "ipx::remove_file_group "
+                "xilinx_anylanguagesynthesis [ipx::current_core]"
+            )
+            # remove sim and src folders
+            tcl.append("file delete -force %s/ip/sim" % vivado_stitch_proj_dir)
+            tcl.append("file delete -force %s/ip/src" % vivado_stitch_proj_dir)
+            # copy and add DCP, stub, and xdc
+            tcl.append("file mkdir %s/ip/dcp" % vivado_stitch_proj_dir)
+            tcl.append("file mkdir %s/ip/impl" % vivado_stitch_proj_dir)
+            tcl.append(
+                "file copy -force %s.dcp %s/ip/dcp"
+                % (block_name, vivado_stitch_proj_dir)
+            )
+            tcl.append(
+                "file copy -force %s.xdc %s/ip/impl"
+                % (block_name, vivado_stitch_proj_dir)
+            )
+            tcl.append("ipx::add_file_group xilinx_implementation [ipx::current_core]")
+            tcl.append(
+                "ipx::add_file impl/%s.xdc [ipx::get_file_groups xilinx_implementation]"
+                % block_name
+            )
+            tcl.append(
+                "set_property used_in [list implementation] "
+                "[ipx::get_files impl/%s.xdc "
+                "-of_objects [ipx::get_file_groups xilinx_implementation]]" % block_name
+            )
+            tcl.append(
+                "ipx::add_file_group " "xilinx_synthesischeckpoint [ipx::current_core]"
+            )
+            tcl.append(
+                "ipx::add_file dcp/%s.dcp "
+                "[ipx::get_file_groups xilinx_synthesischeckpoint]" % block_name
+            )
+            tcl.append(
+                "ipx::add_file_group xilinx_simulationcheckpoint [ipx::current_core]"
+            )
+            tcl.append(
+                "ipx::add_file dcp/%s.dcp "
+                "[ipx::get_file_groups xilinx_simulationcheckpoint]" % block_name
+            )
         tcl.append("ipx::update_checksums [ipx::find_open_core %s]" % block_vlnv)
         tcl.append("ipx::save_core [ipx::find_open_core %s]" % block_vlnv)
-        # create wrapper hdl (for rtlsim later on)
-        bd_base = "%s/%s.srcs/sources_1/bd/%s" % (
-            vivado_stitch_proj_dir,
-            prjname,
-            block_name,
-        )
-        bd_filename = "%s/%s.bd" % (bd_base, block_name)
-        tcl.append("make_wrapper -files [get_files %s] -top" % bd_filename)
-        wrapper_filename = "%s/hdl/%s_wrapper.v" % (bd_base, block_name)
-        tcl.append("add_files -norecurse %s" % wrapper_filename)
-        model.set_metadata_prop("wrapper_filename", wrapper_filename)
         # export list of used Verilog files (for rtlsim later on)
         tcl.append("set all_v_files [get_files -filter {FILE_TYPE == Verilog}]")
         v_file_list = "%s/all_verilog_srcs.txt" % vivado_stitch_proj_dir
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_proj.py b/src/finn/transformation/fpgadataflow/make_pynq_proj.py
index 91f6bd2c4ab19c736fcf21322979cac17a163f24..a874d7a7c702e1b3e9125fc031aa65dc287a407d 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_proj.py
@@ -67,6 +67,16 @@ class MakePYNQProject(Transformation):
             raise Exception(
                 "No vlnv for stitched IP found, apply CreateStitchedIP first."
             )
+        vivado_stitch_ifnames = model.get_metadata_prop("vivado_stitch_ifnames")
+        if vivado_stitch_ifnames is None:
+            raise Exception("No IF name metadata found, apply CreateStitchedIP first.")
+        vivado_stitch_ifnames = eval(vivado_stitch_ifnames)
+        # recover interface names from dict
+        self.clk_name = vivado_stitch_ifnames["clk"][0]
+        self.rst_name = vivado_stitch_ifnames["rst"][0]
+        self.s_axis_if_name = vivado_stitch_ifnames["s_axis"][0]
+        self.m_axis_if_name = vivado_stitch_ifnames["m_axis"][0]
+        self.s_aximm_if_name = vivado_stitch_ifnames["axilite"][0]
 
         # collect list of all IP dirs
         ip_dirs = ["list"]
@@ -105,11 +115,11 @@ class MakePYNQProject(Transformation):
         multiple of 8."""
         in_bytes = i_bits_per_cycle_padded / 8
         out_bytes = o_bits_per_cycle_padded / 8
-        in_if_name = "in0_V_V_0"
-        out_if_name = "out_r_0"
-        clk_name = "ap_clk_0"
-        nrst_name = "ap_rst_n_0"
-        axi_lite_if_name = "s_axi_control_0"
+        in_if_name = self.s_axis_if_name
+        out_if_name = self.m_axis_if_name
+        clk_name = self.clk_name
+        nrst_name = self.rst_name
+        axi_lite_if_name = self.s_aximm_if_name
         vivado_ip_cache = os.getenv("VIVADO_IP_CACHE", default="")
 
         # create a temporary folder for the project
diff --git a/src/finn/transformation/fpgadataflow/synth_ooc.py b/src/finn/transformation/fpgadataflow/synth_ooc.py
index 1d49970c819961d1794cc89e998108639ca15593..8fd7e4724ef7f255b1435d5ab5e680d155d39487 100644
--- a/src/finn/transformation/fpgadataflow/synth_ooc.py
+++ b/src/finn/transformation/fpgadataflow/synth_ooc.py
@@ -37,7 +37,7 @@ from finn.util.basic import make_build_dir
 class SynthOutOfContext(Transformation):
     """Run out-of-context Vivado synthesis on a stitched IP design."""
 
-    def __init__(self, part, clk_period_ns, clk_name="ap_clk_0"):
+    def __init__(self, part, clk_period_ns, clk_name="ap_clk"):
         super().__init__()
         self.part = part
         self.clk_period_ns = clk_period_ns
diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index 2cde191435894c72cadf73af82df3f315fb2998c..b47f269dd6f2671c3d98c9316954483c0e72f14f 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -545,6 +545,7 @@ class MoveScalarLinearPastInvariants(Transformation):
                     # move prod0 from input to output,
                     old_prod0_in = prod0.input[0]
                     old_prod0_out = prod0.output[0]
+                    scalar_op_odt = model.get_tensor_datatype(old_prod0_out)
                     old_n_out = n.output[0]
                     in_shape = model.get_tensor_shape(n.input[0])
                     out_shape = model.get_tensor_shape(n.output[0])
@@ -555,12 +556,16 @@ class MoveScalarLinearPastInvariants(Transformation):
                     model.set_tensor_shape(n.input[0], in_shape)
                     model.set_tensor_shape(n.output[0], out_shape)
                     model.set_tensor_shape(prod0.output[0], out_shape)
+                    model.set_tensor_datatype(prod0.output[0], scalar_op_odt)
+                    model.set_tensor_datatype(n.output[0], DataType.FLOAT32)
                     graph.node.remove(prod0)
                     graph.node.insert(node_ind - 1, prod0)
                     graph_modified = True
                 else:
                     continue
-        model = model.transform(InferShapes())
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
         return (model, graph_modified)
 
 
diff --git a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
index 4fb84be59333ef0e696204c9064fcf77e35b5d9b..59ac1c09f4fe338ef03a8166c63b9d4b29bbc08e 100644
--- a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
+++ b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
@@ -33,6 +33,8 @@ from onnx import TensorProto, helper
 import finn.core.onnx_exec as oxe
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
@@ -72,6 +74,9 @@ def make_dupstreams_modelwrapper(ch, pe, idim, idt):
 
     model.set_tensor_datatype("inp", idt)
 
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+
     return model
 
 
diff --git a/tests/fpgadataflow/test_fpgadataflow_fclayer.py b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
index fc5cdb7745945bee99564ba9ab19423a66d8e035..952d994076fc4da7e7f763d9f0fe303d8da0ff11 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fclayer.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
@@ -134,7 +134,7 @@ def prepare_inputs(input_tensor, idt, wdt):
 
 
 # mem_mode: const or decoupled
-@pytest.mark.parametrize("mem_mode", ["const", "decoupled"])
+@pytest.mark.parametrize("mem_mode", ["const", "decoupled", "external"])
 # activation: None or DataType
 @pytest.mark.parametrize("act", [None, DataType.BIPOLAR, DataType.INT4])
 # weight datatype
@@ -221,7 +221,7 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
 
 
 # mem_mode: const or decoupled
-@pytest.mark.parametrize("mem_mode", ["const", "decoupled"])
+@pytest.mark.parametrize("mem_mode", ["const", "decoupled", "external"])
 # activation: None or DataType
 @pytest.mark.parametrize("act", [None, DataType.BIPOLAR, DataType.INT4])
 # weight datatype
diff --git a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
index a9f5bf5ffa1f816b82ef701800e92249056b7c74..7cb31557dfaa61e3a5e5c0a7c65e1fbe717bf0f1 100644
--- a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
+++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
@@ -119,7 +119,7 @@ def create_one_fc_model():
     return model
 
 
-def create_two_fc_model():
+def create_two_fc_model(mem_mode="decoupled"):
     # create a model with two StreamingFCLayer instances
     wdt = DataType.INT2
     idt = DataType.INT32
@@ -152,7 +152,7 @@ def create_two_fc_model():
         ActVal=actval,
         binaryXnorMode=binary_xnor_mode,
         noActivation=no_act,
-        mem_mode="decoupled",
+        mem_mode=mem_mode,
     )
 
     fc1 = helper.make_node(
@@ -172,7 +172,7 @@ def create_two_fc_model():
         ActVal=actval,
         binaryXnorMode=binary_xnor_mode,
         noActivation=no_act,
-        mem_mode="decoupled",
+        mem_mode=mem_mode,
     )
 
     graph = helper.make_graph(
@@ -247,35 +247,35 @@ def test_fpgadataflow_ipstitch_rtlsim():
     model.set_metadata_prop("rtlsim_trace", "whole_trace.vcd")
     sim = pyverilate_stitched_ip(model)
     exp_io = [
-        "ap_clk_0",
-        "ap_rst_n_0",
-        "in0_V_V_0_tdata",
-        "in0_V_V_0_tready",
-        "in0_V_V_0_tvalid",
-        "out_r_0_tdata",
-        "out_r_0_tkeep",
-        "out_r_0_tlast",
-        "out_r_0_tready",
-        "out_r_0_tvalid",
-        "s_axi_control_0_araddr",
-        "s_axi_control_0_arready",
-        "s_axi_control_0_arvalid",
-        "s_axi_control_0_awaddr",
-        "s_axi_control_0_awready",
-        "s_axi_control_0_awvalid",
-        "s_axi_control_0_bready",
-        "s_axi_control_0_bresp",
-        "s_axi_control_0_bvalid",
-        "s_axi_control_0_rdata",
-        "s_axi_control_0_rready",
-        "s_axi_control_0_rresp",
-        "s_axi_control_0_rvalid",
-        "s_axi_control_0_wdata",
-        "s_axi_control_0_wready",
-        "s_axi_control_0_wstrb",
-        "s_axi_control_0_wvalid",
+        "ap_clk",
+        "ap_rst_n",
+        "s_axis_0_tdata",
+        "s_axis_0_tready",
+        "s_axis_0_tvalid",
+        "m_axis_0_tdata",
+        "m_axis_0_tkeep",
+        "m_axis_0_tlast",
+        "m_axis_0_tready",
+        "m_axis_0_tvalid",
+        "s_axi_control_araddr",
+        "s_axi_control_arready",
+        "s_axi_control_arvalid",
+        "s_axi_control_awaddr",
+        "s_axi_control_awready",
+        "s_axi_control_awvalid",
+        "s_axi_control_bready",
+        "s_axi_control_bresp",
+        "s_axi_control_bvalid",
+        "s_axi_control_rdata",
+        "s_axi_control_rready",
+        "s_axi_control_rresp",
+        "s_axi_control_rvalid",
+        "s_axi_control_wdata",
+        "s_axi_control_wready",
+        "s_axi_control_wstrb",
+        "s_axi_control_wvalid",
     ]
-    assert dir(sim.io) == exp_io
+    assert sorted(dir(sim.io)) == sorted(exp_io)
     model.set_metadata_prop("exec_mode", "rtlsim")
     idt = model.get_tensor_datatype("inp")
     ishape = model.get_tensor_shape("inp")