diff --git a/.github/workflows/quicktest-dev-pr.yml b/.github/workflows/quicktest-dev-pr.yml
new file mode 100644
index 0000000000000000000000000000000000000000..cd59a629405c748187cdf478c0bdb0694c58c79f
--- /dev/null
+++ b/.github/workflows/quicktest-dev-pr.yml
@@ -0,0 +1,21 @@
+name: QuicktestPRAgainstDev
+
+on:
+  pull_request:
+    branches: [ dev ]
+  push:
+    branches: [ dev ]
+  
+
+jobs:
+
+  test:
+    name: Run quicktest on PR branch
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: checkout
+        uses: actions/checkout@v2
+
+      - name: DockerRunQuicktest
+        run: sh run-docker.sh quicktest
diff --git a/docker/Dockerfile.finn_ci b/docker/Dockerfile.finn_ci
index 41f6a88f5dd4c9b0822a74cf4a0e7b4663dce910..0d610ec66a5f433d156f4e8da976767ce6458aef 100644
--- a/docker/Dockerfile.finn_ci
+++ b/docker/Dockerfile.finn_ci
@@ -73,6 +73,8 @@ RUN mkdir -p $VIVADO_IP_CACHE
 WORKDIR /workspace/finn
 
 COPY finn_entrypoint.sh /usr/local/bin/
+COPY quicktest.sh /usr/local/bin/
 RUN chmod 755 /usr/local/bin/finn_entrypoint.sh
+RUN chmod 755 /usr/local/bin/quicktest.sh
 ENTRYPOINT ["finn_entrypoint.sh"]
 CMD ["bash"]
diff --git a/docker/Dockerfile.finn_dev b/docker/Dockerfile.finn_dev
index b7cfc299a2999662672225aa5f8912653d189559..1c2cb19d14137b866b55417522fdebb8e0d7ad90 100644
--- a/docker/Dockerfile.finn_dev
+++ b/docker/Dockerfile.finn_dev
@@ -95,7 +95,9 @@ RUN echo "source \$VIVADO_PATH/settings64.sh" >> /home/$UNAME/.bashrc
 # copy entrypoint script
 USER root
 COPY docker/finn_entrypoint.sh /usr/local/bin/
+COPY docker/quicktest.sh /usr/local/bin/
 RUN chmod 755 /usr/local/bin/finn_entrypoint.sh
+RUN chmod 755 /usr/local/bin/quicktest.sh
 USER $UNAME
 
 ENTRYPOINT ["finn_entrypoint.sh"]
diff --git a/docker/quicktest.sh b/docker/quicktest.sh
new file mode 100755
index 0000000000000000000000000000000000000000..4f6a2d3e230de9fcbb947d794722294880a7730d
--- /dev/null
+++ b/docker/quicktest.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+cd $FINN_ROOT
+python setup.py test --addopts "-m 'not (vivado or slow)'"
diff --git a/docs/finn/getting_started.rst b/docs/finn/getting_started.rst
index e2b56b7f75a489f4e4f08bccdc3b889822f81838..95594bb67a2be3a4c3fbba488c75a704f623c136 100644
--- a/docs/finn/getting_started.rst
+++ b/docs/finn/getting_started.rst
@@ -49,11 +49,30 @@ The run-docker.sh script forwards ports 8888 for Jupyter and 8081 for Netron, an
 
 Running the test suite directly
 *******************************
+FINN comes with a set of tests to check for regressions. The full test suite
+(which will take several hours to run and require a PYNQ board) can be executed
+by:
+
 ::
 
   sh run-docker.sh test
 
-FINN comes with a set of tests which can be launched using the command above. Note that some of the tests involve extra compilation and the entire test suite may take some time to complete.
+There is a quicker variant of the test suite that skips the tests marked as
+requiring Vivado or as slow-running tests:
+
+::
+
+  sh run-docker.sh quicktest
+
+If you want to run individual tests, you can do this *inside the Docker container
+from the FINN root directory* as follows:
+
+::
+
+  python setup.py test --addopts "-k test_end2end_tfc_w1a2"
+
+Please see the pytest documentation for more about picking tests by marks or
+by name.
 
 Environment variables
 **********************
diff --git a/run-docker.sh b/run-docker.sh
index b7f844d314c5fb67e11e0933f42b3edfa4d96036..186efc322a8f437be0371b5a142a9dd524d1abf3 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -27,13 +27,27 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+NC='\033[0m' # No Color
+
+# green echo
+gecho () {
+  echo -e "${GREEN}$1${NC}"
+}
+
+# red echo
+recho () {
+  echo -e "${RED}$1${NC}"
+}
+
 if [ -z "$VIVADO_PATH" ];then
-        echo "For correct implementation please set an environment variable VIVADO_PATH that contains the path to your vivado installation directory"
-        exit 1
+        recho "Please set the VIVADO_PATH that contains the path to your Vivado installation directory."
+        recho "FINN functionality depending on Vivado or Vivado HLS will not be available."
 fi
 
 if [ -z "$PYNQ_IP" ];then
-        echo "Please set the PYNQ_IP env.var. to enable PYNQ deployment tests."
+        recho "Please set the PYNQ_IP env.var. to enable PYNQ deployment tests."
 fi
 
 DOCKER_GID=$(id -g)
@@ -74,23 +88,29 @@ VIVADO_IP_CACHE=$BUILD_LOCAL/vivado_ip_cache
 mkdir -p $BUILD_LOCAL
 mkdir -p $VIVADO_IP_CACHE
 
-echo "Instance is named as $DOCKER_INST_NAME"
-echo "Mounting $BUILD_LOCAL into $BUILD_LOCAL"
-echo "Mounting $VIVADO_PATH into $VIVADO_PATH"
-echo "Port-forwarding for Jupyter $JUPYTER_PORT:$JUPYTER_PORT"
-echo "Port-forwarding for Netron $NETRON_PORT:$NETRON_PORT"
-echo "Vivado IP cache dir is at $VIVADO_IP_CACHE"
-echo "Using default PYNQ board $PYNQ_BOARD"
+gecho "Instance is named as $DOCKER_INST_NAME"
+gecho "Mounting $BUILD_LOCAL into $BUILD_LOCAL"
+gecho "Mounting $VIVADO_PATH into $VIVADO_PATH"
+gecho "Port-forwarding for Jupyter $JUPYTER_PORT:$JUPYTER_PORT"
+gecho "Port-forwarding for Netron $NETRON_PORT:$NETRON_PORT"
+gecho "Vivado IP cache dir is at $VIVADO_IP_CACHE"
+gecho "Using default PYNQ board $PYNQ_BOARD"
+
+DOCKER_INTERACTIVE = ""
 
 if [ "$1" = "test" ]; then
-        echo "Running test suite"
+        gecho "Running test suite (all tests)"
         DOCKER_CMD="python setup.py test"
+elif [ "$1" = "quicktest" ]; then
+        gecho "Running test suite (non-Vivado, non-slow tests)"
+        DOCKER_CMD="quicktest.sh"
 elif [ "$1" = "notebook" ]; then
-        echo "Running Jupyter notebook server"
+        gecho "Running Jupyter notebook server"
         DOCKER_CMD="jupyter notebook --ip=0.0.0.0 --port $JUPYTER_PORT notebooks"
 else
-        echo "Running container only"
+        gecho "Running container only"
         DOCKER_CMD="bash"
+        DOCKER_INTERACTIVE="-it"
 fi
 
 # Build the FINN Docker image
@@ -106,7 +126,7 @@ docker build -f docker/Dockerfile.finn_dev --tag=$DOCKER_TAG \
 # Launch container with current directory mounted
 # important to pass the --init flag here for correct Vivado operation, see:
 # https://stackoverflow.com/questions/55733058/vivado-synthesis-hangs-in-docker-container-spawned-by-jenkins
-docker run -t --rm --name $DOCKER_INST_NAME -it --init \
+docker run -t --rm --name $DOCKER_INST_NAME $DOCKER_INTERACTIVE --init \
 --hostname $DOCKER_INST_NAME \
 -e "XILINX_VIVADO=$VIVADO_PATH" \
 -e "SHELL=/bin/bash" \
diff --git a/setup.cfg b/setup.cfg
index 5974cda20e37449a879f7528516895fb7cea4264..1d7dcf247636b486e35d6320669eae706c2b7a72 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -101,6 +101,9 @@ extras = True
 # in order to write a coverage file that can be read by Jenkins.
 addopts =
     --verbose
+markers =
+    slow: marks tests as slow (deselect with '-m "not slow"')
+    vivado: mark tests that require Vivado or Vivado HLS
 norecursedirs =
     dist
     build
diff --git a/src/finn/core/data_layout.py b/src/finn/core/data_layout.py
new file mode 100644
index 0000000000000000000000000000000000000000..3971d221527d3862346c06cf415831c27e5cba8b
--- /dev/null
+++ b/src/finn/core/data_layout.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# predefined lists of strings to have a cannonical way of expresing data layout
+# annotations
+
+NHWC = ["N", "H", "W", "C"]
+NCHW = ["N", "C", "H", "W"]
+NC = ["N", "C"]
+UNKNOWN = []
diff --git a/src/finn/core/modelwrapper.py b/src/finn/core/modelwrapper.py
index 1db1ceec6cee9507d0c4e9be396479d22407171e..ed32426abcc8ea71428a7f746a99454e8e4a2c17 100644
--- a/src/finn/core/modelwrapper.py
+++ b/src/finn/core/modelwrapper.py
@@ -137,11 +137,16 @@ class ModelWrapper:
         qnt_annotations = graph.quantization_annotation
         ret = util.get_by_name(qnt_annotations, tensor_name, "tensor_name")
         if ret is not None:
-            ret = util.get_by_name(
+            ret_dt = util.get_by_name(
                 ret.quant_parameter_tensor_names, "finn_datatype", "key"
             )
-            if ret is not None:
-                ret.value = datatype.name
+            if ret_dt is not None:
+                ret_dt.value = datatype.name
+            else:
+                dt = onnx.StringStringEntryProto()
+                dt.key = "finn_datatype"
+                dt.value = datatype.name
+                ret.quant_parameter_tensor_names.append(dt)
         else:
             qa = onnx.TensorAnnotation()
             dt = onnx.StringStringEntryProto()
@@ -450,3 +455,58 @@ class ModelWrapper:
                 n_ind += 1
         except ValueError:
             return None
+
+    def get_tensor_layout(self, tensor_name):
+        """Returns the data layout annotation of tensor with given name.
+        The data layout is expressed as a list of strings with as many
+        elements as the number of dimensions in the tensor shape. Each
+        string annotates what is contained in that dimension. If there is no
+        data layout annotation, None will be returned.
+        Examples of data layout annotations:
+        ["N", "C"] is tensor[batch][channel]
+        ["N", "C", "H", "W"] is tensor[batch][channel][height][width]
+        ["N", "H", "W", "C"] is tensor[batch][height][width][channel]
+        """
+        graph = self._model_proto.graph
+        qnt_annotations = graph.quantization_annotation
+        ret = util.get_by_name(qnt_annotations, tensor_name, "tensor_name")
+        if ret is not None:
+            ret = util.get_by_name(
+                ret.quant_parameter_tensor_names, "tensor_layout", "key"
+            )
+            if ret is not None:
+                return eval(ret.value)
+        return None
+
+    def set_tensor_layout(self, tensor_name, data_layout):
+        """Sets the data layout annotation of tensor with given name. See
+        get_tensor_layout for examples."""
+        tensor_shape = self.get_tensor_shape(tensor_name)
+        assert type(data_layout) == list, "data_layout must be a list"
+        if tensor_shape is not None:
+            assert len(tensor_shape) == len(
+                data_layout
+            ), """Mismatch between number
+            of dimensions of tensor shape and data layout annotation."""
+        graph = self._model_proto.graph
+        qnt_annotations = graph.quantization_annotation
+        ret = util.get_by_name(qnt_annotations, tensor_name, "tensor_name")
+        if ret is not None:
+            ret_tl = util.get_by_name(
+                ret.quant_parameter_tensor_names, "tensor_layout", "key"
+            )
+            if ret_tl is not None:
+                ret_tl.value = str(data_layout)
+            else:
+                tl = onnx.StringStringEntryProto()
+                tl.key = "tensor_layout"
+                tl.value = str(data_layout)
+                ret.quant_parameter_tensor_names.append(tl)
+        else:
+            qa = onnx.TensorAnnotation()
+            dt = onnx.StringStringEntryProto()
+            dt.key = "tensor_layout"
+            dt.value = str(data_layout)
+            qa.tensor_name = tensor_name
+            qa.quant_parameter_tensor_names.append(dt)
+            qnt_annotations.append(qa)
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index d47b687b65d93ec45d936afd91c08c117cf8dbc8..c77fd81c0bfaa77b458368807410b8bfec17abb7 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -109,6 +109,31 @@ class HLSCustomOp(CustomOp):
         )
         return verilog_file
 
+    def get_all_verilog_paths(self):
+        "Return list of all folders containing Verilog code for this node."
+
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        assert (
+            code_gen_dir != ""
+        ), """Node attribute "code_gen_dir_ipgen" is
+        not set. Please run HLSSynthIP first."""
+        verilog_path = "{}/project_{}/sol1/impl/verilog/".format(
+            code_gen_dir, self.onnx_node.name
+        )
+        # default impl only returns the HLS verilog codegen dir
+        return [verilog_path]
+
+    def get_all_verilog_filenames(self):
+        "Return list of all Verilog files used for this node."
+
+        verilog_files = []
+        verilog_paths = self.get_all_verilog_paths()
+        for verilog_path in verilog_paths:
+            for f in os.listdir(verilog_path):
+                if f.endswith(".v"):
+                    verilog_files += [f]
+        return verilog_files
+
     def prepare_rtlsim(self):
         """Creates a Verilator emulation library for the RTL code generated
         for this node, sets the rtlsim_so attribute to its path and returns
@@ -116,24 +141,15 @@ class HLSCustomOp(CustomOp):
 
         if PyVerilator is None:
             raise ImportError("Installation of PyVerilator is required.")
-        # ensure that code is generated
-        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-        assert (
-            code_gen_dir != ""
-        ), """Node attribute "code_gen_dir_ipgen" is
-        not set. Please run HLSSynthIP first."""
-        verilog_file = self.get_verilog_top_filename()
-        assert os.path.isfile(verilog_file), "Cannot find top-level Verilog file."
+        verilog_paths = self.get_all_verilog_paths()
+        verilog_files = self.get_all_verilog_filenames()
         # build the Verilator emu library
         sim = PyVerilator.build(
-            verilog_file,
+            verilog_files,
             build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
-            verilog_path=[
-                "{}/project_{}/sol1/impl/verilog/".format(
-                    code_gen_dir, self.onnx_node.name
-                )
-            ],
+            verilog_path=verilog_paths,
             trace_depth=get_rtlsim_trace_depth(),
+            top_module_name=self.get_verilog_top_module_name(),
         )
         # save generated lib filename in attribute
         self.set_nodeattr("rtlsim_so", sim.lib._name)
@@ -336,7 +352,7 @@ compilation transformations?
         sim.io.ap_clk = 1
         sim.io.ap_clk = 0
 
-    def rtlsim(self, sim, inp):
+    def rtlsim(self, sim, inp, inp2=None):
         """Runs the pyverilator simulation by passing the input values to the simulation,
         toggle the clock and observing the execution time. Function contains also an
         observation loop that can abort the simulation if no output value is produced
@@ -368,6 +384,13 @@ compilation transformations?
             sim.io.in0_V_V_TDATA = inputs[0] if len(inputs) > 0 else 0
             if sim.io.in0_V_V_TREADY == 1 and sim.io.in0_V_V_TVALID == 1:
                 inputs = inputs[1:]
+
+            if inp2 is not None:
+                sim.io.in1_V_V_TVALID = 1 if len(inp2) > 0 else 0
+                sim.io.in1_V_V_TDATA = inp2[0] if len(inp2) > 0 else 0
+                if sim.io.in1_V_V_TREADY == 1 and sim.io.in1_V_V_TVALID == 1:
+                    inp2 = inp2[1:]
+
             if sim.io.out_V_V_TVALID == 1 and sim.io.out_V_V_TREADY == 1:
                 outputs = outputs + [sim.io.out_V_V_TDATA]
             sim.io.ap_clk = 1
diff --git a/src/finn/custom_op/fpgadataflow/addstreams_batch.py b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5f5c1194d36e86b895610c084222db5ab9eb2bf
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
@@ -0,0 +1,358 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+
+import numpy as np
+
+from finn.core.datatype import DataType
+from finn.custom_op.fpgadataflow import HLSCustomOp
+from onnx import TensorProto, helper
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class AddStreams_Batch(HLSCustomOp):
+    """Class that corresponds to finn-hlslib AddStreams_Batch function."""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "NumChannels": ("i", True, ""),
+            "PE": ("i", True, ""),
+            # FINN DataTypes for inputs; output datatype inferred from input
+            "inputDataType": ("s", True, ""),
+            # number of input vectors, examples:
+            # [1] is a single vector (like a FC layer with batch=1)
+            # [4] is four vectors (like a FC layer with batch=4)
+            # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
+            "numInputVectors": ("ints", False, [1]),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_normal_input_shape(self):
+        ich = self.get_nodeattr("NumChannels")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        ishape = tuple(vecs + [ich])
+        return ishape
+
+    def get_folded_input_shape(self):
+        ich = self.get_nodeattr("NumChannels")
+        pe = self.get_nodeattr("PE")
+        assert ich % pe == 0, "PE must divide NumChannels"
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        ishape = tuple(vecs + [ich // pe, pe])
+        return ishape
+
+    def get_normal_output_shape(self):
+        return self.get_normal_input_shape()
+
+    def get_folded_output_shape(self):
+        return self.get_folded_input_shape()
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpected input1 shape."
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[1]))
+        assert ishape == exp_ishape, "Unexpected input2 shape."
+        # implement tensor with correct shape
+        values = np.random.randn(*oshape).astype(np.float32)
+        return helper.make_node(
+            "Constant",
+            inputs=[],
+            outputs=[self.onnx_node.output[0]],
+            value=helper.make_tensor(
+                name="const_tensor",
+                data_type=TensorProto.FLOAT,
+                dims=values.shape,
+                vals=values.flatten().astype(float),
+            ),
+        )
+
+    def infer_node_datatype(self, model):
+        # check input datatype against property
+        exp_idt_name = self.get_input_datatype().name
+        idt_name = self.get_nodeattr("inputDataType")
+        assert exp_idt_name == idt_name, "Bad input DataType for AddStreams layer"
+        # enforce output data type
+        odt = self.get_output_datatype()
+        model.set_tensor_datatype(self.onnx_node.output[0], odt)
+
+    def verify_node(self):
+        info_messages = []
+        # verify that "domain" is set to "finn"
+        domain_value = self.onnx_node.domain
+        if domain_value == "finn":
+            info_messages.append("Attribute domain is set correctly")
+        else:
+            info_messages.append('Attribute domain should be set to "finn"')
+
+        # verify that "backend" is set to "fpgadataflow"
+        backend_value = self.get_nodeattr("backend")
+        if backend_value == "fpgadataflow":
+            info_messages.append("Attribute backend is set correctly")
+        else:
+            info_messages.append('Attribute backend should be set to "fpgadataflow"')
+
+        # verify that all necessary attributes exist
+        try:
+            self.get_nodeattr("code_gen_dir_cppsim")
+            self.get_nodeattr("executable_path")
+            self.get_nodeattr("NumChannels")
+            self.get_nodeattr("PE")
+            self.get_nodeattr("inputDataType")
+            info_messages.append("All necessary attributes exist")
+        except Exception:
+            info_messages.append(
+                """The required LabelSelect_Batch attributes do not exist."""
+            )
+
+        return info_messages
+
+    def get_input_datatype(self):
+        """Returns FINN DataType of input."""
+        return DataType[self.get_nodeattr("inputDataType")]
+
+    def get_output_datatype(self):
+        """Returns FINN DataType of output."""
+        # we need to set output datatype to the next larger int or uint
+        # enhancement: consider specifying w/ explicit outputDataType attribute
+        # to allow overflow and use the same idt if user wants
+        idt = DataType[self.get_nodeattr("inputDataType")]
+        if idt.signed():
+            return DataType.get_smallest_possible(2 * idt.min())
+        else:
+            return DataType.get_smallest_possible(2 * idt.max())
+
+    def get_instream_width(self):
+        """Returns input stream width."""
+        ibits = self.get_input_datatype().bitwidth()
+        pe = self.get_nodeattr("PE")
+        in_width = pe * ibits
+        return in_width
+
+    def get_outstream_width(self):
+        """Returns output stream width."""
+        obits = self.get_output_datatype().bitwidth()
+        pe = self.get_nodeattr("PE")
+        out_width = pe * obits
+        return out_width
+
+    def get_number_output_values(self):
+        return np.prod(self.get_folded_output_shape()[:-1])
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        exp_ishape = self.get_normal_input_shape()
+        exp_oshape = self.get_normal_output_shape()
+        folded_ishape = self.get_folded_input_shape()
+        folded_oshape = self.get_folded_output_shape()
+
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert (
+            inp.shape == exp_ishape
+        ), """Input0 shape doesn't match expected shape ."""
+        export_idt = self.get_input_datatype()
+        # reshape input into folded form
+        inp = inp.reshape(folded_ishape)
+        # make copy before saving array
+        reshaped_input = inp.copy()
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+        # exact same thing for input1
+        inp = context[node.input[1]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert (
+            inp.shape == exp_ishape
+        ), """Input1 shape doesn't match expected shape ."""
+        export_idt = self.get_input_datatype()
+        # reshape input into folded form
+        inp = inp.reshape(folded_ishape)
+        # make copy before saving array
+        reshaped_input = inp.copy()
+        np.save(os.path.join(code_gen_dir, "input_1.npy"), reshaped_input)
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_output(context)
+            assert (
+                context[node.output[0]].shape == folded_oshape
+            ), "cppsim did not produce expected folded output shape"
+            context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            rtlsim_inp0 = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            rtlsim_inp1 = npy_to_rtlsim_input(
+                "{}/input_1.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            rtlsim_output = self.rtlsim(sim, rtlsim_inp0, rtlsim_inp1)
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        assert (
+            context[node.output[0]].shape == exp_oshape
+        ), """Output shape doesn't match expected shape."""
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"']
+
+    def defines(self, var):
+        self.code_gen_dict["$DEFINES$"] = []
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_input_datatype()
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        self.code_gen_dict["$READNPYDATA$"] = []
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
+            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+        )
+        npy_in = "%s/input_1.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in1);'
+            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+        )
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in1 ("in1");'.format(self.get_instream_width())
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+        )
+
+    def docompute(self):
+        node = self.onnx_node
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            """{}<{}, {}, {}, {}, {}> (in0, in1, out, 1);""".format(
+                node.op_type,
+                self.get_nodeattr("PE"),
+                self.get_input_datatype().get_hls_datatype_str(),
+                self.get_input_datatype().get_hls_datatype_str(),
+                self.get_output_datatype().get_hls_datatype_str(),
+                self.get_number_output_values(),
+            )
+        ]
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        oshape = self.get_folded_output_shape()
+        oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
+
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                oshape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            """void {}(hls::stream<ap_uint<{}>> &in0, hls::stream<ap_uint<{}>> &in1,
+                hls::stream<ap_uint<{}>> &out)""".format(
+                self.onnx_node.name,
+                self.get_nodeattr("PE") * self.get_input_datatype().bitwidth(),
+                self.get_nodeattr("PE") * self.get_input_datatype().bitwidth(),
+                self.get_nodeattr("PE") * self.get_output_datatype().bitwidth(),
+            )
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=in1")
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+        )
diff --git a/src/finn/custom_op/fpgadataflow/labelselect_batch.py b/src/finn/custom_op/fpgadataflow/labelselect_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..7591f09d8d0cd1847672fe5aa09616ff1571033d
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/labelselect_batch.py
@@ -0,0 +1,331 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+
+import numpy as np
+
+from finn.core.datatype import DataType
+from finn.custom_op.fpgadataflow import HLSCustomOp
+from onnx import TensorProto, helper
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class LabelSelect_Batch(HLSCustomOp):
+    """Class that corresponds to finn-hlslib LabelSelect_Batch function."""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "Labels": ("i", True, 0),
+            "PE": ("i", True, 0),
+            "K": ("i", True, 0),
+            # FINN DataTypes for input
+            "inputDataType": ("s", True, ""),
+            # number of input vectors, examples:
+            # [1] is a single vector (like a FC layer with batch=1)
+            # [4] is four vectors (like a FC layer with batch=4)
+            # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
+            "numInputVectors": ("ints", False, [1]),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_normal_input_shape(self):
+        nlabels = self.get_nodeattr("Labels")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        ishape = tuple(vecs + [nlabels])
+        return ishape
+
+    def get_folded_input_shape(self):
+        nlabels = self.get_nodeattr("Labels")
+        pe = self.get_nodeattr("PE")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        assert nlabels % pe == 0, "PE must divide Labels"
+        assert pe == 1, "LabelSelect currently fails with folding"
+        folds = int(nlabels / pe)
+        folded_ishape = tuple(vecs + [folds, pe])
+        return folded_ishape
+
+    def get_normal_output_shape(self):
+        k = self.get_nodeattr("K")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        oshape = tuple(vecs + [k])
+        return oshape
+
+    def get_folded_output_shape(self):
+        k = self.get_nodeattr("K")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        oshape = tuple(vecs + [k, 1])
+        return oshape
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpect input shape."
+        # implement tensor with correct shape
+        values = np.random.randn(*oshape).astype(np.int64)
+        return helper.make_node(
+            "Constant",
+            inputs=[],
+            outputs=[self.onnx_node.output[0]],
+            value=helper.make_tensor(
+                name="const_tensor",
+                data_type=TensorProto.INT64,
+                dims=values.shape,
+                vals=values.flatten(),
+            ),
+        )
+
+    def infer_node_datatype(self, model):
+        # currently set to uint32 to be compatible with hlslib
+        # enhancement: consider finding smallest power-of-two int for reduced output bandwidth
+        model.set_tensor_datatype(self.onnx_node.output[0], DataType.UINT32)
+
+    def verify_node(self):
+        info_messages = []
+        # verify that "domain" is set to "finn"
+        domain_value = self.onnx_node.domain
+        if domain_value == "finn":
+            info_messages.append("Attribute domain is set correctly")
+        else:
+            info_messages.append('Attribute domain should be set to "finn"')
+
+        # verify that "backend" is set to "fpgadataflow"
+        backend_value = self.get_nodeattr("backend")
+        if backend_value == "fpgadataflow":
+            info_messages.append("Attribute backend is set correctly")
+        else:
+            info_messages.append('Attribute backend should be set to "fpgadataflow"')
+
+        # verify that all necessary attributes exist
+        try:
+            self.get_nodeattr("code_gen_dir_cppsim")
+            self.get_nodeattr("executable_path")
+            self.get_nodeattr("Labels")
+            self.get_nodeattr("PE")
+            self.get_nodeattr("K")
+            self.get_nodeattr("inputDataType")
+            info_messages.append("All necessary attributes exist")
+        except Exception:
+            info_messages.append(
+                """The required LabelSelect_Batch attributes do not exist."""
+            )
+
+        # verify that input data is 1D
+        if len(self.get_nodeattr("numInputVectors")) > 1:
+            info_messages.append("""LabelSelect_Batch requires 1D data input.""")
+            raise Exception
+
+        return info_messages
+
+    def get_input_datatype(self):
+        """Returns FINN DataType of input."""
+        ret = DataType[self.get_nodeattr("inputDataType")]
+        assert ret.signed() is False, "LabelSelect is currently broken for signed inputs"
+        return ret
+
+    def get_output_datatype(self):
+        """Returns FINN DataType of output."""
+        return DataType.UINT32
+
+    def get_instream_width(self):
+        """Returns input stream width."""
+        ibits = self.get_input_datatype().bitwidth()
+        pe = self.get_nodeattr("PE")
+        in_width = pe * ibits
+        return in_width
+
+    def get_outstream_width(self):
+        """Returns output stream width."""
+        return self.get_output_datatype().bitwidth()
+
+    def get_number_output_values(self):
+        return self.get_nodeattr("K")
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        exp_ishape = self.get_normal_input_shape()
+        exp_oshape = self.get_normal_output_shape()
+        folded_ishape = self.get_folded_input_shape()
+        folded_oshape = self.get_folded_output_shape()
+
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert inp.shape == exp_ishape, """Input shape doesn't match expected shape ."""
+        export_idt = self.get_input_datatype()
+        # reshape input into folded form
+        inp = inp.reshape(folded_ishape)
+        # make copy before saving array
+        reshaped_input = inp.copy()
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_output(context)
+            assert (
+                context[node.output[0]].shape == folded_oshape
+            ), "cppsim \
+            did not produce expected ofolded utput shape"
+            context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            rtlsim_inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        assert (
+            context[node.output[0]].shape == exp_oshape
+        ), """Output shape doesn't match expected shape."""
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"']
+
+    def defines(self, var):
+        self.code_gen_dict["$DEFINES$"] = []
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_input_datatype()
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"] = []
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
+            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+        )
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+        )
+
+    def docompute(self):
+        node = self.onnx_node
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            """{}<{}, {}, {}, {}, ap_uint<32>> (in0, out, 1);""".format(
+                node.op_type,
+                self.get_nodeattr("Labels"),
+                self.get_nodeattr("PE"),
+                self.get_nodeattr("K"),
+                self.get_input_datatype().get_hls_datatype_str(),
+            )
+        ]
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        oshape = self.get_folded_output_shape()
+        oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
+
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                oshape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            """void {}(hls::stream<ap_uint<{}*{}>> &in0,
+                hls::stream<ap_uint<32>> &out)""".format(
+                self.onnx_node.name,
+                self.get_nodeattr("PE"),
+                self.get_input_datatype().bitwidth(),
+            )
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+        )
diff --git a/src/finn/custom_op/registry.py b/src/finn/custom_op/registry.py
index e72cc5b1a23f04bae978c6f32ba7e45760824349..ac1f49ee26d1c23bd1b0e67ae4ba7e0c2b55b435 100644
--- a/src/finn/custom_op/registry.py
+++ b/src/finn/custom_op/registry.py
@@ -44,6 +44,8 @@ from finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch import (
     StreamingDataWidthConverter_Batch,
 )
 from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch
+from finn.custom_op.fpgadataflow.addstreams_batch import AddStreams_Batch
+from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch
 
 # create a mapping of all known CustomOp names and classes
 custom_op = {}
@@ -60,6 +62,8 @@ custom_op["MaxPoolNHWC"] = MaxPoolNHWC
 custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch
 custom_op["StreamingFIFO"] = StreamingFIFO
 custom_op["GlobalAccPool_Batch"] = GlobalAccPool_Batch
+custom_op["AddStreams_Batch"] = AddStreams_Batch
+custom_op["LabelSelect_Batch"] = LabelSelect_Batch
 
 
 def getCustomOp(node):
diff --git a/src/finn/transformation/fpgadataflow/make_deployment.py b/src/finn/transformation/fpgadataflow/make_deployment.py
index a185f5392c4b5ec848cd463e02ebab4be9c56a46..a4c751b844a2796447930eb74adad6321454ac09 100644
--- a/src/finn/transformation/fpgadataflow/make_deployment.py
+++ b/src/finn/transformation/fpgadataflow/make_deployment.py
@@ -68,10 +68,10 @@ class DeployToPYNQ(Transformation):
         for file in os.listdir(vivado_pynq_proj):
             if file.endswith(".bit"):
                 bitfile = os.path.join(vivado_pynq_proj, file)
+                copy(bitfile, deployment_dir)
             elif file.endswith(".hwh"):
                 hwhfile = os.path.join(vivado_pynq_proj, file)
-        copy(bitfile, deployment_dir)
-        copy(hwhfile, deployment_dir)
+                copy(hwhfile, deployment_dir)
 
         # driver.py and python libraries
         pynq_driver_dir = model.get_metadata_prop("pynq_driver_dir")
diff --git a/src/finn/transformation/general.py b/src/finn/transformation/general.py
index 64f4e3183082baada5fb97e49e4566525eddbc52..f51ffbcfd9f62e06bf4942409fbb163e92ff6370 100644
--- a/src/finn/transformation/general.py
+++ b/src/finn/transformation/general.py
@@ -86,8 +86,6 @@ class GiveUniqueParameterTensors(Transformation):
     other nodes apart from the one the system is currently operating on."""
 
     def apply(self, model):
-        model_tensor_names = model.get_all_tensor_names()
-
         graph = model.graph
         graph_modified = False
         seen_parameters = []
@@ -106,22 +104,11 @@ class GiveUniqueParameterTensors(Transformation):
                     # first occurance
                     seen_parameters += [node_input]
                     continue
-
-                # Give new name to tensor
-                for trials in range(10):
-                    new_param_name = util.random_string(stringLength=6)
-                    if new_param_name not in model_tensor_names:
-                        break
-                else:
-                    raise Exception(
-                        "Not able to create new tensor name"
-                        + "after 10 trials. Net too big for the random tensor"
-                        + "name lenght chosen? Try larger stringLength?"
-                    )
-
-                model_tensor_names += [new_param_name]
+                    
+                new_param_name = model.make_new_valueinfo_name()
 
                 model.set_initializer(new_param_name, input_init)
+                model.set_tensor_datatype(new_param_name, model.get_tensor_datatype(node_input))
 
                 # point node input to new tensor
                 n.input[input_idx] = new_param_name
diff --git a/src/finn/transformation/infer_data_layouts.py b/src/finn/transformation/infer_data_layouts.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ac75578ffb911cc44cfddc2b2119b55e6abf2dd
--- /dev/null
+++ b/src/finn/transformation/infer_data_layouts.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import finn.custom_op.registry as registry
+import finn.core.data_layout as DataLayout
+from finn.transformation import Transformation
+import warnings
+from finn.util.basic import get_by_name
+
+
+def _dims_to_layout(model, node, ndims):
+    if ndims == 2:
+        return DataLayout.NC
+    else:
+        if node.domain == "finn":
+            if node.op_type == "MultiThreshold":
+                mt_inst = registry.getCustomOp(node)
+                layout = mt_inst.get_nodeattr("data_layout")
+                if layout == "NHWC" and ndims == 4:
+                    return DataLayout.NHWC
+                elif layout == "NCHW" and ndims == 4:
+                    return DataLayout.NCHW
+                else:
+                    return DataLayout.UNKNOWN
+            else:
+                if ndims == 4:
+                    return DataLayout.NHWC
+                else:
+                    return DataLayout.UNKNOWN
+        else:
+            # propagate input layout to output
+            # TODO this won't work for concat, squeeze/unsqueeze/reshape...
+            return model.get_tensor_layout(node.input[0])
+
+
+def _infer_node_data_layout(model, node):
+    """Infer output data layout annotation(s) for a particular node.
+    Returns True if any changes were made."""
+    old_layouts = list(map(lambda x: model.get_tensor_layout(x), node.output))
+    if node.domain == "finn":
+        # try to guess based on number of output dims
+        for o in node.output:
+            ndims = len(model.get_tensor_shape(o))
+            new_layout = _dims_to_layout(model, node, ndims)
+            model.set_tensor_layout(o, new_layout)
+    else:
+        if node.op_type == "Transpose":
+            # grab input annotation and switch it around using perm
+            perm = get_by_name(node.attribute, "perm").ints
+            inp_layout = model.get_tensor_layout(node.input[0])
+            out_layout = [inp_layout[i] for i in perm]
+            model.set_tensor_layout(node.output[0], out_layout)
+        else:
+            # try to guess based on number of output dims
+            for o in node.output:
+                ndims = len(model.get_tensor_shape(o))
+                model.set_tensor_layout(o, _dims_to_layout(model, node, ndims))
+    # compare old and new output dtypes to see if anything changed
+    new_layouts = list(map(lambda x: model.get_tensor_layout(x), node.output))
+    graph_modified = new_layouts != old_layouts
+    return graph_modified
+
+
+class InferDataLayouts(Transformation):
+    """Try to infer data layout annotations info for all input/intermediate/output
+    tensors based on inputs and node type."""
+
+    def apply(self, model):
+        graph = model.graph
+        graph_modified = False
+        # first, make sure that the global input has an annotation
+        # this is really hard to do in general, so we do some bad guesswork
+        inp_name = graph.input[0].name
+        if model.get_tensor_layout(inp_name) is None:
+            inp_shape = model.get_tensor_shape(inp_name)
+            if len(inp_shape) == 4:
+                warnings.warn("Assuming 4D input is NCHW")
+                model.set_tensor_layout(inp_name, DataLayout.NCHW)
+                graph_modified = True
+            elif len(inp_shape) == 2:
+                graph_modified = True
+                warnings.warn("Assuming 2D input is NC")
+                model.set_tensor_layout(inp_name, DataLayout.NC)
+            else:
+                raise Exception(
+                    """Unknown number of dims for input, don't know
+                how to annotate"""
+                )
+        for node in graph.node:
+            graph_modified |= _infer_node_data_layout(model, node)
+        return (model, graph_modified)
diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index 85354fac87f38c6a0ae424f3aeec24a72a36aad0..b91ffdb3f731d27d9a6ba68b090f3881e6d7293a 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -329,6 +329,83 @@ class MoveScalarMulPastConv(Transformation):
         return (model, graph_modified)
 
 
+class MoveLinearPastEltwiseAdd(Transformation):
+    """Move linear operations (mul, add) past elementwise add operations where possible.
+       Specifically,matches and transforms the following patterns:
+       (x*C) + (y*C) -> (x + y) * C
+       (x+A) + (y+B) -> (x + y) + (A + B)
+       where x and y are dynamic inputs, A, B, C are constant tensors (in general).
+    """
+
+    def move_node(self, graph, n, prod0, prod1, node_ind):
+        # found! move one of the muls to output, remove the other one
+        lin0_in0 = prod0.input[0]
+        lin1_in0 = prod1.input[0]
+        in0 = n.input[0]
+        out = n.output[0]
+        # TODO: check shapes don't change through scalar mul or add
+        # connect the eltwise add inputs to mul inputs
+        n.input[0] = lin0_in0
+        n.input[1] = lin1_in0
+        # connect mul0 output to eltwise add output
+        prod0.output[0] = out
+        # connect the input of mul0 and output of eltwise add together
+        n.output[0] = in0
+        prod0.input[0] = in0
+        # move prod0 node past eltwise add node, and remove prod1
+        graph.node.remove(prod1)
+        graph.node.remove(prod0)
+        graph.node.insert(node_ind - 2, prod0)
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        nodes = [n for n in graph.node]
+        for n in nodes:
+            node_ind += 1
+            if n.op_type == "Add":
+                # check for tensors on both inputs (eltwise add)
+                # scalar add has an initializer on one input
+                in0 = n.input[0]
+                in1 = n.input[1]
+                if in0 is None or in1 is None:
+                    continue
+                A = model.get_initializer(in0)
+                B = model.get_initializer(in1)
+                if A is not None or B is not None:
+                    continue
+                # check for mul with same initializer on both inputs
+                prod0 = model.find_producer(in0)
+                prod1 = model.find_producer(in1)
+                # Also check case when both branches are empty and come
+                # from the same node: (prod0 == prod1)
+                # Other transform should handle that
+                if prod0 is None or prod1 is None or (prod0 == prod1):
+                    continue
+                init0 = model.get_initializer(prod0.input[1])
+                init1 = model.get_initializer(prod1.input[1])
+                # if either initializer is None, skip
+                if init0 is None or init1 is None:
+                    continue
+                if prod0.op_type == "Mul" and prod1.op_type == "Mul":
+                    if np.array_equal(init0, init1):
+                        self.move_node(graph, n, prod0, prod1, node_ind)
+                        node_ind -= 1
+                        graph_modified = True
+                elif prod0.op_type == "Add" and prod1.op_type == "Add":
+                    init = init0 + init1
+                    # update initializer of prod0, which we'll move
+                    model.set_initializer(prod0.input[1], init)
+                    self.move_node(graph, n, prod0, prod1, node_ind)
+                    node_ind -= 1
+                    graph_modified = True
+                else:
+                    continue
+        model = model.transform(InferShapes())
+        return (model, graph_modified)
+
+
 class MakeMaxPoolNHWC(Transformation):
     """Convert (MaxPool, NHWCTranpose) into (MaxPoolNHWC)."""
 
diff --git a/src/finn/util/fpgadataflow.py b/src/finn/util/fpgadataflow.py
index 9a2708439c0fed1e25c0d955af21cd2e9e705446..7b66d092107c27decca68926a0667333bebedbe0 100644
--- a/src/finn/util/fpgadataflow.py
+++ b/src/finn/util/fpgadataflow.py
@@ -83,14 +83,27 @@ def pyverilate_stitched_ip(model):
     def file_to_dir(x):
         return os.path.dirname(os.path.realpath(x))
 
+    def file_to_basename(x):
+        return os.path.basename(os.path.realpath(x))
+
     all_verilog_dirs = list(map(file_to_dir, all_verilog_srcs))
-    top_verilog = model.get_metadata_prop("wrapper_filename")
+    all_verilog_files = list(
+        set(
+            filter(
+                lambda x: x.endswith(".v"),
+                list(map(file_to_basename, all_verilog_srcs)),
+            )
+        )
+    )
+    top_module_name = model.get_metadata_prop("wrapper_filename")
+    top_module_name = file_to_basename(top_module_name).strip(".v")
     build_dir = make_build_dir("pyverilator_ipstitched_")
     sim = PyVerilator.build(
-        top_verilog,
+        all_verilog_files,
         verilog_path=all_verilog_dirs,
         build_dir=build_dir,
         trace_depth=get_rtlsim_trace_depth(),
+        top_module_name=top_module_name,
     )
     return sim
 
diff --git a/src/finn/util/test.py b/src/finn/util/test.py
index 4cad01b1f7ec58da7ba6d5460c072faa01202c55..b0985bddd3306f81ad0e81f0fb582a0ae7fdaf3d 100644
--- a/src/finn/util/test.py
+++ b/src/finn/util/test.py
@@ -27,6 +27,10 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from brevitas_examples import bnn_pynq
+import numpy as np
+import pytest
+import warnings
+from finn.core.modelwrapper import ModelWrapper
 
 # map of (wbits,abits) -> model
 example_map = {
@@ -61,3 +65,22 @@ def get_test_model_trained(netname, wbits, abits):
 def get_test_model_untrained(netname, wbits, abits):
     "get_test_model with pretrained=False"
     return get_test_model(netname, wbits, abits, pretrained=False)
+
+
+def soft_verify_topk(invec, idxvec, k):
+    """Check that the topK indices provided actually point to the topK largest
+    values in the input vector"""
+    np_topk = np.flip(invec.flatten().argsort())[:k]
+    soft_expected = invec.flatten()[np_topk.astype(np.int).flatten()]
+    soft_produced = invec.flatten()[idxvec.astype(np.int).flatten()]
+    return (soft_expected == soft_produced).all()
+
+
+def load_test_checkpoint_or_skip(filename):
+    "Try to load given .onnx and return ModelWrapper, else skip current test."
+    try:
+        model = ModelWrapper(filename)
+        return model
+    except FileNotFoundError:
+        warnings.warn(filename + " not found from previous test step, skipping")
+        pytest.skip(filename + " not found from previous test step, skipping")
diff --git a/tests/core/test_modelwrapper.py b/tests/core/test_modelwrapper.py
index da2b403d977f178ef9b73c758ba93e1e22f40041..4bd9385536bc6721c66726169dfa4c69e5f06772 100644
--- a/tests/core/test_modelwrapper.py
+++ b/tests/core/test_modelwrapper.py
@@ -31,6 +31,7 @@ import onnx
 from collections import Counter
 import brevitas.onnx as bo
 import numpy as np
+import finn.core.data_layout as DataLayout
 
 from finn.core.modelwrapper import ModelWrapper
 from finn.util.test import get_test_model_trained
@@ -67,6 +68,11 @@ def test_modelwrapper():
     assert inp_cons.op_type == "MatMul"
     out_prod = model.find_producer(l0_inp_tensor_name)
     assert out_prod.op_type == "MultiThreshold"
+    inp_layout = model.get_tensor_layout(inp_name)
+    assert inp_layout is None
+    inp_layout = DataLayout.NCHW
+    model.set_tensor_layout(inp_name, inp_layout)
+    assert model.get_tensor_layout(inp_name) == inp_layout
     os.remove(export_onnx_path)
 
 
diff --git a/tests/end2end/test_end2end_cnv_w1a1.py b/tests/end2end/test_end2end_cnv_w1a1.py
index 7dd45cbc732a50f8f41c1932601308f0dfd77c20..e6d1fc4efd61c01654ee88638698215d23a82eb3 100644
--- a/tests/end2end/test_end2end_cnv_w1a1.py
+++ b/tests/end2end/test_end2end_cnv_w1a1.py
@@ -36,7 +36,6 @@ import onnx  # NOQA
 
 import pytest
 import pkg_resources as pk
-from finn.core.modelwrapper import ModelWrapper
 from finn.custom_op.registry import getCustomOp
 from finn.core.onnx_exec import execute_onnx
 from finn.transformation.double_to_single_float import DoubleToSingleFloat
@@ -69,7 +68,7 @@ from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject
 from finn.transformation.fpgadataflow.synth_pynq_proj import SynthPYNQProject
 from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
 from finn.util.basic import pynq_part_map
-from finn.util.test import get_test_model_trained
+from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip
 from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
@@ -91,7 +90,7 @@ def test_end2end_cnv_w1a1_export():
 
 
 def test_end2end_cnv_w1a1_import_and_tidy():
-    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_export.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w1a1_export.onnx")
     model = model.transform(DoubleToSingleFloat())
     model = model.transform(InferShapes())
     model = model.transform(FoldConstants())
@@ -101,7 +100,7 @@ def test_end2end_cnv_w1a1_import_and_tidy():
 
 
 def test_end2end_cnv_w1a1_streamline():
-    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_tidy.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w1a1_tidy.onnx")
     model = model.transform(Streamline())
     model = model.transform(LowerConvsToMatMul())
     model = model.transform(MakeMaxPoolNHWC())
@@ -112,7 +111,9 @@ def test_end2end_cnv_w1a1_streamline():
 
 
 def test_end2end_cnv_w1a1_convert_to_hls_layers():
-    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_streamlined.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w1a1_streamlined.onnx"
+    )
     model = model.transform(to_hls.InferBinaryStreamingFCLayer(mem_mode))
     model = model.transform(to_hls.InferQuantizedStreamingFCLayer(mem_mode))
     model = model.transform(to_hls.InferConvInpGen())
@@ -122,18 +123,22 @@ def test_end2end_cnv_w1a1_convert_to_hls_layers():
 
 
 def test_end2end_cnv_w1a1_create_dataflow_partition():
-    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_hls_layers.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w1a1_hls_layers.onnx"
+    )
     parent_model = model.transform(CreateDataflowPartition())
     parent_model.save(build_dir + "/end2end_cnv_w1a1_dataflow_parent.onnx")
     sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
     sdp_node = getCustomOp(sdp_node)
     dataflow_model_filename = sdp_node.get_nodeattr("model")
-    dataflow_model = ModelWrapper(dataflow_model_filename)
+    dataflow_model = load_test_checkpoint_or_skip(dataflow_model_filename)
     dataflow_model.save(build_dir + "/end2end_cnv_w1a1_dataflow_model.onnx")
 
 
 def test_end2end_cnv_w1a1_fold_and_tlastmarker():
-    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_dataflow_model.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w1a1_dataflow_model.onnx"
+    )
     fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
     # each tuple is (PE, SIMD, in_fifo_depth) for a layer
     folding = [
@@ -167,23 +172,27 @@ def test_end2end_cnv_w1a1_fold_and_tlastmarker():
     model.save(build_dir + "/end2end_cnv_w1a1_folded.onnx")
 
 
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_end2end_cnv_w1a1_gen_hls_ip():
-    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_folded.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w1a1_folded.onnx")
     model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
     model = model.transform(HLSSynthIP())
     model = model.transform(AnnotateResources("hls"))
     model.save(build_dir + "/end2end_cnv_w1a1_ipgen.onnx")
 
 
+@pytest.mark.vivado
 def test_end2end_cnv_w1a1_ip_stitch():
-    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_ipgen.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w1a1_ipgen.onnx")
     model = model.transform(ReplaceVerilogRelPaths())
     model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
     model.save(build_dir + "/end2end_cnv_w1a1_ipstitch.onnx")
 
 
+@pytest.mark.vivado
 def test_end2end_cnv_w1a1_verify_dataflow_part():
-    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_ipstitch.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w1a1_ipstitch.onnx")
     x = np.zeros((1, 32, 32, 3), dtype=np.float32)
     inp_name = model.graph.input[0].name
     out_name = model.graph.output[0].name
@@ -212,9 +221,12 @@ def test_end2end_cnv_w1a1_verify_dataflow_part():
     assert np.isclose(res_cppsim, res_rtlsim_whole).all()
 
 
+@pytest.mark.vivado
 def test_end2end_cnv_w1a1_verify_all():
     # use the streamlined model as the "golden" model for right answers
-    golden = ModelWrapper(build_dir + "/end2end_cnv_w1a1_streamlined.onnx")
+    golden = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w1a1_streamlined.onnx"
+    )
     iname = golden.graph.input[0].name
     oname = golden.graph.output[0].name
     # load one of the test vectors
@@ -228,22 +240,31 @@ def test_end2end_cnv_w1a1_verify_all():
     y_golden = ret_golden[oname]
     # set up parent+child graph to test
     # we'll use models from the previous step as the child model
-    parent_model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_dataflow_parent.onnx")
+    parent_model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w1a1_dataflow_parent.onnx"
+    )
     iname = parent_model.graph.input[0].name
     oname = parent_model.graph.output[0].name
     # produce results with cppsim
     sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
     sdp_node = getCustomOp(sdp_node)
+    load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w1a1_ipgen_cppsim.onnx")
     sdp_node.set_nodeattr("model", build_dir + "/end2end_cnv_w1a1_ipgen_cppsim.onnx")
     ret_cppsim = execute_onnx(parent_model, {iname: x}, True)
     y_cppsim = ret_cppsim[oname]
     # produce results with node-by-node rtlsim
+    load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w1a1_ipgen_nodebynode_rtlsim.onnx"
+    )
     sdp_node.set_nodeattr(
         "model", build_dir + "/end2end_cnv_w1a1_ipgen_nodebynode_rtlsim.onnx"
     )
     ret_nodebynode_rtlsim = execute_onnx(parent_model, {iname: x}, True)
     y_nodebynode_rtlsim = ret_nodebynode_rtlsim[oname]
     # produce results with whole-network (stitched ip) rtlsim
+    load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w1a1_ipstitch_whole_rtlsim.onnx"
+    )
     sdp_node.set_nodeattr(
         "model", build_dir + "/end2end_cnv_w1a1_ipstitch_whole_rtlsim.onnx"
     )
@@ -257,27 +278,34 @@ def test_end2end_cnv_w1a1_verify_all():
     assert np.argmax(y_golden) == 3
 
 
+@pytest.mark.vivado
 def test_end2end_cnv_w1a1_make_pynq_proj():
-    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_ipstitch.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w1a1_ipstitch.onnx")
     model = model.transform(MakePYNQProject(test_pynq_board))
     model.save(build_dir + "/end2end_cnv_w1a1_pynq_project.onnx")
 
 
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_end2end_cnv_w1a1_synth_pynq_project():
-    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_pynq_project.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w1a1_pynq_project.onnx"
+    )
     model = model.transform(SynthPYNQProject())
     model = model.transform(AnnotateResources("synth"))
     model.save(build_dir + "/end2end_cnv_w1a1_synth.onnx")
 
 
 def test_end2end_cnv_w1a1_make_driver():
-    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_synth.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w1a1_synth.onnx")
     model = model.transform(MakePYNQDriver())
     model.save(build_dir + "/end2end_cnv_w1a1_pynq_driver.onnx")
 
 
 def test_end2end_cnv_w1a1_deploy_on_pynq():
-    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_pynq_driver.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w1a1_pynq_driver.onnx"
+    )
     try:
         ip = os.environ["PYNQ_IP"]  # no fault for this one; skip if not defined
         if ip == "":
@@ -295,7 +323,9 @@ def test_end2end_cnv_w1a1_deploy_on_pynq():
 
 def test_end2end_cnv_w1a1_run_on_pynq():
     # use the streamlined model as the "golden" model for right answers
-    golden = ModelWrapper(build_dir + "/end2end_cnv_w1a1_streamlined.onnx")
+    golden = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w1a1_streamlined.onnx"
+    )
     iname = golden.graph.input[0].name
     oname = golden.graph.output[0].name
     # load one of the test vectors
@@ -309,7 +339,9 @@ def test_end2end_cnv_w1a1_run_on_pynq():
     y_golden = ret_golden[oname]
     # set up parent+child graph to test
     # we'll use models from the previous step as the child model
-    parent_model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_dataflow_parent.onnx")
+    parent_model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w1a1_dataflow_parent.onnx"
+    )
     iname = parent_model.graph.input[0].name
     oname = parent_model.graph.output[0].name
     try:
@@ -319,6 +351,7 @@ def test_end2end_cnv_w1a1_run_on_pynq():
         # produce results with cppsim
         sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
         sdp_node = getCustomOp(sdp_node)
+        load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w1a1_pynq_deploy.onnx")
         sdp_node.set_nodeattr("model", build_dir + "/end2end_cnv_w1a1_pynq_deploy.onnx")
         ret = execute_onnx(parent_model, {iname: x}, True)
         y = ret[oname]
diff --git a/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py b/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py
index 74cd46549f45b7512a03da450e011c4f2e80e16e..1ba149687bb80a0f977115bd380a09f70eef23f1 100644
--- a/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py
+++ b/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py
@@ -40,7 +40,6 @@ import onnx.numpy_helper as nph
 
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 import finn.transformation.streamline.absorb as absorb
-from finn.core.modelwrapper import ModelWrapper
 from finn.core.onnx_exec import execute_onnx
 from finn.core.throughput_test import throughput_test
 from finn.custom_op.registry import getCustomOp
@@ -71,7 +70,7 @@ from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline import Streamline
 from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
 from finn.util.basic import pynq_part_map
-from finn.util.test import get_test_model_trained
+from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip
 from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 
@@ -92,7 +91,7 @@ def test_end2end_tfc_w1a1_export():
 
 
 def test_end2end_tfc_w1a1_import_and_tidy():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_export.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a1_export.onnx")
     model = model.transform(InferShapes())
     model = model.transform(FoldConstants())
     model = model.transform(GiveUniqueNodeNames())
@@ -102,13 +101,15 @@ def test_end2end_tfc_w1a1_import_and_tidy():
 
 
 def test_end2end_tfc_w1a1_streamline():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_tidy.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a1_tidy.onnx")
     model = model.transform(Streamline())
     model.save(build_dir + "/end2end_tfc_w1a1_streamlined.onnx")
 
 
 def test_end2end_tfc_w1a1_convert_to_hls_layers():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_streamlined.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a1_streamlined.onnx"
+    )
     model = model.transform(ConvertBipolarMatMulToXnorPopcount())
     model = model.transform(absorb.AbsorbAddIntoMultiThreshold())
     model = model.transform(absorb.AbsorbMulIntoMultiThreshold())
@@ -118,18 +119,22 @@ def test_end2end_tfc_w1a1_convert_to_hls_layers():
 
 
 def test_end2end_tfc_w1a1_create_dataflow_partition():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_hls_layers.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a1_hls_layers.onnx"
+    )
     parent_model = model.transform(CreateDataflowPartition())
     parent_model.save(build_dir + "/end2end_tfc_w1a1_dataflow_parent.onnx")
     sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
     sdp_node = getCustomOp(sdp_node)
     dataflow_model_filename = sdp_node.get_nodeattr("model")
-    dataflow_model = ModelWrapper(dataflow_model_filename)
+    dataflow_model = load_test_checkpoint_or_skip(dataflow_model_filename)
     dataflow_model.save(build_dir + "/end2end_tfc_w1a1_dataflow_model.onnx")
 
 
 def test_end2end_tfc_w1a1_fold_and_tlastmarker():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_dataflow_model.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a1_dataflow_model.onnx"
+    )
     fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
     # (PE, SIMD, in_fifo_depth, out_fifo_depth, ramstyle) for each layer
     config = [
@@ -153,23 +158,27 @@ def test_end2end_tfc_w1a1_fold_and_tlastmarker():
     model.save(build_dir + "/end2end_tfc_w1a1_folded.onnx")
 
 
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_end2end_tfc_w1a1_gen_hls_ip():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_folded.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a1_folded.onnx")
     model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
     model = model.transform(HLSSynthIP())
     model = model.transform(AnnotateResources("hls"))
     model.save(build_dir + "/end2end_tfc_w1a1_ipgen.onnx")
 
 
+@pytest.mark.vivado
 def test_end2end_tfc_w1a1_ip_stitch():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_ipgen.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a1_ipgen.onnx")
     model = model.transform(ReplaceVerilogRelPaths())
     model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
     model.save(build_dir + "/end2end_tfc_w1a1_ipstitch.onnx")
 
 
+@pytest.mark.vivado
 def test_end2end_tfc_w1a1_verify_dataflow_part():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_ipstitch.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a1_ipstitch.onnx")
     x = np.zeros((1, 784), dtype=np.float32)
     inp_name = model.graph.input[0].name
     out_name = model.graph.output[0].name
@@ -196,9 +205,12 @@ def test_end2end_tfc_w1a1_verify_dataflow_part():
     assert np.isclose(res_cppsim, res_rtlsim_whole).all()
 
 
+@pytest.mark.vivado
 def test_end2end_tfc_w1a1_verify_all():
     # use the streamlined model as the "golden" model for right answers
-    golden = ModelWrapper(build_dir + "/end2end_tfc_w1a1_streamlined.onnx")
+    golden = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a1_streamlined.onnx"
+    )
     iname = golden.graph.input[0].name
     oname = golden.graph.output[0].name
     raw_i = get_data("finn", "data/onnx/mnist-conv/test_data_set_0/input_0.pb")
@@ -209,22 +221,31 @@ def test_end2end_tfc_w1a1_verify_all():
     y_golden = ret_golden[oname]
     # set up parent+child graph to test
     # we'll use models from the previous step as the child model
-    parent_model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_dataflow_parent.onnx")
+    parent_model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a1_dataflow_parent.onnx"
+    )
     iname = parent_model.graph.input[0].name
     oname = parent_model.graph.output[0].name
     # produce results with cppsim
     sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
     sdp_node = getCustomOp(sdp_node)
+    load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a1_ipstitch_cppsim.onnx")
     sdp_node.set_nodeattr("model", build_dir + "/end2end_tfc_w1a1_ipstitch_cppsim.onnx")
     ret_cppsim = execute_onnx(parent_model, {iname: x}, True)
     y_cppsim = ret_cppsim[oname]
     # produce results with node-by-node rtlsim
+    load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a1_ipstitch_nodebynode_rtlsim.onnx"
+    )
     sdp_node.set_nodeattr(
         "model", build_dir + "/end2end_tfc_w1a1_ipstitch_nodebynode_rtlsim.onnx"
     )
     ret_nodebynode_rtlsim = execute_onnx(parent_model, {iname: x}, True)
     y_nodebynode_rtlsim = ret_nodebynode_rtlsim[oname]
     # produce results with whole-network (stitched ip) rtlsim
+    load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a1_ipstitch_whole_rtlsim.onnx"
+    )
     sdp_node.set_nodeattr(
         "model", build_dir + "/end2end_tfc_w1a1_ipstitch_whole_rtlsim.onnx"
     )
@@ -235,27 +256,34 @@ def test_end2end_tfc_w1a1_verify_all():
     assert np.isclose(y_golden, y_whole_rtlsim).all()
 
 
+@pytest.mark.vivado
 def test_end2end_tfc_w1a1_make_pynq_proj():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_ipstitch.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a1_ipstitch.onnx")
     model = model.transform(MakePYNQProject(test_pynq_board))
     model.save(build_dir + "/end2end_tfc_w1a1_pynq_project.onnx")
 
 
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_end2end_tfc_w1a1_synth_pynq_project():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_pynq_project.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a1_pynq_project.onnx"
+    )
     model = model.transform(SynthPYNQProject())
     model = model.transform(AnnotateResources("synth"))
     model.save(build_dir + "/end2end_tfc_w1a1_synth.onnx")
 
 
 def test_end2end_tfc_w1a1_make_driver():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_synth.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a1_synth.onnx")
     model = model.transform(MakePYNQDriver())
     model.save(build_dir + "/end2end_tfc_w1a1_pynq_driver.onnx")
 
 
 def test_end2end_tfc_w1a1_deploy_on_pynq():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_pynq_driver.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a1_pynq_driver.onnx"
+    )
     try:
         ip = os.environ["PYNQ_IP"]  # no fault for this one; skip if not defined
         if ip == "":
@@ -273,7 +301,9 @@ def test_end2end_tfc_w1a1_deploy_on_pynq():
 
 def test_end2end_tfc_w1a1_run_on_pynq():
     # use the streamlined model as the "golden" model for right answers
-    golden = ModelWrapper(build_dir + "/end2end_tfc_w1a1_streamlined.onnx")
+    golden = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a1_streamlined.onnx"
+    )
     iname = golden.graph.input[0].name
     oname = golden.graph.output[0].name
     raw_i = get_data("finn", "data/onnx/mnist-conv/test_data_set_0/input_0.pb")
@@ -285,7 +315,9 @@ def test_end2end_tfc_w1a1_run_on_pynq():
     y_golden = ret_golden[oname]
     # set up parent+child graph to test
     # we'll use models from the previous step as the child model
-    parent_model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_dataflow_parent.onnx")
+    parent_model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a1_dataflow_parent.onnx"
+    )
     iname = parent_model.graph.input[0].name
     oname = parent_model.graph.output[0].name
     try:
@@ -295,11 +327,12 @@ def test_end2end_tfc_w1a1_run_on_pynq():
         # produce results with cppsim
         sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
         sdp_node = getCustomOp(sdp_node)
+        load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a1_pynq_deploy.onnx")
         sdp_node.set_nodeattr("model", build_dir + "/end2end_tfc_w1a1_pynq_deploy.onnx")
         ret = execute_onnx(parent_model, {iname: x}, True)
         y = ret[oname]
         assert np.isclose(y, y_golden).all()
-        child_model = ModelWrapper(sdp_node.get_nodeattr("model"))
+        child_model = load_test_checkpoint_or_skip(sdp_node.get_nodeattr("model"))
         res = throughput_test(child_model)
         assert res is not None
 
diff --git a/tests/end2end/test_end2end_tfc_w1a2.py b/tests/end2end/test_end2end_tfc_w1a2.py
index 5ee2942845c41f4c6705b4ee3ecee89154d9faa9..d4c005a86580fb36e735beb00717fcfdffff21e5 100644
--- a/tests/end2end/test_end2end_tfc_w1a2.py
+++ b/tests/end2end/test_end2end_tfc_w1a2.py
@@ -39,7 +39,6 @@ import onnx  # NOQA
 import onnx.numpy_helper as nph
 
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
-from finn.core.modelwrapper import ModelWrapper
 from finn.core.onnx_exec import execute_onnx
 from finn.custom_op.registry import getCustomOp
 from finn.transformation.fold_constants import FoldConstants
@@ -67,7 +66,7 @@ from finn.transformation.infer_datatypes import InferDataTypes
 from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline import Streamline
 from finn.util.basic import pynq_part_map
-from finn.util.test import get_test_model_trained
+from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip
 from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 
@@ -88,7 +87,7 @@ def test_end2end_tfc_w1a2_export():
 
 
 def test_end2end_tfc_w1a2_import_and_tidy():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_export.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a2_export.onnx")
     model = model.transform(InferShapes())
     model = model.transform(FoldConstants())
     model = model.transform(GiveUniqueNodeNames())
@@ -98,30 +97,36 @@ def test_end2end_tfc_w1a2_import_and_tidy():
 
 
 def test_end2end_tfc_w1a2_streamline():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_tidy.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a2_tidy.onnx")
     model = model.transform(Streamline())
     model.save(build_dir + "/end2end_tfc_w1a2_streamlined.onnx")
 
 
 def test_end2end_tfc_w1a2_convert_to_hls_layers():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_streamlined.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a2_streamlined.onnx"
+    )
     model = model.transform(to_hls.InferQuantizedStreamingFCLayer(mem_mode))
     model.save(build_dir + "/end2end_tfc_w1a2_hls_layers.onnx")
 
 
 def test_end2end_tfc_w1a2_create_dataflow_partition():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_hls_layers.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a2_hls_layers.onnx"
+    )
     parent_model = model.transform(CreateDataflowPartition())
     parent_model.save(build_dir + "/end2end_tfc_w1a2_dataflow_parent.onnx")
     sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
     sdp_node = getCustomOp(sdp_node)
     dataflow_model_filename = sdp_node.get_nodeattr("model")
-    dataflow_model = ModelWrapper(dataflow_model_filename)
+    dataflow_model = load_test_checkpoint_or_skip(dataflow_model_filename)
     dataflow_model.save(build_dir + "/end2end_tfc_w1a2_dataflow_model.onnx")
 
 
 def test_end2end_tfc_w1a2_fold_and_tlastmarker():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_dataflow_model.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a2_dataflow_model.onnx"
+    )
     fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
     # (PE, SIMD, in_fifo_depth, out_fifo_depth, ramstyle) for each layer
     config = [
@@ -145,23 +150,27 @@ def test_end2end_tfc_w1a2_fold_and_tlastmarker():
     model.save(build_dir + "/end2end_tfc_w1a2_folded.onnx")
 
 
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_end2end_tfc_w1a2_gen_hls_ip():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_folded.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a2_folded.onnx")
     model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
     model = model.transform(HLSSynthIP())
     model = model.transform(AnnotateResources("hls"))
     model.save(build_dir + "/end2end_tfc_w1a2_ipgen.onnx")
 
 
+@pytest.mark.vivado
 def test_end2end_tfc_w1a2_ip_stitch():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_ipgen.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a2_ipgen.onnx")
     model = model.transform(ReplaceVerilogRelPaths())
     model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
     model.save(build_dir + "/end2end_tfc_w1a2_ipstitch.onnx")
 
 
+@pytest.mark.vivado
 def test_end2end_tfc_w1a2_verify_dataflow_part():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_ipstitch.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a2_ipstitch.onnx")
     x = np.zeros((1, 784), dtype=np.float32)
     inp_name = model.graph.input[0].name
     out_name = model.graph.output[0].name
@@ -188,9 +197,12 @@ def test_end2end_tfc_w1a2_verify_dataflow_part():
     assert np.isclose(res_cppsim, res_rtlsim_whole).all()
 
 
+@pytest.mark.vivado
 def test_end2end_tfc_w1a2_verify_all():
     # use the streamlined model as the "golden" model for right answers
-    golden = ModelWrapper(build_dir + "/end2end_tfc_w1a2_streamlined.onnx")
+    golden = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a2_streamlined.onnx"
+    )
     iname = golden.graph.input[0].name
     oname = golden.graph.output[0].name
     raw_i = get_data("finn", "data/onnx/mnist-conv/test_data_set_0/input_0.pb")
@@ -201,22 +213,31 @@ def test_end2end_tfc_w1a2_verify_all():
     y_golden = ret_golden[oname]
     # set up parent+child graph to test
     # we'll use models from the previous step as the child model
-    parent_model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_dataflow_parent.onnx")
+    parent_model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a2_dataflow_parent.onnx"
+    )
     iname = parent_model.graph.input[0].name
     oname = parent_model.graph.output[0].name
     # produce results with cppsim
     sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
     sdp_node = getCustomOp(sdp_node)
+    load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a2_ipstitch_cppsim.onnx")
     sdp_node.set_nodeattr("model", build_dir + "/end2end_tfc_w1a2_ipstitch_cppsim.onnx")
     ret_cppsim = execute_onnx(parent_model, {iname: x}, True)
     y_cppsim = ret_cppsim[oname]
     # produce results with node-by-node rtlsim
+    load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a2_ipstitch_nodebynode_rtlsim.onnx"
+    )
     sdp_node.set_nodeattr(
         "model", build_dir + "/end2end_tfc_w1a2_ipstitch_nodebynode_rtlsim.onnx"
     )
     ret_nodebynode_rtlsim = execute_onnx(parent_model, {iname: x}, True)
     y_nodebynode_rtlsim = ret_nodebynode_rtlsim[oname]
     # produce results with whole-network (stitched ip) rtlsim
+    load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a2_ipstitch_whole_rtlsim.onnx"
+    )
     sdp_node.set_nodeattr(
         "model", build_dir + "/end2end_tfc_w1a2_ipstitch_whole_rtlsim.onnx"
     )
@@ -227,27 +248,34 @@ def test_end2end_tfc_w1a2_verify_all():
     assert np.isclose(y_golden, y_whole_rtlsim).all()
 
 
+@pytest.mark.vivado
 def test_end2end_tfc_w1a2_make_pynq_proj():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_ipstitch.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a2_ipstitch.onnx")
     model = model.transform(MakePYNQProject(test_pynq_board))
     model.save(build_dir + "/end2end_tfc_w1a2_pynq_project.onnx")
 
 
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_end2end_tfc_w1a2_synth_pynq_project():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_pynq_project.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a2_pynq_project.onnx"
+    )
     model = model.transform(SynthPYNQProject())
     model = model.transform(AnnotateResources("synth"))
     model.save(build_dir + "/end2end_tfc_w1a2_synth.onnx")
 
 
 def test_end2end_tfc_w1a2_make_driver():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_synth.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a2_synth.onnx")
     model = model.transform(MakePYNQDriver())
     model.save(build_dir + "/end2end_tfc_w1a2_pynq_driver.onnx")
 
 
 def test_end2end_tfc_w1a2_deploy_on_pynq():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_pynq_driver.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a2_pynq_driver.onnx"
+    )
     try:
         ip = os.environ["PYNQ_IP"]  # no fault for this one; skip if not defined
         if ip == "":
@@ -265,7 +293,9 @@ def test_end2end_tfc_w1a2_deploy_on_pynq():
 
 def test_end2end_tfc_w1a2_run_on_pynq():
     # use the streamlined model as the "golden" model for right answers
-    golden = ModelWrapper(build_dir + "/end2end_tfc_w1a2_streamlined.onnx")
+    golden = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a2_streamlined.onnx"
+    )
     iname = golden.graph.input[0].name
     oname = golden.graph.output[0].name
     raw_i = get_data("finn", "data/onnx/mnist-conv/test_data_set_0/input_0.pb")
@@ -277,7 +307,9 @@ def test_end2end_tfc_w1a2_run_on_pynq():
     y_golden = ret_golden[oname]
     # set up parent+child graph to test
     # we'll use models from the previous step as the child model
-    parent_model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_dataflow_parent.onnx")
+    parent_model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a2_dataflow_parent.onnx"
+    )
     iname = parent_model.graph.input[0].name
     oname = parent_model.graph.output[0].name
     try:
@@ -287,6 +319,7 @@ def test_end2end_tfc_w1a2_run_on_pynq():
         # produce results with cppsim
         sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
         sdp_node = getCustomOp(sdp_node)
+        load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a2_pynq_deploy.onnx")
         sdp_node.set_nodeattr("model", build_dir + "/end2end_tfc_w1a2_pynq_deploy.onnx")
         ret = execute_onnx(parent_model, {iname: x}, True)
         y = ret[oname]
diff --git a/tests/end2end/test_end2end_tfc_w2a2.py b/tests/end2end/test_end2end_tfc_w2a2.py
index 2477318efd1e02b0865dadb40bad1a74ac8ea0b4..19d3f86e046658c4080d71984df1cff74008adab 100644
--- a/tests/end2end/test_end2end_tfc_w2a2.py
+++ b/tests/end2end/test_end2end_tfc_w2a2.py
@@ -39,7 +39,6 @@ import onnx  # NOQA
 import onnx.numpy_helper as nph
 
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
-from finn.core.modelwrapper import ModelWrapper
 from finn.core.onnx_exec import execute_onnx
 from finn.custom_op.registry import getCustomOp
 from finn.transformation.fold_constants import FoldConstants
@@ -67,7 +66,7 @@ from finn.transformation.infer_datatypes import InferDataTypes
 from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline import Streamline
 from finn.util.basic import pynq_part_map
-from finn.util.test import get_test_model_trained
+from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip
 from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 
@@ -88,7 +87,7 @@ def test_end2end_tfc_w2a2_export():
 
 
 def test_end2end_tfc_w2a2_import_and_tidy():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_export.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w2a2_export.onnx")
     model = model.transform(InferShapes())
     model = model.transform(FoldConstants())
     model = model.transform(GiveUniqueNodeNames())
@@ -98,30 +97,36 @@ def test_end2end_tfc_w2a2_import_and_tidy():
 
 
 def test_end2end_tfc_w2a2_streamline():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_tidy.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w2a2_tidy.onnx")
     model = model.transform(Streamline())
     model.save(build_dir + "/end2end_tfc_w2a2_streamlined.onnx")
 
 
 def test_end2end_tfc_w2a2_convert_to_hls_layers():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_streamlined.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w2a2_streamlined.onnx"
+    )
     model = model.transform(to_hls.InferQuantizedStreamingFCLayer(mem_mode))
     model.save(build_dir + "/end2end_tfc_w2a2_hls_layers.onnx")
 
 
 def test_end2end_tfc_w2a2_create_dataflow_partition():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_hls_layers.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w2a2_hls_layers.onnx"
+    )
     parent_model = model.transform(CreateDataflowPartition())
     parent_model.save(build_dir + "/end2end_tfc_w2a2_dataflow_parent.onnx")
     sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
     sdp_node = getCustomOp(sdp_node)
     dataflow_model_filename = sdp_node.get_nodeattr("model")
-    dataflow_model = ModelWrapper(dataflow_model_filename)
+    dataflow_model = load_test_checkpoint_or_skip(dataflow_model_filename)
     dataflow_model.save(build_dir + "/end2end_tfc_w2a2_dataflow_model.onnx")
 
 
 def test_end2end_tfc_w2a2_fold_and_tlastmarker():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_dataflow_model.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w2a2_dataflow_model.onnx"
+    )
     fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
     # (PE, SIMD, in_fifo_depth, out_fifo_depth, ramstyle) for each layer
     config = [
@@ -145,23 +150,27 @@ def test_end2end_tfc_w2a2_fold_and_tlastmarker():
     model.save(build_dir + "/end2end_tfc_w2a2_folded.onnx")
 
 
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_end2end_tfc_w2a2_gen_hls_ip():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_folded.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w2a2_folded.onnx")
     model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
     model = model.transform(HLSSynthIP())
     model = model.transform(AnnotateResources("hls"))
     model.save(build_dir + "/end2end_tfc_w2a2_ipgen.onnx")
 
 
+@pytest.mark.vivado
 def test_end2end_tfc_w2a2_ip_stitch():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_ipgen.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w2a2_ipgen.onnx")
     model = model.transform(ReplaceVerilogRelPaths())
     model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
     model.save(build_dir + "/end2end_tfc_w2a2_ipstitch.onnx")
 
 
+@pytest.mark.vivado
 def test_end2end_tfc_w2a2_verify_dataflow_part():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_ipstitch.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w2a2_ipstitch.onnx")
     x = np.zeros((1, 784), dtype=np.float32)
     inp_name = model.graph.input[0].name
     out_name = model.graph.output[0].name
@@ -188,9 +197,12 @@ def test_end2end_tfc_w2a2_verify_dataflow_part():
     assert np.isclose(res_cppsim, res_rtlsim_whole).all()
 
 
+@pytest.mark.vivado
 def test_end2end_tfc_w2a2_verify_all():
     # use the streamlined model as the "golden" model for right answers
-    golden = ModelWrapper(build_dir + "/end2end_tfc_w2a2_streamlined.onnx")
+    golden = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w2a2_streamlined.onnx"
+    )
     iname = golden.graph.input[0].name
     oname = golden.graph.output[0].name
     raw_i = get_data("finn", "data/onnx/mnist-conv/test_data_set_0/input_0.pb")
@@ -201,22 +213,31 @@ def test_end2end_tfc_w2a2_verify_all():
     y_golden = ret_golden[oname]
     # set up parent+child graph to test
     # we'll use models from the previous step as the child model
-    parent_model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_dataflow_parent.onnx")
+    parent_model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w2a2_dataflow_parent.onnx"
+    )
     iname = parent_model.graph.input[0].name
     oname = parent_model.graph.output[0].name
     # produce results with cppsim
     sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
     sdp_node = getCustomOp(sdp_node)
+    load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w2a2_ipstitch_cppsim.onnx")
     sdp_node.set_nodeattr("model", build_dir + "/end2end_tfc_w2a2_ipstitch_cppsim.onnx")
     ret_cppsim = execute_onnx(parent_model, {iname: x}, True)
     y_cppsim = ret_cppsim[oname]
     # produce results with node-by-node rtlsim
+    load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w2a2_ipstitch_nodebynode_rtlsim.onnx"
+    )
     sdp_node.set_nodeattr(
         "model", build_dir + "/end2end_tfc_w2a2_ipstitch_nodebynode_rtlsim.onnx"
     )
     ret_nodebynode_rtlsim = execute_onnx(parent_model, {iname: x}, True)
     y_nodebynode_rtlsim = ret_nodebynode_rtlsim[oname]
     # produce results with whole-network (stitched ip) rtlsim
+    load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w2a2_ipstitch_whole_rtlsim.onnx"
+    )
     sdp_node.set_nodeattr(
         "model", build_dir + "/end2end_tfc_w2a2_ipstitch_whole_rtlsim.onnx"
     )
@@ -227,27 +248,34 @@ def test_end2end_tfc_w2a2_verify_all():
     assert np.isclose(y_golden, y_whole_rtlsim).all()
 
 
+@pytest.mark.vivado
 def test_end2end_tfc_w2a2_make_pynq_proj():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_ipstitch.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w2a2_ipstitch.onnx")
     model = model.transform(MakePYNQProject(test_pynq_board))
     model.save(build_dir + "/end2end_tfc_w2a2_pynq_project.onnx")
 
 
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_end2end_tfc_w2a2_synth_pynq_project():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_pynq_project.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w2a2_pynq_project.onnx"
+    )
     model = model.transform(SynthPYNQProject())
     model = model.transform(AnnotateResources("synth"))
     model.save(build_dir + "/end2end_tfc_w2a2_synth.onnx")
 
 
 def test_end2end_tfc_w2a2_make_driver():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_synth.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w2a2_synth.onnx")
     model = model.transform(MakePYNQDriver())
     model.save(build_dir + "/end2end_tfc_w2a2_pynq_driver.onnx")
 
 
 def test_end2end_tfc_w2a2_deploy_on_pynq():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_pynq_driver.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w2a2_pynq_driver.onnx"
+    )
     try:
         ip = os.environ["PYNQ_IP"]  # no fault for this one; skip if not defined
         if ip == "":
@@ -265,7 +293,9 @@ def test_end2end_tfc_w2a2_deploy_on_pynq():
 
 def test_end2end_tfc_w2a2_run_on_pynq():
     # use the streamlined model as the "golden" model for right answers
-    golden = ModelWrapper(build_dir + "/end2end_tfc_w2a2_streamlined.onnx")
+    golden = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w2a2_streamlined.onnx"
+    )
     iname = golden.graph.input[0].name
     oname = golden.graph.output[0].name
     raw_i = get_data("finn", "data/onnx/mnist-conv/test_data_set_0/input_0.pb")
@@ -277,7 +307,9 @@ def test_end2end_tfc_w2a2_run_on_pynq():
     y_golden = ret_golden[oname]
     # set up parent+child graph to test
     # we'll use models from the previous step as the child model
-    parent_model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_dataflow_parent.onnx")
+    parent_model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w2a2_dataflow_parent.onnx"
+    )
     iname = parent_model.graph.input[0].name
     oname = parent_model.graph.output[0].name
     try:
@@ -287,6 +319,7 @@ def test_end2end_tfc_w2a2_run_on_pynq():
         # produce results with cppsim
         sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
         sdp_node = getCustomOp(sdp_node)
+        load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w2a2_pynq_deploy.onnx")
         sdp_node.set_nodeattr("model", build_dir + "/end2end_tfc_w2a2_pynq_deploy.onnx")
         ret = execute_onnx(parent_model, {iname: x}, True)
         y = ret[oname]
diff --git a/tests/fpgadataflow/test_code_gen_trafo.py b/tests/fpgadataflow/test_code_gen_trafo.py
index 1228a9c79608a1c7eb44900ddb7df54ed900a3c2..24933759830535dfcec768d47a6020b4f3e2de35 100644
--- a/tests/fpgadataflow/test_code_gen_trafo.py
+++ b/tests/fpgadataflow/test_code_gen_trafo.py
@@ -29,13 +29,14 @@
 import os
 
 from onnx import TensorProto, helper
-
+import pytest
 import finn.util.basic as util
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 
 
+@pytest.mark.vivado
 def test_code_gen_trafo():
     idt = wdt = odt = DataType.BIPOLAR
     mw = 8
diff --git a/tests/fpgadataflow/test_compilation_trafo.py b/tests/fpgadataflow/test_compilation_trafo.py
index 35eed02f4e71a96f9f4e8957c372f93e6cd7927c..65894e02e490f6931e5b03a9aa67b8f22e32583a 100644
--- a/tests/fpgadataflow/test_compilation_trafo.py
+++ b/tests/fpgadataflow/test_compilation_trafo.py
@@ -30,6 +30,7 @@ import os
 
 from onnx import TensorProto, helper
 
+import pytest
 import finn.util.basic as util
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
@@ -37,6 +38,7 @@ from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 
 
+@pytest.mark.vivado
 def test_compilation_trafo():
     idt = wdt = odt = DataType.BIPOLAR
     mw = 8
diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py b/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
index 220f8a7966a146f954a7fcb3f32058e231b83e23..e03090f0581eebf68cac7baffb6888a6992df68d 100644
--- a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
+++ b/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
@@ -31,7 +31,7 @@ import pkg_resources as pk
 
 import brevitas.onnx as bo
 import numpy as np
-
+import pytest
 import finn.core.onnx_exec as oxe
 import finn.transformation.streamline.absorb as absorb
 from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
@@ -53,6 +53,7 @@ from finn.custom_op.registry import getCustomOp
 export_onnx_path_cnv = "test_output_cnv.onnx"
 
 
+@pytest.mark.vivado
 def test_convert_to_hls_layers_cnv_w1a1():
     cnv = get_test_model_trained("CNV", 1, 1)
     bo.export_finn_onnx(cnv, (1, 3, 32, 32), export_onnx_path_cnv)
diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_fc.py b/tests/fpgadataflow/test_convert_to_hls_layers_fc.py
index b7dea03797bc5de5e7517d0d8b816c438027008b..e261a3114853bf24bdb4c931c46ff92eea4150dd 100644
--- a/tests/fpgadataflow/test_convert_to_hls_layers_fc.py
+++ b/tests/fpgadataflow/test_convert_to_hls_layers_fc.py
@@ -34,7 +34,7 @@ import numpy as np
 import onnx
 import onnx.numpy_helper as nph
 import torch
-
+import pytest
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 import finn.transformation.streamline.absorb as absorb
@@ -56,6 +56,7 @@ export_onnx_path = "test_output_tfc.onnx"
 export_onnx_path_cnv = "test_output_cnv.onnx"
 
 
+@pytest.mark.vivado
 def test_convert_to_hls_layers_tfc_w1a1():
     tfc = get_test_model_trained("TFC", 1, 1)
     bo.export_finn_onnx(tfc, (1, 1, 28, 28), export_onnx_path)
@@ -125,6 +126,7 @@ def test_convert_to_hls_layers_tfc_w1a1():
     assert np.isclose(produced, expected, atol=1e-3).all()
 
 
+@pytest.mark.vivado
 def test_convert_to_hls_layers_tfc_w1a2():
     tfc = get_test_model_trained("TFC", 1, 2)
     bo.export_finn_onnx(tfc, (1, 1, 28, 28), export_onnx_path)
diff --git a/tests/fpgadataflow/test_create_dataflow_partition.py b/tests/fpgadataflow/test_create_dataflow_partition.py
index 77e0ddeebf6080e1840d6014978a4c9b4a10b5c1..c4f748051ff038371353574298580f3bf9e05e9f 100644
--- a/tests/fpgadataflow/test_create_dataflow_partition.py
+++ b/tests/fpgadataflow/test_create_dataflow_partition.py
@@ -29,7 +29,6 @@
 import os.path
 from pkgutil import get_data
 
-import pytest
 
 from finn.core.modelwrapper import ModelWrapper
 from finn.custom_op.registry import getCustomOp
@@ -38,11 +37,11 @@ from finn.transformation.fpgadataflow.create_dataflow_partition import (
 )
 from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker
 from finn.util.basic import make_build_dir
+from finn.util.test import load_test_checkpoint_or_skip
 
 build_dir = make_build_dir("test_dataflow_partition_")
 
 
-@pytest.mark.dependency()
 def test_dataflow_partition_create():
     # load the onnx model
     raw_m = get_data(
@@ -57,9 +56,10 @@ def test_dataflow_partition_create():
     model.save(build_dir + "/test_dataflow_partition_create.onnx")
 
 
-@pytest.mark.dependency(depends=["test_dataflow_partition_create"])
 def test_dataflow_partition_tlastmarker():
-    model = ModelWrapper(build_dir + "/test_dataflow_partition_create.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/test_dataflow_partition_create.onnx"
+    )
     model_path = getCustomOp(model.graph.node[2]).get_nodeattr("model")
     model = ModelWrapper(model_path)
     model = model.transform(InsertTLastMarker())
diff --git a/tests/fpgadataflow/test_fpgadataflow_addstreams.py b/tests/fpgadataflow/test_fpgadataflow_addstreams.py
new file mode 100644
index 0000000000000000000000000000000000000000..f94784457a43718516e76946269fc47119423b24
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_addstreams.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+from onnx import TensorProto, helper
+
+import finn.core.onnx_exec as oxe
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.util.basic import gen_finn_dt_tensor
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
+
+
+def make_addstreams_modelwrapper(ch, pe, idt):
+    inp1 = helper.make_tensor_value_info("inp1", TensorProto.FLOAT, [1, ch])
+    inp2 = helper.make_tensor_value_info("inp2", TensorProto.FLOAT, [1, ch])
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ch])
+
+    addstreams_node = helper.make_node(
+        "AddStreams_Batch",
+        ["inp1", "inp2"],
+        ["outp"],
+        domain="finn",
+        backend="fpgadataflow",
+        NumChannels=ch,
+        PE=pe,
+        inputDataType=idt.name,
+    )
+    graph = helper.make_graph(
+        nodes=[addstreams_node], name="graph", inputs=[inp1, inp2], outputs=[outp],
+    )
+
+    model = helper.make_model(graph, producer_name="addstreams-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp1", idt)
+    model.set_tensor_datatype("inp2", idt)
+
+    return model
+
+
+def prepare_inputs(input1, input2):
+    return {"inp1": input1, "inp2": input2}
+
+
+# data types
+@pytest.mark.parametrize("idt", [DataType.UINT4, DataType.UINT8])
+# channels
+@pytest.mark.parametrize("ch", [1, 64])
+# folding
+@pytest.mark.parametrize("fold", [-1, 2, 1])
+# execution mode
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.vivado
+def test_fpgadataflow_addstreams(idt, ch, fold, exec_mode):
+    if fold == -1:
+        pe = 1
+    else:
+        pe = max(1, ch // fold)
+    assert ch % pe == 0
+
+    # generate input data
+    x1 = gen_finn_dt_tensor(idt, (1, ch))
+    x2 = gen_finn_dt_tensor(idt, (1, ch))
+
+    model = make_addstreams_modelwrapper(ch, pe, idt)
+
+    if exec_mode == "cppsim":
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+        model = model.transform(SetExecMode("cppsim"))
+    elif exec_mode == "rtlsim":
+        model = model.transform(SetExecMode("rtlsim"))
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(ReplaceVerilogRelPaths())
+        model = model.transform(PrepareRTLSim())
+    else:
+        raise Exception("Unknown exec_mode")
+
+    # prepare input data
+    input_dict = prepare_inputs(x1, x2)
+
+    oshape = model.get_tensor_shape("outp")
+    y = x1 + x2
+    y_expected = y.reshape(oshape)
+    # execute model
+    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
+    y_produced = y_produced.reshape(y_expected.shape)
+
+    assert (y_produced == y_expected).all(), exec_mode + " failed"
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
index 02a9acae5e0e90d2a8dfa7d4d4afb03aa11f4239..5051bf34dc690daf8b6186859d3717cc8e217eee 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
@@ -137,6 +137,8 @@ def prepare_inputs(input_tensor):
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 # input channel parallelism ("SIMD")
 @pytest.mark.parametrize("simd", [1, 2])
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_fpgadataflow_slidingwindow(idt, k, ifm_dim, ifm_ch, stride, exec_mode, simd):
     ofm_dim = int(((ifm_dim - k) / stride) + 1)
 
diff --git a/tests/fpgadataflow/test_fpgadataflow_dwc.py b/tests/fpgadataflow/test_fpgadataflow_dwc.py
index a230fb4201673e3bf0a31cf9ec82f21250fd9e40..1d83f7a23cd3bad757e772055d242799cf22b0da 100644
--- a/tests/fpgadataflow/test_fpgadataflow_dwc.py
+++ b/tests/fpgadataflow/test_fpgadataflow_dwc.py
@@ -55,6 +55,8 @@ def prepare_inputs(input_tensor, dt):
 @pytest.mark.parametrize("OUTWidth", [2, 4])
 # finn_dtype
 @pytest.mark.parametrize("finn_dtype", [DataType.BIPOLAR, DataType.INT2])
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_fpgadataflow_dwc_rtlsim(Shape, INWidth, OUTWidth, finn_dtype):
 
     # generate input data
diff --git a/tests/fpgadataflow/test_fpgadataflow_fclayer.py b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
index 416d96d5dbfa1125d878eb8339ae38f5d572d1ce..fc5cdb7745945bee99564ba9ab19423a66d8e035 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fclayer.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
@@ -149,6 +149,8 @@ def prepare_inputs(input_tensor, idt, wdt):
 @pytest.mark.parametrize("mw", [16])
 # HLS matrix height (output features)
 @pytest.mark.parametrize("mh", [16])
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
     if nf == -1:
         nf = mh
@@ -234,6 +236,8 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
 @pytest.mark.parametrize("mw", [16])
 # HLS matrix height (output features)
 @pytest.mark.parametrize("mh", [16])
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
     if nf == -1:
         nf = mh
@@ -324,6 +328,7 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
 @pytest.mark.parametrize("mw", [128])
 # HLS matrix height (output features)
 @pytest.mark.parametrize("mh", [128])
+@pytest.mark.vivado
 def test_fpgadataflow_fclayer_large_depth_decoupled_mode(
     mem_mode, idt, wdt, act, nf, sf, mw, mh
 ):
diff --git a/tests/fpgadataflow/test_fpgadataflow_fifo.py b/tests/fpgadataflow/test_fpgadataflow_fifo.py
index 9158a0b0e72017b2468627e4f30fd3432c418d38..94090a47ad64fc377530e6e21d35661e1d92b5a6 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fifo.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fifo.py
@@ -76,6 +76,8 @@ def prepare_inputs(input_tensor, dt):
 @pytest.mark.parametrize("depth", [16])
 # finn_dtype
 @pytest.mark.parametrize("finn_dtype", [DataType.BIPOLAR])  # , DataType.INT2])
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_fpgadataflow_fifo_rtlsim(Shape, folded_shape, depth, finn_dtype):
 
     # generate input data
diff --git a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
index bbfa153c6fd3959d07f8f1883fa88949072f3511..b46391daf629e97c24c2950aefad3cbc5055c345 100644
--- a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
+++ b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
@@ -63,7 +63,7 @@ def make_accpool_modelwrapper(ch, pe, idim, idt):
         numInputVectors=[1, idim, idim],
     )
     graph = helper.make_graph(
-        nodes=[accpool_node], name="graph", inputs=[inp], outputs=[outp],
+        nodes=[accpool_node], name="graph", inputs=[inp], outputs=[outp]
     )
 
     model = helper.make_model(graph, producer_name="thresholding-model")
@@ -88,6 +88,7 @@ def prepare_inputs(input_tensor, idt):
 @pytest.mark.parametrize("imdim", [7])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.vivado
 def test_fpgadataflow_globalaccpool(idt, ch, fold, imdim, exec_mode):
     if fold == -1:
         pe = 1
diff --git a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py b/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
index 30b86d639ae52143320dfdfeb25488bae865b4d2..16100522aa94fd25d234efa1d03edfdc866ca1bb 100644
--- a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
+++ b/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
@@ -52,6 +52,7 @@ import finn.transformation.fpgadataflow.replace_verilog_relpaths as rvp
 from finn.transformation.general import GiveUniqueNodeNames
 from finn.util.basic import gen_finn_dt_tensor, pynq_part_map
 from finn.util.fpgadataflow import pyverilate_stitched_ip
+from finn.util.test import load_test_checkpoint_or_skip
 
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
 test_fpga_part = pynq_part_map[test_pynq_board]
@@ -198,13 +199,14 @@ def create_two_fc_model():
 
 # exec_mode of StreamingDataflowPartition
 # @pytest.mark.parametrize("exec_mode", ["remote_pynq"]) #, "rtlsim"])
+@pytest.mark.vivado
 def test_fpgadataflow_ipstitch_gen_model():  # exec_mode):
     model = create_one_fc_model()
     if model.graph.node[0].op_type == "StreamingDataflowPartition":
         sdp_node = getCustomOp(model.graph.node[0])
         assert sdp_node.__class__.__name__ == "StreamingDataflowPartition"
         assert os.path.isfile(sdp_node.get_nodeattr("model"))
-        model = ModelWrapper(sdp_node.get_nodeattr("model"))
+        model = load_test_checkpoint_or_skip(sdp_node.get_nodeattr("model"))
         model.set_metadata_prop("exec_mode", "remote_pynq")
     model = model.transform(InsertTLastMarker())
     model = model.transform(GiveUniqueNodeNames())
@@ -215,8 +217,9 @@ def test_fpgadataflow_ipstitch_gen_model():  # exec_mode):
     model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_gen_model.onnx")
 
 
+@pytest.mark.vivado
 def test_fpgadataflow_ipstitch_do_stitch():
-    model = ModelWrapper(
+    model = load_test_checkpoint_or_skip(
         ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_gen_model.onnx"
     )
     model = model.transform(rvp.ReplaceVerilogRelPaths())
@@ -231,8 +234,11 @@ def test_fpgadataflow_ipstitch_do_stitch():
     model.save(ip_stitch_model_dir + "/test_fpgadataflow_ip_stitch.onnx")
 
 
+@pytest.mark.vivado
 def test_fpgadataflow_ipstitch_rtlsim():
-    model = ModelWrapper(ip_stitch_model_dir + "/test_fpgadataflow_ip_stitch.onnx")
+    model = load_test_checkpoint_or_skip(
+        ip_stitch_model_dir + "/test_fpgadataflow_ip_stitch.onnx"
+    )
     model.set_metadata_prop("rtlsim_trace", "whole_trace.vcd")
     sim = pyverilate_stitched_ip(model)
     exp_io = [
@@ -275,8 +281,11 @@ def test_fpgadataflow_ipstitch_rtlsim():
     assert (rtlsim_res == x).all()
 
 
+@pytest.mark.vivado
 def test_fpgadataflow_ipstitch_pynq_projgen():
-    model = ModelWrapper(ip_stitch_model_dir + "/test_fpgadataflow_ip_stitch.onnx")
+    model = load_test_checkpoint_or_skip(
+        ip_stitch_model_dir + "/test_fpgadataflow_ip_stitch.onnx"
+    )
     model = model.transform(MakePYNQProject(test_pynq_board))
     vivado_pynq_proj_dir = model.get_metadata_prop("vivado_pynq_proj")
     assert vivado_pynq_proj_dir is not None
@@ -284,8 +293,12 @@ def test_fpgadataflow_ipstitch_pynq_projgen():
     model.save(ip_stitch_model_dir + "/test_fpgadataflow_pynq_projgen.onnx")
 
 
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_fpgadataflow_ipstitch_pynq_synth():
-    model = ModelWrapper(ip_stitch_model_dir + "/test_fpgadataflow_pynq_projgen.onnx")
+    model = load_test_checkpoint_or_skip(
+        ip_stitch_model_dir + "/test_fpgadataflow_pynq_projgen.onnx"
+    )
     model = model.transform(SynthPYNQProject())
     bitfile = model.get_metadata_prop("vivado_pynq_bitfile")
     assert bitfile is not None
@@ -294,7 +307,9 @@ def test_fpgadataflow_ipstitch_pynq_synth():
 
 
 def test_fpgadataflow_ipstitch_pynq_driver():
-    model = ModelWrapper(ip_stitch_model_dir + "/test_fpgadataflow_pynq_projgen.onnx")
+    model = load_test_checkpoint_or_skip(
+        ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_pynq_synth.onnx"
+    )
     model = model.transform(MakePYNQDriver())
     driver_dir = model.get_metadata_prop("pynq_driver_dir")
     assert driver_dir is not None
@@ -303,13 +318,13 @@ def test_fpgadataflow_ipstitch_pynq_driver():
 
 
 def test_fpgadataflow_ipstitch_pynq_deployment_folder():
+    model = load_test_checkpoint_or_skip(
+        ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_pynq_driver.onnx"
+    )
     try:
         ip = os.environ["PYNQ_IP"]  # no default for this one; skip if not defined
         if ip == "":
             pytest.skip("PYNQ board IP address not specified")
-        model = ModelWrapper(
-            ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_pynq_driver.onnx"
-        )
         username = os.getenv("PYNQ_USERNAME", "xilinx")
         password = os.getenv("PYNQ_PASSWORD", "xilinx")
         port = os.getenv("PYNQ_PORT", 22)
@@ -341,7 +356,7 @@ def test_fpgadataflow_ipstitch_remote_execution():
         ip = os.environ["PYNQ_IP"]  # NOQA
         if ip == "":
             pytest.skip("PYNQ board IP address not specified")
-        model = ModelWrapper(
+        model = load_test_checkpoint_or_skip(
             ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_pynq_deployment.onnx"
         )
         iname = "inp"
diff --git a/tests/fpgadataflow/test_fpgadataflow_labelselect.py b/tests/fpgadataflow/test_fpgadataflow_labelselect.py
new file mode 100644
index 0000000000000000000000000000000000000000..2df841728395229dafe33d2804c44a3489ef3e45
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_labelselect.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+from onnx import TensorProto, helper
+
+import finn.core.onnx_exec as oxe
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.util.basic import gen_finn_dt_tensor
+from finn.util.test import soft_verify_topk
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
+
+
+def make_labelselect_modelwrapper(labels, pe, k, idt):
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, labels])
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, k])
+
+    labelselect_node = helper.make_node(
+        "LabelSelect_Batch",
+        ["inp"],
+        ["outp"],
+        domain="finn",
+        backend="fpgadataflow",
+        Labels=labels,
+        PE=pe,
+        K=k,
+        inputDataType=idt.name,
+    )
+    graph = helper.make_graph(
+        nodes=[labelselect_node], name="graph", inputs=[inp], outputs=[outp],
+    )
+
+    model = helper.make_model(graph, producer_name="thresholding-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", DataType.UINT32)
+
+    return model
+
+
+def prepare_inputs(input_tensor, idt):
+    return {"inp": input_tensor}
+
+
+# TODO: folded inputs fail, likely problem in hlslib
+# input datatype -- checked by assertion in HLSCustomOp
+@pytest.mark.parametrize("idt", [DataType.UINT8, DataType.UINT16])
+# labels
+@pytest.mark.parametrize("labels", [10, 1000])
+# folding
+@pytest.mark.parametrize("fold", [-1])
+# number of top labels to select
+@pytest.mark.parametrize("k", [1, 5])
+# execution mode
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.vivado
+def test_fpgadataflow_labelselect(idt, labels, fold, k, exec_mode):
+    if fold == -1:
+        pe = 1
+    else:
+        pe = labels // fold
+    assert labels % pe == 0
+
+    if k == -1:
+        k = labels
+
+    # generate input data
+    x = gen_finn_dt_tensor(idt, (1, labels))
+
+    model = make_labelselect_modelwrapper(labels, pe, k, idt)
+
+    if exec_mode == "cppsim":
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+        model = model.transform(SetExecMode("cppsim"))
+    elif exec_mode == "rtlsim":
+        model = model.transform(SetExecMode("rtlsim"))
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(ReplaceVerilogRelPaths())
+        model = model.transform(PrepareRTLSim())
+    else:
+        raise Exception("Unknown exec_mode")
+
+    # prepare input data and execute
+    input_dict = prepare_inputs(x, idt)
+    y = oxe.execute_onnx(model, input_dict)["outp"]
+
+    assert soft_verify_topk(x, y, k), exec_mode + " failed"
diff --git a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py b/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
index ac4ab33469c7720c3d7b9f30f5d13be888e1439d..bda66bebbd93d346eb0026b17cbaff9a7ca5df5e 100644
--- a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
+++ b/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
@@ -121,6 +121,8 @@ def prepare_inputs(input_tensor):
 @pytest.mark.parametrize("ifm_ch", [1, 2])  # , 2, 3, 4])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["rtlsim", "cppsim"])
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_fpgadataflow_streamingmaxpool(idt, k, ifm_dim, ifm_ch, exec_mode):
     stride = k
     ofm_dim = int(((ifm_dim - k) / stride) + 1)
diff --git a/tests/transformation/test_infer_data_layouts.py b/tests/transformation/test_infer_data_layouts.py
new file mode 100644
index 0000000000000000000000000000000000000000..fccc7813da6f98c8af4ade7ae562c99b32247a8b
--- /dev/null
+++ b/tests/transformation/test_infer_data_layouts.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+
+import brevitas.onnx as bo
+import finn.transformation.streamline.absorb as absorb
+from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.fold_constants import FoldConstants
+from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.streamline import Streamline
+from finn.util.test import get_test_model_trained
+from finn.transformation.double_to_single_float import DoubleToSingleFloat
+from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+from finn.transformation.infer_data_layouts import InferDataLayouts
+import finn.core.data_layout as DataLayout
+
+export_onnx_path_cnv = "test_output_cnv.onnx"
+
+
+def test_infer_data_layouts():
+    cnv = get_test_model_trained("CNV", 1, 1)
+    bo.export_finn_onnx(cnv, (1, 3, 32, 32), export_onnx_path_cnv)
+    model = ModelWrapper(export_onnx_path_cnv)
+    model = model.transform(DoubleToSingleFloat())
+    model = model.transform(InferShapes())
+    model = model.transform(FoldConstants())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(Streamline())
+    model = model.transform(InferDataLayouts())
+
+    assert model.get_tensor_layout("global_in") == DataLayout.NCHW
+    assert model.get_tensor_layout("Conv_0_out0") == DataLayout.NCHW
+    assert model.get_tensor_layout("MaxPool_0_out0") == DataLayout.NCHW
+    assert model.get_tensor_layout("MultiThreshold_6_out0") == DataLayout.NCHW
+    assert model.get_tensor_layout("Reshape_0_out0") == DataLayout.NC
+    assert model.get_tensor_layout("MatMul_0_out0") == DataLayout.NC
+    assert model.get_tensor_layout("global_out") == DataLayout.NC
+
+    model = model.transform(LowerConvsToMatMul())
+    model = model.transform(MakeMaxPoolNHWC())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(InferDataLayouts())
+
+    assert model.get_tensor_layout("global_in") == DataLayout.NCHW
+    assert model.get_tensor_layout("Transpose_0_out0") == DataLayout.NHWC
+    assert model.get_tensor_layout("Im2Col_0_out0") == DataLayout.NHWC
+    # note: im2col output isn't really NHWC or any other common layout
+    # since the concept of channels changes with lowering... but it is
+    # conceptually close to NHWC since the innermost dim gets multiplied
+    assert model.get_tensor_layout("MatMul_0_out0") == DataLayout.NHWC
+    assert model.get_tensor_layout("Transpose_1_out0") == DataLayout.NCHW
+    assert model.get_tensor_layout("Transpose_2_out0") == DataLayout.NHWC
+    assert model.get_tensor_layout("MaxPoolNHWC_0_out0") == DataLayout.NHWC
+    assert model.get_tensor_layout("Reshape_0_out0") == DataLayout.NC
+    assert model.get_tensor_layout("global_out") == DataLayout.NC
+
+    model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
+    model = model.transform(ConvertBipolarMatMulToXnorPopcount())
+    model = model.transform(Streamline())
+    model = model.transform(to_hls.InferBinaryStreamingFCLayer())
+    model = model.transform(to_hls.InferQuantizedStreamingFCLayer())
+    model = model.transform(to_hls.InferConvInpGen())
+    model = model.transform(to_hls.InferStreamingMaxPool())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(InferDataLayouts())
+
+    assert model.get_tensor_layout("global_in") == DataLayout.NCHW
+    assert model.get_tensor_layout("Transpose_0_out0") == DataLayout.NHWC
+    # note: im2col output isn't really NHWC or any other common layout
+    # since the concept of channels changes with lowering... but it is
+    # conceptually close to NHWC since the innermost dim gets multiplied
+    assert (
+        model.get_tensor_layout("ConvolutionInputGenerator_0_out0") == DataLayout.NHWC
+    )
+    assert model.get_tensor_layout("StreamingFCLayer_Batch_3_out0") == DataLayout.NHWC
+    assert model.get_tensor_layout("Reshape_0_out0") == DataLayout.NC
+    assert model.get_tensor_layout("StreamingFCLayer_Batch_6_out0") == DataLayout.NC
+    assert model.get_tensor_layout("global_out") == DataLayout.NC
+
+    os.remove(export_onnx_path_cnv)
diff --git a/tests/transformation/test_infer_datatypes.py b/tests/transformation/test_infer_datatypes.py
index 77b6a94f8ed891a4fe761fe864a6e18d35e84382..e3db40289c4318894cf5ad41c2f67b3bff501db9 100644
--- a/tests/transformation/test_infer_datatypes.py
+++ b/tests/transformation/test_infer_datatypes.py
@@ -54,8 +54,8 @@ def test_infer_datatypes():
     assert model.get_tensor_datatype("MatMul_1_out0") == DataType.INT32
     assert model.get_tensor_datatype("MatMul_2_out0") == DataType.INT32
     assert model.get_tensor_datatype("MatMul_3_out0") == DataType.INT32
-    assert model.get_tensor_datatype("Sign_0_out0") == DataType.BIPOLAR
-    assert model.get_tensor_datatype("Sign_1_out0") == DataType.BIPOLAR
-    assert model.get_tensor_datatype("Sign_2_out0") == DataType.BIPOLAR
-    assert model.get_tensor_datatype("Sign_3_out0") == DataType.BIPOLAR
+    assert model.get_tensor_datatype("MultiThreshold_0_out0") == DataType.BIPOLAR
+    assert model.get_tensor_datatype("MultiThreshold_1_out0") == DataType.BIPOLAR
+    assert model.get_tensor_datatype("MultiThreshold_2_out0") == DataType.BIPOLAR
+    assert model.get_tensor_datatype("MultiThreshold_3_out0") == DataType.BIPOLAR
     os.remove(export_onnx_path)
diff --git a/tests/transformation/test_linear_past_eltwise.py b/tests/transformation/test_linear_past_eltwise.py
new file mode 100644
index 0000000000000000000000000000000000000000..b77f59779a5e8559f80e017d13b66bcb67249830
--- /dev/null
+++ b/tests/transformation/test_linear_past_eltwise.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import numpy as np
+
+from onnx import TensorProto, helper
+
+import finn.core.onnx_exec as oxe
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.fold_constants import FoldConstants
+from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from finn.transformation.streamline.reorder import MoveLinearPastEltwiseAdd
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.double_to_single_float import DoubleToSingleFloat
+
+import pytest
+
+export_onnx_path = "test_scalar_past_eltwise.onnx"
+
+# construct a synthetic graph to test:
+# topk insertion, topk conversion to hls, add conversion to hls
+# graph should just be a sum
+
+
+def make_model(shape):
+    inp1 = helper.make_tensor_value_info("inp1", TensorProto.FLOAT, shape)
+    inp2 = helper.make_tensor_value_info("inp2", TensorProto.FLOAT, shape)
+    inp1_add = helper.make_tensor_value_info("inp1_add", TensorProto.FLOAT, shape)
+    inp1_add_ct = helper.make_tensor_value_info("inp1_add_ct", TensorProto.FLOAT, [1])
+    inp2_add = helper.make_tensor_value_info("inp2_add", TensorProto.FLOAT, shape)
+    inp2_add_ct = helper.make_tensor_value_info("inp2_add_ct", TensorProto.FLOAT, [1])
+    inp1_mul = helper.make_tensor_value_info("inp1_mul", TensorProto.FLOAT, shape)
+    inp1_mul_ct = helper.make_tensor_value_info("inp1_mul_ct", TensorProto.FLOAT, [1])
+    inp2_mul = helper.make_tensor_value_info("inp2_mul", TensorProto.FLOAT, shape)
+    inp2_mul_ct = helper.make_tensor_value_info("inp2_mul_ct", TensorProto.FLOAT, [1])
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, shape)
+
+    add1_node = helper.make_node("Add", [inp1.name, inp1_add_ct.name], [inp1_add.name])
+    add2_node = helper.make_node("Add", [inp2.name, inp2_add_ct.name], [inp2_add.name])
+    mul1_node = helper.make_node(
+        "Mul", [inp1_add.name, inp1_mul_ct.name], [inp1_mul.name]
+    )
+    mul2_node = helper.make_node(
+        "Mul", [inp2_add.name, inp2_mul_ct.name], [inp2_mul.name]
+    )
+    eltwise_add_node = helper.make_node(
+        "Add", [inp1_mul.name, inp2_mul.name], [outp.name]
+    )
+    graph = helper.make_graph(
+        nodes=[add1_node, add2_node, mul1_node, mul2_node, eltwise_add_node],
+        name="graph",
+        inputs=[inp1, inp2],
+        outputs=[outp],
+    )
+
+    model = helper.make_model(graph, producer_name="add-model")
+    model = ModelWrapper(model)
+
+    # set initializers for scalar add/mul nodes
+    model.set_initializer(add1_node.input[1], np.array([7.0]))
+    model.set_initializer(add2_node.input[1], np.array([8.0]))
+    model.set_initializer(mul1_node.input[1], np.array([3.0]))
+    model.set_initializer(mul2_node.input[1], np.array([3.0]))
+
+    return model
+
+
+# channels
+@pytest.mark.parametrize("ch", [64])
+# ifmdim
+@pytest.mark.parametrize("ifmdim", [-1, 7])
+def test_linear_past_eltwise_add(ch, ifmdim):
+    # generate test vectors of correct shape
+    if ifmdim == -1:
+        input_tensor_shape = (1, ch)
+    else:
+        input_tensor_shape = (1, ch, ifmdim, ifmdim)
+
+    model = make_model(input_tensor_shape)
+    model.save(export_onnx_path)
+    model = ModelWrapper(export_onnx_path)
+    model = model.transform(DoubleToSingleFloat())
+    model = model.transform(InferShapes())
+    model = model.transform(FoldConstants())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+
+    x1 = np.random.randn(*input_tensor_shape).astype(np.float32)
+    x2 = np.random.randn(*input_tensor_shape).astype(np.float32)
+
+    # generate expected value from streamlined net
+    input_dict = {model.graph.input[0].name: x1, model.graph.input[1].name: x2}
+
+    output_dict = oxe.execute_onnx(model, input_dict, True)
+    produced_sum = output_dict[model.graph.output[0].name]
+    expected_sum = 3.0 * ((x1 + x2) + 15.0)
+    assert np.isclose(expected_sum, produced_sum, atol=1e-3).all()
+    assert len(model.get_nodes_by_op_type("Add")) == 3
+    assert len(model.get_nodes_by_op_type("Mul")) == 2
+
+    model = model.transform(MoveLinearPastEltwiseAdd())
+
+    # verify again, to check we didnt break anything
+    output_dict = oxe.execute_onnx(model, input_dict, True)
+    produced_sum = output_dict[model.graph.output[0].name]
+    assert np.isclose(expected_sum, produced_sum, atol=1e-3).all()
+    assert len(model.get_nodes_by_op_type("Add")) == 2
+    assert len(model.get_nodes_by_op_type("Mul")) == 1
+
+    os.remove(export_onnx_path)
+
+
+@pytest.mark.parametrize("ch", [64, 1])
+# ifmdim
+@pytest.mark.parametrize("ifmdim", [-1, 7])
+def test_linear_past_eltwise_add_multiple_forks(ch, ifmdim):
+    # generate test vectors of correct shape
+    if ifmdim == -1:
+        input_shape = (1, ch)
+    else:
+        input_shape = (1, ch, ifmdim, ifmdim)
+
+    top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
+    top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, input_shape)
+
+    num_of_params = 6
+    value_info = []
+    for i in range(num_of_params):
+        value_info += [
+            helper.make_tensor_value_info("p" + str(i), TensorProto.FLOAT, input_shape)
+        ]
+
+    modelproto = helper.make_model(
+        helper.make_graph(
+            name="test",
+            inputs=[top_in],
+            outputs=[top_out],
+            value_info=value_info,
+            nodes=[
+                helper.make_node("Add", ["top_in", "p0"], ["fork1"]),
+                helper.make_node("Mul", ["fork1", "p1"], ["t2"]),
+                helper.make_node("Mul", ["fork1", "p2"], ["t3"]),
+                helper.make_node("Add", ["t2", "t3"], ["t4"]),
+                helper.make_node("Mul", ["t4", "p3"], ["fork2"]),
+                helper.make_node("Add", ["fork2", "p4"], ["t5"]),
+                helper.make_node("Add", ["fork2", "p5"], ["t6"]),
+                helper.make_node("Add", ["t5", "t6"], ["top_out"]),
+            ],
+        )
+    )
+    model = ModelWrapper(modelproto)
+    model = model.transform(InferShapes())
+
+    np.random.seed(0)
+    for i in range(num_of_params):
+        model.set_initializer(
+            "p" + str(i), np.random.rand(*input_shape).astype(np.float32)
+        )
+
+    # need equal mults:
+    model.set_initializer("p2", model.get_initializer("p1"))
+
+    # Transform
+    new_model = model.transform(MoveLinearPastEltwiseAdd())
+    inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)}
+
+    # Test
+    assert oxe.compare_execution(model, new_model, inp_dict)
+    assert new_model.graph.node[0].op_type == "Add"
+    assert new_model.graph.node[1].op_type == "Add"
+    assert new_model.graph.node[2].op_type == "Mul"
+    assert new_model.graph.node[3].op_type == "Mul"
+    assert new_model.graph.node[4].op_type == "Add"
+    assert new_model.graph.node[5].op_type == "Add"
+    assert len(new_model.graph.node) == 6
diff --git a/tests/util/test_data_packing.py b/tests/util/test_data_packing.py
index 28f1d56d0dbc5451ccad3d36b4b1d4c6bed4f63e..7b77c4be20c1f41c11b53a9b65b79441c9bbbe47 100644
--- a/tests/util/test_data_packing.py
+++ b/tests/util/test_data_packing.py
@@ -47,6 +47,7 @@ from finn.util.data_packing import (
 
 @pytest.mark.parametrize("dtype", [DataType.BINARY, DataType.INT2, DataType.INT32])
 @pytest.mark.parametrize("test_shape", [(1, 2, 4), (1, 1, 64), (2, 64)])
+@pytest.mark.vivado
 def test_npy2apintstream(test_shape, dtype):
     ndarray = cutil.gen_finn_dt_tensor(dtype, test_shape)
     test_dir = cutil.make_build_dir(prefix="test_npy2apintstream_")