diff --git a/docker/Dockerfile.finn_ci b/docker/Dockerfile.finn_ci
index 41f6a88f5dd4c9b0822a74cf4a0e7b4663dce910..0d610ec66a5f433d156f4e8da976767ce6458aef 100644
--- a/docker/Dockerfile.finn_ci
+++ b/docker/Dockerfile.finn_ci
@@ -73,6 +73,8 @@ RUN mkdir -p $VIVADO_IP_CACHE
 WORKDIR /workspace/finn
 
 COPY finn_entrypoint.sh /usr/local/bin/
+COPY quicktest.sh /usr/local/bin/
 RUN chmod 755 /usr/local/bin/finn_entrypoint.sh
+RUN chmod 755 /usr/local/bin/quicktest.sh
 ENTRYPOINT ["finn_entrypoint.sh"]
 CMD ["bash"]
diff --git a/docker/Dockerfile.finn_dev b/docker/Dockerfile.finn_dev
index b7cfc299a2999662672225aa5f8912653d189559..1c2cb19d14137b866b55417522fdebb8e0d7ad90 100644
--- a/docker/Dockerfile.finn_dev
+++ b/docker/Dockerfile.finn_dev
@@ -95,7 +95,9 @@ RUN echo "source \$VIVADO_PATH/settings64.sh" >> /home/$UNAME/.bashrc
 # copy entrypoint script
 USER root
 COPY docker/finn_entrypoint.sh /usr/local/bin/
+COPY docker/quicktest.sh /usr/local/bin/
 RUN chmod 755 /usr/local/bin/finn_entrypoint.sh
+RUN chmod 755 /usr/local/bin/quicktest.sh
 USER $UNAME
 
 ENTRYPOINT ["finn_entrypoint.sh"]
diff --git a/docker/quicktest.sh b/docker/quicktest.sh
new file mode 100755
index 0000000000000000000000000000000000000000..4f6a2d3e230de9fcbb947d794722294880a7730d
--- /dev/null
+++ b/docker/quicktest.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+cd $FINN_ROOT
+python setup.py test --addopts "-m 'not (vivado or slow)'"
diff --git a/docs/finn/getting_started.rst b/docs/finn/getting_started.rst
index e2b56b7f75a489f4e4f08bccdc3b889822f81838..95594bb67a2be3a4c3fbba488c75a704f623c136 100644
--- a/docs/finn/getting_started.rst
+++ b/docs/finn/getting_started.rst
@@ -49,11 +49,30 @@ The run-docker.sh script forwards ports 8888 for Jupyter and 8081 for Netron, an
 
 Running the test suite directly
 *******************************
+FINN comes with a set of tests to check for regressions. The full test suite
+(which will take several hours to run and require a PYNQ board) can be executed
+by:
+
 ::
 
   sh run-docker.sh test
 
-FINN comes with a set of tests which can be launched using the command above. Note that some of the tests involve extra compilation and the entire test suite may take some time to complete.
+There is a quicker variant of the test suite that skips the tests marked as
+requiring Vivado or as slow-running tests:
+
+::
+
+  sh run-docker.sh quicktest
+
+If you want to run individual tests, you can do this *inside the Docker container
+from the FINN root directory* as follows:
+
+::
+
+  python setup.py test --addopts "-k test_end2end_tfc_w1a2"
+
+Please see the pytest documentation for more about picking tests by marks or
+by name.
 
 Environment variables
 **********************
diff --git a/run-docker.sh b/run-docker.sh
index b7f844d314c5fb67e11e0933f42b3edfa4d96036..e1f17e728204217ff3caa6e486b2daae16d6d271 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -27,13 +27,27 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+NC='\033[0m' # No Color
+
+# green echo
+gecho () {
+  echo -e "${GREEN}$1${NC}"
+}
+
+# red echo
+recho () {
+  echo -e "${RED}$1${NC}"
+}
+
 if [ -z "$VIVADO_PATH" ];then
-        echo "For correct implementation please set an environment variable VIVADO_PATH that contains the path to your vivado installation directory"
-        exit 1
+        recho "Please set the VIVADO_PATH that contains the path to your Vivado installation directory."
+        recho "FINN functionality depending on Vivado or Vivado HLS will not be available."
 fi
 
 if [ -z "$PYNQ_IP" ];then
-        echo "Please set the PYNQ_IP env.var. to enable PYNQ deployment tests."
+        recho "Please set the PYNQ_IP env.var. to enable PYNQ deployment tests."
 fi
 
 DOCKER_GID=$(id -g)
@@ -74,22 +88,25 @@ VIVADO_IP_CACHE=$BUILD_LOCAL/vivado_ip_cache
 mkdir -p $BUILD_LOCAL
 mkdir -p $VIVADO_IP_CACHE
 
-echo "Instance is named as $DOCKER_INST_NAME"
-echo "Mounting $BUILD_LOCAL into $BUILD_LOCAL"
-echo "Mounting $VIVADO_PATH into $VIVADO_PATH"
-echo "Port-forwarding for Jupyter $JUPYTER_PORT:$JUPYTER_PORT"
-echo "Port-forwarding for Netron $NETRON_PORT:$NETRON_PORT"
-echo "Vivado IP cache dir is at $VIVADO_IP_CACHE"
-echo "Using default PYNQ board $PYNQ_BOARD"
+gecho "Instance is named as $DOCKER_INST_NAME"
+gecho "Mounting $BUILD_LOCAL into $BUILD_LOCAL"
+gecho "Mounting $VIVADO_PATH into $VIVADO_PATH"
+gecho "Port-forwarding for Jupyter $JUPYTER_PORT:$JUPYTER_PORT"
+gecho "Port-forwarding for Netron $NETRON_PORT:$NETRON_PORT"
+gecho "Vivado IP cache dir is at $VIVADO_IP_CACHE"
+gecho "Using default PYNQ board $PYNQ_BOARD"
 
 if [ "$1" = "test" ]; then
-        echo "Running test suite"
+        gecho "Running test suite (all tests)"
         DOCKER_CMD="python setup.py test"
+elif [ "$1" = "quicktest" ]; then
+        gecho "Running test suite (non-Vivado, non-slow tests)"
+        DOCKER_CMD="quicktest.sh"
 elif [ "$1" = "notebook" ]; then
-        echo "Running Jupyter notebook server"
+        gecho "Running Jupyter notebook server"
         DOCKER_CMD="jupyter notebook --ip=0.0.0.0 --port $JUPYTER_PORT notebooks"
 else
-        echo "Running container only"
+        gecho "Running container only"
         DOCKER_CMD="bash"
 fi
 
diff --git a/setup.cfg b/setup.cfg
index 5974cda20e37449a879f7528516895fb7cea4264..1d7dcf247636b486e35d6320669eae706c2b7a72 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -101,6 +101,9 @@ extras = True
 # in order to write a coverage file that can be read by Jenkins.
 addopts =
     --verbose
+markers =
+    slow: marks tests as slow (deselect with '-m "not slow"')
+    vivado: mark tests that require Vivado or Vivado HLS
 norecursedirs =
     dist
     build
diff --git a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e6c63dc510aab5f6baff9cb6326a2d0476f67a9
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
@@ -0,0 +1,338 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+
+import numpy as np
+
+from finn.core.datatype import DataType
+from finn.custom_op.fpgadataflow import HLSCustomOp
+from onnx import TensorProto, helper
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class GlobalAccPool_Batch(HLSCustomOp):
+    """Class that corresponds to finn-hlslib AccPool_Batch function."""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "NumChannels": ("i", True, 0),
+            "PE": ("i", True, 0),
+            # FINN DataTypes for input
+            "inputDataType": ("s", True, ""),
+            # number of input vectors, examples:
+            # [1] is a single vector (like a FC layer with batch=1)
+            # [4] is four vectors (like a FC layer with batch=4)
+            # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
+            "numInputVectors": ("ints", False, [1]),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_normal_input_shape(self):
+        ch = self.get_nodeattr("NumChannels")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        ishape = tuple(vecs + [ch])
+        return ishape
+
+    def get_folded_input_shape(self):
+        ch = self.get_nodeattr("NumChannels")
+        pe = self.get_nodeattr("PE")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        assert ch % pe == 0, "PE must divide NumChannels"
+        folds = int(ch / pe)
+        folded_ishape = tuple(vecs + [folds, pe])
+        return folded_ishape
+
+    def get_normal_output_shape(self):
+        ch = self.get_nodeattr("NumChannels")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        oshape = tuple([vecs[0]] + [ch])
+        return oshape
+
+    def get_folded_output_shape(self):
+        ch = self.get_nodeattr("NumChannels")
+        pe = self.get_nodeattr("PE")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        assert ch % pe == 0, "PE must divide NumChannels"
+        folds = int(ch / pe)
+        oshape = tuple([vecs[0]] + [folds, pe])
+        return oshape
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpected input shape."
+        # implement tensor with correct shape
+        values = np.random.randn(*oshape).astype(np.float32)
+        return helper.make_node(
+            "Constant",
+            inputs=[],
+            outputs=[self.onnx_node.output[0]],
+            value=helper.make_tensor(
+                name="const_tensor",
+                data_type=TensorProto.FLOAT,
+                dims=values.shape,
+                vals=values.flatten(),
+            ),
+        )
+
+    def infer_node_datatype(self, model):
+        odt = self.get_output_datatype()
+        model.set_tensor_datatype(self.onnx_node.output[0], odt)
+
+    def verify_node(self):
+        info_messages = []
+        # verify that "domain" is set to "finn"
+        domain_value = self.onnx_node.domain
+        if domain_value == "finn":
+            info_messages.append("Attribute domain is set correctly")
+        else:
+            info_messages.append('Attribute domain should be set to "finn"')
+
+        # verify that "backend" is set to "fpgadataflow"
+        backend_value = self.get_nodeattr("backend")
+        if backend_value == "fpgadataflow":
+            info_messages.append("Attribute backend is set correctly")
+        else:
+            info_messages.append('Attribute backend should be set to "fpgadataflow"')
+
+        # verify that all necessary attributes exist
+        try:
+            self.get_nodeattr("code_gen_dir_cppsim")
+            self.get_nodeattr("executable_path")
+            self.get_nodeattr("NumChannels")
+            self.get_nodeattr("PE")
+            self.get_nodeattr("inputDataType")
+            info_messages.append("All necessary attributes exist")
+        except Exception:
+            info_messages.append(
+                """The required GlobalAccPool_Batch attributes do not exist."""
+            )
+
+        # verify that input data is 2D
+        if len(self.get_nodeattr("numInputVectors")) != 3:
+            info_messages.append("""GlobalAccPool_Batch requires 2D data input.""")
+            raise Exception
+
+        return info_messages
+
+    def get_input_datatype(self):
+        """Returns FINN DataType of input."""
+        return DataType[self.get_nodeattr("inputDataType")]
+
+    def get_output_datatype(self):
+        """Returns FINN DataType of output."""
+        # determine data type from image size and input type
+        idt = DataType[self.get_nodeattr("inputDataType")]
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        npixels = vecs[-1] * vecs[-2]
+        if idt.signed():
+            extreme_value = npixels * idt.min()
+        else:
+            extreme_value = npixels * idt.max()
+        return DataType.get_smallest_possible(extreme_value)
+
+    def get_instream_width(self):
+        """Returns input stream width."""
+        ibits = self.get_input_datatype().bitwidth()
+        pe = self.get_nodeattr("PE")
+        in_width = pe * ibits
+        return in_width
+
+    def get_outstream_width(self):
+        """Returns output stream width."""
+        obits = self.get_output_datatype().bitwidth()
+        pe = self.get_nodeattr("PE")
+        out_width = pe * obits
+        return out_width
+
+    def get_number_output_values(self):
+        return np.prod(self.get_folded_output_shape()[1:-1])
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        exp_ishape = self.get_normal_input_shape()
+        exp_oshape = self.get_normal_output_shape()
+        folded_ishape = self.get_folded_input_shape()
+        folded_oshape = self.get_folded_output_shape()
+
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert inp.shape == exp_ishape, """Input shape doesn't match expected shape ."""
+        export_idt = self.get_input_datatype()
+        # reshape input into folded form
+        inp = inp.reshape(folded_ishape)
+        # make copy before saving array
+        reshaped_input = inp.copy()
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_output(context)
+            assert (
+                context[node.output[0]].shape == folded_oshape
+            ), "cppsim \
+            did not produce expected ofolded utput shape"
+            context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            rtlsim_inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        assert (
+            context[node.output[0]].shape == exp_oshape
+        ), """Output shape doesn't match expected shape."""
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"']
+
+    def defines(self, var):
+        self.code_gen_dict["$DEFINES$"] = []
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_input_datatype()
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"] = []
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
+            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+        )
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+        )
+
+    def docompute(self):
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            """AccPool_Batch<{}, {}, {}, {}, {}> (in0, out, 1);""".format(
+                self.get_normal_input_shape()[1],
+                self.get_nodeattr("NumChannels"),
+                self.get_input_datatype().get_hls_datatype_str(),
+                self.get_nodeattr("PE"),
+                self.get_output_datatype().get_hls_datatype_str(),
+            )
+        ]
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        oshape = self.get_folded_output_shape()
+        oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
+
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                oshape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            """void {}(hls::stream<ap_uint<{}>> &in0,
+                hls::stream<ap_uint<{}>> &out)""".format(
+                self.onnx_node.name,
+                self.get_instream_width(),
+                self.get_outstream_width(),
+            )
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+        )
diff --git a/src/finn/custom_op/multithreshold.py b/src/finn/custom_op/multithreshold.py
index 37f8e0950b5fc352c8f9fe005884724f028879a0..bc0a454cdf847d124b12c940b029f51bf2d3e778 100644
--- a/src/finn/custom_op/multithreshold.py
+++ b/src/finn/custom_op/multithreshold.py
@@ -33,16 +33,6 @@ from finn.core.datatype import DataType
 from finn.custom_op import CustomOp
 
 
-def compare(x, y):
-    """Comparison helper function for multithresholding.
-
-    Gets two values and returns 1.0 if x>=y otherwise 0.0."""
-    if x >= y:
-        return 1.0
-    else:
-        return 0.0
-
-
 def multithreshold(v, thresholds, out_scale=None, out_bias=None):
     """Given a set of threshold values t={t_0, t_1 ... t_n} the successive
     thresholding maps any real number x to an integer in the interval [0, n],
@@ -76,8 +66,6 @@ def multithreshold(v, thresholds, out_scale=None, out_bias=None):
     num_act = thresholds.shape[1]
     # reshape inputs to enable channel-wise reading
     vr = v.reshape((v.shape[0], v.shape[1], -1))
-    # save the new shape size of the images
-    num_img_elem = vr.shape[2]
     # initiate output tensor
     ret = np.zeros_like(vr)
     # iterate over thresholds channel-wise
@@ -85,12 +73,10 @@ def multithreshold(v, thresholds, out_scale=None, out_bias=None):
         channel_thresh = thresholds[0] if is_global_threshold else thresholds[t]
         # iterate over batches
         for b in range(num_batch):
-            # iterate over image elements on which the thresholds will be applied
-            for elem in range(num_img_elem):
-                # iterate over the different thresholds for one channel
-                for a in range(num_act):
-                    # apply successive thresholding to every element
-                    ret[b][t][elem] += compare(vr[b][t][elem], channel_thresh[a])
+            # iterate over the different thresholds for one channel
+            for a in range(num_act):
+                ret[b][t] += (vr[b][t] >= channel_thresh[a]).astype(int)
+
     if out_scale is None:
         out_scale = 1.0
     if out_bias is None:
diff --git a/src/finn/custom_op/registry.py b/src/finn/custom_op/registry.py
index 0e532ae305e0bcc14ddf3b0242d95ff4bcec7dc7..514b0bf62285faaf32c7baa063bbab3622e280f0 100644
--- a/src/finn/custom_op/registry.py
+++ b/src/finn/custom_op/registry.py
@@ -43,6 +43,7 @@ from finn.custom_op.maxpoolnhwc import MaxPoolNHWC
 from finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch import (
     StreamingDataWidthConverter_Batch,
 )
+from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch
 from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch
 
 # create a mapping of all known CustomOp names and classes
@@ -59,6 +60,7 @@ custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition
 custom_op["MaxPoolNHWC"] = MaxPoolNHWC
 custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch
 custom_op["StreamingFIFO"] = StreamingFIFO
+custom_op["GlobalAccPool_Batch"] = GlobalAccPool_Batch
 custom_op["LabelSelect_Batch"] = LabelSelect_Batch
 
 
diff --git a/src/finn/transformation/fpgadataflow/make_deployment.py b/src/finn/transformation/fpgadataflow/make_deployment.py
index a185f5392c4b5ec848cd463e02ebab4be9c56a46..a4c751b844a2796447930eb74adad6321454ac09 100644
--- a/src/finn/transformation/fpgadataflow/make_deployment.py
+++ b/src/finn/transformation/fpgadataflow/make_deployment.py
@@ -68,10 +68,10 @@ class DeployToPYNQ(Transformation):
         for file in os.listdir(vivado_pynq_proj):
             if file.endswith(".bit"):
                 bitfile = os.path.join(vivado_pynq_proj, file)
+                copy(bitfile, deployment_dir)
             elif file.endswith(".hwh"):
                 hwhfile = os.path.join(vivado_pynq_proj, file)
-        copy(bitfile, deployment_dir)
-        copy(hwhfile, deployment_dir)
+                copy(hwhfile, deployment_dir)
 
         # driver.py and python libraries
         pynq_driver_dir = model.get_metadata_prop("pynq_driver_dir")
diff --git a/src/finn/util/test.py b/src/finn/util/test.py
index f1b98df7051711f2cd25479c8f7ebea1b7b94031..b0985bddd3306f81ad0e81f0fb582a0ae7fdaf3d 100644
--- a/src/finn/util/test.py
+++ b/src/finn/util/test.py
@@ -28,6 +28,9 @@
 
 from brevitas_examples import bnn_pynq
 import numpy as np
+import pytest
+import warnings
+from finn.core.modelwrapper import ModelWrapper
 
 # map of (wbits,abits) -> model
 example_map = {
@@ -71,3 +74,13 @@ def soft_verify_topk(invec, idxvec, k):
     soft_expected = invec.flatten()[np_topk.astype(np.int).flatten()]
     soft_produced = invec.flatten()[idxvec.astype(np.int).flatten()]
     return (soft_expected == soft_produced).all()
+
+
+def load_test_checkpoint_or_skip(filename):
+    "Try to load given .onnx and return ModelWrapper, else skip current test."
+    try:
+        model = ModelWrapper(filename)
+        return model
+    except FileNotFoundError:
+        warnings.warn(filename + " not found from previous test step, skipping")
+        pytest.skip(filename + " not found from previous test step, skipping")
diff --git a/tests/custom_op/test_multi_thresholding.py b/tests/custom_op/test_multithreshold.py
similarity index 61%
rename from tests/custom_op/test_multi_thresholding.py
rename to tests/custom_op/test_multithreshold.py
index 4f2b08675fdabb1bda49972c51892da92e1a0cdc..7e6ad4fe08517290dd22a2c74b2847d007b74b1f 100644
--- a/tests/custom_op/test_multi_thresholding.py
+++ b/tests/custom_op/test_multithreshold.py
@@ -27,11 +27,76 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import numpy as np
-
+import time
 from finn.custom_op.multithreshold import multithreshold
 
 
-def test_execute_multi_thresholding():
+def compare(x, y):
+    """Comparison helper function for multithresholding.
+
+    Gets two values and returns 1.0 if x>=y otherwise 0.0."""
+    if x >= y:
+        return 1.0
+    else:
+        return 0.0
+
+# naive implementation of thresholding for performance comparison
+def multithreshold_elementwise(v, thresholds, out_scale=None, out_bias=None):
+    """Given a set of threshold values t={t_0, t_1 ... t_n} the successive
+    thresholding maps any real number x to an integer in the interval [0, n],
+    where the returned integer is the number of thresholds x is greater than
+    or equal to.
+
+    The output tensor will be scaled by out_scale and biased by out_bias."""
+    # the inputs are expected to be in the shape (N,C,H,W) or (N, C)
+    # the MultiThreshold node supports a data_layout attribute that can be set
+    # to 'NHWC' to support (N,H,W,C) data layout mode for in-out as well
+    # N : Batch size
+    # C : Number of channels
+    # H : Heigth of the input images
+    # W : Width of the input images
+    #
+    # the thresholds are expected to be in the shape (C, B)
+    # C : Number of channels (must be the same value as C in input tensor
+    #     or 1 if all channels use the same threshold value)
+    # B : Desired activation steps => i.e. for 4-bit activation,
+    #     B=7 (2^(n)-1 and n=4)
+    # the output tensor will be scaled by out_scale and biased by out_bias
+    # assert threshold shape
+    is_global_threshold = thresholds.shape[0] == 1
+    assert (
+        v.shape[1] == thresholds.shape[0]
+    ) or is_global_threshold, """"Threshold
+    shape incorrect"""
+    # save the required shape sizes for the loops (N, C and B)
+    num_batch = v.shape[0]
+    num_channel = v.shape[1]
+    num_act = thresholds.shape[1]
+    # reshape inputs to enable channel-wise reading
+    vr = v.reshape((v.shape[0], v.shape[1], -1))
+    # save the new shape size of the images
+    num_img_elem = vr.shape[2]
+    # initiate output tensor
+    ret = np.zeros_like(vr)
+    # iterate over thresholds channel-wise
+    for t in range(num_channel):
+        channel_thresh = thresholds[0] if is_global_threshold else thresholds[t]
+        # iterate over batches
+        for b in range(num_batch):
+            # iterate over image elements on which the thresholds will be applied
+            for elem in range(num_img_elem):
+                # iterate over the different thresholds for one channel
+                for a in range(num_act):
+                    # apply successive thresholding to every element
+                    ret[b][t][elem] += compare(vr[b][t][elem], channel_thresh[a])
+    if out_scale is None:
+        out_scale = 1.0
+    if out_bias is None:
+        out_bias = 0.0
+    return out_scale * ret.reshape(v.shape) + out_bias
+
+
+def test_multithreshold():
 
     inputs = np.ndarray(
         shape=(6, 3, 2, 2),
@@ -223,9 +288,35 @@ def test_execute_multi_thresholding():
     )
 
     results = multithreshold(inputs, thresholds)
-
     assert (results == outputs).all()
 
     results_scaled = multithreshold(inputs, thresholds, 2.0, -1.0)
     outputs_scaled = 2.0 * outputs - 1.0
     assert (results_scaled == outputs_scaled).all()
+
+    # performance and random test
+    np.random.seed(0)
+    inputs = np.random.random((1, 256, 64, 64))
+    thresholds = (np.array([[1, 2, 3, 4, 5, 6]]) - 0.5) / 6
+
+    before = time.time()
+    vec_results = multithreshold(inputs, thresholds)
+    after = time.time()
+    vector_runtime = after - before
+
+    before = time.time()
+    nonvec_results = multithreshold_elementwise(inputs, thresholds)
+    after = time.time()
+    non_vector_runtime = after - before
+
+    assert (vec_results == nonvec_results).all()
+
+    return vector_runtime, non_vector_runtime
+
+
+if __name__ == "__main__":
+    vector_runtime, non_vector_runtime = test_multithreshold()
+
+    print("Runtime non-vectorized: ", non_vector_runtime, "s")
+    print("Runtime vectorized: ", vector_runtime, "s")
+    print("Speed-up: ", non_vector_runtime / vector_runtime)
diff --git a/tests/end2end/test_end2end_cnv_w1a1.py b/tests/end2end/test_end2end_cnv_w1a1.py
index 7dd45cbc732a50f8f41c1932601308f0dfd77c20..e6d1fc4efd61c01654ee88638698215d23a82eb3 100644
--- a/tests/end2end/test_end2end_cnv_w1a1.py
+++ b/tests/end2end/test_end2end_cnv_w1a1.py
@@ -36,7 +36,6 @@ import onnx  # NOQA
 
 import pytest
 import pkg_resources as pk
-from finn.core.modelwrapper import ModelWrapper
 from finn.custom_op.registry import getCustomOp
 from finn.core.onnx_exec import execute_onnx
 from finn.transformation.double_to_single_float import DoubleToSingleFloat
@@ -69,7 +68,7 @@ from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject
 from finn.transformation.fpgadataflow.synth_pynq_proj import SynthPYNQProject
 from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
 from finn.util.basic import pynq_part_map
-from finn.util.test import get_test_model_trained
+from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip
 from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
@@ -91,7 +90,7 @@ def test_end2end_cnv_w1a1_export():
 
 
 def test_end2end_cnv_w1a1_import_and_tidy():
-    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_export.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w1a1_export.onnx")
     model = model.transform(DoubleToSingleFloat())
     model = model.transform(InferShapes())
     model = model.transform(FoldConstants())
@@ -101,7 +100,7 @@ def test_end2end_cnv_w1a1_import_and_tidy():
 
 
 def test_end2end_cnv_w1a1_streamline():
-    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_tidy.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w1a1_tidy.onnx")
     model = model.transform(Streamline())
     model = model.transform(LowerConvsToMatMul())
     model = model.transform(MakeMaxPoolNHWC())
@@ -112,7 +111,9 @@ def test_end2end_cnv_w1a1_streamline():
 
 
 def test_end2end_cnv_w1a1_convert_to_hls_layers():
-    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_streamlined.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w1a1_streamlined.onnx"
+    )
     model = model.transform(to_hls.InferBinaryStreamingFCLayer(mem_mode))
     model = model.transform(to_hls.InferQuantizedStreamingFCLayer(mem_mode))
     model = model.transform(to_hls.InferConvInpGen())
@@ -122,18 +123,22 @@ def test_end2end_cnv_w1a1_convert_to_hls_layers():
 
 
 def test_end2end_cnv_w1a1_create_dataflow_partition():
-    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_hls_layers.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w1a1_hls_layers.onnx"
+    )
     parent_model = model.transform(CreateDataflowPartition())
     parent_model.save(build_dir + "/end2end_cnv_w1a1_dataflow_parent.onnx")
     sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
     sdp_node = getCustomOp(sdp_node)
     dataflow_model_filename = sdp_node.get_nodeattr("model")
-    dataflow_model = ModelWrapper(dataflow_model_filename)
+    dataflow_model = load_test_checkpoint_or_skip(dataflow_model_filename)
     dataflow_model.save(build_dir + "/end2end_cnv_w1a1_dataflow_model.onnx")
 
 
 def test_end2end_cnv_w1a1_fold_and_tlastmarker():
-    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_dataflow_model.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w1a1_dataflow_model.onnx"
+    )
     fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
     # each tuple is (PE, SIMD, in_fifo_depth) for a layer
     folding = [
@@ -167,23 +172,27 @@ def test_end2end_cnv_w1a1_fold_and_tlastmarker():
     model.save(build_dir + "/end2end_cnv_w1a1_folded.onnx")
 
 
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_end2end_cnv_w1a1_gen_hls_ip():
-    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_folded.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w1a1_folded.onnx")
     model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
     model = model.transform(HLSSynthIP())
     model = model.transform(AnnotateResources("hls"))
     model.save(build_dir + "/end2end_cnv_w1a1_ipgen.onnx")
 
 
+@pytest.mark.vivado
 def test_end2end_cnv_w1a1_ip_stitch():
-    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_ipgen.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w1a1_ipgen.onnx")
     model = model.transform(ReplaceVerilogRelPaths())
     model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
     model.save(build_dir + "/end2end_cnv_w1a1_ipstitch.onnx")
 
 
+@pytest.mark.vivado
 def test_end2end_cnv_w1a1_verify_dataflow_part():
-    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_ipstitch.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w1a1_ipstitch.onnx")
     x = np.zeros((1, 32, 32, 3), dtype=np.float32)
     inp_name = model.graph.input[0].name
     out_name = model.graph.output[0].name
@@ -212,9 +221,12 @@ def test_end2end_cnv_w1a1_verify_dataflow_part():
     assert np.isclose(res_cppsim, res_rtlsim_whole).all()
 
 
+@pytest.mark.vivado
 def test_end2end_cnv_w1a1_verify_all():
     # use the streamlined model as the "golden" model for right answers
-    golden = ModelWrapper(build_dir + "/end2end_cnv_w1a1_streamlined.onnx")
+    golden = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w1a1_streamlined.onnx"
+    )
     iname = golden.graph.input[0].name
     oname = golden.graph.output[0].name
     # load one of the test vectors
@@ -228,22 +240,31 @@ def test_end2end_cnv_w1a1_verify_all():
     y_golden = ret_golden[oname]
     # set up parent+child graph to test
     # we'll use models from the previous step as the child model
-    parent_model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_dataflow_parent.onnx")
+    parent_model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w1a1_dataflow_parent.onnx"
+    )
     iname = parent_model.graph.input[0].name
     oname = parent_model.graph.output[0].name
     # produce results with cppsim
     sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
     sdp_node = getCustomOp(sdp_node)
+    load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w1a1_ipgen_cppsim.onnx")
     sdp_node.set_nodeattr("model", build_dir + "/end2end_cnv_w1a1_ipgen_cppsim.onnx")
     ret_cppsim = execute_onnx(parent_model, {iname: x}, True)
     y_cppsim = ret_cppsim[oname]
     # produce results with node-by-node rtlsim
+    load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w1a1_ipgen_nodebynode_rtlsim.onnx"
+    )
     sdp_node.set_nodeattr(
         "model", build_dir + "/end2end_cnv_w1a1_ipgen_nodebynode_rtlsim.onnx"
     )
     ret_nodebynode_rtlsim = execute_onnx(parent_model, {iname: x}, True)
     y_nodebynode_rtlsim = ret_nodebynode_rtlsim[oname]
     # produce results with whole-network (stitched ip) rtlsim
+    load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w1a1_ipstitch_whole_rtlsim.onnx"
+    )
     sdp_node.set_nodeattr(
         "model", build_dir + "/end2end_cnv_w1a1_ipstitch_whole_rtlsim.onnx"
     )
@@ -257,27 +278,34 @@ def test_end2end_cnv_w1a1_verify_all():
     assert np.argmax(y_golden) == 3
 
 
+@pytest.mark.vivado
 def test_end2end_cnv_w1a1_make_pynq_proj():
-    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_ipstitch.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w1a1_ipstitch.onnx")
     model = model.transform(MakePYNQProject(test_pynq_board))
     model.save(build_dir + "/end2end_cnv_w1a1_pynq_project.onnx")
 
 
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_end2end_cnv_w1a1_synth_pynq_project():
-    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_pynq_project.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w1a1_pynq_project.onnx"
+    )
     model = model.transform(SynthPYNQProject())
     model = model.transform(AnnotateResources("synth"))
     model.save(build_dir + "/end2end_cnv_w1a1_synth.onnx")
 
 
 def test_end2end_cnv_w1a1_make_driver():
-    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_synth.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w1a1_synth.onnx")
     model = model.transform(MakePYNQDriver())
     model.save(build_dir + "/end2end_cnv_w1a1_pynq_driver.onnx")
 
 
 def test_end2end_cnv_w1a1_deploy_on_pynq():
-    model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_pynq_driver.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w1a1_pynq_driver.onnx"
+    )
     try:
         ip = os.environ["PYNQ_IP"]  # no fault for this one; skip if not defined
         if ip == "":
@@ -295,7 +323,9 @@ def test_end2end_cnv_w1a1_deploy_on_pynq():
 
 def test_end2end_cnv_w1a1_run_on_pynq():
     # use the streamlined model as the "golden" model for right answers
-    golden = ModelWrapper(build_dir + "/end2end_cnv_w1a1_streamlined.onnx")
+    golden = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w1a1_streamlined.onnx"
+    )
     iname = golden.graph.input[0].name
     oname = golden.graph.output[0].name
     # load one of the test vectors
@@ -309,7 +339,9 @@ def test_end2end_cnv_w1a1_run_on_pynq():
     y_golden = ret_golden[oname]
     # set up parent+child graph to test
     # we'll use models from the previous step as the child model
-    parent_model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_dataflow_parent.onnx")
+    parent_model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w1a1_dataflow_parent.onnx"
+    )
     iname = parent_model.graph.input[0].name
     oname = parent_model.graph.output[0].name
     try:
@@ -319,6 +351,7 @@ def test_end2end_cnv_w1a1_run_on_pynq():
         # produce results with cppsim
         sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
         sdp_node = getCustomOp(sdp_node)
+        load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w1a1_pynq_deploy.onnx")
         sdp_node.set_nodeattr("model", build_dir + "/end2end_cnv_w1a1_pynq_deploy.onnx")
         ret = execute_onnx(parent_model, {iname: x}, True)
         y = ret[oname]
diff --git a/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py b/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py
index 74cd46549f45b7512a03da450e011c4f2e80e16e..1ba149687bb80a0f977115bd380a09f70eef23f1 100644
--- a/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py
+++ b/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py
@@ -40,7 +40,6 @@ import onnx.numpy_helper as nph
 
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 import finn.transformation.streamline.absorb as absorb
-from finn.core.modelwrapper import ModelWrapper
 from finn.core.onnx_exec import execute_onnx
 from finn.core.throughput_test import throughput_test
 from finn.custom_op.registry import getCustomOp
@@ -71,7 +70,7 @@ from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline import Streamline
 from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
 from finn.util.basic import pynq_part_map
-from finn.util.test import get_test_model_trained
+from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip
 from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 
@@ -92,7 +91,7 @@ def test_end2end_tfc_w1a1_export():
 
 
 def test_end2end_tfc_w1a1_import_and_tidy():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_export.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a1_export.onnx")
     model = model.transform(InferShapes())
     model = model.transform(FoldConstants())
     model = model.transform(GiveUniqueNodeNames())
@@ -102,13 +101,15 @@ def test_end2end_tfc_w1a1_import_and_tidy():
 
 
 def test_end2end_tfc_w1a1_streamline():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_tidy.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a1_tidy.onnx")
     model = model.transform(Streamline())
     model.save(build_dir + "/end2end_tfc_w1a1_streamlined.onnx")
 
 
 def test_end2end_tfc_w1a1_convert_to_hls_layers():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_streamlined.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a1_streamlined.onnx"
+    )
     model = model.transform(ConvertBipolarMatMulToXnorPopcount())
     model = model.transform(absorb.AbsorbAddIntoMultiThreshold())
     model = model.transform(absorb.AbsorbMulIntoMultiThreshold())
@@ -118,18 +119,22 @@ def test_end2end_tfc_w1a1_convert_to_hls_layers():
 
 
 def test_end2end_tfc_w1a1_create_dataflow_partition():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_hls_layers.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a1_hls_layers.onnx"
+    )
     parent_model = model.transform(CreateDataflowPartition())
     parent_model.save(build_dir + "/end2end_tfc_w1a1_dataflow_parent.onnx")
     sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
     sdp_node = getCustomOp(sdp_node)
     dataflow_model_filename = sdp_node.get_nodeattr("model")
-    dataflow_model = ModelWrapper(dataflow_model_filename)
+    dataflow_model = load_test_checkpoint_or_skip(dataflow_model_filename)
     dataflow_model.save(build_dir + "/end2end_tfc_w1a1_dataflow_model.onnx")
 
 
 def test_end2end_tfc_w1a1_fold_and_tlastmarker():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_dataflow_model.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a1_dataflow_model.onnx"
+    )
     fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
     # (PE, SIMD, in_fifo_depth, out_fifo_depth, ramstyle) for each layer
     config = [
@@ -153,23 +158,27 @@ def test_end2end_tfc_w1a1_fold_and_tlastmarker():
     model.save(build_dir + "/end2end_tfc_w1a1_folded.onnx")
 
 
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_end2end_tfc_w1a1_gen_hls_ip():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_folded.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a1_folded.onnx")
     model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
     model = model.transform(HLSSynthIP())
     model = model.transform(AnnotateResources("hls"))
     model.save(build_dir + "/end2end_tfc_w1a1_ipgen.onnx")
 
 
+@pytest.mark.vivado
 def test_end2end_tfc_w1a1_ip_stitch():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_ipgen.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a1_ipgen.onnx")
     model = model.transform(ReplaceVerilogRelPaths())
     model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
     model.save(build_dir + "/end2end_tfc_w1a1_ipstitch.onnx")
 
 
+@pytest.mark.vivado
 def test_end2end_tfc_w1a1_verify_dataflow_part():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_ipstitch.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a1_ipstitch.onnx")
     x = np.zeros((1, 784), dtype=np.float32)
     inp_name = model.graph.input[0].name
     out_name = model.graph.output[0].name
@@ -196,9 +205,12 @@ def test_end2end_tfc_w1a1_verify_dataflow_part():
     assert np.isclose(res_cppsim, res_rtlsim_whole).all()
 
 
+@pytest.mark.vivado
 def test_end2end_tfc_w1a1_verify_all():
     # use the streamlined model as the "golden" model for right answers
-    golden = ModelWrapper(build_dir + "/end2end_tfc_w1a1_streamlined.onnx")
+    golden = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a1_streamlined.onnx"
+    )
     iname = golden.graph.input[0].name
     oname = golden.graph.output[0].name
     raw_i = get_data("finn", "data/onnx/mnist-conv/test_data_set_0/input_0.pb")
@@ -209,22 +221,31 @@ def test_end2end_tfc_w1a1_verify_all():
     y_golden = ret_golden[oname]
     # set up parent+child graph to test
     # we'll use models from the previous step as the child model
-    parent_model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_dataflow_parent.onnx")
+    parent_model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a1_dataflow_parent.onnx"
+    )
     iname = parent_model.graph.input[0].name
     oname = parent_model.graph.output[0].name
     # produce results with cppsim
     sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
     sdp_node = getCustomOp(sdp_node)
+    load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a1_ipstitch_cppsim.onnx")
     sdp_node.set_nodeattr("model", build_dir + "/end2end_tfc_w1a1_ipstitch_cppsim.onnx")
     ret_cppsim = execute_onnx(parent_model, {iname: x}, True)
     y_cppsim = ret_cppsim[oname]
     # produce results with node-by-node rtlsim
+    load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a1_ipstitch_nodebynode_rtlsim.onnx"
+    )
     sdp_node.set_nodeattr(
         "model", build_dir + "/end2end_tfc_w1a1_ipstitch_nodebynode_rtlsim.onnx"
     )
     ret_nodebynode_rtlsim = execute_onnx(parent_model, {iname: x}, True)
     y_nodebynode_rtlsim = ret_nodebynode_rtlsim[oname]
     # produce results with whole-network (stitched ip) rtlsim
+    load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a1_ipstitch_whole_rtlsim.onnx"
+    )
     sdp_node.set_nodeattr(
         "model", build_dir + "/end2end_tfc_w1a1_ipstitch_whole_rtlsim.onnx"
     )
@@ -235,27 +256,34 @@ def test_end2end_tfc_w1a1_verify_all():
     assert np.isclose(y_golden, y_whole_rtlsim).all()
 
 
+@pytest.mark.vivado
 def test_end2end_tfc_w1a1_make_pynq_proj():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_ipstitch.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a1_ipstitch.onnx")
     model = model.transform(MakePYNQProject(test_pynq_board))
     model.save(build_dir + "/end2end_tfc_w1a1_pynq_project.onnx")
 
 
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_end2end_tfc_w1a1_synth_pynq_project():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_pynq_project.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a1_pynq_project.onnx"
+    )
     model = model.transform(SynthPYNQProject())
     model = model.transform(AnnotateResources("synth"))
     model.save(build_dir + "/end2end_tfc_w1a1_synth.onnx")
 
 
 def test_end2end_tfc_w1a1_make_driver():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_synth.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a1_synth.onnx")
     model = model.transform(MakePYNQDriver())
     model.save(build_dir + "/end2end_tfc_w1a1_pynq_driver.onnx")
 
 
 def test_end2end_tfc_w1a1_deploy_on_pynq():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_pynq_driver.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a1_pynq_driver.onnx"
+    )
     try:
         ip = os.environ["PYNQ_IP"]  # no fault for this one; skip if not defined
         if ip == "":
@@ -273,7 +301,9 @@ def test_end2end_tfc_w1a1_deploy_on_pynq():
 
 def test_end2end_tfc_w1a1_run_on_pynq():
     # use the streamlined model as the "golden" model for right answers
-    golden = ModelWrapper(build_dir + "/end2end_tfc_w1a1_streamlined.onnx")
+    golden = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a1_streamlined.onnx"
+    )
     iname = golden.graph.input[0].name
     oname = golden.graph.output[0].name
     raw_i = get_data("finn", "data/onnx/mnist-conv/test_data_set_0/input_0.pb")
@@ -285,7 +315,9 @@ def test_end2end_tfc_w1a1_run_on_pynq():
     y_golden = ret_golden[oname]
     # set up parent+child graph to test
     # we'll use models from the previous step as the child model
-    parent_model = ModelWrapper(build_dir + "/end2end_tfc_w1a1_dataflow_parent.onnx")
+    parent_model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a1_dataflow_parent.onnx"
+    )
     iname = parent_model.graph.input[0].name
     oname = parent_model.graph.output[0].name
     try:
@@ -295,11 +327,12 @@ def test_end2end_tfc_w1a1_run_on_pynq():
         # produce results with cppsim
         sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
         sdp_node = getCustomOp(sdp_node)
+        load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a1_pynq_deploy.onnx")
         sdp_node.set_nodeattr("model", build_dir + "/end2end_tfc_w1a1_pynq_deploy.onnx")
         ret = execute_onnx(parent_model, {iname: x}, True)
         y = ret[oname]
         assert np.isclose(y, y_golden).all()
-        child_model = ModelWrapper(sdp_node.get_nodeattr("model"))
+        child_model = load_test_checkpoint_or_skip(sdp_node.get_nodeattr("model"))
         res = throughput_test(child_model)
         assert res is not None
 
diff --git a/tests/end2end/test_end2end_tfc_w1a2.py b/tests/end2end/test_end2end_tfc_w1a2.py
index 5ee2942845c41f4c6705b4ee3ecee89154d9faa9..d4c005a86580fb36e735beb00717fcfdffff21e5 100644
--- a/tests/end2end/test_end2end_tfc_w1a2.py
+++ b/tests/end2end/test_end2end_tfc_w1a2.py
@@ -39,7 +39,6 @@ import onnx  # NOQA
 import onnx.numpy_helper as nph
 
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
-from finn.core.modelwrapper import ModelWrapper
 from finn.core.onnx_exec import execute_onnx
 from finn.custom_op.registry import getCustomOp
 from finn.transformation.fold_constants import FoldConstants
@@ -67,7 +66,7 @@ from finn.transformation.infer_datatypes import InferDataTypes
 from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline import Streamline
 from finn.util.basic import pynq_part_map
-from finn.util.test import get_test_model_trained
+from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip
 from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 
@@ -88,7 +87,7 @@ def test_end2end_tfc_w1a2_export():
 
 
 def test_end2end_tfc_w1a2_import_and_tidy():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_export.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a2_export.onnx")
     model = model.transform(InferShapes())
     model = model.transform(FoldConstants())
     model = model.transform(GiveUniqueNodeNames())
@@ -98,30 +97,36 @@ def test_end2end_tfc_w1a2_import_and_tidy():
 
 
 def test_end2end_tfc_w1a2_streamline():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_tidy.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a2_tidy.onnx")
     model = model.transform(Streamline())
     model.save(build_dir + "/end2end_tfc_w1a2_streamlined.onnx")
 
 
 def test_end2end_tfc_w1a2_convert_to_hls_layers():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_streamlined.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a2_streamlined.onnx"
+    )
     model = model.transform(to_hls.InferQuantizedStreamingFCLayer(mem_mode))
     model.save(build_dir + "/end2end_tfc_w1a2_hls_layers.onnx")
 
 
 def test_end2end_tfc_w1a2_create_dataflow_partition():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_hls_layers.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a2_hls_layers.onnx"
+    )
     parent_model = model.transform(CreateDataflowPartition())
     parent_model.save(build_dir + "/end2end_tfc_w1a2_dataflow_parent.onnx")
     sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
     sdp_node = getCustomOp(sdp_node)
     dataflow_model_filename = sdp_node.get_nodeattr("model")
-    dataflow_model = ModelWrapper(dataflow_model_filename)
+    dataflow_model = load_test_checkpoint_or_skip(dataflow_model_filename)
     dataflow_model.save(build_dir + "/end2end_tfc_w1a2_dataflow_model.onnx")
 
 
 def test_end2end_tfc_w1a2_fold_and_tlastmarker():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_dataflow_model.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a2_dataflow_model.onnx"
+    )
     fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
     # (PE, SIMD, in_fifo_depth, out_fifo_depth, ramstyle) for each layer
     config = [
@@ -145,23 +150,27 @@ def test_end2end_tfc_w1a2_fold_and_tlastmarker():
     model.save(build_dir + "/end2end_tfc_w1a2_folded.onnx")
 
 
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_end2end_tfc_w1a2_gen_hls_ip():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_folded.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a2_folded.onnx")
     model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
     model = model.transform(HLSSynthIP())
     model = model.transform(AnnotateResources("hls"))
     model.save(build_dir + "/end2end_tfc_w1a2_ipgen.onnx")
 
 
+@pytest.mark.vivado
 def test_end2end_tfc_w1a2_ip_stitch():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_ipgen.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a2_ipgen.onnx")
     model = model.transform(ReplaceVerilogRelPaths())
     model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
     model.save(build_dir + "/end2end_tfc_w1a2_ipstitch.onnx")
 
 
+@pytest.mark.vivado
 def test_end2end_tfc_w1a2_verify_dataflow_part():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_ipstitch.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a2_ipstitch.onnx")
     x = np.zeros((1, 784), dtype=np.float32)
     inp_name = model.graph.input[0].name
     out_name = model.graph.output[0].name
@@ -188,9 +197,12 @@ def test_end2end_tfc_w1a2_verify_dataflow_part():
     assert np.isclose(res_cppsim, res_rtlsim_whole).all()
 
 
+@pytest.mark.vivado
 def test_end2end_tfc_w1a2_verify_all():
     # use the streamlined model as the "golden" model for right answers
-    golden = ModelWrapper(build_dir + "/end2end_tfc_w1a2_streamlined.onnx")
+    golden = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a2_streamlined.onnx"
+    )
     iname = golden.graph.input[0].name
     oname = golden.graph.output[0].name
     raw_i = get_data("finn", "data/onnx/mnist-conv/test_data_set_0/input_0.pb")
@@ -201,22 +213,31 @@ def test_end2end_tfc_w1a2_verify_all():
     y_golden = ret_golden[oname]
     # set up parent+child graph to test
     # we'll use models from the previous step as the child model
-    parent_model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_dataflow_parent.onnx")
+    parent_model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a2_dataflow_parent.onnx"
+    )
     iname = parent_model.graph.input[0].name
     oname = parent_model.graph.output[0].name
     # produce results with cppsim
     sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
     sdp_node = getCustomOp(sdp_node)
+    load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a2_ipstitch_cppsim.onnx")
     sdp_node.set_nodeattr("model", build_dir + "/end2end_tfc_w1a2_ipstitch_cppsim.onnx")
     ret_cppsim = execute_onnx(parent_model, {iname: x}, True)
     y_cppsim = ret_cppsim[oname]
     # produce results with node-by-node rtlsim
+    load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a2_ipstitch_nodebynode_rtlsim.onnx"
+    )
     sdp_node.set_nodeattr(
         "model", build_dir + "/end2end_tfc_w1a2_ipstitch_nodebynode_rtlsim.onnx"
     )
     ret_nodebynode_rtlsim = execute_onnx(parent_model, {iname: x}, True)
     y_nodebynode_rtlsim = ret_nodebynode_rtlsim[oname]
     # produce results with whole-network (stitched ip) rtlsim
+    load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a2_ipstitch_whole_rtlsim.onnx"
+    )
     sdp_node.set_nodeattr(
         "model", build_dir + "/end2end_tfc_w1a2_ipstitch_whole_rtlsim.onnx"
     )
@@ -227,27 +248,34 @@ def test_end2end_tfc_w1a2_verify_all():
     assert np.isclose(y_golden, y_whole_rtlsim).all()
 
 
+@pytest.mark.vivado
 def test_end2end_tfc_w1a2_make_pynq_proj():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_ipstitch.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a2_ipstitch.onnx")
     model = model.transform(MakePYNQProject(test_pynq_board))
     model.save(build_dir + "/end2end_tfc_w1a2_pynq_project.onnx")
 
 
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_end2end_tfc_w1a2_synth_pynq_project():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_pynq_project.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a2_pynq_project.onnx"
+    )
     model = model.transform(SynthPYNQProject())
     model = model.transform(AnnotateResources("synth"))
     model.save(build_dir + "/end2end_tfc_w1a2_synth.onnx")
 
 
 def test_end2end_tfc_w1a2_make_driver():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_synth.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a2_synth.onnx")
     model = model.transform(MakePYNQDriver())
     model.save(build_dir + "/end2end_tfc_w1a2_pynq_driver.onnx")
 
 
 def test_end2end_tfc_w1a2_deploy_on_pynq():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_pynq_driver.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a2_pynq_driver.onnx"
+    )
     try:
         ip = os.environ["PYNQ_IP"]  # no fault for this one; skip if not defined
         if ip == "":
@@ -265,7 +293,9 @@ def test_end2end_tfc_w1a2_deploy_on_pynq():
 
 def test_end2end_tfc_w1a2_run_on_pynq():
     # use the streamlined model as the "golden" model for right answers
-    golden = ModelWrapper(build_dir + "/end2end_tfc_w1a2_streamlined.onnx")
+    golden = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a2_streamlined.onnx"
+    )
     iname = golden.graph.input[0].name
     oname = golden.graph.output[0].name
     raw_i = get_data("finn", "data/onnx/mnist-conv/test_data_set_0/input_0.pb")
@@ -277,7 +307,9 @@ def test_end2end_tfc_w1a2_run_on_pynq():
     y_golden = ret_golden[oname]
     # set up parent+child graph to test
     # we'll use models from the previous step as the child model
-    parent_model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_dataflow_parent.onnx")
+    parent_model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a2_dataflow_parent.onnx"
+    )
     iname = parent_model.graph.input[0].name
     oname = parent_model.graph.output[0].name
     try:
@@ -287,6 +319,7 @@ def test_end2end_tfc_w1a2_run_on_pynq():
         # produce results with cppsim
         sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
         sdp_node = getCustomOp(sdp_node)
+        load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w1a2_pynq_deploy.onnx")
         sdp_node.set_nodeattr("model", build_dir + "/end2end_tfc_w1a2_pynq_deploy.onnx")
         ret = execute_onnx(parent_model, {iname: x}, True)
         y = ret[oname]
diff --git a/tests/end2end/test_end2end_tfc_w2a2.py b/tests/end2end/test_end2end_tfc_w2a2.py
index 2477318efd1e02b0865dadb40bad1a74ac8ea0b4..19d3f86e046658c4080d71984df1cff74008adab 100644
--- a/tests/end2end/test_end2end_tfc_w2a2.py
+++ b/tests/end2end/test_end2end_tfc_w2a2.py
@@ -39,7 +39,6 @@ import onnx  # NOQA
 import onnx.numpy_helper as nph
 
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
-from finn.core.modelwrapper import ModelWrapper
 from finn.core.onnx_exec import execute_onnx
 from finn.custom_op.registry import getCustomOp
 from finn.transformation.fold_constants import FoldConstants
@@ -67,7 +66,7 @@ from finn.transformation.infer_datatypes import InferDataTypes
 from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline import Streamline
 from finn.util.basic import pynq_part_map
-from finn.util.test import get_test_model_trained
+from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip
 from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 
@@ -88,7 +87,7 @@ def test_end2end_tfc_w2a2_export():
 
 
 def test_end2end_tfc_w2a2_import_and_tidy():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_export.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w2a2_export.onnx")
     model = model.transform(InferShapes())
     model = model.transform(FoldConstants())
     model = model.transform(GiveUniqueNodeNames())
@@ -98,30 +97,36 @@ def test_end2end_tfc_w2a2_import_and_tidy():
 
 
 def test_end2end_tfc_w2a2_streamline():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_tidy.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w2a2_tidy.onnx")
     model = model.transform(Streamline())
     model.save(build_dir + "/end2end_tfc_w2a2_streamlined.onnx")
 
 
 def test_end2end_tfc_w2a2_convert_to_hls_layers():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_streamlined.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w2a2_streamlined.onnx"
+    )
     model = model.transform(to_hls.InferQuantizedStreamingFCLayer(mem_mode))
     model.save(build_dir + "/end2end_tfc_w2a2_hls_layers.onnx")
 
 
 def test_end2end_tfc_w2a2_create_dataflow_partition():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_hls_layers.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w2a2_hls_layers.onnx"
+    )
     parent_model = model.transform(CreateDataflowPartition())
     parent_model.save(build_dir + "/end2end_tfc_w2a2_dataflow_parent.onnx")
     sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
     sdp_node = getCustomOp(sdp_node)
     dataflow_model_filename = sdp_node.get_nodeattr("model")
-    dataflow_model = ModelWrapper(dataflow_model_filename)
+    dataflow_model = load_test_checkpoint_or_skip(dataflow_model_filename)
     dataflow_model.save(build_dir + "/end2end_tfc_w2a2_dataflow_model.onnx")
 
 
 def test_end2end_tfc_w2a2_fold_and_tlastmarker():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_dataflow_model.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w2a2_dataflow_model.onnx"
+    )
     fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
     # (PE, SIMD, in_fifo_depth, out_fifo_depth, ramstyle) for each layer
     config = [
@@ -145,23 +150,27 @@ def test_end2end_tfc_w2a2_fold_and_tlastmarker():
     model.save(build_dir + "/end2end_tfc_w2a2_folded.onnx")
 
 
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_end2end_tfc_w2a2_gen_hls_ip():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_folded.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w2a2_folded.onnx")
     model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
     model = model.transform(HLSSynthIP())
     model = model.transform(AnnotateResources("hls"))
     model.save(build_dir + "/end2end_tfc_w2a2_ipgen.onnx")
 
 
+@pytest.mark.vivado
 def test_end2end_tfc_w2a2_ip_stitch():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_ipgen.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w2a2_ipgen.onnx")
     model = model.transform(ReplaceVerilogRelPaths())
     model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
     model.save(build_dir + "/end2end_tfc_w2a2_ipstitch.onnx")
 
 
+@pytest.mark.vivado
 def test_end2end_tfc_w2a2_verify_dataflow_part():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_ipstitch.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w2a2_ipstitch.onnx")
     x = np.zeros((1, 784), dtype=np.float32)
     inp_name = model.graph.input[0].name
     out_name = model.graph.output[0].name
@@ -188,9 +197,12 @@ def test_end2end_tfc_w2a2_verify_dataflow_part():
     assert np.isclose(res_cppsim, res_rtlsim_whole).all()
 
 
+@pytest.mark.vivado
 def test_end2end_tfc_w2a2_verify_all():
     # use the streamlined model as the "golden" model for right answers
-    golden = ModelWrapper(build_dir + "/end2end_tfc_w2a2_streamlined.onnx")
+    golden = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w2a2_streamlined.onnx"
+    )
     iname = golden.graph.input[0].name
     oname = golden.graph.output[0].name
     raw_i = get_data("finn", "data/onnx/mnist-conv/test_data_set_0/input_0.pb")
@@ -201,22 +213,31 @@ def test_end2end_tfc_w2a2_verify_all():
     y_golden = ret_golden[oname]
     # set up parent+child graph to test
     # we'll use models from the previous step as the child model
-    parent_model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_dataflow_parent.onnx")
+    parent_model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w2a2_dataflow_parent.onnx"
+    )
     iname = parent_model.graph.input[0].name
     oname = parent_model.graph.output[0].name
     # produce results with cppsim
     sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
     sdp_node = getCustomOp(sdp_node)
+    load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w2a2_ipstitch_cppsim.onnx")
     sdp_node.set_nodeattr("model", build_dir + "/end2end_tfc_w2a2_ipstitch_cppsim.onnx")
     ret_cppsim = execute_onnx(parent_model, {iname: x}, True)
     y_cppsim = ret_cppsim[oname]
     # produce results with node-by-node rtlsim
+    load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w2a2_ipstitch_nodebynode_rtlsim.onnx"
+    )
     sdp_node.set_nodeattr(
         "model", build_dir + "/end2end_tfc_w2a2_ipstitch_nodebynode_rtlsim.onnx"
     )
     ret_nodebynode_rtlsim = execute_onnx(parent_model, {iname: x}, True)
     y_nodebynode_rtlsim = ret_nodebynode_rtlsim[oname]
     # produce results with whole-network (stitched ip) rtlsim
+    load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w2a2_ipstitch_whole_rtlsim.onnx"
+    )
     sdp_node.set_nodeattr(
         "model", build_dir + "/end2end_tfc_w2a2_ipstitch_whole_rtlsim.onnx"
     )
@@ -227,27 +248,34 @@ def test_end2end_tfc_w2a2_verify_all():
     assert np.isclose(y_golden, y_whole_rtlsim).all()
 
 
+@pytest.mark.vivado
 def test_end2end_tfc_w2a2_make_pynq_proj():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_ipstitch.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w2a2_ipstitch.onnx")
     model = model.transform(MakePYNQProject(test_pynq_board))
     model.save(build_dir + "/end2end_tfc_w2a2_pynq_project.onnx")
 
 
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_end2end_tfc_w2a2_synth_pynq_project():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_pynq_project.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w2a2_pynq_project.onnx"
+    )
     model = model.transform(SynthPYNQProject())
     model = model.transform(AnnotateResources("synth"))
     model.save(build_dir + "/end2end_tfc_w2a2_synth.onnx")
 
 
 def test_end2end_tfc_w2a2_make_driver():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_synth.onnx")
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w2a2_synth.onnx")
     model = model.transform(MakePYNQDriver())
     model.save(build_dir + "/end2end_tfc_w2a2_pynq_driver.onnx")
 
 
 def test_end2end_tfc_w2a2_deploy_on_pynq():
-    model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_pynq_driver.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w2a2_pynq_driver.onnx"
+    )
     try:
         ip = os.environ["PYNQ_IP"]  # no fault for this one; skip if not defined
         if ip == "":
@@ -265,7 +293,9 @@ def test_end2end_tfc_w2a2_deploy_on_pynq():
 
 def test_end2end_tfc_w2a2_run_on_pynq():
     # use the streamlined model as the "golden" model for right answers
-    golden = ModelWrapper(build_dir + "/end2end_tfc_w2a2_streamlined.onnx")
+    golden = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w2a2_streamlined.onnx"
+    )
     iname = golden.graph.input[0].name
     oname = golden.graph.output[0].name
     raw_i = get_data("finn", "data/onnx/mnist-conv/test_data_set_0/input_0.pb")
@@ -277,7 +307,9 @@ def test_end2end_tfc_w2a2_run_on_pynq():
     y_golden = ret_golden[oname]
     # set up parent+child graph to test
     # we'll use models from the previous step as the child model
-    parent_model = ModelWrapper(build_dir + "/end2end_tfc_w2a2_dataflow_parent.onnx")
+    parent_model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w2a2_dataflow_parent.onnx"
+    )
     iname = parent_model.graph.input[0].name
     oname = parent_model.graph.output[0].name
     try:
@@ -287,6 +319,7 @@ def test_end2end_tfc_w2a2_run_on_pynq():
         # produce results with cppsim
         sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
         sdp_node = getCustomOp(sdp_node)
+        load_test_checkpoint_or_skip(build_dir + "/end2end_tfc_w2a2_pynq_deploy.onnx")
         sdp_node.set_nodeattr("model", build_dir + "/end2end_tfc_w2a2_pynq_deploy.onnx")
         ret = execute_onnx(parent_model, {iname: x}, True)
         y = ret[oname]
diff --git a/tests/fpgadataflow/test_code_gen_trafo.py b/tests/fpgadataflow/test_code_gen_trafo.py
index 1228a9c79608a1c7eb44900ddb7df54ed900a3c2..24933759830535dfcec768d47a6020b4f3e2de35 100644
--- a/tests/fpgadataflow/test_code_gen_trafo.py
+++ b/tests/fpgadataflow/test_code_gen_trafo.py
@@ -29,13 +29,14 @@
 import os
 
 from onnx import TensorProto, helper
-
+import pytest
 import finn.util.basic as util
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 
 
+@pytest.mark.vivado
 def test_code_gen_trafo():
     idt = wdt = odt = DataType.BIPOLAR
     mw = 8
diff --git a/tests/fpgadataflow/test_compilation_trafo.py b/tests/fpgadataflow/test_compilation_trafo.py
index 35eed02f4e71a96f9f4e8957c372f93e6cd7927c..65894e02e490f6931e5b03a9aa67b8f22e32583a 100644
--- a/tests/fpgadataflow/test_compilation_trafo.py
+++ b/tests/fpgadataflow/test_compilation_trafo.py
@@ -30,6 +30,7 @@ import os
 
 from onnx import TensorProto, helper
 
+import pytest
 import finn.util.basic as util
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
@@ -37,6 +38,7 @@ from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 
 
+@pytest.mark.vivado
 def test_compilation_trafo():
     idt = wdt = odt = DataType.BIPOLAR
     mw = 8
diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py b/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
index 220f8a7966a146f954a7fcb3f32058e231b83e23..e03090f0581eebf68cac7baffb6888a6992df68d 100644
--- a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
+++ b/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
@@ -31,7 +31,7 @@ import pkg_resources as pk
 
 import brevitas.onnx as bo
 import numpy as np
-
+import pytest
 import finn.core.onnx_exec as oxe
 import finn.transformation.streamline.absorb as absorb
 from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
@@ -53,6 +53,7 @@ from finn.custom_op.registry import getCustomOp
 export_onnx_path_cnv = "test_output_cnv.onnx"
 
 
+@pytest.mark.vivado
 def test_convert_to_hls_layers_cnv_w1a1():
     cnv = get_test_model_trained("CNV", 1, 1)
     bo.export_finn_onnx(cnv, (1, 3, 32, 32), export_onnx_path_cnv)
diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_fc.py b/tests/fpgadataflow/test_convert_to_hls_layers_fc.py
index b7dea03797bc5de5e7517d0d8b816c438027008b..e261a3114853bf24bdb4c931c46ff92eea4150dd 100644
--- a/tests/fpgadataflow/test_convert_to_hls_layers_fc.py
+++ b/tests/fpgadataflow/test_convert_to_hls_layers_fc.py
@@ -34,7 +34,7 @@ import numpy as np
 import onnx
 import onnx.numpy_helper as nph
 import torch
-
+import pytest
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 import finn.transformation.streamline.absorb as absorb
@@ -56,6 +56,7 @@ export_onnx_path = "test_output_tfc.onnx"
 export_onnx_path_cnv = "test_output_cnv.onnx"
 
 
+@pytest.mark.vivado
 def test_convert_to_hls_layers_tfc_w1a1():
     tfc = get_test_model_trained("TFC", 1, 1)
     bo.export_finn_onnx(tfc, (1, 1, 28, 28), export_onnx_path)
@@ -125,6 +126,7 @@ def test_convert_to_hls_layers_tfc_w1a1():
     assert np.isclose(produced, expected, atol=1e-3).all()
 
 
+@pytest.mark.vivado
 def test_convert_to_hls_layers_tfc_w1a2():
     tfc = get_test_model_trained("TFC", 1, 2)
     bo.export_finn_onnx(tfc, (1, 1, 28, 28), export_onnx_path)
diff --git a/tests/fpgadataflow/test_create_dataflow_partition.py b/tests/fpgadataflow/test_create_dataflow_partition.py
index 77e0ddeebf6080e1840d6014978a4c9b4a10b5c1..c4f748051ff038371353574298580f3bf9e05e9f 100644
--- a/tests/fpgadataflow/test_create_dataflow_partition.py
+++ b/tests/fpgadataflow/test_create_dataflow_partition.py
@@ -29,7 +29,6 @@
 import os.path
 from pkgutil import get_data
 
-import pytest
 
 from finn.core.modelwrapper import ModelWrapper
 from finn.custom_op.registry import getCustomOp
@@ -38,11 +37,11 @@ from finn.transformation.fpgadataflow.create_dataflow_partition import (
 )
 from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker
 from finn.util.basic import make_build_dir
+from finn.util.test import load_test_checkpoint_or_skip
 
 build_dir = make_build_dir("test_dataflow_partition_")
 
 
-@pytest.mark.dependency()
 def test_dataflow_partition_create():
     # load the onnx model
     raw_m = get_data(
@@ -57,9 +56,10 @@ def test_dataflow_partition_create():
     model.save(build_dir + "/test_dataflow_partition_create.onnx")
 
 
-@pytest.mark.dependency(depends=["test_dataflow_partition_create"])
 def test_dataflow_partition_tlastmarker():
-    model = ModelWrapper(build_dir + "/test_dataflow_partition_create.onnx")
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/test_dataflow_partition_create.onnx"
+    )
     model_path = getCustomOp(model.graph.node[2]).get_nodeattr("model")
     model = ModelWrapper(model_path)
     model = model.transform(InsertTLastMarker())
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
index 02a9acae5e0e90d2a8dfa7d4d4afb03aa11f4239..5051bf34dc690daf8b6186859d3717cc8e217eee 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
@@ -137,6 +137,8 @@ def prepare_inputs(input_tensor):
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 # input channel parallelism ("SIMD")
 @pytest.mark.parametrize("simd", [1, 2])
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_fpgadataflow_slidingwindow(idt, k, ifm_dim, ifm_ch, stride, exec_mode, simd):
     ofm_dim = int(((ifm_dim - k) / stride) + 1)
 
diff --git a/tests/fpgadataflow/test_fpgadataflow_dwc.py b/tests/fpgadataflow/test_fpgadataflow_dwc.py
index a230fb4201673e3bf0a31cf9ec82f21250fd9e40..1d83f7a23cd3bad757e772055d242799cf22b0da 100644
--- a/tests/fpgadataflow/test_fpgadataflow_dwc.py
+++ b/tests/fpgadataflow/test_fpgadataflow_dwc.py
@@ -55,6 +55,8 @@ def prepare_inputs(input_tensor, dt):
 @pytest.mark.parametrize("OUTWidth", [2, 4])
 # finn_dtype
 @pytest.mark.parametrize("finn_dtype", [DataType.BIPOLAR, DataType.INT2])
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_fpgadataflow_dwc_rtlsim(Shape, INWidth, OUTWidth, finn_dtype):
 
     # generate input data
diff --git a/tests/fpgadataflow/test_fpgadataflow_fclayer.py b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
index 416d96d5dbfa1125d878eb8339ae38f5d572d1ce..fc5cdb7745945bee99564ba9ab19423a66d8e035 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fclayer.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
@@ -149,6 +149,8 @@ def prepare_inputs(input_tensor, idt, wdt):
 @pytest.mark.parametrize("mw", [16])
 # HLS matrix height (output features)
 @pytest.mark.parametrize("mh", [16])
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
     if nf == -1:
         nf = mh
@@ -234,6 +236,8 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
 @pytest.mark.parametrize("mw", [16])
 # HLS matrix height (output features)
 @pytest.mark.parametrize("mh", [16])
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
     if nf == -1:
         nf = mh
@@ -324,6 +328,7 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
 @pytest.mark.parametrize("mw", [128])
 # HLS matrix height (output features)
 @pytest.mark.parametrize("mh", [128])
+@pytest.mark.vivado
 def test_fpgadataflow_fclayer_large_depth_decoupled_mode(
     mem_mode, idt, wdt, act, nf, sf, mw, mh
 ):
diff --git a/tests/fpgadataflow/test_fpgadataflow_fifo.py b/tests/fpgadataflow/test_fpgadataflow_fifo.py
index 9158a0b0e72017b2468627e4f30fd3432c418d38..94090a47ad64fc377530e6e21d35661e1d92b5a6 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fifo.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fifo.py
@@ -76,6 +76,8 @@ def prepare_inputs(input_tensor, dt):
 @pytest.mark.parametrize("depth", [16])
 # finn_dtype
 @pytest.mark.parametrize("finn_dtype", [DataType.BIPOLAR])  # , DataType.INT2])
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_fpgadataflow_fifo_rtlsim(Shape, folded_shape, depth, finn_dtype):
 
     # generate input data
diff --git a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
new file mode 100644
index 0000000000000000000000000000000000000000..b46391daf629e97c24c2950aefad3cbc5055c345
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+import numpy as np
+
+from onnx import TensorProto, helper
+
+import finn.core.onnx_exec as oxe
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.util.basic import gen_finn_dt_tensor
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
+
+
+def make_accpool_modelwrapper(ch, pe, idim, idt):
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, idim, idim, ch])
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, 1, 1, ch])
+
+    accpool_node = helper.make_node(
+        "GlobalAccPool_Batch",
+        ["inp"],
+        ["outp"],
+        domain="finn",
+        backend="fpgadataflow",
+        NumChannels=ch,
+        PE=pe,
+        inputDataType=idt.name,
+        numInputVectors=[1, idim, idim],
+    )
+    graph = helper.make_graph(
+        nodes=[accpool_node], name="graph", inputs=[inp], outputs=[outp]
+    )
+
+    model = helper.make_model(graph, producer_name="thresholding-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+
+    return model
+
+
+def prepare_inputs(input_tensor, idt):
+    return {"inp": input_tensor}
+
+
+# data type
+@pytest.mark.parametrize("idt", [DataType.UINT4, DataType.UINT16])
+# channels
+@pytest.mark.parametrize("ch", [64])
+# folding
+@pytest.mark.parametrize("fold", [-1, 2, 1])
+# image dimension
+@pytest.mark.parametrize("imdim", [7])
+# execution mode
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.vivado
+def test_fpgadataflow_globalaccpool(idt, ch, fold, imdim, exec_mode):
+    if fold == -1:
+        pe = 1
+    else:
+        pe = ch // fold
+    assert ch % pe == 0
+
+    # generate input data
+    x = gen_finn_dt_tensor(idt, (1, imdim, imdim, ch))
+
+    model = make_accpool_modelwrapper(ch, pe, imdim, idt)
+
+    if exec_mode == "cppsim":
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+        model = model.transform(SetExecMode("cppsim"))
+    elif exec_mode == "rtlsim":
+        model = model.transform(SetExecMode("rtlsim"))
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(ReplaceVerilogRelPaths())
+        model = model.transform(PrepareRTLSim())
+    else:
+        raise Exception("Unknown exec_mode")
+
+    # prepare input data and execute
+    input_dict = prepare_inputs(x, idt)
+    y = oxe.execute_onnx(model, input_dict)["outp"]
+    expected_y = np.sum(x, axis=(1, 2)).flatten()
+
+    assert (y == expected_y).all(), exec_mode + " failed"
diff --git a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py b/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
index 30b86d639ae52143320dfdfeb25488bae865b4d2..16100522aa94fd25d234efa1d03edfdc866ca1bb 100644
--- a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
+++ b/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
@@ -52,6 +52,7 @@ import finn.transformation.fpgadataflow.replace_verilog_relpaths as rvp
 from finn.transformation.general import GiveUniqueNodeNames
 from finn.util.basic import gen_finn_dt_tensor, pynq_part_map
 from finn.util.fpgadataflow import pyverilate_stitched_ip
+from finn.util.test import load_test_checkpoint_or_skip
 
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
 test_fpga_part = pynq_part_map[test_pynq_board]
@@ -198,13 +199,14 @@ def create_two_fc_model():
 
 # exec_mode of StreamingDataflowPartition
 # @pytest.mark.parametrize("exec_mode", ["remote_pynq"]) #, "rtlsim"])
+@pytest.mark.vivado
 def test_fpgadataflow_ipstitch_gen_model():  # exec_mode):
     model = create_one_fc_model()
     if model.graph.node[0].op_type == "StreamingDataflowPartition":
         sdp_node = getCustomOp(model.graph.node[0])
         assert sdp_node.__class__.__name__ == "StreamingDataflowPartition"
         assert os.path.isfile(sdp_node.get_nodeattr("model"))
-        model = ModelWrapper(sdp_node.get_nodeattr("model"))
+        model = load_test_checkpoint_or_skip(sdp_node.get_nodeattr("model"))
         model.set_metadata_prop("exec_mode", "remote_pynq")
     model = model.transform(InsertTLastMarker())
     model = model.transform(GiveUniqueNodeNames())
@@ -215,8 +217,9 @@ def test_fpgadataflow_ipstitch_gen_model():  # exec_mode):
     model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_gen_model.onnx")
 
 
+@pytest.mark.vivado
 def test_fpgadataflow_ipstitch_do_stitch():
-    model = ModelWrapper(
+    model = load_test_checkpoint_or_skip(
         ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_gen_model.onnx"
     )
     model = model.transform(rvp.ReplaceVerilogRelPaths())
@@ -231,8 +234,11 @@ def test_fpgadataflow_ipstitch_do_stitch():
     model.save(ip_stitch_model_dir + "/test_fpgadataflow_ip_stitch.onnx")
 
 
+@pytest.mark.vivado
 def test_fpgadataflow_ipstitch_rtlsim():
-    model = ModelWrapper(ip_stitch_model_dir + "/test_fpgadataflow_ip_stitch.onnx")
+    model = load_test_checkpoint_or_skip(
+        ip_stitch_model_dir + "/test_fpgadataflow_ip_stitch.onnx"
+    )
     model.set_metadata_prop("rtlsim_trace", "whole_trace.vcd")
     sim = pyverilate_stitched_ip(model)
     exp_io = [
@@ -275,8 +281,11 @@ def test_fpgadataflow_ipstitch_rtlsim():
     assert (rtlsim_res == x).all()
 
 
+@pytest.mark.vivado
 def test_fpgadataflow_ipstitch_pynq_projgen():
-    model = ModelWrapper(ip_stitch_model_dir + "/test_fpgadataflow_ip_stitch.onnx")
+    model = load_test_checkpoint_or_skip(
+        ip_stitch_model_dir + "/test_fpgadataflow_ip_stitch.onnx"
+    )
     model = model.transform(MakePYNQProject(test_pynq_board))
     vivado_pynq_proj_dir = model.get_metadata_prop("vivado_pynq_proj")
     assert vivado_pynq_proj_dir is not None
@@ -284,8 +293,12 @@ def test_fpgadataflow_ipstitch_pynq_projgen():
     model.save(ip_stitch_model_dir + "/test_fpgadataflow_pynq_projgen.onnx")
 
 
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_fpgadataflow_ipstitch_pynq_synth():
-    model = ModelWrapper(ip_stitch_model_dir + "/test_fpgadataflow_pynq_projgen.onnx")
+    model = load_test_checkpoint_or_skip(
+        ip_stitch_model_dir + "/test_fpgadataflow_pynq_projgen.onnx"
+    )
     model = model.transform(SynthPYNQProject())
     bitfile = model.get_metadata_prop("vivado_pynq_bitfile")
     assert bitfile is not None
@@ -294,7 +307,9 @@ def test_fpgadataflow_ipstitch_pynq_synth():
 
 
 def test_fpgadataflow_ipstitch_pynq_driver():
-    model = ModelWrapper(ip_stitch_model_dir + "/test_fpgadataflow_pynq_projgen.onnx")
+    model = load_test_checkpoint_or_skip(
+        ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_pynq_synth.onnx"
+    )
     model = model.transform(MakePYNQDriver())
     driver_dir = model.get_metadata_prop("pynq_driver_dir")
     assert driver_dir is not None
@@ -303,13 +318,13 @@ def test_fpgadataflow_ipstitch_pynq_driver():
 
 
 def test_fpgadataflow_ipstitch_pynq_deployment_folder():
+    model = load_test_checkpoint_or_skip(
+        ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_pynq_driver.onnx"
+    )
     try:
         ip = os.environ["PYNQ_IP"]  # no default for this one; skip if not defined
         if ip == "":
             pytest.skip("PYNQ board IP address not specified")
-        model = ModelWrapper(
-            ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_pynq_driver.onnx"
-        )
         username = os.getenv("PYNQ_USERNAME", "xilinx")
         password = os.getenv("PYNQ_PASSWORD", "xilinx")
         port = os.getenv("PYNQ_PORT", 22)
@@ -341,7 +356,7 @@ def test_fpgadataflow_ipstitch_remote_execution():
         ip = os.environ["PYNQ_IP"]  # NOQA
         if ip == "":
             pytest.skip("PYNQ board IP address not specified")
-        model = ModelWrapper(
+        model = load_test_checkpoint_or_skip(
             ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_pynq_deployment.onnx"
         )
         iname = "inp"
diff --git a/tests/fpgadataflow/test_fpgadataflow_labelselect.py b/tests/fpgadataflow/test_fpgadataflow_labelselect.py
index e324b8ac7d0d97df93d7207e81479e4d81830741..237e54aa17747c38fd76d8c1ede46744dbe21da6 100644
--- a/tests/fpgadataflow/test_fpgadataflow_labelselect.py
+++ b/tests/fpgadataflow/test_fpgadataflow_labelselect.py
@@ -90,6 +90,7 @@ def prepare_inputs(input_tensor, idt):
 @pytest.mark.parametrize("k", [1, 5])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.vivado
 def test_fpgadataflow_labelselect(idt, labels, fold, k, exec_mode):
     if fold == -1:
         pe = 1
diff --git a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py b/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
index ac4ab33469c7720c3d7b9f30f5d13be888e1439d..bda66bebbd93d346eb0026b17cbaff9a7ca5df5e 100644
--- a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
+++ b/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
@@ -121,6 +121,8 @@ def prepare_inputs(input_tensor):
 @pytest.mark.parametrize("ifm_ch", [1, 2])  # , 2, 3, 4])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["rtlsim", "cppsim"])
+@pytest.mark.slow
+@pytest.mark.vivado
 def test_fpgadataflow_streamingmaxpool(idt, k, ifm_dim, ifm_ch, exec_mode):
     stride = k
     ofm_dim = int(((ifm_dim - k) / stride) + 1)
diff --git a/tests/transformation/test_infer_datatypes.py b/tests/transformation/test_infer_datatypes.py
index 77b6a94f8ed891a4fe761fe864a6e18d35e84382..e3db40289c4318894cf5ad41c2f67b3bff501db9 100644
--- a/tests/transformation/test_infer_datatypes.py
+++ b/tests/transformation/test_infer_datatypes.py
@@ -54,8 +54,8 @@ def test_infer_datatypes():
     assert model.get_tensor_datatype("MatMul_1_out0") == DataType.INT32
     assert model.get_tensor_datatype("MatMul_2_out0") == DataType.INT32
     assert model.get_tensor_datatype("MatMul_3_out0") == DataType.INT32
-    assert model.get_tensor_datatype("Sign_0_out0") == DataType.BIPOLAR
-    assert model.get_tensor_datatype("Sign_1_out0") == DataType.BIPOLAR
-    assert model.get_tensor_datatype("Sign_2_out0") == DataType.BIPOLAR
-    assert model.get_tensor_datatype("Sign_3_out0") == DataType.BIPOLAR
+    assert model.get_tensor_datatype("MultiThreshold_0_out0") == DataType.BIPOLAR
+    assert model.get_tensor_datatype("MultiThreshold_1_out0") == DataType.BIPOLAR
+    assert model.get_tensor_datatype("MultiThreshold_2_out0") == DataType.BIPOLAR
+    assert model.get_tensor_datatype("MultiThreshold_3_out0") == DataType.BIPOLAR
     os.remove(export_onnx_path)
diff --git a/tests/util/test_data_packing.py b/tests/util/test_data_packing.py
index 28f1d56d0dbc5451ccad3d36b4b1d4c6bed4f63e..7b77c4be20c1f41c11b53a9b65b79441c9bbbe47 100644
--- a/tests/util/test_data_packing.py
+++ b/tests/util/test_data_packing.py
@@ -47,6 +47,7 @@ from finn.util.data_packing import (
 
 @pytest.mark.parametrize("dtype", [DataType.BINARY, DataType.INT2, DataType.INT32])
 @pytest.mark.parametrize("test_shape", [(1, 2, 4), (1, 1, 64), (2, 64)])
+@pytest.mark.vivado
 def test_npy2apintstream(test_shape, dtype):
     ndarray = cutil.gen_finn_dt_tensor(dtype, test_shape)
     test_dir = cutil.make_build_dir(prefix="test_npy2apintstream_")