diff --git a/src/finn/core/remote_exec.py b/src/finn/core/remote_exec.py
index 335dfec04e4abee41f914c5d912ce291a0d31a91..a533e4d36629f57f7c4a576570d75a1e051de5be 100644
--- a/src/finn/core/remote_exec.py
+++ b/src/finn/core/remote_exec.py
@@ -79,6 +79,12 @@ def remote_exec(model, execution_context):
     bash_command = ["/bin/bash", "-c", cmd]
     process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
     process_compile.communicate()
+    # remove stale output file from local dir, if any
+    try:
+        os.remove("{}/output.npy".format(deployment_dir))
+    except FileNotFoundError:
+        pass
+    # copy generated output to local
     cmd = "sshpass -p {} scp -P{} {}@{}:{}/{}/output.npy {}".format(
         pynq_password,
         pynq_port,
diff --git a/src/finn/core/throughput_test.py b/src/finn/core/throughput_test.py
index 39e71efa8c5ca3cef786b35a45b89d9231d1115d..8d3dabcf8af51327d5d951464c6d9b36e2f67497 100644
--- a/src/finn/core/throughput_test.py
+++ b/src/finn/core/throughput_test.py
@@ -33,7 +33,8 @@ import subprocess
 def throughput_test(model, batchsize=1000):
     """Runs the throughput test for the given model remotely on the pynq board.
     The metadata properties related to the pynq board have to be set.
-    Returns a dictionary with results of the throughput test"""
+    Returns a dictionary with results of the throughput test. Returns None
+    if the test fails."""
 
     pynq_ip = model.get_metadata_prop("pynq_ip")
     pynq_port = int(model.get_metadata_prop("pynq_port"))
@@ -62,6 +63,12 @@ def throughput_test(model, batchsize=1000):
     process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
     process_compile.communicate()
 
+    # remove any pre-existing metrics file
+    try:
+        os.remove("{}/nw_metrics.txt".format(deployment_dir))
+    except FileNotFoundError:
+        pass
+
     cmd = "sshpass -p {} scp -P{} {}@{}:{}/{}/nw_metrics.txt {}".format(
         pynq_password,
         pynq_port,
@@ -75,7 +82,9 @@ def throughput_test(model, batchsize=1000):
     process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
     process_compile.communicate()
 
-    with open("{}/nw_metrics.txt".format(deployment_dir), "r") as file:
-        res = eval(file.read())
-
-    return res
+    try:
+        with open("{}/nw_metrics.txt".format(deployment_dir), "r") as file:
+            res = eval(file.read())
+        return res
+    except FileNotFoundError:
+        return None
diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py
index 7ad555ef7fd1f1a8708ced605020f67b5d04985b..ab9fd03251819aee72f74cc0c1fa17b99b1e05a4 100644
--- a/src/finn/transformation/fpgadataflow/templates.py
+++ b/src/finn/transformation/fpgadataflow/templates.py
@@ -91,7 +91,7 @@ cd %s
 
 pynq_driver_template = """
 import argparse
-
+import os
 from pynq import Overlay
 import numpy as np
 from pynq import allocate
@@ -207,6 +207,12 @@ if __name__ == "__main__":
     # for the remote execution the data from the input npy file has to be loaded,
     # packed and copied to the PYNQ buffer
     if exec_mode == "execute":
+        # remove old output file to prevent reusing old output
+        # in case execution fails
+        try:
+            os.remove(outputfile)
+        except FileNotFoundError:
+            pass
         # load desired input .npy file
         ibuf_normal = np.load(inputfile)
         ibuf_folded = finnDriver.fold_input(ibuf_normal)
@@ -217,10 +223,15 @@ if __name__ == "__main__":
 
     # for the throughput test the runtime of the network has to be measured
     if exec_mode == "throughput_test":
-        # measure runtime of network
-        start = time.time()
+        # remove old metrics file
+        try:
+            os.remove("nw_metrics.txt")
+        except FileNotFoundError:
+            pass
         # dictionary for results of throughput test
         res={}
+        # measure runtime of network
+        start = time.time()
 
     # execute accelerator
     finnDriver.execute()
diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index bc413bf665e96be1d58a5de13b0744fd6a80f855..3880bb9591e27af5fe9d063dba2485d304e4db54 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -43,6 +43,13 @@ pynq_part_map["Pynq-Z1"] = "xc7z020clg400-1"
 pynq_part_map["Pynq-Z2"] = "xc7z020clg400-1"
 pynq_part_map["ZCU104"] = "xczu7ev-ffvc1156-2-e"
 
+# native AXI HP port width (in bits) for PYNQ boards
+pynq_native_port_width = dict()
+pynq_native_port_width["Pynq-Z1"] = 64
+pynq_native_port_width["Pynq-Z2"] = 64
+pynq_native_port_width["Ultra96"] = 128
+pynq_native_port_width["ZCU104"] = 128
+
 
 def get_rtlsim_trace_depth():
     """Return the trace depth for rtlsim via PyVerilator. Controllable
diff --git a/tests/end2end/test_end2end_cnv_w1a1.py b/tests/end2end/test_end2end_cnv_w1a1.py
index e6d1fc4efd61c01654ee88638698215d23a82eb3..c3359dcc82650bf0e9e8a5bc5276f5ca770ee96c 100644
--- a/tests/end2end/test_end2end_cnv_w1a1.py
+++ b/tests/end2end/test_end2end_cnv_w1a1.py
@@ -76,7 +76,7 @@ from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
 test_fpga_part = pynq_part_map[test_pynq_board]
-target_clk_ns = 5
+target_clk_ns = 10
 mem_mode = "decoupled"
 
 
diff --git a/tests/pynq/test_pynq_performance_end2end.py b/tests/pynq/test_pynq_performance_end2end.py
index c7be7ae33ef296ce673a4f11948d4a9998d67bed..66a93a190061e0142637be19bb2ea841d192745a 100644
--- a/tests/pynq/test_pynq_performance_end2end.py
+++ b/tests/pynq/test_pynq_performance_end2end.py
@@ -11,6 +11,7 @@ build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 
 
 @pytest.mark.parametrize("end2end_example", ["tfc_w1a1", "cnv_w1a1"])
+@pytest.mark.slow
 def test_pynq_performance_end2end(end2end_example):
     model = load_test_checkpoint_or_skip(
         build_dir + "/end2end_%s_pynq_deploy.onnx" % end2end_example
@@ -21,11 +22,18 @@ def test_pynq_performance_end2end(end2end_example):
         if ip == "" or board == "":
             pytest.skip("PYNQ board or IP address not specified")
         ret = dict()
-        bsize_range = [1, 10, 100, 1000, 10000]
-        for bsize in bsize_range:
+        # try a range of batch sizes, some may fail due to insufficient DMA
+        # buffers
+        bsize_range_in = [2 ** i for i in range(16)]
+        bsize_range = []
+        for bsize in bsize_range_in:
             res = throughput_test(model, bsize)
-            assert res is not None
-            ret[bsize] = res
+            if res is not None:
+                ret[bsize] = res
+                bsize_range.append(bsize)
+            else:
+                # assume we reached largest possible N
+                break
 
         y = [ret[key]["runtime[ms]"] for key in bsize_range]
         lrret = linregress(bsize_range, y)
diff --git a/tests/pynq/test_pynq_performance_fifo.py b/tests/pynq/test_pynq_performance_fifo.py
index 7aaff9f6ed2af9d05bb4f3a805628ced9792dc35..1d4542473c4b58d3baa62f4123fd0f2f76954d95 100644
--- a/tests/pynq/test_pynq_performance_fifo.py
+++ b/tests/pynq/test_pynq_performance_fifo.py
@@ -17,7 +17,7 @@ from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject
 from finn.transformation.fpgadataflow.synth_pynq_proj import SynthPYNQProject
 import finn.transformation.fpgadataflow.replace_verilog_relpaths as rvp
 from finn.transformation.general import GiveUniqueNodeNames
-from finn.util.basic import pynq_part_map
+from finn.util.basic import pynq_part_map, pynq_native_port_width
 from finn.core.throughput_test import throughput_test
 from scipy.stats import linregress
 import warnings
@@ -60,8 +60,9 @@ def test_pynq_performance_fifo():
         board = os.environ["PYNQ_BOARD"]  # NOQA
         if ip == "" or board == "":
             pytest.skip("PYNQ board or IP address not specified")
-        shape = (1, 128)
-        folded_shape = (1, 1, 128)
+        fifo_width = pynq_native_port_width[board]
+        shape = (1, fifo_width)
+        folded_shape = (1, 1, fifo_width)
         depth = 16
         clk_ns = 10
         dtype = DataType.BIPOLAR
@@ -84,11 +85,18 @@ def test_pynq_performance_fifo():
         model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
 
         ret = dict()
-        bsize_range = [1, 10, 100, 1000, 10000, 100000]
-        for bsize in bsize_range:
+        # try a range of batch sizes, some may fail due to insufficient DMA
+        # buffers
+        bsize_range_in = [2 ** i for i in range(20)]
+        bsize_range = []
+        for bsize in bsize_range_in:
             res = throughput_test(model, bsize)
-            assert res is not None
-            ret[bsize] = res
+            if res is not None:
+                ret[bsize] = res
+                bsize_range.append(bsize)
+            else:
+                # assume we reached largest possible N
+                break
 
         y = [ret[key]["runtime[ms]"] for key in bsize_range]
         lrret = linregress(bsize_range, y)