diff --git a/src/finn/core/remote_exec.py b/src/finn/core/remote_exec.py
index 335dfec04e4abee41f914c5d912ce291a0d31a91..a533e4d36629f57f7c4a576570d75a1e051de5be 100644
--- a/src/finn/core/remote_exec.py
+++ b/src/finn/core/remote_exec.py
@@ -79,6 +79,12 @@ def remote_exec(model, execution_context):
     bash_command = ["/bin/bash", "-c", cmd]
     process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
     process_compile.communicate()
+    # remove stale output file from local dir, if any
+    try:
+        os.remove("{}/output.npy".format(deployment_dir))
+    except FileNotFoundError:
+        pass
+    # copy generated output to local
     cmd = "sshpass -p {} scp -P{} {}@{}:{}/{}/output.npy {}".format(
         pynq_password,
         pynq_port,
diff --git a/src/finn/core/throughput_test.py b/src/finn/core/throughput_test.py
index c82d540e29fc59b92a22bf011e823a9f8c076843..8d3dabcf8af51327d5d951464c6d9b36e2f67497 100644
--- a/src/finn/core/throughput_test.py
+++ b/src/finn/core/throughput_test.py
@@ -30,10 +30,11 @@ import os
 import subprocess
 
 
-def throughput_test(model):
+def throughput_test(model, batchsize=1000):
     """Runs the throughput test for the given model remotely on the pynq board.
     The metadata properties related to the pynq board have to be set.
-    Returns a dictionary with results of the throughput test"""
+    Returns a dictionary with results of the throughput test. Returns None
+    if the test fails."""
 
     pynq_ip = model.get_metadata_prop("pynq_ip")
     pynq_port = int(model.get_metadata_prop("pynq_port"))
@@ -47,7 +48,8 @@ def throughput_test(model):
     cmd = (
         "sshpass -p {} ssh {}@{} -p {} "
         '"cd {}/{}; echo "{}" | '
-        'sudo -S python3.6 driver.py --exec_mode="throughput_test" --batchsize=1000"'
+        'sudo -S python3.6 driver.py --exec_mode="throughput_test" --batchsize=%d"'
+        % batchsize
     ).format(
         pynq_password,
         pynq_username,
@@ -61,6 +63,12 @@ def throughput_test(model):
     process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
     process_compile.communicate()
 
+    # remove any pre-existing metrics file
+    try:
+        os.remove("{}/nw_metrics.txt".format(deployment_dir))
+    except FileNotFoundError:
+        pass
+
     cmd = "sshpass -p {} scp -P{} {}@{}:{}/{}/nw_metrics.txt {}".format(
         pynq_password,
         pynq_port,
@@ -74,7 +82,9 @@ def throughput_test(model):
     process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
     process_compile.communicate()
 
-    with open("{}/nw_metrics.txt".format(deployment_dir), "r") as file:
-        res = eval(file.read())
-
-    return res
+    try:
+        with open("{}/nw_metrics.txt".format(deployment_dir), "r") as file:
+            res = eval(file.read())
+        return res
+    except FileNotFoundError:
+        return None
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
index 049ede5064d252bd6391184c4227e5367a8c1e2b..18d3db18da089a5dda4dbb6d97180dd4a20613b5 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
@@ -107,6 +107,13 @@ class MakePYNQDriver(Transformation):
         driver = driver.replace("$OUTPUT_SHAPE_FOLDED$", mss(o_tensor_shape_folded))
         driver = driver.replace("$OUTPUT_SHAPE_PACKED$", mss(o_tensor_shape_packed))
 
+        # clock settings for driver
+        clk_ns = float(model.get_metadata_prop("clk_ns"))
+        fclk_mhz = 1 / (clk_ns * 0.001)
+        # TODO change according to PYNQ board?
+        driver = driver.replace("$CLK_NAME$", "fclk0_mhz")
+        driver = driver.replace("$CLOCK_FREQ_MHZ$", str(fclk_mhz))
+
         with open(driver_py, "w") as f:
             f.write(driver)
         # copy all the dependencies into the driver folder
diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py
index 55ecb57decd2ac4fa08331b5ebbcb7fd2f0cd5c6..ab9fd03251819aee72f74cc0c1fa17b99b1e05a4 100644
--- a/src/finn/transformation/fpgadataflow/templates.py
+++ b/src/finn/transformation/fpgadataflow/templates.py
@@ -91,7 +91,7 @@ cd %s
 
 pynq_driver_template = """
 import argparse
-
+import os
 from pynq import Overlay
 import numpy as np
 from pynq import allocate
@@ -101,6 +101,7 @@ from finn.util.data_packing import (
     packed_bytearray_to_finnpy
 )
 from finn.core.datatype import DataType
+from pynq.ps import Clocks
 
 class FINNAccelDriver():
     def __init__(self, N, bitfile):
@@ -118,8 +119,12 @@ class FINNAccelDriver():
         self.oshape_folded = $OUTPUT_SHAPE_FOLDED$
         self.ishape_packed = $INPUT_SHAPE_PACKED$   # datatype np.uint8
         self.oshape_packed = $OUTPUT_SHAPE_PACKED$  # datatype np.uint8
+        # clock frequency
+        self.fclk_mhz = $CLOCK_FREQ_MHZ$
         # load bitfile and set up accelerator
         self.ol = Overlay(bitfile)
+        # set the clock frequency as specified by user during transformations
+        Clocks.$CLK_NAME$ = self.fclk_mhz
         self.dma = self.ol.axi_dma_0
         self.ctrl_regs = self.ol.resize_accel_0
         # neuron folding factor of output = iterations per sample
@@ -202,6 +207,12 @@ if __name__ == "__main__":
     # for the remote execution the data from the input npy file has to be loaded,
     # packed and copied to the PYNQ buffer
     if exec_mode == "execute":
+        # remove old output file to prevent reusing old output
+        # in case execution fails
+        try:
+            os.remove(outputfile)
+        except FileNotFoundError:
+            pass
         # load desired input .npy file
         ibuf_normal = np.load(inputfile)
         ibuf_folded = finnDriver.fold_input(ibuf_normal)
@@ -212,10 +223,15 @@ if __name__ == "__main__":
 
     # for the throughput test the runtime of the network has to be measured
     if exec_mode == "throughput_test":
-        # measure runtime of network
-        start = time.time()
+        # remove old metrics file
+        try:
+            os.remove("nw_metrics.txt")
+        except FileNotFoundError:
+            pass
         # dictionary for results of throughput test
         res={}
+        # measure runtime of network
+        start = time.time()
 
     # execute accelerator
     finnDriver.execute()
@@ -228,6 +244,8 @@ if __name__ == "__main__":
         res["throughput[images/s]"] = N / runtime
         res["DRAM_in_bandwidth[Mb/s]"] = np.prod(finnDriver.ishape_packed)*0.000001 / runtime
         res["DRAM_out_bandwidth[Mb/s]"] = np.prod(finnDriver.oshape_packed)*0.000001 / runtime
+        res["fclk[mhz]"] = Clocks.fclk0_mhz
+        res["N"] = N
         file = open("nw_metrics.txt", "w")
         file.write(str(res))
         file.close()
diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index bc413bf665e96be1d58a5de13b0744fd6a80f855..3880bb9591e27af5fe9d063dba2485d304e4db54 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -43,6 +43,13 @@ pynq_part_map["Pynq-Z1"] = "xc7z020clg400-1"
 pynq_part_map["Pynq-Z2"] = "xc7z020clg400-1"
 pynq_part_map["ZCU104"] = "xczu7ev-ffvc1156-2-e"
 
+# native AXI HP port width (in bits) for PYNQ boards
+pynq_native_port_width = dict()
+pynq_native_port_width["Pynq-Z1"] = 64
+pynq_native_port_width["Pynq-Z2"] = 64
+pynq_native_port_width["Ultra96"] = 128
+pynq_native_port_width["ZCU104"] = 128
+
 
 def get_rtlsim_trace_depth():
     """Return the trace depth for rtlsim via PyVerilator. Controllable
diff --git a/tests/end2end/test_end2end_cnv_w1a1.py b/tests/end2end/test_end2end_cnv_w1a1.py
index e6d1fc4efd61c01654ee88638698215d23a82eb3..c3359dcc82650bf0e9e8a5bc5276f5ca770ee96c 100644
--- a/tests/end2end/test_end2end_cnv_w1a1.py
+++ b/tests/end2end/test_end2end_cnv_w1a1.py
@@ -76,7 +76,7 @@ from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
 test_fpga_part = pynq_part_map[test_pynq_board]
-target_clk_ns = 5
+target_clk_ns = 10
 mem_mode = "decoupled"
 
 
diff --git a/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py b/tests/end2end/test_end2end_tfc_w1a1.py
similarity index 98%
rename from tests/end2end/test_end2end_tfc_w1a1_throughput_test.py
rename to tests/end2end/test_end2end_tfc_w1a1.py
index 1ba149687bb80a0f977115bd380a09f70eef23f1..15c1c41b006c6f87d79a0e7eb6a4458838de5fd2 100644
--- a/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py
+++ b/tests/end2end/test_end2end_tfc_w1a1.py
@@ -41,7 +41,6 @@ import onnx.numpy_helper as nph
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 import finn.transformation.streamline.absorb as absorb
 from finn.core.onnx_exec import execute_onnx
-from finn.core.throughput_test import throughput_test
 from finn.custom_op.registry import getCustomOp
 from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
 from finn.transformation.fold_constants import FoldConstants
@@ -332,9 +331,6 @@ def test_end2end_tfc_w1a1_run_on_pynq():
         ret = execute_onnx(parent_model, {iname: x}, True)
         y = ret[oname]
         assert np.isclose(y, y_golden).all()
-        child_model = load_test_checkpoint_or_skip(sdp_node.get_nodeattr("model"))
-        res = throughput_test(child_model)
-        assert res is not None
 
     except KeyError:
         pytest.skip("PYNQ board IP address not specified")
diff --git a/tests/pynq/test_pynq_performance_end2end.py b/tests/pynq/test_pynq_performance_end2end.py
new file mode 100644
index 0000000000000000000000000000000000000000..66a93a190061e0142637be19bb2ea841d192745a
--- /dev/null
+++ b/tests/pynq/test_pynq_performance_end2end.py
@@ -0,0 +1,65 @@
+import os
+
+import pytest
+import numpy as np
+from scipy.stats import linregress
+import warnings
+from finn.util.test import load_test_checkpoint_or_skip
+from finn.core.throughput_test import throughput_test
+
+build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
+
+
+@pytest.mark.parametrize("end2end_example", ["tfc_w1a1", "cnv_w1a1"])
+@pytest.mark.slow
+def test_pynq_performance_end2end(end2end_example):
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_%s_pynq_deploy.onnx" % end2end_example
+    )
+    try:
+        ip = os.environ["PYNQ_IP"]  # NOQA
+        board = os.environ["PYNQ_BOARD"]  # NOQA
+        if ip == "" or board == "":
+            pytest.skip("PYNQ board or IP address not specified")
+        ret = dict()
+        # try a range of batch sizes, some may fail due to insufficient DMA
+        # buffers
+        bsize_range_in = [2 ** i for i in range(16)]
+        bsize_range = []
+        for bsize in bsize_range_in:
+            res = throughput_test(model, bsize)
+            if res is not None:
+                ret[bsize] = res
+                bsize_range.append(bsize)
+            else:
+                # assume we reached largest possible N
+                break
+
+        y = [ret[key]["runtime[ms]"] for key in bsize_range]
+        lrret = linregress(bsize_range, y)
+        ret_str = ""
+        ret_str += "\n" + "%s Throughput Test Results" % end2end_example
+        ret_str += "\n" + "-----------------------------"
+        ret_str += "\n" + "From linear regression:"
+        ret_str += "\n" + "Invocation overhead: %f ms" % lrret.intercept
+        ret_str += "\n" + "Time per sample: %f ms" % lrret.slope
+        ret_str += "\n" + "Raw data:"
+
+        ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format(
+            "N", "runtime[ms]", "fclk[mhz]", "fps", "DRAM rd[Mb/s]", "DRAM wr[Mb/s]"
+        )
+        for k in bsize_range:
+            v = ret[k]
+            ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format(
+                k,
+                np.round(v["runtime[ms]"], 4),
+                v["fclk[mhz]"],
+                np.round(v["throughput[images/s]"], 2),
+                np.round(v["DRAM_in_bandwidth[Mb/s]"], 2),
+                np.round(v["DRAM_out_bandwidth[Mb/s]"], 2),
+            )
+        ret_str += "\n" + "-----------------------------"
+        warnings.warn(ret_str)
+
+    except KeyError:
+        pytest.skip("PYNQ board or IP address not specified")
diff --git a/tests/pynq/test_pynq_performance_fifo.py b/tests/pynq/test_pynq_performance_fifo.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d4542473c4b58d3baa62f4123fd0f2f76954d95
--- /dev/null
+++ b/tests/pynq/test_pynq_performance_fifo.py
@@ -0,0 +1,128 @@
+import os
+
+import pytest
+
+import numpy as np
+from onnx import TensorProto, helper
+
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker
+from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
+from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
+from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject
+from finn.transformation.fpgadataflow.synth_pynq_proj import SynthPYNQProject
+import finn.transformation.fpgadataflow.replace_verilog_relpaths as rvp
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.util.basic import pynq_part_map, pynq_native_port_width
+from finn.core.throughput_test import throughput_test
+from scipy.stats import linregress
+import warnings
+
+
+def make_single_fifo_modelwrapper(Shape, Depth, fld_shape, finn_dtype):
+
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, Shape)
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, Shape)
+
+    FIFO_node = helper.make_node(
+        "StreamingFIFO",
+        ["inp"],
+        ["outp"],
+        domain="finn",
+        backend="fpgadataflow",
+        depth=Depth,
+        folded_shape=fld_shape,
+        dataType=str(finn_dtype.name),
+    )
+
+    graph = helper.make_graph(
+        nodes=[FIFO_node], name="fifo_graph", inputs=[inp], outputs=[outp]
+    )
+
+    model = helper.make_model(graph, producer_name="fifo-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", finn_dtype)
+    model.set_tensor_datatype("outp", finn_dtype)
+
+    return model
+
+
+@pytest.mark.vivado
+@pytest.mark.slow
+def test_pynq_performance_fifo():
+    try:
+        ip = os.environ["PYNQ_IP"]  # NOQA
+        board = os.environ["PYNQ_BOARD"]  # NOQA
+        if ip == "" or board == "":
+            pytest.skip("PYNQ board or IP address not specified")
+        fifo_width = pynq_native_port_width[board]
+        shape = (1, fifo_width)
+        folded_shape = (1, 1, fifo_width)
+        depth = 16
+        clk_ns = 10
+        dtype = DataType.BIPOLAR
+        fpga_part = pynq_part_map[board]
+        username = os.getenv("PYNQ_USERNAME", "xilinx")
+        password = os.getenv("PYNQ_PASSWORD", "xilinx")
+        port = os.getenv("PYNQ_PORT", 22)
+        target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
+
+        model = make_single_fifo_modelwrapper(shape, depth, folded_shape, dtype)
+        model = model.transform(InsertTLastMarker())
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP(fpga_part, clk_ns))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(rvp.ReplaceVerilogRelPaths())
+        model = model.transform(CreateStitchedIP(fpga_part, clk_ns))
+        model = model.transform(MakePYNQProject(board))
+        model = model.transform(SynthPYNQProject())
+        model = model.transform(MakePYNQDriver())
+        model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
+
+        ret = dict()
+        # try a range of batch sizes, some may fail due to insufficient DMA
+        # buffers
+        bsize_range_in = [2 ** i for i in range(20)]
+        bsize_range = []
+        for bsize in bsize_range_in:
+            res = throughput_test(model, bsize)
+            if res is not None:
+                ret[bsize] = res
+                bsize_range.append(bsize)
+            else:
+                # assume we reached largest possible N
+                break
+
+        y = [ret[key]["runtime[ms]"] for key in bsize_range]
+        lrret = linregress(bsize_range, y)
+        ret_str = ""
+        ret_str += "\n" + "FIFO Throughput Test Results"
+        ret_str += "\n" + "-----------------------------"
+        ret_str += "\n" + "From linear regression:"
+        ret_str += "\n" + "Invocation overhead: %f ms" % lrret.intercept
+        ret_str += "\n" + "Time per sample: %f ms" % lrret.slope
+        ret_str += "\n" + "Raw data:"
+
+        ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format(
+            "N", "runtime[ms]", "fclk[mhz]", "fps", "DRAM rd[Mb/s]", "DRAM wr[Mb/s]"
+        )
+        for k in bsize_range:
+            v = ret[k]
+            ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format(
+                k,
+                np.round(v["runtime[ms]"], 4),
+                v["fclk[mhz]"],
+                np.round(v["throughput[images/s]"], 2),
+                np.round(v["DRAM_in_bandwidth[Mb/s]"], 2),
+                np.round(v["DRAM_out_bandwidth[Mb/s]"], 2),
+            )
+        ret_str += "\n" + "-----------------------------"
+        warnings.warn(ret_str)
+
+    except KeyError:
+        pytest.skip("PYNQ board or IP address not specified")