diff --git a/.github/workflows/quicktest-dev-pr.yml b/.github/workflows/quicktest-dev-pr.yml
new file mode 100644
index 0000000000000000000000000000000000000000..cd59a629405c748187cdf478c0bdb0694c58c79f
--- /dev/null
+++ b/.github/workflows/quicktest-dev-pr.yml
@@ -0,0 +1,21 @@
+name: QuicktestPRAgainstDev
+
+on:
+  pull_request:
+    branches: [ dev ]
+  push:
+    branches: [ dev ]
+  
+
+jobs:
+
+  test:
+    name: Run quicktest on PR branch
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: checkout
+        uses: actions/checkout@v2
+
+      - name: DockerRunQuicktest
+        run: sh run-docker.sh quicktest
diff --git a/AUTHORS.rst b/AUTHORS.rst
index e231e61d38991e11e2e43a7c9a3a78c50c878244..eb1e06e54b7eb6deedd3e7f8392bb3aa257e7dc6 100644
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
@@ -6,3 +6,5 @@ Contributors
 * Jakoba Petri-Koenig (@auphelia)
 * Andrea Rigoni (@AndreaRigoni)
 * Hendrik Borras (@HenniOVP)
+* Lucian Petrica (@quetric)
+* Tobias Alonso (@Tobi-Alonso)
diff --git a/docker/Dockerfile.finn_ci b/docker/Dockerfile.finn_ci
index 0d610ec66a5f433d156f4e8da976767ce6458aef..2668927602ebb8de5fdc3d7c25b20a0c8c4a2e55 100644
--- a/docker/Dockerfile.finn_ci
+++ b/docker/Dockerfile.finn_ci
@@ -47,7 +47,7 @@ RUN git clone https://github.com/Xilinx/brevitas.git /workspace/brevitas
 # CNPY
 RUN git clone https://github.com/rogersce/cnpy.git /workspace/cnpy
 # FINN hlslib
-RUN git clone https://github.com/maltanar/finn-hlslib.git /workspace/finn-hlslib
+RUN git clone https://github.com/Xilinx/finn-hlslib.git /workspace/finn-hlslib
 # PyVerilator
 RUN git clone https://github.com/maltanar/pyverilator /workspace/pyverilator
 # PYNQ-HelloWorld
diff --git a/docker/Dockerfile.finn_dev b/docker/Dockerfile.finn_dev
index 1c2cb19d14137b866b55417522fdebb8e0d7ad90..1200c7d5d15bbd62e15f19f84e70d5fe0b8aca28 100644
--- a/docker/Dockerfile.finn_dev
+++ b/docker/Dockerfile.finn_dev
@@ -76,7 +76,7 @@ RUN git clone https://github.com/Xilinx/brevitas.git /workspace/brevitas
 # CNPY
 RUN git clone https://github.com/rogersce/cnpy.git /workspace/cnpy
 # FINN hlslib
-RUN git clone https://github.com/maltanar/finn-hlslib.git /workspace/finn-hlslib
+RUN git clone https://github.com/Xilinx/finn-hlslib.git /workspace/finn-hlslib
 # PyVerilator
 RUN git clone https://github.com/maltanar/pyverilator /workspace/pyverilator
 # PYNQ-HelloWorld
diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index 30dae7d2bd37516a887ba5ca20c1398af75905f3..ef6b65cedf12cd07df391c5045519d11f7ff6db0 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -15,7 +15,7 @@ gecho () {
 # the repos themselves are cloned in the Dockerfile
 BREVITAS_COMMIT=093de7d138c6715dbcaf82a9e1d530069327ad98
 CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4
-HLSLIB_COMMIT=6b88db826bb023937506913a23d964775a7606af
+HLSLIB_COMMIT=13e9b0772a27a3a1efc40c878d8e78ed09efb716
 PYVERILATOR_COMMIT=1d89cb0d4e0c97469cc6352c611f876ec13edfa6
 PYNQSHELL_COMMIT=0c82a61b0ec1a07fa275a14146233824ded7a13d
 
diff --git a/requirements.txt b/requirements.txt
index 2427f9490a3dd5a7ffe0e0a8cf2ad19af0934cdf..6b8e4d02c8ca1dcdbe607aabdccd27cec8056332 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,4 +8,5 @@ pre-commit
 pyverilator
 scipy
 sphinx
+toposort
 wget
diff --git a/run-docker.sh b/run-docker.sh
index e1f17e728204217ff3caa6e486b2daae16d6d271..e07556716db335421f57a390f1e6a17168ac058b 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -96,6 +96,8 @@ gecho "Port-forwarding for Netron $NETRON_PORT:$NETRON_PORT"
 gecho "Vivado IP cache dir is at $VIVADO_IP_CACHE"
 gecho "Using default PYNQ board $PYNQ_BOARD"
 
+DOCKER_INTERACTIVE=""
+
 if [ "$1" = "test" ]; then
         gecho "Running test suite (all tests)"
         DOCKER_CMD="python setup.py test"
@@ -108,6 +110,7 @@ elif [ "$1" = "notebook" ]; then
 else
         gecho "Running container only"
         DOCKER_CMD="bash"
+        DOCKER_INTERACTIVE="-it"
 fi
 
 # Build the FINN Docker image
@@ -123,7 +126,7 @@ docker build -f docker/Dockerfile.finn_dev --tag=$DOCKER_TAG \
 # Launch container with current directory mounted
 # important to pass the --init flag here for correct Vivado operation, see:
 # https://stackoverflow.com/questions/55733058/vivado-synthesis-hangs-in-docker-container-spawned-by-jenkins
-docker run -t --rm --name $DOCKER_INST_NAME -it --init \
+docker run -t --rm --name $DOCKER_INST_NAME $DOCKER_INTERACTIVE --init \
 --hostname $DOCKER_INST_NAME \
 -e "XILINX_VIVADO=$VIVADO_PATH" \
 -e "SHELL=/bin/bash" \
diff --git a/src/finn/core/modelwrapper.py b/src/finn/core/modelwrapper.py
index cdf99dc3bd8b698bec60d79ef6e34640ac3b740c..646add188c5d475cf37ccd33cf24d29d61754ae1 100644
--- a/src/finn/core/modelwrapper.py
+++ b/src/finn/core/modelwrapper.py
@@ -259,11 +259,10 @@ class ModelWrapper:
 
     def find_producer(self, tensor_name):
         """Finds and returns the node that produces the tensor with given name."""
-        ret = None
         for x in self._model_proto.graph.node:
             if tensor_name in x.output:
-                ret = x
-        return ret
+                return x
+        return None
 
     def find_upstream(self, tensor_name, finder_fxn):
         """Follow the producer chain upstream, calling finder_fxn on each upstream
@@ -333,6 +332,22 @@ class ModelWrapper:
         else:
             return None
 
+    def is_fork_node(self, node):
+        """Checks if the given node is a fork, that is, the node has multiple
+        direct successors"""
+        direct_successors = self.find_direct_successors(node)
+        is_fork = False if direct_successors is None else (len(direct_successors) > 1)
+        return is_fork
+
+    def is_join_node(self, node):
+        """Checks if the given node is a join, that is, the node has multiple
+        direct predecessors"""
+        direct_predecessors = self.find_direct_predecessors(node)
+        is_join = (
+            False if direct_predecessors is None else (len(direct_predecessors) > 1)
+        )
+        return is_join
+
     def get_all_tensor_names(self):
         """Returns a list of all (input, output and value_info) tensor names
         in the graph."""
@@ -494,3 +509,41 @@ class ModelWrapper:
             qa.tensor_name = tensor_name
             qa.quant_parameter_tensor_names.append(dt)
             qnt_annotations.append(qa)
+
+    def get_tensor_sparsity(self, tensor_name):
+        """Returns the sparsity of a given tensor as dictionary."""
+        graph = self._model_proto.graph
+        qnt_annotations = graph.quantization_annotation
+        ret = util.get_by_name(qnt_annotations, tensor_name, "tensor_name")
+        if ret is not None:
+            ret = util.get_by_name(
+                ret.quant_parameter_tensor_names, "tensor_sparsity", "key"
+            )
+            if ret is not None:
+                return eval(ret.value)
+        return None
+
+    def set_tensor_sparsity(self, tensor_name, sparsity_dict):
+        """Sets the sparsity annotation of a tensor with given name."""
+        graph = self._model_proto.graph
+        qnt_annotations = graph.quantization_annotation
+        ret = util.get_by_name(qnt_annotations, tensor_name, "tensor_name")
+        if ret is not None:
+            ret_ts = util.get_by_name(
+                ret.quant_parameter_tensor_names, "tensor_sparsity", "key"
+            )
+            if ret_ts is not None:
+                ret_ts.value = str(sparsity_dict)
+            else:
+                ts = onnx.StringStringEntryProto()
+                ts.key = "tensor_sparsity"
+                ts.value = str(sparsity_dict)
+                ret.quant_parameter_tensor_names.append(ts)
+        else:
+            qa = onnx.TensorAnnotation()
+            dt = onnx.StringStringEntryProto()
+            dt.key = "tensor_sparsity"
+            dt.value = str(sparsity_dict)
+            qa.tensor_name = tensor_name
+            qa.quant_parameter_tensor_names.append(dt)
+            qnt_annotations.append(qa)
diff --git a/src/finn/core/remote_exec.py b/src/finn/core/remote_exec.py
index 335dfec04e4abee41f914c5d912ce291a0d31a91..a533e4d36629f57f7c4a576570d75a1e051de5be 100644
--- a/src/finn/core/remote_exec.py
+++ b/src/finn/core/remote_exec.py
@@ -79,6 +79,12 @@ def remote_exec(model, execution_context):
     bash_command = ["/bin/bash", "-c", cmd]
     process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
     process_compile.communicate()
+    # remove stale output file from local dir, if any
+    try:
+        os.remove("{}/output.npy".format(deployment_dir))
+    except FileNotFoundError:
+        pass
+    # copy generated output to local
     cmd = "sshpass -p {} scp -P{} {}@{}:{}/{}/output.npy {}".format(
         pynq_password,
         pynq_port,
diff --git a/src/finn/core/throughput_test.py b/src/finn/core/throughput_test.py
index c82d540e29fc59b92a22bf011e823a9f8c076843..8d3dabcf8af51327d5d951464c6d9b36e2f67497 100644
--- a/src/finn/core/throughput_test.py
+++ b/src/finn/core/throughput_test.py
@@ -30,10 +30,11 @@ import os
 import subprocess
 
 
-def throughput_test(model):
+def throughput_test(model, batchsize=1000):
     """Runs the throughput test for the given model remotely on the pynq board.
     The metadata properties related to the pynq board have to be set.
-    Returns a dictionary with results of the throughput test"""
+    Returns a dictionary with results of the throughput test. Returns None
+    if the test fails."""
 
     pynq_ip = model.get_metadata_prop("pynq_ip")
     pynq_port = int(model.get_metadata_prop("pynq_port"))
@@ -47,7 +48,8 @@ def throughput_test(model):
     cmd = (
         "sshpass -p {} ssh {}@{} -p {} "
         '"cd {}/{}; echo "{}" | '
-        'sudo -S python3.6 driver.py --exec_mode="throughput_test" --batchsize=1000"'
+        'sudo -S python3.6 driver.py --exec_mode="throughput_test" --batchsize=%d"'
+        % batchsize
     ).format(
         pynq_password,
         pynq_username,
@@ -61,6 +63,12 @@ def throughput_test(model):
     process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
     process_compile.communicate()
 
+    # remove any pre-existing metrics file
+    try:
+        os.remove("{}/nw_metrics.txt".format(deployment_dir))
+    except FileNotFoundError:
+        pass
+
     cmd = "sshpass -p {} scp -P{} {}@{}:{}/{}/nw_metrics.txt {}".format(
         pynq_password,
         pynq_port,
@@ -74,7 +82,9 @@ def throughput_test(model):
     process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
     process_compile.communicate()
 
-    with open("{}/nw_metrics.txt".format(deployment_dir), "r") as file:
-        res = eval(file.read())
-
-    return res
+    try:
+        with open("{}/nw_metrics.txt".format(deployment_dir), "r") as file:
+            res = eval(file.read())
+        return res
+    except FileNotFoundError:
+        return None
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index a99d62fd18a958d37fef6b3e1939ef97e859b0b2..a688898f4a43b33fd3f07cda12144b84829e451f 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -40,6 +40,7 @@ from finn.util.basic import (
 from finn.util.fpgadataflow import (
     IPGenBuilder,
     pyverilate_get_liveness_threshold_cycles,
+    rtlsim_multi_io,
 )
 from . import templates
 
@@ -109,6 +110,31 @@ class HLSCustomOp(CustomOp):
         )
         return verilog_file
 
+    def get_all_verilog_paths(self):
+        "Return list of all folders containing Verilog code for this node."
+
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        assert (
+            code_gen_dir != ""
+        ), """Node attribute "code_gen_dir_ipgen" is
+        not set. Please run HLSSynthIP first."""
+        verilog_path = "{}/project_{}/sol1/impl/verilog/".format(
+            code_gen_dir, self.onnx_node.name
+        )
+        # default impl only returns the HLS verilog codegen dir
+        return [verilog_path]
+
+    def get_all_verilog_filenames(self):
+        "Return list of all Verilog files used for this node."
+
+        verilog_files = []
+        verilog_paths = self.get_all_verilog_paths()
+        for verilog_path in verilog_paths:
+            for f in os.listdir(verilog_path):
+                if f.endswith(".v"):
+                    verilog_files += [f]
+        return verilog_files
+
     def prepare_rtlsim(self):
         """Creates a Verilator emulation library for the RTL code generated
         for this node, sets the rtlsim_so attribute to its path and returns
@@ -116,24 +142,15 @@ class HLSCustomOp(CustomOp):
 
         if PyVerilator is None:
             raise ImportError("Installation of PyVerilator is required.")
-        # ensure that code is generated
-        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-        assert (
-            code_gen_dir != ""
-        ), """Node attribute "code_gen_dir_ipgen" is
-        not set. Please run HLSSynthIP first."""
-        verilog_file = self.get_verilog_top_filename()
-        assert os.path.isfile(verilog_file), "Cannot find top-level Verilog file."
+        verilog_paths = self.get_all_verilog_paths()
+        verilog_files = self.get_all_verilog_filenames()
         # build the Verilator emu library
         sim = PyVerilator.build(
-            verilog_file,
+            verilog_files,
             build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
-            verilog_path=[
-                "{}/project_{}/sol1/impl/verilog/".format(
-                    code_gen_dir, self.onnx_node.name
-                )
-            ],
+            verilog_path=verilog_paths,
             trace_depth=get_rtlsim_trace_depth(),
+            top_module_name=self.get_verilog_top_module_name(),
         )
         # save generated lib filename in attribute
         self.set_nodeattr("rtlsim_so", sim.lib._name)
@@ -202,6 +219,7 @@ class HLSCustomOp(CustomOp):
         self.code_gen_dict["$CLKPERIOD$"] = [str(clk)]
         self.code_gen_dict["$EXTRA_DIRECTIVES$"] = self.ipgen_extra_directives()
 
+
         template = self.ipgentcl_template
 
         for key in self.code_gen_dict:
@@ -217,7 +235,7 @@ class HLSCustomOp(CustomOp):
     def ipgen_extra_directives(self):
         "Return a list of extra tcl directives for HLS synthesis."
         return []
-
+        
     def ipgen_singlenode_code(self):
         """Builds the bash script for ip generation using the IPGenBuilder from
         finn.util.fpgadataflow."""
@@ -302,14 +320,24 @@ Found no codegen dir for this node, did you run the prepare_cppsim transformatio
             )
 
     def npy_to_dynamic_output(self, context):
-        """Reads the output from a .npy file and saves it at the right place in
-        the context dictionary."""
-        # TODO support multi-output nodes as needed
+        """Reads the output from an output.npy file generated from cppsim and
+        places its content into the context dictionary."""
         node = self.onnx_node
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         output = np.load("{}/output.npy".format(code_gen_dir))
         context[node.output[0]] = output
 
+    def npy_to_dynamic_outputs(self, context, npy_list):
+        """Reads the output from .npy files generated from cppsim and places
+        their content into the context dictionary.
+        npy_list is a list specifying which files to read, and its order must
+        match the order of node outputs."""
+        node = self.onnx_node
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        for i in range(len(npy_list)):
+            output = np.load("{}/{}".format(code_gen_dir, npy_list[i]))
+            context[node.output[i]] = output
+
     def exec_precompiled_singlenode_model(self):
         """Executes precompiled executable."""
         executable_path = self.get_nodeattr("executable_path")
@@ -405,6 +433,16 @@ compilation transformations?
             sim.stop_vcd_trace()
         return outputs
 
+    def rtlsim_multi_io(self, sim, io_dict):
+        "Run rtlsim for this node, supports multiple i/o streams."
+
+        trace_file = self.get_nodeattr("rtlsim_trace")
+        if trace_file == "default":
+            trace_file = self.onnx_node.name + ".vcd"
+        num_out_values = self.get_number_output_values()
+        total_cycle_count = rtlsim_multi_io(sim, io_dict, num_out_values, trace_file)
+        self.set_nodeattr("sim_cycles", total_cycle_count)
+
     def execute_node(self, context, graph):
         """Executes single node using cppsim or rtlsim."""
         mode = self.get_nodeattr("exec_mode")
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
index e4d106068d4d128c66b2ce5f3d6c925dfe414b90..3e40ad70208909551365c51324153859ccc79ceb 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
@@ -41,10 +41,19 @@ from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 # output 0 is the output tensor, shape NHWC:
 #     = (1, OFMDim, OFMDim, (ConvKernelDim^2)*IFMChannels)
 
+# note: the actual data layout produced by the hlslib kernels is different
+# for depthwise and non-depthwise ops.
+# * non-depthwise SWG: (1, OFMDim, OFMDim, K, K, IFMChannels/SIMD, SIMD)
+# * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/SIMD, K, K, SIMD)
+# see test_fpgadataflow_slidingwindow.py for an example of how to transform
+# between the two layouts
+
 
 class ConvolutionInputGenerator(HLSCustomOp):
-    """Class that corresponds to finn-hlslib ConvolutionInputGenerator
-    (sliding window) function."""
+    """Class that corresponds to one of the finn-hlslib ConvolutionInputGenerator
+    (sliding window) function variants. Depending on the combination of
+    attributes (e.g. depthwise or not, whether k % stride is 0) a different
+    variant will be picked for the actual HLS implementation."""
 
     def __init__(self, onnx_node):
         super().__init__(onnx_node)
@@ -60,6 +69,7 @@ class ConvolutionInputGenerator(HLSCustomOp):
             # FINN DataTypes for inputs, weights, outputs
             "inputDataType": ("s", True, ""),
             "outputDataType": ("s", True, ""),
+            "depthwise": ("i", False, 0),
             # FPGA resource type for ConvolutionInputGenerator input buffer
             # auto -- let Vivado HLS decide
             # block -- use BRAM
@@ -106,7 +116,6 @@ class ConvolutionInputGenerator(HLSCustomOp):
         pad = 0
         ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, pad)
         assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
-        assert k % stride == 0, "stride must divide kernel size k"
         wf = int((k * k * ifm_ch) // simd)
         folded_oshape = (1, ofm_dim, ofm_dim, wf, simd)
         return folded_oshape
@@ -305,12 +314,35 @@ class ConvolutionInputGenerator(HLSCustomOp):
 
     def docompute(self):
         node = self.onnx_node
-        self.code_gen_dict["$DOCOMPUTE$"] = [
-            """{}<ConvKernelDim1, IFMChannels1, Input_precision1, IFMDim1,
-                OFMDim1, SIMD1, Stride1> (in0, out, numReps);""".format(
-                node.op_type
-            )
-        ]
+        ram_style = self.get_nodeattr("ram_style")
+        map_to_hls_ram_style = {
+            "auto": "ap_resource_dflt()",
+            "block": "ap_resource_bram()",
+            "distributed": "ap_resource_lutram()",
+            "ultra": "ap_resource_uram()",
+        }
+        hls_ram_style = map_to_hls_ram_style[ram_style]
+        hls_call = node.op_type
+        # check if non optimized ConvolutionInputGenerator is needed
+        k = self.get_nodeattr("ConvKernelDim")
+        stride = self.get_nodeattr("Stride")
+        if k % stride != 0:
+            hls_call += "_kernel_stride"
+
+        if self.get_nodeattr("depthwise") == 1:
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}_dws<ConvKernelDim1, IFMChannels1, Input_precision1, IFMDim1,
+                    OFMDim1, SIMD1, Stride1> (in0, out, numReps, {});""".format(
+                    hls_call, hls_ram_style
+                )
+            ]
+        else:
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}<ConvKernelDim1, IFMChannels1, Input_precision1, IFMDim1,
+                    OFMDim1, SIMD1, Stride1> (in0, out, numReps, {});""".format(
+                    hls_call, hls_ram_style
+                )
+            ]
 
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -356,17 +388,3 @@ class ConvolutionInputGenerator(HLSCustomOp):
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
-
-    def ipgen_extra_directives(self):
-        # add directive to control input buffer memory resources
-        ram_style = self.get_nodeattr("ram_style")
-        map_to_hls_ram_style = {
-            "auto": "RAM_2P",
-            "block": "RAM_2P_BRAM",
-            "distributed": "RAM_2P_LUTRAM",
-            "ultra": "RAM_2P_URAM",
-        }
-        hls_ram_style = map_to_hls_ram_style[ram_style]
-        directive = "set_directive_resource -core %s " % hls_ram_style
-        directive += "ConvolutionInputGenerator inputBuf"
-        return [directive]
diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..54051af5e0387081a23e1f8fa77ec9e363098830
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
@@ -0,0 +1,361 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+
+import numpy as np
+
+from finn.core.datatype import DataType
+from finn.custom_op.fpgadataflow import HLSCustomOp
+from onnx import TensorProto, helper
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class DuplicateStreams_Batch(HLSCustomOp):
+    """Class that corresponds to finn-hlslib function of the same name."""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "NumChannels": ("i", True, 0),
+            "PE": ("i", True, 0),
+            # FINN DataTypes for input
+            "inputDataType": ("s", True, ""),
+            # number of input vectors, examples:
+            # [1] is a single vector (like a FC layer with batch=1)
+            # [4] is four vectors (like a FC layer with batch=4)
+            # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
+            "numInputVectors": ("ints", False, [1]),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_normal_input_shape(self):
+        ch = self.get_nodeattr("NumChannels")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        ishape = tuple(vecs + [ch])
+        return ishape
+
+    def get_folded_input_shape(self):
+        ch = self.get_nodeattr("NumChannels")
+        pe = self.get_nodeattr("PE")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        assert ch % pe == 0, "PE must divide NumChannels"
+        folds = int(ch / pe)
+        folded_ishape = tuple(vecs + [folds, pe])
+        return folded_ishape
+
+    def get_normal_output_shape(self):
+        return self.get_normal_input_shape()
+
+    def get_folded_output_shape(self):
+        return self.get_folded_input_shape()
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpected input shape."
+        # implement tensor with correct shape
+        values = np.random.randn(*oshape).astype(np.float32)
+        split_input = np.concatenate((values, values), axis=0)
+        return helper.make_node(
+            "Split",
+            inputs=[split_input],
+            outputs=[self.onnx_node.output[0], self.onnx_node.output[0]],
+            value=helper.make_tensor(
+                name="const_tensor", data_type=TensorProto.FLOAT, axis=0
+            ),
+        )
+
+    def infer_node_datatype(self, model):
+        odt = self.get_output_datatype()
+        model.set_tensor_datatype(self.onnx_node.output[0], odt)
+
+    def verify_node(self):
+        info_messages = []
+        # verify that "domain" is set to "finn"
+        domain_value = self.onnx_node.domain
+        if domain_value == "finn":
+            info_messages.append("Attribute domain is set correctly")
+        else:
+            info_messages.append('Attribute domain should be set to "finn"')
+
+        # verify that "backend" is set to "fpgadataflow"
+        backend_value = self.get_nodeattr("backend")
+        if backend_value == "fpgadataflow":
+            info_messages.append("Attribute backend is set correctly")
+        else:
+            info_messages.append('Attribute backend should be set to "fpgadataflow"')
+
+        # verify that all necessary attributes exist
+        try:
+            self.get_nodeattr("code_gen_dir_cppsim")
+            self.get_nodeattr("executable_path")
+            self.get_nodeattr("NumChannels")
+            self.get_nodeattr("PE")
+            self.get_nodeattr("inputDataType")
+            info_messages.append("All necessary attributes exist")
+        except Exception:
+            info_messages.append(
+                """The required GlobalAccPool_Batch attributes do not exist."""
+            )
+
+        return info_messages
+
+    def get_input_datatype(self):
+        """Returns FINN DataType of input."""
+        return DataType[self.get_nodeattr("inputDataType")]
+
+    def get_output_datatype(self):
+        """Returns FINN DataType of output."""
+        return DataType[self.get_nodeattr("inputDataType")]
+
+    def get_instream_width(self):
+        """Returns input stream width."""
+        ibits = self.get_input_datatype().bitwidth()
+        pe = self.get_nodeattr("PE")
+        in_width = pe * ibits
+        return in_width
+
+    def get_outstream_width(self):
+        """Returns output stream width."""
+        obits = self.get_output_datatype().bitwidth()
+        pe = self.get_nodeattr("PE")
+        out_width = pe * obits
+        return out_width
+
+    def get_number_output_values(self):
+        return 2 * np.prod(self.get_folded_output_shape()[1:-1])
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        exp_ishape = self.get_normal_input_shape()
+        exp_oshape = self.get_normal_output_shape()
+        folded_ishape = self.get_folded_input_shape()
+        folded_oshape = self.get_folded_output_shape()
+
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert inp.shape == exp_ishape, """Input shape doesn't match expected shape ."""
+        export_idt = self.get_input_datatype()
+        # reshape input into folded form
+        inp = inp.reshape(folded_ishape)
+        # make copy before saving array
+        reshaped_input = inp.copy()
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_outputs(context, ["output0.npy", "output1.npy"])
+            assert (
+                context[node.output[0]].shape == folded_oshape
+            ), "cppsim \
+            did not produce expected ofolded utput shape"
+            assert (
+                context[node.output[1]].shape == folded_oshape
+            ), "cppsim \
+            did not produce expected ofolded utput shape"
+            context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+            context[node.output[1]] = context[node.output[1]].reshape(*exp_oshape)
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            rtlsim_inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            rtlsim_dict = {
+                "inputs": {"in0": rtlsim_inp},
+                "outputs": {"out0": [], "out1": []},
+            }
+            self.rtlsim_multi_io(sim, rtlsim_dict)
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_shape = self.get_folded_output_shape()
+
+            out_npy_path = "{}/output0.npy".format(code_gen_dir)
+            rtlsim_output_to_npy(
+                rtlsim_dict["outputs"]["out0"],
+                out_npy_path,
+                odt,
+                out_shape,
+                packed_bits,
+                target_bits,
+            )
+            # load and reshape output 0
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+            context[node.output[0]] = output
+
+            out_npy_path = "{}/output1.npy".format(code_gen_dir)
+            rtlsim_output_to_npy(
+                rtlsim_dict["outputs"]["out1"],
+                out_npy_path,
+                odt,
+                out_shape,
+                packed_bits,
+                target_bits,
+            )
+            # load and reshape output 1
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+            context[node.output[1]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        assert (
+            context[node.output[0]].shape == exp_oshape
+        ), """Output0 shape doesn't match expected shape."""
+        assert (
+            context[node.output[1]].shape == exp_oshape
+        ), """Output1 shape doesn't match expected shape."""
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"']
+
+    def defines(self, var):
+        self.code_gen_dict["$DEFINES$"] = []
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_input_datatype()
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"] = []
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
+            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+        )
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out0 ("out0");'.format(self.get_outstream_width())
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out1 ("out1");'.format(self.get_outstream_width())
+        )
+
+    def docompute(self):
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            """DuplicateStreams_Batch<{}, {}> (in0, out0, out1, 1);""".format(
+                self.get_outstream_width(), self.get_number_output_values() // 2,
+            )
+        ]
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output0.npy" % code_gen_dir
+        npy_out1 = "%s/output1.npy" % code_gen_dir
+        oshape = self.get_folded_output_shape()
+        oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
+
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out0, %s, "%s");'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                oshape_cpp_str,
+                npy_out,
+            )
+        ]
+
+        self.code_gen_dict["$DATAOUTSTREAM$"] += [
+            'apintstream2npy<%s, %s, %d, %s>(out1, %s, "%s");'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                oshape_cpp_str,
+                npy_out1,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            """void {}(hls::stream<ap_uint<{}>> &in0,
+                hls::stream<ap_uint<{}>> &out0,
+                hls::stream<ap_uint<{}>> &out1)""".format(
+                self.onnx_node.name,
+                self.get_instream_width(),
+                self.get_outstream_width(),
+                self.get_outstream_width(),
+            )
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out0")
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out1")
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+        )
diff --git a/src/finn/custom_op/fpgadataflow/fmpadding.py b/src/finn/custom_op/fpgadataflow/fmpadding.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa321dfa65d14b67fa218fb6a49f602ddab8d57e
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/fmpadding.py
@@ -0,0 +1,302 @@
+import os
+import numpy as np
+from onnx import TensorProto, helper
+from finn.core.datatype import DataType
+from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class FMPadding_Batch(HLSCustomOp):
+    """Corresponds to finn-hlslib FMPadding_Batch function.
+    Pads input image by given amount."""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            # spatial size of input images
+            "ImgDim": ("i", True, 0),
+            # total padding (per dimension) to apply
+            "Padding": ("i", True, 2),
+            # number of channels in input image
+            "NumChannels": ("i", True, 0),
+            # FINN input datatype
+            "inputDataType": ("s", True, ""),
+            # controls distribution of padded pixels
+            # in case of uneven padding -- see FMPadding fxn
+            # in hlslib
+            "PaddingStyle": ("i", False, 2),
+            # shape describing input vecs per execution
+            "numInputVectors": ("i", False, 1),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_padded_odim(self):
+        "Return the padded spatial size of the output."
+
+        idim = self.get_nodeattr("ImgDim")
+        pad = self.get_nodeattr("Padding")
+        return idim + pad
+
+    def get_normal_input_shape(self):
+        idim = self.get_nodeattr("ImgDim")
+        num_ch = self.get_nodeattr("NumChannels")
+
+        ishape = (1, idim, idim, num_ch)
+        return ishape
+
+    def get_normal_output_shape(self):
+        odim = self.get_padded_odim()
+        num_ch = self.get_nodeattr("NumChannels")
+
+        oshape = (1, odim, odim, num_ch)
+        return oshape
+
+    def get_folded_input_shape(self):
+        # even though there is no folding in the current hlslib op,
+        # insert a time multiplexing axis to remain compatible with the
+        # shapes produced by the rest of the dataflow pipeline
+        ret = list(self.get_normal_input_shape())
+        ret.insert(-1, 1)
+        return tuple(ret)
+
+    def get_folded_output_shape(self):
+        # even though there is no folding in the current hlslib op,
+        # insert a time multiplexing axis to remain compatible with the
+        # shapes produced by the rest of the dataflow pipeline
+        ret = list(self.get_normal_output_shape())
+        ret.insert(-1, 1)
+        return tuple(ret)
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpect input shape for SameResize."
+        # implement tensor with correct shape
+        values = np.random.randn(*oshape).astype(np.float32)
+        return helper.make_node(
+            "Constant",
+            inputs=[],
+            outputs=[self.onnx_node.output[0]],
+            value=helper.make_tensor(
+                name="const_tensor",
+                data_type=TensorProto.FLOAT,
+                dims=values.shape,
+                vals=values.flatten().astype(float),
+            ),
+        )
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        # data type stays the same
+        dtype = model.get_tensor_datatype(node.input[0])
+        exp_idtype = self.get_input_datatype()
+        assert dtype == exp_idtype, "Unexpected datatype for FMPadding_Batch"
+        model.set_tensor_datatype(node.output[0], dtype)
+
+    def verify_node(self):
+        pass
+
+    def get_input_datatype(self):
+        """Returns FINN DataType of input."""
+        ret = DataType[self.get_nodeattr("inputDataType")]
+        # the hlslib op always pads with zeros, so ensure that the DataType
+        # is able to represent zeros
+        assert ret.allowed(0), "FMPadding_Batch DataType must support zero"
+        return ret
+
+    def get_output_datatype(self):
+        """Returns FINN DataType of output. (Same as input datatype)"""
+        return self.get_input_datatype()
+
+    def get_instream_width(self):
+        ibits = self.get_input_datatype().bitwidth()
+        num_ch = self.get_nodeattr("NumChannels")
+
+        return ibits * num_ch
+
+    def get_outstream_width(self):
+        obits = self.get_output_datatype().bitwidth()
+        num_ch = self.get_nodeattr("NumChannels")
+
+        return obits * num_ch
+
+    def get_number_output_values(self):
+        folded_oshape = self.get_folded_output_shape()
+        return np.prod(folded_oshape[:-1])
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"']
+
+    def defines(self, var):
+        self.code_gen_dict["$DEFINES$"] = [
+            """#define ImgDim1 {}\n#define OutputDim1 {}\n
+            #define Padding1 {}\n#define NumChannels1 {}\n
+            #define PaddingStyle1 {}\n#define numReps {}\n""".format(
+                self.get_nodeattr("ImgDim"),
+                self.get_padded_odim(),
+                self.get_nodeattr("Padding"),
+                self.get_nodeattr("NumChannels"),
+                self.get_nodeattr("PaddingStyle"),
+                self.get_nodeattr("numInputVectors"),
+            )
+        ]
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_input_datatype()
+        if dtype == DataType.BIPOLAR:
+            # use binary for bipolar storage
+            dtype = DataType.BINARY
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"] = []
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
+            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+        )
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+        )
+
+    def docompute(self):
+        in_t = self.get_input_datatype().get_hls_datatype_str()
+        node = self.onnx_node
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            """{}<ImgDim1, OutputDim1, Padding1, NumChannels1,
+            {}, PaddingStyle1> (in0, out, numReps);""".format(
+                node.op_type, in_t
+            )
+        ]
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        if dtype == DataType.BIPOLAR:
+            # use binary for bipolar storage
+            dtype = DataType.BINARY
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        oshape = self.get_folded_output_shape()
+        oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
+
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                oshape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)"
+            % (self.onnx_node.name, packed_hls_type, packed_hls_type)
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+        )
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        exp_ishape = self.get_normal_input_shape()
+        exp_oshape = self.get_normal_output_shape()
+        folded_oshape = self.get_folded_output_shape()
+
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert (
+            inp.shape == exp_ishape
+        ), """Input shape doesn't
+        match expected shape (1, ImgDim, ImgDim, NumChannels)."""
+        export_idt = self.get_input_datatype()
+
+        # no reshaping for input since assuming no folding on input
+        # make copy before saving array
+        inp = inp.copy()
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), inp)
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_output(context)
+            assert (
+                context[node.output[0]].shape == folded_oshape
+            ), "cppsim did not produce expected folded output shape"
+            context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            rtlsim_inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+            odt = export_idt
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+        assert (
+            context[node.output[0]].shape == exp_oshape
+        ), """Output shape doesn't match expected shape
+            (1, OutputDim, OutputDim, NumChannels)."""
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index f650442401b49f1ad0a602b6b2ad3e50fbb5e5c2..9b73ba1e100aa83fd19aa8799195c99891fca3fd 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -513,40 +513,44 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         elif mem_mode == "decoupled":
             """Saves weights in corresponding file format for cppsim or rtlsim"""
             # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD)
-            # and save as unflipped weight tensor to be able to differentiate between
-            # flipped an unflipped weight tensor (has to be flipped for cppsim)
-
             weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3))
 
-            # flip PE dimension and reverse SIMD flip for saving weights in .npy
-            weight_tensor_flipped = np.flip(weight_tensor_unflipped, axis=-2)
-            weight_tensor_flipped = np.flip(weight_tensor_flipped, axis=-1)
+            # reverse SIMD flip for saving weights in .npy
+            weight_tensor_simd_flipped = np.flip(weight_tensor_unflipped, axis=-1)
+            # PE flip for saving weights in .dat
+            weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2)
 
-            # reshape weight tensor (flipped and unflipped) to desired shape
+            # reshape weight tensor (simd_flipped and pe_flipped) to desired shape
             pe = self.get_nodeattr("PE")
             simd = self.get_nodeattr("SIMD")
-            # unflipped
-            weight_tensor_unflipped = weight_tensor_unflipped.reshape(1, -1, pe * simd)
-            weight_tensor_unflipped = weight_tensor_unflipped.copy()
+            # simd_flipped
+            weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape(
+                1, -1, pe * simd
+            )
+            weight_tensor_simd_flipped = weight_tensor_simd_flipped.copy()
             # flipped
-            weight_tensor_flipped = weight_tensor_flipped.reshape(1, -1, pe * simd)
-            weight_tensor_flipped = weight_tensor_flipped.copy()
+            weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape(
+                1, -1, pe * simd
+            )
+            weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy()
 
             """Saves weights into .npy file"""
-            np.save(os.path.join(code_gen_dir, "weights.npy"), weight_tensor_flipped)
+            np.save(
+                os.path.join(code_gen_dir, "weights.npy"), weight_tensor_simd_flipped
+            )
 
             """Saves weights into .dat file"""
             # convert weight values into hexstring
             weight_width = self.get_weightstream_width()
             # pad to nearest 4 bits to get hex strings
             weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
-            weight_tensor_unflipped = pack_innermost_dim_as_hex_string(
-                weight_tensor_unflipped, export_wdt, weight_width_padded, prefix=""
+            weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
+                weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
             )
-            weight_stream_len = np.prod(weight_tensor_unflipped.shape)
+            weight_stream_len = np.prod(weight_tensor_pe_flipped.shape)
             factor = math.ceil(weight_stream_len / 1024)
             # add zeroes to pad out file to 1024 entries
-            weight_stream = weight_tensor_unflipped.flatten()
+            weight_stream = weight_tensor_pe_flipped.flatten()
             pad_amt = (factor * 1024) - weight_stream_len
             weight_stream = np.pad(
                 weight_stream, (0, pad_amt), mode="constant", constant_values="0"
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa33c70218fab16f106da45e296f0d59ae4ea606
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
@@ -0,0 +1,551 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from math import ceil
+import os
+
+import numpy as np
+
+from onnx import TensorProto, helper
+from finn.core.datatype import DataType
+from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.util.basic import interleave_matrix_outer_dim_from_partitions
+from finn.util.data_packing import (
+    npy_to_rtlsim_input,
+    numpy_to_hls_code,
+    rtlsim_output_to_npy,
+)
+from . import templates
+
+# ONNX i/o tensor shape assumptions for Thresholding:
+# input 0 is the input tensor, shape (..., NumChannels)
+# input 1 is the threshold tensor, shape (NumChannels, n_thres)
+# output 0 is the output tensor, shape (..., NumChannels) - same as input
+# the ... here can be any shape (representing groups of vectors)
+
+
+class Thresholding_Batch(HLSCustomOp):
+    """Class that corresponds to finn-hls Thresholding_Batch function."""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+        self.decoupled_wrapper = templates.decoupled_wrapper
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "PE": ("i", True, 0),
+            "NumChannels": ("i", True, 0),
+            # string defining memory type
+            "ram_style": ("s", False, "distributed"),
+            # FINN DataTypes for inputs, weights, outputs
+            "inputDataType": ("s", True, ""),
+            "outputDataType": ("s", True, ""),
+            # input and output FIFO depths
+            "inFIFODepth": ("i", False, 0),
+            "outFIFODepth": ("i", False, 0),
+            # number of input vectors, examples:
+            # [1] is a single vector (like a FC layer with batch=1)
+            # [4] is four vectors (like a FC layer with batch=4)
+            # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
+            "numInputVectors": ("ints", False, [1]),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def calc_tmem(self):
+        """Calculates and returns TMEM."""
+        mh = self.get_nodeattr("NumChannels")
+        pe = self.get_nodeattr("PE")
+        return mh // pe
+
+    def make_shape_compatible_op(self, model):
+        oshape = self.get_normal_output_shape()
+        # implement tensor with correct shape
+        values = np.random.randn(*oshape).astype(np.float32)
+        return helper.make_node(
+            "Constant",
+            inputs=[],
+            outputs=[self.onnx_node.output[0]],
+            value=helper.make_tensor(
+                name="const_tensor",
+                data_type=TensorProto.FLOAT,
+                dims=values.shape,
+                vals=values.flatten().astype(float),
+            ),
+        )
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        # check input datatype against property
+        idt_name = self.get_input_datatype().name
+        exp_idt_name = self.get_nodeattr("inputDataType")
+        assert exp_idt_name == idt_name, "Bad input DataType for Thresholding layer"
+        # set output datatype from property
+        odt = self.get_output_datatype()
+        model.set_tensor_datatype(node.output[0], odt)
+
+    def verify_node(self):
+        info_messages = []
+        # verify that "domain" is set to "finn"
+        domain_value = self.onnx_node.domain
+        if domain_value == "finn":
+            info_messages.append("Attribute domain is set correctly")
+        else:
+            info_messages.append('Attribute domain should be set to "finn"')
+
+        # verify that "backend" is set to "fpgadataflow"
+        backend_value = self.get_nodeattr("backend")
+        if backend_value == "fpgadataflow":
+            info_messages.append("Attribute backend is set correctly")
+        else:
+            info_messages.append('Attribute backend should be set to "fpgadataflow"')
+
+        # verify that all necessary attributes exist
+        # TODO collect automatically from get_nodeattr_types
+        try:
+            self.get_nodeattr("code_gen_dir_cppsim")
+            self.get_nodeattr("executable_path")
+            self.get_nodeattr("NumChannels")
+            self.get_nodeattr("PE")
+            self.get_nodeattr("inputDataType")
+            self.get_nodeattr("outputDataType")
+            info_messages.append("All necessary attributes exist")
+        except Exception:
+            info_messages.append(
+                """The required Threshold_Batch attributes do not exist."""
+            )
+
+        return info_messages
+
+    def bram_estimation(self):
+        """Calculates BRAM cost if resource set to BRAM"""
+        style = self.get_nodeattr("ram_style")
+        P = self.get_nodeattr("PE")
+        idt = self.get_input_datatype()
+        A = idt.bitwidth()
+        tmem = self.calc_tmem()
+
+        if style == "block" and tmem > 1:
+            return int(ceil(A * P / 16)) * int(ceil(tmem / 1024))
+        else:
+            return 0
+
+    def lut_estimation(self):
+        """Calculates LUT cost, taking memory resource type into account """
+        # TODO add in/out FIFO contributions
+        style = self.get_nodeattr("ram_style")
+        P = self.get_nodeattr("PE")
+        idt = self.get_input_datatype()
+        A = idt.bitwidth()
+        tmem = self.calc_tmem()
+        # cost of comparators
+        comparator_cost = A * P
+        # cost of LUTRAM
+        if style == "distributed" and tmem > 1:
+            lutram_cost = P * A * int(ceil(tmem / 64))
+        else:
+            lutram_cost = 0
+        # total cost
+        return comparator_cost + lutram_cost
+
+    def get_input_datatype(self):
+        """Returns FINN DataType of input."""
+        return DataType[self.get_nodeattr("inputDataType")]
+
+    def get_output_datatype(self):
+        """Returns FINN DataType of output."""
+        return DataType[self.get_nodeattr("outputDataType")]
+
+    def get_instream_width(self):
+        i_bits = self.get_input_datatype().bitwidth()
+        return i_bits * self.get_nodeattr("PE")
+
+    def get_outstream_width(self):
+        o_bits = self.get_output_datatype().bitwidth()
+        return o_bits * self.get_nodeattr("PE")
+
+    def get_folded_input_shape(self):
+        ich = self.get_nodeattr("NumChannels")
+        pe = self.get_nodeattr("PE")
+        fold = ich // pe
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        folded_input_shape = tuple(vecs + [fold, pe])
+        return folded_input_shape
+
+    def get_folded_output_shape(self):
+        # same shape as input
+        return self.get_folded_input_shape()
+
+    def get_normal_input_shape(self):
+        ich = self.get_nodeattr("NumChannels")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        normal_input_shape = tuple(vecs + [ich])
+        return normal_input_shape
+
+    def get_normal_output_shape(self):
+        # same shape as input
+        return self.get_normal_input_shape()
+
+    def get_number_output_values(self):
+        nf = np.prod(self.get_folded_output_shape()[:-1])
+        return nf
+
+    def get_template_param_values(self):
+        """Returns the template parameter values according to input, output and weight
+        data types."""
+        ret = dict()
+        inp_hls_str = self.get_input_datatype().get_hls_datatype_str()
+        out_hls_str = self.get_output_datatype().get_hls_datatype_str()
+        # fill in TSrcI
+        ret["TSrcI"] = "Slice<%s>" % inp_hls_str
+        # fill in TDstI
+        ret["TDstI"] = "Slice<%s>" % out_hls_str
+
+        return ret
+
+    def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
+        """Convert the original numpy weight matrix orig_weight_matrix into
+        a form suitable for passing to the hlslib call:
+        * ensure MH % PE == 0
+        * for unsigned inputs, ensure thresholds are positive
+        * interleave rows between PEs
+        * reshape into (PE, TMEM, n_thres_steps) and return
+        """
+        mh = self.get_nodeattr("NumChannels")
+        pe = self.get_nodeattr("PE")
+        tmem = mh // pe
+        assert mh % pe == 0, "Requirement NumChannels divisable by PE is violated."
+        assert (
+            orig_thres_matrix.ndim == 2
+        ), """Threshold matrix dimension is
+        not as expected (2)."""
+        n_thres_steps = orig_thres_matrix.shape[1]
+        if not self.get_input_datatype().signed():
+            # ensure all thresholds are nonnegative
+            assert (orig_thres_matrix >= 0).all()
+        # ensure all thresholds are integer
+        assert (orig_thres_matrix.astype(np.int32) == orig_thres_matrix).all()
+        ret = orig_thres_matrix
+        # ensure channels = mh , duplicating if necessary
+        if ret.shape[0] == 1:
+            ret = np.tile(ret, (mh, 1))
+        assert (
+            ret.shape[0] == mh
+        ), "Channels of threshold matrix are not as expected (mh)"
+        # distribute rows between PEs
+        ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
+        assert (
+            ret.shape[0] == pe
+        ), """First dimension after distribution of the
+        rows between PEs is not as expected (pe)"""
+        assert (
+            ret.shape[1] == tmem
+        ), """Second dimension after distribution of the
+        rows between PEs is not as expected (tmem)"""
+        assert (
+            ret.shape[2] == n_thres_steps
+        ), """Third dimension after distribution of the
+        rows between PEs is not as expected (n_thres_steps)"""
+        return ret.reshape(1, pe, tmem, n_thres_steps)
+
+    def generate_params(self, model, path):
+        code_gen_dir = path
+        # save thresholds in thresh.h
+        thresholds = model.get_initializer(self.onnx_node.input[1])
+
+        threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
+        tdt = DataType.INT32
+        thresholds_hls_code = numpy_to_hls_code(
+            threshold_tensor, tdt, "thresholds", False, True
+        )
+        # write thresholds into thresh.h
+        f_thresh = open("{}/thresh.h".format(code_gen_dir), "w")
+        tdt_hls = tdt.get_hls_datatype_str()
+        # use binary to export bipolar activations
+        export_odt = self.get_output_datatype()
+        if self.get_output_datatype() == DataType.BIPOLAR:
+            export_odt = DataType.BINARY
+        odt_hls = export_odt.get_hls_datatype_str()
+        f_thresh.write(
+            "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \
+            = ".format(
+                self.calc_tmem(),
+                self.get_nodeattr("PE"),
+                threshold_tensor.shape[-1],
+                tdt_hls,
+                odt_hls,
+                export_odt.min(),
+                "std::less_equal<%s>" % tdt_hls,
+            )
+        )
+        f_thresh.write(thresholds_hls_code)
+        f_thresh.close()
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+
+        # TODO ensure codegen dir exists
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        # create a npy file fore each input of the node (in_ind is input index)
+        in_ind = 0
+        for inputs in node.input:
+            # it is assumed that the first input of the node is the data input
+            # the second input are the weights
+            # the third input are the thresholds
+            if in_ind == 0:
+                assert (
+                    str(context[inputs].dtype) == "float32"
+                ), """Input datatype is
+                not float32 as expected."""
+                expected_inp_shape = self.get_folded_input_shape()
+                reshaped_input = context[inputs].reshape(expected_inp_shape)
+                if self.get_input_datatype() == DataType.BIPOLAR:
+                    # store bipolar activations as binary
+                    reshaped_input = (reshaped_input + 1) / 2
+                    export_idt = DataType.BINARY
+                else:
+                    export_idt = self.get_input_datatype()
+                # make copy before saving the array
+                reshaped_input = reshaped_input.copy()
+                np.save(
+                    os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)),
+                    reshaped_input,
+                )
+            elif in_ind > 2:
+                raise Exception("Unexpected input found for StreamingFCLayer")
+            in_ind += 1
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_output(context)
+            # reinterpret binary output as bipolar where needed
+            if self.get_output_datatype() == DataType.BIPOLAR:
+                out = context[node.output[0]]
+                out = 2 * out - 1
+                context[node.output[0]] = out
+            assert (
+                context[node.output[0]].shape == self.get_folded_output_shape()
+            ), """Output shape is not as expected"""
+            # reshape output to have expected shape
+            oshape = self.get_normal_output_shape()
+            context[node.output[0]] = context[node.output[0]].reshape(*oshape)
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            output = self.rtlsim(sim, inp)
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+
+            # load and reshape output
+            output = np.load(out_npy_path)
+            oshape = self.get_normal_output_shape()
+            output = np.asarray([output], dtype=np.float32).reshape(*oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "activations.hpp"']
+        self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"']
+
+    # TODO check and add whatever missing
+    def defines(self, var):
+        numInputVectors = list(self.get_nodeattr("numInputVectors"))
+        numReps = numInputVectors[0]
+        self.code_gen_dict["$DEFINES$"] = [
+            """#define NumChannels1 {}\n #define PE1 {}\n #define numReps {}""".format(
+                self.get_nodeattr("NumChannels"), self.get_nodeattr("PE"), numReps,
+            )
+        ]
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_input_datatype()
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"] = []
+        # note: the innermost dim is reversed for the input
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0, false);'
+            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+        )
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+        )
+
+    def docompute(self):
+        tmpl_args = self.get_template_param_values()
+        # TODO: why put some template parameters into defines and not others?
+        # should ImgDim be defined or just filled in here like we do now?
+        node = self.onnx_node
+        ishape = self.get_folded_input_shape()
+        if len(ishape) == 3:
+            imgdim = 1
+        elif len(ishape) == 5:
+            imgdim = ishape[1]
+        else:
+            raise Exception("""Unexpeted input shape""")
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            """{}<{}, NumChannels1, PE1, {}, {}>
+            (in0, out, threshs, numReps);""".format(
+                node.op_type, imgdim, tmpl_args["TSrcI"], tmpl_args["TDstI"],
+            )
+        ]
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        if dtype == DataType.BIPOLAR:
+            # use binary for bipolar storage
+            dtype = DataType.BINARY
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        shape = self.get_folded_output_shape()
+        shape_cpp_str = str(shape).replace("(", "{").replace(")", "}")
+
+        # note: the innermost dim is not reversed for the output
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", false);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                shape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            """void {}(hls::stream<ap_uint<{}>> &in0,
+                hls::stream<ap_uint<{}>> &out
+                )""".format(
+                self.onnx_node.name,
+                self.get_instream_width(),
+                self.get_outstream_width(),
+            )
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+        )
+
+        # the threshold tensor is acc_type [PE][TMEM][N_THRES]
+        # partition for parallel access along PE and N_THRES
+        # dimensions (dims 1 and 3)
+        self.code_gen_dict["$PRAGMAS$"].append(
+            (
+                "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds "
+                "complete dim=1"
+            )
+        )
+        self.code_gen_dict["$PRAGMAS$"].append(
+            (
+                "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds "
+                "complete dim=3"
+            )
+        )
+        # set resource type
+        ram_style = self.get_nodeattr("ram_style")
+        pe = self.get_nodeattr("PE")
+        ich = self.get_nodeattr("NumChannels")
+        # if PE less than NumChannels, assign cores according to ram_style;
+        # otherwise if PE == NumChannels, Vivado HLS will unroll to FFs
+        if pe < ich:
+            if ram_style == "distributed":
+                self.code_gen_dict["$PRAGMAS$"].append(
+                    (
+                        "#pragma HLS RESOURCE variable=threshs.m_thresholds "
+                        "core=ROM_2P_LUTRAM"
+                    )
+                )
+            elif ram_style == "block":
+                self.code_gen_dict["$PRAGMAS$"].append(
+                    (
+                        "#pragma HLS RESOURCE variable=threshs.m_thresholds "
+                        "core=ROM_2P_BRAM"
+                    )
+                )
+            else:
+                raise Exception(
+                    """Invalid value for attribute ram_style! Is currently set to: {}
+                has to be set to one of ("block", "distributed")""".format(
+                        ram_style
+                    )
+                )
diff --git a/src/finn/custom_op/im2col.py b/src/finn/custom_op/im2col.py
index 16446c15d46ee7996162f864708f7fde6cfedaf3..82a6b140f7af1be4e5c0f429d077b99c7865383e 100644
--- a/src/finn/custom_op/im2col.py
+++ b/src/finn/custom_op/im2col.py
@@ -21,8 +21,6 @@ def get_im2col_indices_nchw(
     """Returns im2col indices."""
     # First figure out what the size of the output should be
     N, C, H, W = x_shape
-    assert (H + 2 * padding - field_height) % stride_y == 0
-    assert (W + 2 * padding - field_width) % stride_x == 0
     out_height = compute_conv_output_dim(H, field_height, stride_y, padding)
     out_width = compute_conv_output_dim(W, field_width, stride_x, padding)
 
@@ -70,6 +68,9 @@ def im2col_indices_nchw(
 # * ifm is the number of input channels
 # * k is the convolutional kernel size
 
+# note: for the innermost (dot product) dimension of k*k*ifm, we
+# assume an internal ordering (k, k, ifm)
+
 
 class Im2Col(CustomOp):
     def get_nodeattr_types(self):
diff --git a/src/finn/custom_op/registry.py b/src/finn/custom_op/registry.py
index 2dbff507c7b84a430da0d45563af6ab99f1971e8..2dae826cf9712bef17d0053a0878c41ef51fec36 100644
--- a/src/finn/custom_op/registry.py
+++ b/src/finn/custom_op/registry.py
@@ -44,9 +44,12 @@ from finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch import (
     StreamingDataWidthConverter_Batch,
 )
 from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch
+from finn.custom_op.fpgadataflow.fmpadding import FMPadding_Batch
+from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch
 from finn.custom_op.fpgadataflow.addstreams_batch import AddStreams_Batch
 from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch
 from finn.custom_op.quantavgpool2d import QuantAvgPool2d
+from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch
 
 # create a mapping of all known CustomOp names and classes
 custom_op = {}
@@ -63,9 +66,12 @@ custom_op["MaxPoolNHWC"] = MaxPoolNHWC
 custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch
 custom_op["StreamingFIFO"] = StreamingFIFO
 custom_op["GlobalAccPool_Batch"] = GlobalAccPool_Batch
+custom_op["FMPadding_Batch"] = FMPadding_Batch
+custom_op["Thresholding_Batch"] = Thresholding_Batch
 custom_op["AddStreams_Batch"] = AddStreams_Batch
 custom_op["LabelSelect_Batch"] = LabelSelect_Batch
 custom_op["QuantAvgPool2d"] = QuantAvgPool2d
+custom_op["DuplicateStreams_Batch"] = DuplicateStreams_Batch
 
 
 def getCustomOp(node):
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index dbd98623c4cdf5baca9fa9c137debf8be0f70981..d421a5f3ef8ca980b399087de1482b2ae913da1b 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -26,13 +26,14 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from onnx import helper
+from onnx import helper, TensorProto
 
 from finn.core.datatype import DataType
 from finn.transformation import Transformation
 from finn.custom_op.registry import getCustomOp
 from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.infer_datatypes import InferDataTypes
+import finn.core.data_layout as DataLayout
 
 
 class InferConvInpGen(Transformation):
@@ -58,27 +59,61 @@ class InferConvInpGen(Transformation):
                 ifm_ch = i2c_in_shape[-1]
                 ifm_dim = i2c_in_shape[1]
                 ofm_dim = i2c_out_shape[1]
-                # if padding enabled, ensure pad_val supported by DataType
+
+                # default params for ConvolutionInputGenerator
+                ConvInpGen_node_idx = node_ind
+                ConvInpGen_input = i2c_input
+                ConvInpGen_idim = ifm_dim
+
                 if pad > 0:
+                    # if padding enabled, ensure pad_val supported by DataType
                     assert dt.allowed(pad_val), "Im2Col DataType must support pad_val"
+
+                    odim_padding = ifm_dim + 2 * pad
+
+                    padding_out = helper.make_tensor_value_info(
+                        model.make_new_valueinfo_name(),
+                        TensorProto.FLOAT,
+                        (1, odim_padding, odim_padding, ifm_ch),
+                    )
+                    graph.value_info.append(padding_out)
+                    padding_out = padding_out.name
+                    model.set_tensor_datatype(padding_out, dt)
+
+                    ConvInpGen_node_idx += 1
+                    ConvInpGen_input = padding_out
+                    ConvInpGen_idim = odim_padding
+
+                    padding_node = helper.make_node(
+                        "FMPadding_Batch",
+                        [i2c_input],
+                        [padding_out],
+                        domain="finn",
+                        backend="fpgadataflow",
+                        ImgDim=ifm_dim,
+                        Padding=2 * pad,
+                        NumChannels=ifm_ch,
+                        inputDataType=dt.name,
+                    )
+                    graph.node.insert(node_ind, padding_node)
+
                 # create equivalent ConvolutionInputGenerator node
-                # TODO support padding
-                new_node = helper.make_node(
+                ConvInpGen_node = helper.make_node(
                     "ConvolutionInputGenerator",
-                    [i2c_input],
+                    [ConvInpGen_input],
                     [i2c_output],
                     domain="finn",
                     backend="fpgadataflow",
                     ConvKernelDim=k,
                     IFMChannels=ifm_ch,
-                    IFMDim=ifm_dim,
+                    IFMDim=ConvInpGen_idim,
                     OFMDim=ofm_dim,
                     SIMD=ifm_ch,
                     Stride=stride,
                     inputDataType=dt.name,
                     outputDataType=dt.name,
                 )
-                graph.node.insert(node_ind, new_node)
+                graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
                 # remove old nodes
                 graph.node.remove(n)
                 graph_modified = True
@@ -398,3 +433,59 @@ class InferQuantizedStreamingFCLayer(Transformation):
             model = model.transform(InferShapes())
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
+
+
+class InferThresholdingLayer(Transformation):
+    """Convert any MultiThreshold into a standalone thresholding HLS layer."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for node in graph.node:
+            node_ind += 1
+            if node.op_type == "MultiThreshold":
+                thl_input = node.input[0]
+                thl_threshold = node.input[1]
+                thl_output = node.output[0]
+                thl_in_shape = model.get_tensor_shape(thl_input)
+                idt = model.get_tensor_datatype(thl_input)
+
+                # skip conversion for layers with float input
+                if not idt.is_integer():
+                    continue
+
+                # skip conversion if input is not NHWC or NC
+                thl_in_layout = model.get_tensor_layout(thl_input)
+                if thl_in_layout != DataLayout.NHWC and thl_in_layout != DataLayout.NC:
+                    continue
+
+                # now safe to assume number of channels is in last dimension
+                ifc = int(thl_in_shape[-1])
+                # create node with no parallelization first
+                pe = 1
+                assert ifc % pe == 0, "Requirement IFC divisable by PE is violated."
+
+                odt = model.get_tensor_datatype(thl_output)
+                # create and insert new StreamingFCLayer node
+                new_node = helper.make_node(
+                    "Thresholding_Batch",
+                    [thl_input, thl_threshold],
+                    [thl_output],
+                    domain="finn",
+                    backend="fpgadataflow",
+                    NumChannels=ifc,
+                    PE=pe,
+                    inputDataType=idt.name,
+                    outputDataType=odt.name,
+                    numInputVectors=list(thl_in_shape[:-1]),
+                )
+                graph.node.insert(node_ind, new_node)
+                # remove old node
+                graph.node.remove(node)
+                graph_modified = True
+
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
index 049ede5064d252bd6391184c4227e5367a8c1e2b..18d3db18da089a5dda4dbb6d97180dd4a20613b5 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
@@ -107,6 +107,13 @@ class MakePYNQDriver(Transformation):
         driver = driver.replace("$OUTPUT_SHAPE_FOLDED$", mss(o_tensor_shape_folded))
         driver = driver.replace("$OUTPUT_SHAPE_PACKED$", mss(o_tensor_shape_packed))
 
+        # clock settings for driver
+        clk_ns = float(model.get_metadata_prop("clk_ns"))
+        fclk_mhz = 1 / (clk_ns * 0.001)
+        # TODO change according to PYNQ board?
+        driver = driver.replace("$CLK_NAME$", "fclk0_mhz")
+        driver = driver.replace("$CLOCK_FREQ_MHZ$", str(fclk_mhz))
+
         with open(driver_py, "w") as f:
             f.write(driver)
         # copy all the dependencies into the driver folder
diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py
index 55ecb57decd2ac4fa08331b5ebbcb7fd2f0cd5c6..ab9fd03251819aee72f74cc0c1fa17b99b1e05a4 100644
--- a/src/finn/transformation/fpgadataflow/templates.py
+++ b/src/finn/transformation/fpgadataflow/templates.py
@@ -91,7 +91,7 @@ cd %s
 
 pynq_driver_template = """
 import argparse
-
+import os
 from pynq import Overlay
 import numpy as np
 from pynq import allocate
@@ -101,6 +101,7 @@ from finn.util.data_packing import (
     packed_bytearray_to_finnpy
 )
 from finn.core.datatype import DataType
+from pynq.ps import Clocks
 
 class FINNAccelDriver():
     def __init__(self, N, bitfile):
@@ -118,8 +119,12 @@ class FINNAccelDriver():
         self.oshape_folded = $OUTPUT_SHAPE_FOLDED$
         self.ishape_packed = $INPUT_SHAPE_PACKED$   # datatype np.uint8
         self.oshape_packed = $OUTPUT_SHAPE_PACKED$  # datatype np.uint8
+        # clock frequency
+        self.fclk_mhz = $CLOCK_FREQ_MHZ$
         # load bitfile and set up accelerator
         self.ol = Overlay(bitfile)
+        # set the clock frequency as specified by user during transformations
+        Clocks.$CLK_NAME$ = self.fclk_mhz
         self.dma = self.ol.axi_dma_0
         self.ctrl_regs = self.ol.resize_accel_0
         # neuron folding factor of output = iterations per sample
@@ -202,6 +207,12 @@ if __name__ == "__main__":
     # for the remote execution the data from the input npy file has to be loaded,
     # packed and copied to the PYNQ buffer
     if exec_mode == "execute":
+        # remove old output file to prevent reusing old output
+        # in case execution fails
+        try:
+            os.remove(outputfile)
+        except FileNotFoundError:
+            pass
         # load desired input .npy file
         ibuf_normal = np.load(inputfile)
         ibuf_folded = finnDriver.fold_input(ibuf_normal)
@@ -212,10 +223,15 @@ if __name__ == "__main__":
 
     # for the throughput test the runtime of the network has to be measured
     if exec_mode == "throughput_test":
-        # measure runtime of network
-        start = time.time()
+        # remove old metrics file
+        try:
+            os.remove("nw_metrics.txt")
+        except FileNotFoundError:
+            pass
         # dictionary for results of throughput test
         res={}
+        # measure runtime of network
+        start = time.time()
 
     # execute accelerator
     finnDriver.execute()
@@ -228,6 +244,8 @@ if __name__ == "__main__":
         res["throughput[images/s]"] = N / runtime
         res["DRAM_in_bandwidth[Mb/s]"] = np.prod(finnDriver.ishape_packed)*0.000001 / runtime
         res["DRAM_out_bandwidth[Mb/s]"] = np.prod(finnDriver.oshape_packed)*0.000001 / runtime
+        res["fclk[mhz]"] = Clocks.fclk0_mhz
+        res["N"] = N
         file = open("nw_metrics.txt", "w")
         file.write(str(res))
         file.close()
diff --git a/src/finn/transformation/general.py b/src/finn/transformation/general.py
index 53c73e1dc4fe0bfab53e3f126add992cb338c11d..488391740fc25f1f7caa657adc9ed55bdc2f9722 100644
--- a/src/finn/transformation/general.py
+++ b/src/finn/transformation/general.py
@@ -28,6 +28,7 @@
 
 import finn.util.basic as util
 from finn.transformation import Transformation
+from toposort import toposort_flatten
 
 
 class GiveUniqueNodeNames(Transformation):
@@ -81,6 +82,93 @@ class GiveReadableTensorNames(Transformation):
         return (model, False)
 
 
+class GiveUniqueParameterTensors(Transformation):
+    """Make every parameter tensor unique. The aim is to avoid affecting
+    other nodes apart from the one the system is currently operating on."""
+
+    def apply(self, model):
+        graph = model.graph
+        graph_modified = False
+        seen_parameters = []
+        for n in graph.node:
+            # copy inputs since they may be modified
+            node_inputs_list = [x for x in n.input]
+            for input_idx, node_input in enumerate(node_inputs_list):
+                # check if it's a parameter
+                input_init = model.get_initializer(node_input)
+                if input_init is None:
+                    # dynamic input
+                    continue
+
+                # check if repeated
+                if node_input not in seen_parameters:
+                    # first occurance
+                    seen_parameters += [node_input]
+                    continue
+
+                new_param_name = model.make_new_valueinfo_name()
+
+                model.set_initializer(new_param_name, input_init)
+                model.set_tensor_datatype(
+                    new_param_name, model.get_tensor_datatype(node_input)
+                )
+
+                # point node input to new tensor
+                n.input[input_idx] = new_param_name
+
+        return (model, graph_modified)
+
+
+class SortGraph(Transformation):
+    """ Returns the model with its node list sorted topologically.
+    Any ONNX graph to be executed must have a topologically sorted node list, as dictated
+    by the ONNX standard.
+    """
+    
+    # Notes on SortGraph performance:
+    #         benchmark in  tests/transformation/test_sort_graph.py
+    # 
+    #         The algorithm doesn't move initializers so its performance should only depend on
+    #         the number of nodes
+    # 
+    #         Relative order of magnitudes for time per step:
+    #             - Gather graph structure:       base
+    #             - Sort nodes:                   0.1 of base
+    #             - Remove and insert in order :  0.001 of base
+    # 
+    #     Notes:
+    #         Remove nodes and insert them in order:
+    #           Probably this is faster than copying initializers and more robust in general
+
+    def apply(self, model):
+        # Gather graph structure
+        graph_dependencies = {}
+        node_list = [
+            n for n in model.graph.node
+        ]  # I also need the list to remove the nodes
+        for node_idx, n in enumerate(node_list):
+            node_pred = model.find_direct_predecessors(n)
+            if node_pred is None:
+                # Will also eliminate nodes that are floating around for some reason
+                continue
+
+            node_dependencies = [node_list.index(pred) for pred in node_pred]
+            graph_dependencies[node_idx] = set(node_dependencies)
+
+        # Sort nodes
+        sorted_node_indexes = toposort_flatten(graph_dependencies)
+
+        # Remove nodes and insert them in order
+        # Can't remove nodes before if I want to use model.find_direct_predecessors()
+        for n in node_list:
+            model.graph.node.remove(n)
+
+        for new_idx, sorted_idx in enumerate(sorted_node_indexes):
+            model.graph.node.insert(new_idx, node_list[sorted_idx])
+
+        return model, False
+
+
 class ConvertSubToAdd(Transformation):
     """Convert subtract-a-constant nodes to add-a-constant nodes."""
 
diff --git a/src/finn/transformation/streamline/absorb.py b/src/finn/transformation/streamline/absorb.py
index 0d709297a9132b15b51435b7ab4b51ce55c7e9f3..dbcf97361017144174f9fbfca35a84361b5abd26 100644
--- a/src/finn/transformation/streamline/absorb.py
+++ b/src/finn/transformation/streamline/absorb.py
@@ -46,7 +46,11 @@ class AbsorbAddIntoMultiThreshold(Transformation):
         graph_modified = False
         for n in graph.node:
             node_ind += 1
-            if n.op_type == "Add":
+            if (
+                n.op_type == "Add"
+                and not model.is_fork_node(n)
+                and not model.is_join_node(n)
+            ):
                 consumer = model.find_consumer(n.output[0])
                 if consumer is not None and consumer.op_type == "MultiThreshold":
                     add_weight_name = n.input[1]
@@ -83,7 +87,11 @@ class AbsorbMulIntoMultiThreshold(Transformation):
         graph_modified = False
         for n in graph.node:
             node_ind += 1
-            if n.op_type == "Mul":
+            if (
+                n.op_type == "Mul"
+                and not model.is_fork_node(n)
+                and not model.is_join_node(n)
+            ):
                 mul_weight_name = n.input[1]
                 A = model.get_initializer(mul_weight_name)
                 assert A is not None, "Initializer for mul weights is not set."
diff --git a/src/finn/transformation/streamline/collapse_repeated.py b/src/finn/transformation/streamline/collapse_repeated.py
index aa059747b602bc6b659bc8b53b1f18988bba1ef0..67824ad4f633983b93e3178d03118927a1ddd85b 100644
--- a/src/finn/transformation/streamline/collapse_repeated.py
+++ b/src/finn/transformation/streamline/collapse_repeated.py
@@ -48,9 +48,17 @@ class CollapseRepeatedOp(Transformation):
         graph_modified = False
         for n in graph.node:
             node_ind += 1
-            if n.op_type == self.op_name:
+            if (
+                n.op_type == self.op_name
+                and not model.is_fork_node(n)
+                and not model.is_join_node(n)
+            ):
                 consumer = model.find_consumer(n.output[0])
-                if consumer is not None and consumer.op_type == self.op_name:
+                if (
+                    consumer is not None
+                    and consumer.op_type == self.op_name
+                    and not model.is_join_node(consumer)
+                ):
                     op0_param_name = n.input[1]
                     op1_param_name = consumer.input[1]
                     op0_param = model.get_initializer(op0_param_name)
diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index 1886c785705161c3a13493de44dc3f3f86463f4f..0b6259a61d3eb67b7b38d4c6939019ce2893a875 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -36,8 +36,9 @@ from finn.util.basic import get_by_name
 
 
 class MoveAddPastMul(Transformation):
-    """Move add operations past multiply operations. The aim is to have them
-    next to each other such that they can be collapsed into a single add."""
+    """Move add operations past multiply operations on linear segments of the graph.
+    The aim is to have them next to each other such that they can be collapsed into
+    a single add."""
 
     def apply(self, model):
         graph = model.graph
@@ -45,9 +46,17 @@ class MoveAddPastMul(Transformation):
         graph_modified = False
         for n in graph.node:
             node_ind += 1
-            if n.op_type == "Add":
+            if (
+                n.op_type == "Add"
+                and not model.is_fork_node(n)
+                and not model.is_join_node(n)
+            ):
                 consumer = model.find_consumer(n.output[0])
-                if consumer is not None and consumer.op_type == "Mul":
+                if (
+                    consumer is not None
+                    and consumer.op_type == "Mul"
+                    and not model.is_join_node(consumer)
+                ):
                     # have: (x) -> add(,B) -> (x+B) -> mul(,A) -> (xA+BA)
                     # want: (x) -> mul(,A) -> (xA) -> add(,BA) -> (xA+BA)
                     # assume input 0 is from the previous layer, input 1 is the
@@ -63,12 +72,16 @@ class MoveAddPastMul(Transformation):
                     end_name = consumer.output[0]
                     # compute new param value for add
                     BA = B * A
+
                     # make and insert new nodes
                     new_mul = oh.make_node(
-                        "Mul", [start_name, mul_weight_name], [middle_name]
+                        "Mul",
+                        [start_name, mul_weight_name],
+                        [middle_name],
+                        name=consumer.name,
                     )
                     new_add = oh.make_node(
-                        "Add", [middle_name, add_weight_name], [end_name]
+                        "Add", [middle_name, add_weight_name], [end_name], name=n.name
                     )
                     graph.node.insert(node_ind, new_mul)
                     graph.node.insert(node_ind + 1, new_add)
@@ -78,6 +91,7 @@ class MoveAddPastMul(Transformation):
                     graph.node.remove(n)
                     graph.node.remove(consumer)
                     graph_modified = True
+
         model = model.transform(InferShapes())
         return (model, graph_modified)
 
@@ -92,9 +106,17 @@ class MoveScalarMulPastMatMul(Transformation):
         graph_modified = False
         for n in graph.node:
             node_ind += 1
-            if n.op_type == "Mul":
+            if (
+                n.op_type == "Mul"
+                and not model.is_fork_node(n)
+                and not model.is_join_node(n)
+            ):
                 consumer = model.find_consumer(n.output[0])
-                if consumer is not None and consumer.op_type == "MatMul":
+                if (
+                    consumer is not None
+                    and consumer.op_type == "MatMul"
+                    and not model.is_join_node(consumer)
+                ):
                     mul_weight_name = n.input[1]
                     matmul_weight_name = consumer.input[1]
                     A = model.get_initializer(mul_weight_name)
@@ -109,10 +131,16 @@ class MoveScalarMulPastMatMul(Transformation):
                         # if the mul is scalar, we can simply swap the order of ops
                         # make and insert new nodes
                         new_matmul = oh.make_node(
-                            "MatMul", [start_name, matmul_weight_name], [middle_name]
+                            "MatMul",
+                            [start_name, matmul_weight_name],
+                            [middle_name],
+                            name=consumer.name,
                         )
                         new_mul = oh.make_node(
-                            "Mul", [middle_name, mul_weight_name], [end_name]
+                            "Mul",
+                            [middle_name, mul_weight_name],
+                            [end_name],
+                            name=n.name,
                         )
                         graph.node.insert(node_ind, new_matmul)
                         graph.node.insert(node_ind + 1, new_mul)
@@ -135,9 +163,17 @@ class MoveScalarAddPastMatMul(Transformation):
         graph_modified = False
         for n in graph.node:
             node_ind += 1
-            if n.op_type == "Add":
+            if (
+                n.op_type == "Add"
+                and not model.is_fork_node(n)
+                and not model.is_join_node(n)
+            ):
                 consumer = model.find_consumer(n.output[0])
-                if consumer is not None and consumer.op_type == "MatMul":
+                if (
+                    consumer is not None
+                    and consumer.op_type == "MatMul"
+                    and not model.is_join_node(consumer)
+                ):
                     add_weight_name = n.input[1]
                     matmul_weight_name = consumer.input[1]
                     A = model.get_initializer(add_weight_name)
@@ -155,10 +191,16 @@ class MoveScalarAddPastMatMul(Transformation):
                         # update the add weight
                         model.set_initializer(add_weight_name, Anew)
                         new_matmul = oh.make_node(
-                            "MatMul", [start_name, matmul_weight_name], [middle_name]
+                            "MatMul",
+                            [start_name, matmul_weight_name],
+                            [middle_name],
+                            name=consumer.name,
                         )
                         new_add = oh.make_node(
-                            "Add", [middle_name, add_weight_name], [end_name]
+                            "Add",
+                            [middle_name, add_weight_name],
+                            [end_name],
+                            name=n.name,
                         )
                         graph.node.insert(node_ind, new_matmul)
                         graph.node.insert(node_ind + 1, new_add)
@@ -181,9 +223,17 @@ class MoveScalarAddPastConv(Transformation):
         graph_modified = False
         for n in graph.node:
             node_ind += 1
-            if n.op_type == "Add":
+            if (
+                n.op_type == "Add"
+                and not model.is_fork_node(n)
+                and not model.is_join_node(n)
+            ):
                 consumer = model.find_consumer(n.output[0])
-                if consumer is not None and consumer.op_type == "Conv":
+                if (
+                    consumer is not None
+                    and consumer.op_type == "Conv"
+                    and not model.is_join_node(consumer)
+                ):
                     conv_node = consumer
                     add_node = n
                     add_weight_name = n.input[1]
@@ -194,7 +244,12 @@ class MoveScalarAddPastConv(Transformation):
                     start_name = n.input[0]
                     end_name = consumer.output[0]
                     conv_out_shape = model.get_tensor_shape(end_name)
-                    if all(x == 1 for x in A.shape):
+
+                    using_padding = True
+                    pads = list(get_by_name(consumer.attribute, "pads").ints)
+                    if sum(pads) == 0:
+                        using_padding = False
+                    if all(x == 1 for x in A.shape) and not using_padding:
                         # create a tensor filled with the add constant, in
                         # the shape expected by the convolution
                         conv_in_const = np.zeros(conv_in_shape, dtype=np.float32)
@@ -206,7 +261,8 @@ class MoveScalarAddPastConv(Transformation):
                         execute_node(conv_node, exec_ctx, model.graph)
                         # retrieve the conv output
                         Anew = exec_ctx[end_name]
-                        # strip out repetition
+
+                        # strip out repetition if no padding
                         Anew = Anew[0, :, 0, 0].reshape(1, -1, 1, 1)
                         # update the add weight
                         model.set_initializer(add_weight_name, Anew)
@@ -224,6 +280,7 @@ class MoveScalarAddPastConv(Transformation):
                         graph.node.remove(add_node)
                         graph.node.insert(node_ind, add_node)
                         graph_modified = True
+
         model = model.transform(InferShapes())
         return (model, graph_modified)
 
@@ -238,9 +295,17 @@ class MoveScalarMulPastConv(Transformation):
         graph_modified = False
         for n in graph.node:
             node_ind += 1
-            if n.op_type == "Mul":
+            if (
+                n.op_type == "Mul"
+                and not model.is_fork_node(n)
+                and not model.is_join_node(n)
+            ):
                 consumer = model.find_consumer(n.output[0])
-                if consumer is not None and consumer.op_type == "Conv":
+                if (
+                    consumer is not None
+                    and consumer.op_type == "Conv"
+                    and not model.is_join_node(consumer)
+                ):
                     mul_weight_name = n.input[1]
                     A = model.get_initializer(mul_weight_name)
                     assert A is not None, "Initializer for mul weights is not set."
@@ -379,3 +444,90 @@ class MakeMaxPoolNHWC(Transformation):
                         graph.node.insert(node_ind - 1, consumer)
                         graph_modified = True
         return (model, graph_modified)
+
+
+class MoveOpPastFork(Transformation):
+    """Move node operations past graph forks. Used when a node before a fork
+     can be merged with nodes in the branches
+    """
+
+    def __init__(self, op_name_list):
+        super().__init__()
+        self.ops_to_move = op_name_list
+
+    def apply(self, model):
+        graph = model.graph
+        graph_modified = False
+        nodes = [n for n in graph.node]
+        node_ind = 0
+        for n in nodes:
+            node_ind += 1
+            if (
+                n.op_type in self.ops_to_move
+                and model.is_fork_node(n)
+                and not model.is_join_node(n)
+            ):
+
+                # Restrict this transform to operations with constant parameters
+                # Assuming parameters is in input 1
+                op_init_param = model.get_initializer(n.input[1])
+                if op_init_param is None:
+                    continue
+
+                # Check case when branches are empty and go
+                # to the same node
+                consumers = model.find_consumers(n.output[0])
+                unique_consumer = True
+                for consum_node in consumers[1:]:
+                    if consumers[0] != consum_node:
+                        unique_consumer = False
+                        break
+
+                if unique_consumer:
+                    continue
+
+                for consumer_node in consumers[1:]:
+                    # create new node
+                    new_param_name = model.make_new_valueinfo_name()
+                    new_output_tensor_name = model.make_new_valueinfo_name()
+                    new_node = oh.make_node(
+                        n.op_type,
+                        [n.input[0], new_param_name],
+                        [new_output_tensor_name],
+                    )
+                    graph.node.insert(node_ind, new_node)
+                    node_ind += 1
+                    model.set_initializer(new_param_name, op_init_param)
+
+                    # change consumer input tensor
+                    graph.node.remove(consumer_node)
+                    for idx, consumer_input in enumerate(consumer_node.input):
+                        if consumer_input == n.output[0]:
+                            consumer_node.input[idx] = new_output_tensor_name
+                            break
+                    else:
+                        raise Exception(
+                            "Consumer should have the current node output as input"
+                        )
+
+                    graph.node.insert(node_ind, consumer_node)
+
+                graph_modified = True
+
+        model = model.transform(InferShapes())
+        return (model, graph_modified)
+
+
+class MoveAddPastFork(MoveOpPastFork):
+    def __init__(self):
+        super().__init__(["Add"])
+
+
+class MoveMulPastFork(MoveOpPastFork):
+    def __init__(self):
+        super().__init__(["Mul"])
+
+
+class MoveLinearPastFork(MoveOpPastFork):
+    def __init__(self):
+        super().__init__(["Add", "Mul"])
diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index bc413bf665e96be1d58a5de13b0744fd6a80f855..3880bb9591e27af5fe9d063dba2485d304e4db54 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -43,6 +43,13 @@ pynq_part_map["Pynq-Z1"] = "xc7z020clg400-1"
 pynq_part_map["Pynq-Z2"] = "xc7z020clg400-1"
 pynq_part_map["ZCU104"] = "xczu7ev-ffvc1156-2-e"
 
+# native AXI HP port width (in bits) for PYNQ boards
+pynq_native_port_width = dict()
+pynq_native_port_width["Pynq-Z1"] = 64
+pynq_native_port_width["Pynq-Z2"] = 64
+pynq_native_port_width["Ultra96"] = 128
+pynq_native_port_width["ZCU104"] = 128
+
 
 def get_rtlsim_trace_depth():
     """Return the trace depth for rtlsim via PyVerilator. Controllable
diff --git a/src/finn/util/fpgadataflow.py b/src/finn/util/fpgadataflow.py
index 9a2708439c0fed1e25c0d955af21cd2e9e705446..d1669444e55cb0fddb2690e51849c4603d47d32c 100644
--- a/src/finn/util/fpgadataflow.py
+++ b/src/finn/util/fpgadataflow.py
@@ -83,14 +83,27 @@ def pyverilate_stitched_ip(model):
     def file_to_dir(x):
         return os.path.dirname(os.path.realpath(x))
 
+    def file_to_basename(x):
+        return os.path.basename(os.path.realpath(x))
+
     all_verilog_dirs = list(map(file_to_dir, all_verilog_srcs))
-    top_verilog = model.get_metadata_prop("wrapper_filename")
+    all_verilog_files = list(
+        set(
+            filter(
+                lambda x: x.endswith(".v"),
+                list(map(file_to_basename, all_verilog_srcs)),
+            )
+        )
+    )
+    top_module_name = model.get_metadata_prop("wrapper_filename")
+    top_module_name = file_to_basename(top_module_name).strip(".v")
     build_dir = make_build_dir("pyverilator_ipstitched_")
     sim = PyVerilator.build(
-        top_verilog,
+        all_verilog_files,
         verilog_path=all_verilog_dirs,
         build_dir=build_dir,
         trace_depth=get_rtlsim_trace_depth(),
+        top_module_name=top_module_name,
     )
     return sim
 
@@ -114,3 +127,91 @@ def is_fpgadataflow_node(node):
                     is_node = True
 
     return is_node
+
+
+def rtlsim_multi_io(sim, io_dict, num_out_values, trace_file=""):
+    """Runs the pyverilator simulation by passing the input values to the simulation,
+    toggle the clock and observing the execution time. Function contains also an
+    observation loop that can abort the simulation if no output value is produced
+    after a set number of cycles. Can handle multiple i/o streams. See function
+    implementation for details on how the top-level signals should be named.
+
+    sim: the PyVerilator object for simulation
+    io_dict: a dict of dicts in the following format:
+            {"inputs" : {"in0" : <input_data>, "in1" : <input_data>},
+             "outputs" : {"out0" : [], "out1" : []} }
+            <input_data> is a list of Python arbitrary-precision ints indicating
+            what data to push into the simulation, and the output lists are
+            similarly filled when the simulation is complete
+    num_out_values: number of total values to be read from the simulation to
+                    finish the simulation and return.
+
+    returns: number of clock cycles elapsed for completion
+
+    """
+
+    if trace_file != "":
+        sim.start_vcd_trace(trace_file)
+
+    for outp in io_dict["outputs"]:
+        sim.io[outp + "_V_V_TREADY"] = 1
+
+    # observe if output is completely calculated
+    # total_cycle_count will contain the number of cycles the calculation ran
+    output_done = False
+    total_cycle_count = 0
+    output_count = 0
+    old_output_count = 0
+
+    # avoid infinite looping of simulation by aborting when there is no change in
+    # output values after 100 cycles
+    no_change_count = 0
+    liveness_threshold = pyverilate_get_liveness_threshold_cycles()
+
+    while not (output_done):
+        for inp in io_dict["inputs"]:
+            inputs = io_dict["inputs"][inp]
+            sim.io[inp + "_V_V_TVALID"] = 1 if len(inputs) > 0 else 0
+            sim.io[inp + "_V_V_TDATA"] = inputs[0] if len(inputs) > 0 else 0
+            if sim.io[inp + "_V_V_TREADY"] == 1 and sim.io[inp + "_V_V_TVALID"] == 1:
+                inputs = inputs[1:]
+            io_dict["inputs"][inp] = inputs
+
+        for outp in io_dict["outputs"]:
+            outputs = io_dict["outputs"][outp]
+            if sim.io[outp + "_V_V_TVALID"] == 1 and sim.io[outp + "_V_V_TREADY"] == 1:
+                outputs = outputs + [sim.io[outp + "_V_V_TDATA"]]
+                output_count += 1
+            io_dict["outputs"][outp] = outputs
+
+        sim.io.ap_clk = 1
+        sim.io.ap_clk = 0
+
+        total_cycle_count = total_cycle_count + 1
+
+        if output_count == old_output_count:
+            no_change_count = no_change_count + 1
+        else:
+            no_change_count = 0
+            old_output_count = output_count
+
+        # check if all expected output words received
+        if output_count == num_out_values:
+            output_done = True
+
+        # end sim on timeout
+        if no_change_count == liveness_threshold:
+            if trace_file != "":
+                sim.flush_vcd_trace()
+                sim.stop_vcd_trace()
+            raise Exception(
+                "Error in simulation! Takes too long to produce output. "
+                "Consider setting the LIVENESS_THRESHOLD env.var. to a "
+                "larger value."
+            )
+
+    if trace_file != "":
+        sim.flush_vcd_trace()
+        sim.stop_vcd_trace()
+
+    return total_cycle_count
diff --git a/tests/core/test_modelwrapper.py b/tests/core/test_modelwrapper.py
index d1da6934a5db07aabe41a9ca40b5de497b6460a1..5fa9b23bad5c5b67f65530c55f862f889c07b1ac 100644
--- a/tests/core/test_modelwrapper.py
+++ b/tests/core/test_modelwrapper.py
@@ -73,6 +73,11 @@ def test_modelwrapper():
     inp_layout = DataLayout.NCHW
     model.set_tensor_layout(inp_name, inp_layout)
     assert model.get_tensor_layout(inp_name) == inp_layout
+    inp_sparsity = model.get_tensor_sparsity(inp_name)
+    assert inp_sparsity is None
+    inp_sparsity = {"dw": {"kernel_shape": 3}}
+    model.set_tensor_sparsity(inp_name, inp_sparsity)
+    assert model.get_tensor_sparsity(inp_name) == inp_sparsity
     os.remove(export_onnx_path)
 
 
@@ -127,3 +132,45 @@ def test_modelwrapper_graph_order():
     assert model.get_node_index(Round_node) == 1
     assert model.get_node_index(Ceil_node) == 2
     assert model.get_node_index(Add_node) == 3
+
+
+def test_modelwrapper_detect_forks_n_joins():
+    # create small network with properties to be tested
+    Neg_node = onnx.helper.make_node("Neg", inputs=["in1"], outputs=["neg1"])
+    Round_node = onnx.helper.make_node("Round", inputs=["neg1"], outputs=["round1"])
+
+    Ceil_node = onnx.helper.make_node("Ceil", inputs=["neg1"], outputs=["ceil1"])
+    Add_node = onnx.helper.make_node(
+        "Add", inputs=["round1", "ceil1"], outputs=["out1"]
+    )
+
+    in1 = onnx.helper.make_tensor_value_info("in1", onnx.TensorProto.FLOAT, [4, 4])
+    out1 = onnx.helper.make_tensor_value_info("out1", onnx.TensorProto.FLOAT, [4, 4])
+
+    graph = onnx.helper.make_graph(
+        nodes=[Neg_node, Round_node, Ceil_node, Add_node],
+        name="simple_graph",
+        inputs=[in1],
+        outputs=[out1],
+        value_info=[
+            onnx.helper.make_tensor_value_info("neg1", onnx.TensorProto.FLOAT, [4, 4]),
+            onnx.helper.make_tensor_value_info(
+                "round1", onnx.TensorProto.FLOAT, [4, 4]
+            ),
+            onnx.helper.make_tensor_value_info("ceil1", onnx.TensorProto.FLOAT, [4, 4]),
+        ],
+    )
+
+    onnx_model = onnx.helper.make_model(graph, producer_name="simple-model")
+    model = ModelWrapper(onnx_model)
+
+    # test
+    assert model.is_fork_node(Neg_node)
+    assert not model.is_fork_node(Round_node)
+    assert not model.is_fork_node(Ceil_node)
+    assert not model.is_fork_node(Add_node)
+
+    assert not model.is_join_node(Neg_node)
+    assert not model.is_join_node(Round_node)
+    assert not model.is_join_node(Ceil_node)
+    assert model.is_join_node(Add_node)
diff --git a/tests/end2end/test_end2end_cnv_w1a1.py b/tests/end2end/test_end2end_cnv_w1a1.py
index e6d1fc4efd61c01654ee88638698215d23a82eb3..c3359dcc82650bf0e9e8a5bc5276f5ca770ee96c 100644
--- a/tests/end2end/test_end2end_cnv_w1a1.py
+++ b/tests/end2end/test_end2end_cnv_w1a1.py
@@ -76,7 +76,7 @@ from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
 test_fpga_part = pynq_part_map[test_pynq_board]
-target_clk_ns = 5
+target_clk_ns = 10
 mem_mode = "decoupled"
 
 
diff --git a/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py b/tests/end2end/test_end2end_tfc_w1a1.py
similarity index 98%
rename from tests/end2end/test_end2end_tfc_w1a1_throughput_test.py
rename to tests/end2end/test_end2end_tfc_w1a1.py
index 1ba149687bb80a0f977115bd380a09f70eef23f1..15c1c41b006c6f87d79a0e7eb6a4458838de5fd2 100644
--- a/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py
+++ b/tests/end2end/test_end2end_tfc_w1a1.py
@@ -41,7 +41,6 @@ import onnx.numpy_helper as nph
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 import finn.transformation.streamline.absorb as absorb
 from finn.core.onnx_exec import execute_onnx
-from finn.core.throughput_test import throughput_test
 from finn.custom_op.registry import getCustomOp
 from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
 from finn.transformation.fold_constants import FoldConstants
@@ -332,9 +331,6 @@ def test_end2end_tfc_w1a1_run_on_pynq():
         ret = execute_onnx(parent_model, {iname: x}, True)
         y = ret[oname]
         assert np.isclose(y, y_golden).all()
-        child_model = load_test_checkpoint_or_skip(sdp_node.get_nodeattr("model"))
-        res = throughput_test(child_model)
-        assert res is not None
 
     except KeyError:
         pytest.skip("PYNQ board IP address not specified")
diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee65326ec57fb7fa7fa0490a8980dbabb8efc13c
--- /dev/null
+++ b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
@@ -0,0 +1,106 @@
+from onnx import TensorProto, helper
+import numpy as np
+import pytest
+
+from finn.core.datatype import DataType
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+
+import finn.core.onnx_exec as oxe
+from finn.core.modelwrapper import ModelWrapper
+from finn.util.basic import gen_finn_dt_tensor
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+
+
+@pytest.mark.parametrize("padding", [True, False])
+@pytest.mark.parametrize("kernel_size", [3, 5])
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_convert_to_hls_conv_layer(padding, kernel_size):
+
+    assert (
+        kernel_size % 2 != 0
+    ), """test_convert_to_hls_conv_layer test only
+    supports odd kernel_size"""
+
+    np.random.seed(0)
+    padding = True
+    idt = DataType.UINT4
+
+    in_feature_dim = 7
+    in_chn = 3
+
+    stages = 1  # just one convolution
+
+    out_feature_dim = (
+        in_feature_dim if padding else in_feature_dim - (kernel_size // 2 * 2) * stages
+    )
+
+    input_shape = [1, in_chn, in_feature_dim, in_feature_dim]
+    output_shape = [1, in_chn, out_feature_dim, out_feature_dim]
+
+    conv_param_shape = [in_chn, in_chn, kernel_size, kernel_size]
+
+    conv_config = {}
+    conv_config["dilations"] = [1, 1]
+    conv_config["group"] = 1
+    conv_config["kernel_shape"] = [kernel_size, kernel_size]
+    if padding:
+        pad = kernel_size // 2
+        conv_config["pads"] = [pad, pad, pad, pad]
+    else:
+        conv_config["pads"] = [0, 0, 0, 0]
+    conv_config["strides"] = [1, 1]
+
+    top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
+    top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape)
+    value_info = [
+        helper.make_tensor_value_info("p1", TensorProto.FLOAT, conv_param_shape)
+    ]
+
+    modelproto = helper.make_model(
+        helper.make_graph(
+            name="conv_test",
+            inputs=[top_in],
+            outputs=[top_out],
+            value_info=value_info,
+            nodes=[
+                helper.make_node("Conv", ["top_in", "p1"], ["top_out"], **conv_config)
+            ],
+        )
+    )
+
+    model = ModelWrapper(modelproto)
+    model.set_tensor_datatype("top_in", idt)
+    model.set_tensor_datatype("top_out", idt)
+    model.set_tensor_datatype("p1", DataType.UINT4)
+
+    model = model.transform(InferShapes())
+    model.set_initializer(
+        "p1", np.round(np.random.rand(*conv_param_shape).astype(np.float32) * 16)
+    )
+
+    model.set_tensor_datatype(model.graph.input[0].name, idt)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataLayouts())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(InferDataTypes())
+
+    new_model = model.transform(LowerConvsToMatMul())
+    new_model = new_model.transform(to_hls.InferConvInpGen())
+
+    new_model = new_model.transform(PrepareCppSim())
+    new_model = new_model.transform(CompileCppSim())
+    new_model = new_model.transform(SetExecMode("cppsim"))
+
+    x = gen_finn_dt_tensor(idt, input_shape)
+    inp_dict = {model.graph.input[0].name: x}
+    assert oxe.compare_execution(model, new_model, inp_dict)
diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py b/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
index e03090f0581eebf68cac7baffb6888a6992df68d..48803c9614f53a3a149c6eaac4289d10086513a5 100644
--- a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
+++ b/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
@@ -39,6 +39,7 @@ from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fold_constants import FoldConstants
 from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
 from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_data_layouts import InferDataLayouts
 from finn.transformation.streamline import Streamline
 from finn.util.test import get_test_model_trained
 from finn.transformation.double_to_single_float import DoubleToSingleFloat
@@ -54,7 +55,9 @@ export_onnx_path_cnv = "test_output_cnv.onnx"
 
 
 @pytest.mark.vivado
-def test_convert_to_hls_layers_cnv_w1a1():
+# Standalone or fused thresholding-based activation
+@pytest.mark.parametrize("fused_activation", [True, False])
+def test_convert_to_hls_layers_cnv_w1a1(fused_activation):
     cnv = get_test_model_trained("CNV", 1, 1)
     bo.export_finn_onnx(cnv, (1, 3, 32, 32), export_onnx_path_cnv)
     model = ModelWrapper(export_onnx_path_cnv)
@@ -69,6 +72,7 @@ def test_convert_to_hls_layers_cnv_w1a1():
     model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
     model = model.transform(ConvertBipolarMatMulToXnorPopcount())
     model = model.transform(Streamline())
+    model = model.transform(InferDataLayouts())
     # model.save("golden.onnx")
     # load one of the test vectors
     fn = pk.resource_filename("finn", "data/cifar10/cifar10-test-data-class3.npz")
@@ -80,6 +84,10 @@ def test_convert_to_hls_layers_cnv_w1a1():
     expected_ctx = oxe.execute_onnx(model, input_dict, True)
     expected = expected_ctx[model.graph.output[0].name]
 
+    # if we infer thresholding first, all MultiThresholds get converted to HLS
+    # subsequently, the FC inference will generate passthrough MVAUs
+    if not fused_activation:
+        model = model.transform(to_hls.InferThresholdingLayer())
     model = model.transform(to_hls.InferBinaryStreamingFCLayer())
     model = model.transform(to_hls.InferQuantizedStreamingFCLayer())
     for node in model.graph.node:
@@ -102,7 +110,12 @@ def test_convert_to_hls_layers_cnv_w1a1():
     model = model.transform(to_hls.InferStreamingMaxPool())
     # check topology status
     finn_nodes = model.get_finn_nodes()
-    assert len(finn_nodes) == 18
+    if fused_activation:
+        assert len(finn_nodes) == 18
+    else:
+        assert len(finn_nodes) == 26
+        thr_nodes = model.get_nodes_by_op_type("Thresholding_Batch")
+        assert len(thr_nodes) == 8
     non_finn_nodes = model.get_non_finn_nodes()
     assert len(non_finn_nodes) == 4
     exp_non_finn_nodes = ["Transpose", "Reshape", "Mul", "Add"]
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
index 5051bf34dc690daf8b6186859d3717cc8e217eee..b5fc85caf274edc9e7afc52df962862fa8a99ba3 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
@@ -78,7 +78,7 @@ def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, simd, stride, i
 
 
 def make_single_slidingwindow_modelwrapper(
-    k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt
+    k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt, dw=0
 ):
     odt = idt
     inp = helper.make_tensor_value_info(
@@ -102,6 +102,7 @@ def make_single_slidingwindow_modelwrapper(
         Stride=stride,
         inputDataType=idt.name,
         outputDataType=odt.name,
+        depthwise=dw,
     )
     graph = helper.make_graph(
         nodes=[SlidingWindow_node],
@@ -126,25 +127,29 @@ def prepare_inputs(input_tensor):
 # input datatype
 @pytest.mark.parametrize("idt", [DataType.BIPOLAR, DataType.INT2])
 # kernel size
-@pytest.mark.parametrize("k", [2, 4])
+@pytest.mark.parametrize("k", [2, 3])
 # input dimension
-@pytest.mark.parametrize("ifm_dim", [4, 6, 8])
+@pytest.mark.parametrize("ifm_dim", [6, 8])
 # input channels
-@pytest.mark.parametrize("ifm_ch", [2, 4])  # , 2, 3, 4])
+@pytest.mark.parametrize("ifm_ch", [2, 4])
 # Stride
 @pytest.mark.parametrize("stride", [1, 2])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 # input channel parallelism ("SIMD")
 @pytest.mark.parametrize("simd", [1, 2])
+# depthwise
+@pytest.mark.parametrize("dw", [0, 1])
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_slidingwindow(idt, k, ifm_dim, ifm_ch, stride, exec_mode, simd):
+def test_fpgadataflow_slidingwindow(
+    idt, k, ifm_dim, ifm_ch, stride, exec_mode, simd, dw
+):
     ofm_dim = int(((ifm_dim - k) / stride) + 1)
 
     x = gen_finn_dt_tensor(idt, (1, ifm_dim, ifm_dim, ifm_ch))
     model = make_single_slidingwindow_modelwrapper(
-        k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt
+        k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt, dw
     )
 
     if exec_mode == "cppsim":
@@ -168,6 +173,12 @@ def test_fpgadataflow_slidingwindow(idt, k, ifm_dim, ifm_ch, stride, exec_mode,
         k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt
     )
     y_expected = oxe.execute_onnx(golden, input_dict)["outp"]
-    # if idt == DataType.BIPOLAR:
-    #     y_expected = 2 * y_expected - 1
-    assert (y_produced == y_expected).all()
+    if dw == 0:
+        assert (y_produced == y_expected).all()
+    else:
+        y_expected = y_expected.reshape(
+            1, ofm_dim, ofm_dim, k * k, ifm_ch // simd, simd
+        )
+        y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5)
+        y_expected = y_expected.reshape(1, ofm_dim, ofm_dim, ifm_ch * k * k)
+        assert (y_produced == y_expected).all()
diff --git a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fb84be59333ef0e696204c9064fcf77e35b5d9b
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+from onnx import TensorProto, helper
+
+import finn.core.onnx_exec as oxe
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.util.basic import gen_finn_dt_tensor
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
+
+
+def make_dupstreams_modelwrapper(ch, pe, idim, idt):
+    shape = [1, idim, idim, ch]
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape)
+    outp0 = helper.make_tensor_value_info("outp0", TensorProto.FLOAT, shape)
+    outp1 = helper.make_tensor_value_info("outp1", TensorProto.FLOAT, shape)
+
+    dupstrm_node = helper.make_node(
+        "DuplicateStreams_Batch",
+        ["inp"],
+        ["outp0", "outp1"],
+        domain="finn",
+        backend="fpgadataflow",
+        NumChannels=ch,
+        PE=pe,
+        inputDataType=idt.name,
+        numInputVectors=[1, idim, idim],
+    )
+    graph = helper.make_graph(
+        nodes=[dupstrm_node], name="graph", inputs=[inp], outputs=[outp0, outp1]
+    )
+
+    model = helper.make_model(graph, producer_name="addstreams-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+
+    return model
+
+
+def prepare_inputs(input_tensor, idt):
+    return {"inp": input_tensor}
+
+
+# data type
+@pytest.mark.parametrize("idt", [DataType.INT4, DataType.UINT16])
+# channels
+@pytest.mark.parametrize("ch", [64])
+# folding
+@pytest.mark.parametrize("fold", [-1, 2, 1])
+# image dimension
+@pytest.mark.parametrize("imdim", [7])
+# execution mode
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.vivado
+def test_fpgadataflow_duplicatestreams(idt, ch, fold, imdim, exec_mode):
+    if fold == -1:
+        pe = 1
+    else:
+        pe = ch // fold
+    assert ch % pe == 0
+
+    # generate input data
+    x = gen_finn_dt_tensor(idt, (1, imdim, imdim, ch))
+
+    model = make_dupstreams_modelwrapper(ch, pe, imdim, idt)
+
+    if exec_mode == "cppsim":
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+        model = model.transform(SetExecMode("cppsim"))
+    elif exec_mode == "rtlsim":
+        model = model.transform(SetExecMode("rtlsim"))
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(ReplaceVerilogRelPaths())
+        model = model.transform(PrepareRTLSim())
+    else:
+        raise Exception("Unknown exec_mode")
+
+    # prepare input data and execute
+    input_dict = prepare_inputs(x, idt)
+    output_dict = oxe.execute_onnx(model, input_dict)
+    y0 = output_dict["outp0"]
+    y1 = output_dict["outp1"]
+    expected_y = x
+
+    assert (y0 == expected_y).all(), exec_mode + " failed"
+    assert (y1 == expected_y).all(), exec_mode + " failed"
diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d6390b2673e5d2c0e72748183ac04ed222d078e
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
@@ -0,0 +1,121 @@
+import pytest
+import os
+import numpy as np
+
+from onnx import TensorProto, helper
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.util.basic import gen_finn_dt_tensor
+import finn.core.onnx_exec as oxe
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+
+from finn.util.basic import pynq_part_map
+
+test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
+test_fpga_part = pynq_part_map[test_pynq_board]
+target_clk_ns = 10
+
+
+def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style):
+    assert pad_style == 2, "only pad_style == 2 supported in hlslib"
+    assert padding > 0, "Output dim should be greater than input dim"
+    odim = idim + padding
+
+    inp = helper.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, idim, idim, num_ch]
+    )
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, odim, odim, num_ch]
+    )
+
+    FMPadding = helper.make_node(
+        "FMPadding_Batch",
+        ["inp"],
+        ["outp"],
+        domain="finn",
+        backend="fpgadataflow",
+        ImgDim=idim,
+        Padding=padding,
+        NumChannels=num_ch,
+        inputDataType=str(idt.name),
+        PaddingStyle=pad_style,
+        numInputVectors=1,
+    )
+
+    graph = helper.make_graph(
+        nodes=[FMPadding], name="fmpadding_graph", inputs=[inp], outputs=[outp]
+    )
+
+    model = helper.make_model(graph, producer_name="fmpadding-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", idt)
+
+    return model
+
+
+# input image dimension
+@pytest.mark.parametrize("idim", [8, 16])
+# number of rows and number of cols to add
+@pytest.mark.parametrize("pad", [2, 3])
+# number of channels
+@pytest.mark.parametrize("num_ch", [1, 2])
+# PaddingStyle: selects behavior when (odim-idim)%2 != 0
+@pytest.mark.parametrize("pad_style", [2])
+# FINN input datatype
+@pytest.mark.parametrize("idt", [DataType.INT2, DataType.INT4])
+# execution mode
+@pytest.mark.parametrize("mode", ["cppsim", "rtlsim"])
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_fpgadataflow_fmpadding(idim, pad, num_ch, pad_style, idt, mode):
+
+    # generate input data
+    x = gen_finn_dt_tensor(idt, [1, idim, idim, num_ch])
+    input_dict = {"inp": x}
+    odim = idim + pad
+
+    model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, idt, pad_style)
+    model = model.transform(InferShapes())
+    model = model.transform(SetExecMode(mode))
+    model = model.transform(GiveUniqueNodeNames())
+    if mode == "cppsim":
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+    elif mode == "rtlsim":
+        model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(PrepareRTLSim())
+    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
+    expected_oshape = (1, odim, odim, num_ch)
+    assert y_produced.shape == expected_oshape
+
+    # calculate reference
+    # calculate correct pad according to parameters
+    if pad_style == 2:
+        if pad % 2 == 0:
+            pad_up = pad // 2
+            pad_left = pad // 2
+        else:
+            pad_up = pad // 2 + 1
+            pad_left = pad // 2 + 1
+    else:
+        pad_up = pad // 2
+        pad_left = pad // 2
+
+    pad_down = pad - pad_up
+    pad_right = pad - pad_left
+
+    y_expected = np.pad(
+        x, ((0, 0), (pad_up, pad_down), (pad_left, pad_right), (0, 0)), "constant"
+    )
+
+    assert (y_produced == y_expected).all()
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
new file mode 100644
index 0000000000000000000000000000000000000000..50b990f13494f22e985406791445b406e9946147
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+from onnx import TensorProto, helper
+
+import finn.core.onnx_exec as oxe
+from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.custom_op.multithreshold import multithreshold
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.util.basic import gen_finn_dt_tensor
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
+
+
+def make_single_thresholding_modelwrapper(T, pe, idt, odt):
+    NumChannels = T.shape[0]
+
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, NumChannels])
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, NumChannels])
+
+    node_inp_list = ["inp", "thresh"]
+
+    Thresholding_node = helper.make_node(
+        "Thresholding_Batch",
+        node_inp_list,
+        ["outp"],
+        domain="finn",
+        backend="fpgadataflow",
+        NumChannels=NumChannels,
+        PE=pe,
+        inputDataType=idt.name,
+        outputDataType=odt.name,
+    )
+    graph = helper.make_graph(
+        nodes=[Thresholding_node],
+        name="thresholding_graph",
+        inputs=[inp],
+        outputs=[outp],
+    )
+
+    model = helper.make_model(graph, producer_name="thresholding-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+
+    model.set_tensor_datatype("thresh", idt)
+    model.set_initializer("thresh", T)
+    return model
+
+
+# activation: None or DataType
+@pytest.mark.parametrize("act", [DataType.INT4, DataType.BIPOLAR])
+# input datatype
+@pytest.mark.parametrize("idt", [DataType.INT16, DataType.UINT16])
+# folding, -1 is maximum possible
+@pytest.mark.parametrize("nf", [-1, 2, 1])
+# number of input features
+@pytest.mark.parametrize("ich", [16])
+# execution mode
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.vivado
+@pytest.mark.slow
+def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode):
+    if nf == -1:
+        nf = ich
+    pe = ich // nf
+    assert ich % pe == 0
+
+    # generate input data
+    x = gen_finn_dt_tensor(idt, (1, ich))
+
+    odt = act
+    n_steps = act.get_num_possible_values() - 1
+    T = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(np.float32)
+    # provide non-decreasing thresholds
+    T = np.sort(T, axis=1)
+
+    model = make_single_thresholding_modelwrapper(T, pe, idt, odt)
+
+    if exec_mode == "cppsim":
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+        model = model.transform(SetExecMode("cppsim"))
+    elif exec_mode == "rtlsim":
+        model = model.transform(SetExecMode("rtlsim"))
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(ReplaceVerilogRelPaths())
+        model = model.transform(PrepareRTLSim())
+    else:
+        raise Exception("Unknown exec_mode")
+
+    # package input data as dictionary
+    input_dict = {"inp": x}
+
+    y = multithreshold(x, T)
+    if act == DataType.BIPOLAR:
+        # binary to bipolar
+        y = 2 * y - 1
+    else:
+        # signed offset
+        y += act.min()
+
+    oshape = model.get_tensor_shape("outp")
+    y_expected = y.reshape(oshape)
+    # execute model
+    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
+
+    y_produced = y_produced.reshape(y_expected.shape)
+
+    assert (y_produced == y_expected).all(), "cppsim failed"
+
+    if exec_mode == "rtlsim":
+        hls_synt_res_est = model.analysis(hls_synth_res_estimation)
+        assert "Thresholding_Batch_0" in hls_synt_res_est
diff --git a/tests/pynq/test_pynq_performance_end2end.py b/tests/pynq/test_pynq_performance_end2end.py
new file mode 100644
index 0000000000000000000000000000000000000000..66a93a190061e0142637be19bb2ea841d192745a
--- /dev/null
+++ b/tests/pynq/test_pynq_performance_end2end.py
@@ -0,0 +1,65 @@
+import os
+
+import pytest
+import numpy as np
+from scipy.stats import linregress
+import warnings
+from finn.util.test import load_test_checkpoint_or_skip
+from finn.core.throughput_test import throughput_test
+
+build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
+
+
+@pytest.mark.parametrize("end2end_example", ["tfc_w1a1", "cnv_w1a1"])
+@pytest.mark.slow
+def test_pynq_performance_end2end(end2end_example):
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_%s_pynq_deploy.onnx" % end2end_example
+    )
+    try:
+        ip = os.environ["PYNQ_IP"]  # NOQA
+        board = os.environ["PYNQ_BOARD"]  # NOQA
+        if ip == "" or board == "":
+            pytest.skip("PYNQ board or IP address not specified")
+        ret = dict()
+        # try a range of batch sizes, some may fail due to insufficient DMA
+        # buffers
+        bsize_range_in = [2 ** i for i in range(16)]
+        bsize_range = []
+        for bsize in bsize_range_in:
+            res = throughput_test(model, bsize)
+            if res is not None:
+                ret[bsize] = res
+                bsize_range.append(bsize)
+            else:
+                # assume we reached largest possible N
+                break
+
+        y = [ret[key]["runtime[ms]"] for key in bsize_range]
+        lrret = linregress(bsize_range, y)
+        ret_str = ""
+        ret_str += "\n" + "%s Throughput Test Results" % end2end_example
+        ret_str += "\n" + "-----------------------------"
+        ret_str += "\n" + "From linear regression:"
+        ret_str += "\n" + "Invocation overhead: %f ms" % lrret.intercept
+        ret_str += "\n" + "Time per sample: %f ms" % lrret.slope
+        ret_str += "\n" + "Raw data:"
+
+        ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format(
+            "N", "runtime[ms]", "fclk[mhz]", "fps", "DRAM rd[Mb/s]", "DRAM wr[Mb/s]"
+        )
+        for k in bsize_range:
+            v = ret[k]
+            ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format(
+                k,
+                np.round(v["runtime[ms]"], 4),
+                v["fclk[mhz]"],
+                np.round(v["throughput[images/s]"], 2),
+                np.round(v["DRAM_in_bandwidth[Mb/s]"], 2),
+                np.round(v["DRAM_out_bandwidth[Mb/s]"], 2),
+            )
+        ret_str += "\n" + "-----------------------------"
+        warnings.warn(ret_str)
+
+    except KeyError:
+        pytest.skip("PYNQ board or IP address not specified")
diff --git a/tests/pynq/test_pynq_performance_fifo.py b/tests/pynq/test_pynq_performance_fifo.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d4542473c4b58d3baa62f4123fd0f2f76954d95
--- /dev/null
+++ b/tests/pynq/test_pynq_performance_fifo.py
@@ -0,0 +1,128 @@
+import os
+
+import pytest
+
+import numpy as np
+from onnx import TensorProto, helper
+
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker
+from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
+from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
+from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject
+from finn.transformation.fpgadataflow.synth_pynq_proj import SynthPYNQProject
+import finn.transformation.fpgadataflow.replace_verilog_relpaths as rvp
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.util.basic import pynq_part_map, pynq_native_port_width
+from finn.core.throughput_test import throughput_test
+from scipy.stats import linregress
+import warnings
+
+
+def make_single_fifo_modelwrapper(Shape, Depth, fld_shape, finn_dtype):
+
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, Shape)
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, Shape)
+
+    FIFO_node = helper.make_node(
+        "StreamingFIFO",
+        ["inp"],
+        ["outp"],
+        domain="finn",
+        backend="fpgadataflow",
+        depth=Depth,
+        folded_shape=fld_shape,
+        dataType=str(finn_dtype.name),
+    )
+
+    graph = helper.make_graph(
+        nodes=[FIFO_node], name="fifo_graph", inputs=[inp], outputs=[outp]
+    )
+
+    model = helper.make_model(graph, producer_name="fifo-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", finn_dtype)
+    model.set_tensor_datatype("outp", finn_dtype)
+
+    return model
+
+
+@pytest.mark.vivado
+@pytest.mark.slow
+def test_pynq_performance_fifo():
+    try:
+        ip = os.environ["PYNQ_IP"]  # NOQA
+        board = os.environ["PYNQ_BOARD"]  # NOQA
+        if ip == "" or board == "":
+            pytest.skip("PYNQ board or IP address not specified")
+        fifo_width = pynq_native_port_width[board]
+        shape = (1, fifo_width)
+        folded_shape = (1, 1, fifo_width)
+        depth = 16
+        clk_ns = 10
+        dtype = DataType.BIPOLAR
+        fpga_part = pynq_part_map[board]
+        username = os.getenv("PYNQ_USERNAME", "xilinx")
+        password = os.getenv("PYNQ_PASSWORD", "xilinx")
+        port = os.getenv("PYNQ_PORT", 22)
+        target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
+
+        model = make_single_fifo_modelwrapper(shape, depth, folded_shape, dtype)
+        model = model.transform(InsertTLastMarker())
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP(fpga_part, clk_ns))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(rvp.ReplaceVerilogRelPaths())
+        model = model.transform(CreateStitchedIP(fpga_part, clk_ns))
+        model = model.transform(MakePYNQProject(board))
+        model = model.transform(SynthPYNQProject())
+        model = model.transform(MakePYNQDriver())
+        model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
+
+        ret = dict()
+        # try a range of batch sizes, some may fail due to insufficient DMA
+        # buffers
+        bsize_range_in = [2 ** i for i in range(20)]
+        bsize_range = []
+        for bsize in bsize_range_in:
+            res = throughput_test(model, bsize)
+            if res is not None:
+                ret[bsize] = res
+                bsize_range.append(bsize)
+            else:
+                # assume we reached largest possible N
+                break
+
+        y = [ret[key]["runtime[ms]"] for key in bsize_range]
+        lrret = linregress(bsize_range, y)
+        ret_str = ""
+        ret_str += "\n" + "FIFO Throughput Test Results"
+        ret_str += "\n" + "-----------------------------"
+        ret_str += "\n" + "From linear regression:"
+        ret_str += "\n" + "Invocation overhead: %f ms" % lrret.intercept
+        ret_str += "\n" + "Time per sample: %f ms" % lrret.slope
+        ret_str += "\n" + "Raw data:"
+
+        ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format(
+            "N", "runtime[ms]", "fclk[mhz]", "fps", "DRAM rd[Mb/s]", "DRAM wr[Mb/s]"
+        )
+        for k in bsize_range:
+            v = ret[k]
+            ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format(
+                k,
+                np.round(v["runtime[ms]"], 4),
+                v["fclk[mhz]"],
+                np.round(v["throughput[images/s]"], 2),
+                np.round(v["DRAM_in_bandwidth[Mb/s]"], 2),
+                np.round(v["DRAM_out_bandwidth[Mb/s]"], 2),
+            )
+        ret_str += "\n" + "-----------------------------"
+        warnings.warn(ret_str)
+
+    except KeyError:
+        pytest.skip("PYNQ board or IP address not specified")
diff --git a/tests/transformation/test_collapse_repeated_op.py b/tests/transformation/test_collapse_repeated_op.py
index 01d932ece0be4b0beb7ad6094284ec3efb1e525e..b74d868f9b921c35ff9f596c811583f45f761374 100644
--- a/tests/transformation/test_collapse_repeated_op.py
+++ b/tests/transformation/test_collapse_repeated_op.py
@@ -34,6 +34,7 @@ import finn.core.onnx_exec as ox
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline import CollapseRepeatedAdd, CollapseRepeatedMul
+import pytest
 
 
 def test_collapse_repeated_op():
@@ -67,3 +68,60 @@ def test_collapse_repeated_op():
     new_model = new_model.transform(CollapseRepeatedMul())
     inp_dict = {"top_in": np.asarray([-1.0, 1.0], dtype=np.float32)}
     assert ox.compare_execution(model, new_model, inp_dict)
+    assert len(new_model.graph.node) == 2
+    assert new_model.graph.node[0].op_type == "Add"
+    assert new_model.graph.node[1].op_type == "Mul"
+
+
+@pytest.mark.parametrize(
+    "test_args", [("Add", CollapseRepeatedAdd()), ("Mul", CollapseRepeatedMul())],
+)
+def test_collapse_repeated_only_if_linear(test_args):
+    scalar_op = test_args[0]
+    transf_fxn = test_args[1]
+
+    input_shape = [4, 4]
+    output_shape = input_shape
+
+    top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
+    top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape)
+
+    value_info = [oh.make_tensor_value_info("p1", TensorProto.FLOAT, [1])]
+    value_info += [oh.make_tensor_value_info("p2", TensorProto.FLOAT, [1])]
+    value_info += [oh.make_tensor_value_info("p3", TensorProto.FLOAT, [1])]
+    value_info += [oh.make_tensor_value_info("p4", TensorProto.FLOAT, [1])]
+    value_info += [oh.make_tensor_value_info("p5", TensorProto.FLOAT, [1])]
+
+    modelproto = oh.make_model(
+        oh.make_graph(
+            name="test",
+            inputs=[top_in],
+            outputs=[top_out],
+            value_info=value_info,
+            nodes=[
+                oh.make_node(scalar_op, ["top_in", "p2"], ["t1"]),
+                oh.make_node(scalar_op, ["t1", "p1"], ["t2"]),
+                oh.make_node(scalar_op, ["t2", "p3"], ["t3"]),
+                oh.make_node(scalar_op, ["t2", "p4"], ["t4"]),
+                oh.make_node(scalar_op, ["t3", "t4"], ["t5"]),
+                oh.make_node(scalar_op, ["t5", "p5"], ["top_out"]),
+            ],
+        )
+    )
+    model = ModelWrapper(modelproto)
+    model = model.transform(InferShapes())
+
+    np.random.seed(0)
+    model.set_initializer("p1", *np.random.rand(1).astype(np.float32))
+    model.set_initializer("p2", *np.random.rand(1).astype(np.float32))
+    model.set_initializer("p3", *np.random.rand(1).astype(np.float32))
+    model.set_initializer("p4", *np.random.rand(1).astype(np.float32))
+    model.set_initializer("p5", *np.random.rand(1).astype(np.float32))
+
+    # Transform
+    new_model = model.transform(transf_fxn)
+
+    # Test
+    inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)}
+    assert ox.compare_execution(model, new_model, inp_dict)
+    assert len(new_model.graph.node) == 5
diff --git a/tests/transformation/test_general_transformation.py b/tests/transformation/test_general_transformation.py
index 33b6041a170f3c0de8f741ef3ecb28682f6429ea..153af378eb3e07d5824f114fd194730048fb4953 100644
--- a/tests/transformation/test_general_transformation.py
+++ b/tests/transformation/test_general_transformation.py
@@ -31,6 +31,12 @@ from pkgutil import get_data
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.general import GiveUniqueNodeNames
 
+import numpy as np
+import onnx
+import finn.core.onnx_exec as oxe
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.general import GiveUniqueParameterTensors
+
 
 def test_give_unique_node_names():
     raw_m = get_data("finn", "data/onnx/mnist-conv/model.onnx")
@@ -39,3 +45,76 @@ def test_give_unique_node_names():
     assert model.graph.node[0].name == "Reshape_0"
     assert model.graph.node[1].name == "Conv_0"
     assert model.graph.node[11].name == "Add_2"
+
+
+def test_give_unique_parameter_tensors():
+
+    # Create model
+    input_shape = [4, 4]
+    in1 = onnx.helper.make_tensor_value_info("in1", onnx.TensorProto.FLOAT, input_shape)
+    out1 = onnx.helper.make_tensor_value_info(
+        "out1", onnx.TensorProto.FLOAT, input_shape
+    )
+
+    graph_nodes = []
+    graph_nodes += [
+        onnx.helper.make_node("Add", inputs=["in1", "param1"], outputs=["t1"])
+    ]
+
+    graph_nodes += [
+        onnx.helper.make_node("Sum", inputs=["t1", "param1", "param1"], outputs=["t2"])
+    ]
+
+    graph_nodes += [
+        onnx.helper.make_node("Sum", inputs=["t2", "param2", "param1"], outputs=["t3"])
+    ]
+
+    graph_nodes += [
+        onnx.helper.make_node("Add", inputs=["t3", "param1"], outputs=["out1"])
+    ]
+
+    onnx_graph = onnx.helper.make_graph(
+        nodes=graph_nodes, name="simple_graph", inputs=[in1], outputs=[out1],
+    )
+
+    onnx_model = onnx.helper.make_model(onnx_graph, producer_name="simple-model")
+    model = ModelWrapper(onnx_model)
+
+    # Set param values
+    np.random.seed(0)
+    param1 = np.random.rand(*input_shape).astype(np.float32)
+    param2 = np.random.rand(*input_shape).astype(np.float32)
+    model.set_initializer("param1", param1)
+    model.set_initializer("param2", param2)
+    model = model.transform(InferShapes())
+
+    # Apply transformation
+    new_model = model.transform(GiveUniqueParameterTensors())
+    new_model = new_model.transform(InferShapes())
+
+    # Test
+    # Breaks the model?
+    input_tensor = np.random.rand(*input_shape).astype(np.float32)
+    input_dict = {"in1": input_tensor}
+
+    # run original
+    expected_context = oxe.execute_onnx(model, input_dict)
+    expected_output = expected_context[model.graph.output[0].name]
+
+    # run modified
+    produced_context = oxe.execute_onnx(new_model, input_dict)
+    produced_output = produced_context[new_model.graph.output[0].name]
+
+    assert np.isclose(
+        expected_output, produced_output, atol=1e-8
+    ).all(), " GiveUniqueParameterTensors() transform breaks the model"
+
+    # Does the job?
+    param_set = set()
+    param_cnt = 0
+    for n in new_model.graph.node:
+        for i in range(1, len(n.input)):
+            param_set |= {n.input[i]}
+            param_cnt += 1
+
+    assert len(param_set) == param_cnt, " There are still parameters reused"
diff --git a/tests/transformation/test_move_add_past_mul.py b/tests/transformation/test_move_add_past_mul.py
index a0516d6fb2ff985fc112185ce99ad8facd841caf..163b9d310a5f12bd0b854f9aa46f53a549bf109e 100644
--- a/tests/transformation/test_move_add_past_mul.py
+++ b/tests/transformation/test_move_add_past_mul.py
@@ -60,6 +60,9 @@ def test_move_add_past_mul_single():
     new_model = model.transform(MoveAddPastMul())
     inp_dict = {"top_in": np.asarray([-1.0, 1.0], dtype=np.float32)}
     assert ox.compare_execution(model, new_model, inp_dict)
+    assert new_model.graph.node[0].op_type == "Mul"
+    assert new_model.graph.node[1].op_type == "Add"
+    assert new_model.graph.node[0].output[0] == new_model.graph.node[1].input[0]
 
 
 def test_move_add_past_mul_multi():
@@ -92,3 +95,50 @@ def test_move_add_past_mul_multi():
     new_model = model.transform(MoveAddPastMul())
     inp_dict = {"top_in": np.asarray([-1.0, 1.0], dtype=np.float32)}
     assert ox.compare_execution(model, new_model, inp_dict)
+    assert new_model.graph.node[0].op_type == "Mul"
+    assert new_model.graph.node[1].op_type == "Mul"
+    assert new_model.graph.node[2].op_type == "Add"
+    assert new_model.graph.node[3].op_type == "Add"
+    for i in range(len(new_model.graph.node) - 1):
+        assert new_model.graph.node[i].output[0] == new_model.graph.node[i + 1].input[0]
+
+
+def test_move_add_past_mul_only_if_linear():
+    top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, [2])
+    top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, [2])
+
+    value_info = [oh.make_tensor_value_info("add1_param", TensorProto.FLOAT, [1])]
+    value_info += [oh.make_tensor_value_info("mul1_param", TensorProto.FLOAT, [1])]
+    value_info += [oh.make_tensor_value_info("mul2_param", TensorProto.FLOAT, [1])]
+    value_info += [oh.make_tensor_value_info("mul3_param", TensorProto.FLOAT, [1])]
+    modelproto = oh.make_model(
+        oh.make_graph(
+            name="test",
+            inputs=[top_in],
+            outputs=[top_out],
+            value_info=value_info,
+            nodes=[
+                oh.make_node("Add", ["top_in", "add1_param"], ["t1"]),
+                oh.make_node("Mul", ["t1", "mul1_param"], ["fork"]),
+                oh.make_node("Mul", ["fork", "mul2_param"], ["t3"]),
+                oh.make_node("Add", ["t3", "fork"], ["t4"]),
+                oh.make_node("Mul", ["t4", "mul3_param"], ["top_out"]),
+            ],
+        )
+    )
+    model = ModelWrapper(modelproto)
+    model = model.transform(InferShapes())
+
+    np.random.seed(0)
+    model.set_initializer("add1_param", np.random.rand(2).astype(np.float32))
+    model.set_initializer("mul1_param", np.random.rand(2).astype(np.float32))
+    model.set_initializer("mul2_param", np.random.rand(2).astype(np.float32))
+    model.set_initializer("mul3_param", np.random.rand(2).astype(np.float32))
+    new_model = model.transform(MoveAddPastMul())
+    inp_dict = {"top_in": np.random.rand(2).astype(np.float32)}
+    assert ox.compare_execution(model, new_model, inp_dict)
+    assert new_model.graph.node[0].op_type == "Mul"
+    assert new_model.graph.node[1].op_type == "Add"
+    assert new_model.graph.node[2].op_type == "Mul"
+    assert new_model.graph.node[3].op_type == "Add"
+    assert new_model.graph.node[4].op_type == "Mul"
diff --git a/tests/transformation/test_move_past_fork.py b/tests/transformation/test_move_past_fork.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3d37bd60c9e2580ca4499daafa8693f39fec810
--- /dev/null
+++ b/tests/transformation/test_move_past_fork.py
@@ -0,0 +1,79 @@
+from onnx import TensorProto, helper
+import numpy as np
+
+import finn.core.onnx_exec as oxe
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.streamline.reorder import MoveLinearPastFork
+from finn.transformation.infer_shapes import InferShapes
+
+import pytest
+
+
+@pytest.mark.parametrize("ch", [64, 1])
+# ifmdim
+@pytest.mark.parametrize("ifmdim", [-1, 7])
+def test_move_past_fork(ch, ifmdim):
+    # generate test vectors of correct shape
+    if ifmdim == -1:
+        input_shape = (1, ch)
+    else:
+        input_shape = (1, ch, ifmdim, ifmdim)
+
+    top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
+    top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, input_shape)
+
+    num_of_params = 8
+    value_info = []
+    for i in range(num_of_params):
+        value_info += [
+            helper.make_tensor_value_info("p" + str(i), TensorProto.FLOAT, input_shape)
+        ]
+
+    add_1_to_move = helper.make_node("Add", ["top_in", "p0"], ["fork1"])
+    mul_1_to_move = helper.make_node("Mul", ["t5", "p4"], ["fork2"])
+    add_2_to_move = helper.make_node("Add", ["fork2", "p5"], ["t6"])
+    mul_1_not_to_move = helper.make_node("Mul", ["t8", "p7"], ["fork3"])
+    modelproto = helper.make_model(
+        helper.make_graph(
+            name="test",
+            inputs=[top_in],
+            outputs=[top_out],
+            value_info=value_info,
+            nodes=[
+                # fork1
+                add_1_to_move,
+                helper.make_node("Mul", ["fork1", "p1"], ["t2"]),
+                helper.make_node("Mul", ["fork1", "p2"], ["t3"]),
+                helper.make_node("Add", ["t2", "t3"], ["t4"]),
+                helper.make_node("Add", ["t4", "p3"], ["t5"]),
+                # fork2
+                mul_1_to_move,
+                add_2_to_move,
+                helper.make_node("Add", ["fork2", "p6"], ["t7"]),
+                helper.make_node("Add", ["t6", "t7"], ["t8"]),
+                # empty branches: do nothing
+                mul_1_not_to_move,
+                helper.make_node("Add", ["fork3", "fork3"], ["top_out"]),
+            ],
+        )
+    )
+    model = ModelWrapper(modelproto)
+    model = model.transform(InferShapes())
+
+    np.random.seed(0)
+    for i in range(num_of_params):
+        model.set_initializer(
+            "p" + str(i), np.random.rand(*input_shape).astype(np.float32)
+        )
+
+    # Transform
+    new_model = model.transform(MoveLinearPastFork())
+    inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)}
+
+    # Test
+    assert oxe.compare_execution(model, new_model, inp_dict)
+    assert not new_model.is_fork_node(add_1_to_move)
+    assert not new_model.is_fork_node(mul_1_to_move)
+    assert not new_model.is_fork_node(add_2_to_move)
+    assert new_model.is_fork_node(mul_1_not_to_move)
+    assert len(new_model.graph.node) == 14
diff --git a/tests/transformation/test_move_scalar_past_conv.py b/tests/transformation/test_move_scalar_past_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f50642d2b9d1583030630cb4927c2b86667e71a
--- /dev/null
+++ b/tests/transformation/test_move_scalar_past_conv.py
@@ -0,0 +1,166 @@
+import numpy as np
+import onnx.helper as oh
+import pytest
+from onnx import TensorProto
+
+import finn.core.onnx_exec as ox
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.streamline import (
+    MoveScalarAddPastConv,
+    MoveScalarMulPastConv,
+)
+
+
+@pytest.mark.parametrize("padding", [False, True])
+@pytest.mark.parametrize(
+    "test_args", [("Add", MoveScalarAddPastConv()), ("Mul", MoveScalarMulPastConv())],
+)
+def test_move_scalar_past_conv(test_args, padding):
+    scalar_op = test_args[0]
+    transf_fxn = test_args[1]
+
+    in_feature_dim = 7
+    in_chn = 3
+
+    stages = 2
+    kernel_size = 3
+
+    out_feature_dim = (
+        in_feature_dim if padding else in_feature_dim - (kernel_size // 2 * 2) * stages
+    )
+
+    input_shape = [1, in_chn, in_feature_dim, in_feature_dim]
+    output_shape = [1, in_chn, out_feature_dim, out_feature_dim]
+
+    conv_param_shape = [in_chn, in_chn, kernel_size, kernel_size]
+
+    conv_config = {}
+    conv_config["dilations"] = [1, 1]
+    conv_config["group"] = 1
+    conv_config["kernel_shape"] = [kernel_size, kernel_size]
+    if padding:
+        conv_config["pads"] = [1, 1, 1, 1]
+    else:
+        conv_config["pads"] = [0, 0, 0, 0]
+    conv_config["strides"] = [1, 1]
+
+    top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
+    top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape)
+
+    value_info = [oh.make_tensor_value_info("p1", TensorProto.FLOAT, [1])]
+    value_info += [oh.make_tensor_value_info("p2", TensorProto.FLOAT, conv_param_shape)]
+    value_info += [oh.make_tensor_value_info("p3", TensorProto.FLOAT, conv_param_shape)]
+
+    modelproto = oh.make_model(
+        oh.make_graph(
+            name="test",
+            inputs=[top_in],
+            outputs=[top_out],
+            value_info=value_info,
+            nodes=[
+                oh.make_node(scalar_op, ["top_in", "p1"], ["t1"]),
+                oh.make_node("Conv", ["t1", "p2"], ["t2"], **conv_config),
+                oh.make_node("Conv", ["t2", "p3"], ["top_out"], **conv_config),
+            ],
+        )
+    )
+    model = ModelWrapper(modelproto)
+    model = model.transform(InferShapes())
+
+    np.random.seed(0)
+    model.set_initializer("p1", *np.random.rand(1).astype(np.float32))
+    model.set_initializer("p2", np.random.rand(*conv_param_shape).astype(np.float32))
+    model.set_initializer("p3", np.random.rand(*conv_param_shape).astype(np.float32))
+    new_model = model.transform(transf_fxn)
+    inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)}
+
+    assert ox.compare_execution(model, new_model, inp_dict)
+    if scalar_op == "Add":
+        if padding:
+            assert new_model.graph.node[0].op_type == scalar_op
+            assert new_model.graph.node[1].op_type == "Conv"
+            assert new_model.graph.node[2].op_type == "Conv"
+        else:
+            assert new_model.graph.node[0].op_type == "Conv"
+            assert new_model.graph.node[1].op_type == scalar_op
+            assert new_model.graph.node[2].op_type == "Conv"
+    else:
+        assert new_model.graph.node[0].op_type == "Conv"
+        assert new_model.graph.node[1].op_type == "Conv"
+        assert new_model.graph.node[2].op_type == scalar_op
+
+
+@pytest.mark.parametrize(
+    "test_args", [("Add", MoveScalarAddPastConv()), ("Mul", MoveScalarMulPastConv())],
+)
+def test_move_scalar_past_conv_only_if_linear(test_args):
+    scalar_op = test_args[0]
+    transf_fxn = test_args[1]
+
+    in_feature_dim = 7
+    in_chn = 1
+    padding = False
+    stages = 3
+    kernel_size = 3
+
+    out_feature_dim = (
+        in_feature_dim if padding else in_feature_dim - (kernel_size // 2 * 2) * stages
+    )
+
+    input_shape = [1, in_chn, in_feature_dim, in_feature_dim]
+    output_shape = [1, in_chn, out_feature_dim, out_feature_dim]
+
+    conv_param_shape = [in_chn, in_chn, kernel_size, kernel_size]
+
+    conv_config = {}
+    conv_config["dilations"] = [1, 1]
+    conv_config["group"] = 1
+    conv_config["kernel_shape"] = [kernel_size, kernel_size]
+    conv_config["pads"] = [0, 0, 0, 0]
+    conv_config["strides"] = [1, 1]
+
+    top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
+    top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape)
+
+    value_info = [oh.make_tensor_value_info("p1", TensorProto.FLOAT, [1])]
+    value_info += [oh.make_tensor_value_info("p2", TensorProto.FLOAT, conv_param_shape)]
+    value_info += [oh.make_tensor_value_info("p3", TensorProto.FLOAT, conv_param_shape)]
+    value_info += [oh.make_tensor_value_info("p4", TensorProto.FLOAT, conv_param_shape)]
+    value_info += [oh.make_tensor_value_info("p5", TensorProto.FLOAT, conv_param_shape)]
+
+    modelproto = oh.make_model(
+        oh.make_graph(
+            name="test",
+            inputs=[top_in],
+            outputs=[top_out],
+            value_info=value_info,
+            nodes=[
+                oh.make_node("Conv", ["top_in", "p2"], ["t1"], **conv_config),
+                oh.make_node(scalar_op, ["t1", "p1"], ["t2"]),
+                oh.make_node("Conv", ["t2", "p3"], ["t3"], **conv_config),
+                oh.make_node("Conv", ["t2", "p4"], ["t4"], **conv_config),
+                oh.make_node(scalar_op, ["t3", "t4"], ["t5"]),
+                oh.make_node("Conv", ["t5", "p5"], ["top_out"], **conv_config),
+            ],
+        )
+    )
+    model = ModelWrapper(modelproto)
+    model = model.transform(InferShapes())
+
+    np.random.seed(0)
+    model.set_initializer("p1", *np.random.rand(1).astype(np.float32))
+    model.set_initializer("p2", np.random.rand(*conv_param_shape).astype(np.float32))
+    model.set_initializer("p3", np.random.rand(*conv_param_shape).astype(np.float32))
+    model.set_initializer("p4", np.random.rand(*conv_param_shape).astype(np.float32))
+    model.set_initializer("p5", np.random.rand(*conv_param_shape).astype(np.float32))
+    new_model = model.transform(transf_fxn)
+    inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)}
+
+    assert ox.compare_execution(model, new_model, inp_dict)
+    assert new_model.graph.node[0].op_type == "Conv"
+    assert new_model.graph.node[1].op_type == scalar_op
+    assert new_model.graph.node[2].op_type == "Conv"
+    assert new_model.graph.node[3].op_type == "Conv"
+    assert new_model.graph.node[4].op_type == scalar_op
+    assert new_model.graph.node[5].op_type == "Conv"
diff --git a/tests/transformation/test_move_scalar_past_matmul.py b/tests/transformation/test_move_scalar_past_matmul.py
index 896527e82d8cfa869cb979d1102904c70703a14c..e432dbf4ec1a38551609e5914e2d44968a020908 100644
--- a/tests/transformation/test_move_scalar_past_matmul.py
+++ b/tests/transformation/test_move_scalar_past_matmul.py
@@ -27,6 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import numpy as np
+import pytest
 import onnx.helper as oh
 from onnx import TensorProto
 
@@ -99,3 +100,56 @@ def test_move_scalar_add_past_matmul():
     assert new_model.graph.node[0].op_type == "MatMul"
     assert new_model.graph.node[1].op_type == "Add"
     assert new_model.graph.node[0].output[0] == new_model.graph.node[1].input[0]
+
+
+@pytest.mark.parametrize(
+    "test_args",
+    [("Add", MoveScalarAddPastMatMul()), ("Mul", MoveScalarMulPastMatMul())],
+)
+def test_move_scalar_past_matmul_only_if_linear(test_args):
+    scalar_op = test_args[0]
+    transf_fxn = test_args[1]
+    input_shape = [1, 2]
+    matmul_shape = [2, 2]
+    top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
+    top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, input_shape)
+
+    p1 = oh.make_tensor_value_info("p1", TensorProto.FLOAT, [1, 1])
+    p2 = oh.make_tensor_value_info("p2", TensorProto.FLOAT, matmul_shape)
+    p3 = oh.make_tensor_value_info("p3", TensorProto.FLOAT, matmul_shape)
+    p4 = oh.make_tensor_value_info("p4", TensorProto.FLOAT, matmul_shape)
+    modelproto = oh.make_model(
+        oh.make_graph(
+            name="test",
+            inputs=[top_in],
+            outputs=[top_out],
+            value_info=[p1, p2, p3, p4],
+            nodes=[
+                oh.make_node(scalar_op, ["top_in", "p1"], ["t1"]),
+                oh.make_node("MatMul", ["t1", "p2"], ["fork"]),
+                oh.make_node("MatMul", ["fork", "p3"], ["t3"]),
+                oh.make_node(scalar_op, ["t3", "fork"], ["t4"]),
+                oh.make_node("MatMul", ["t4", "p4"], ["top_out"]),
+            ],
+        )
+    )
+    model = ModelWrapper(modelproto)
+    model = model.transform(InferShapes())
+
+    np.random.seed(0)
+    model.set_initializer("p1", np.random.rand(1, 1).astype(np.float32))
+    model.set_initializer("p2", np.random.rand(*matmul_shape).astype(np.float32))
+    model.set_initializer("p3", np.random.rand(*matmul_shape).astype(np.float32))
+    model.set_initializer("p4", np.random.rand(*matmul_shape).astype(np.float32))
+
+    # Transform
+    new_model = model.transform(transf_fxn)
+
+    # Test
+    inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)}
+    assert ox.compare_execution(model, new_model, inp_dict)
+    assert new_model.graph.node[0].op_type == "MatMul"
+    assert new_model.graph.node[1].op_type == scalar_op
+    assert new_model.graph.node[2].op_type == "MatMul"
+    assert new_model.graph.node[3].op_type == scalar_op
+    assert new_model.graph.node[4].op_type == "MatMul"
diff --git a/tests/transformation/test_sort_graph.py b/tests/transformation/test_sort_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..05842504c13b144bb34e8084fb12b5086fa84115
--- /dev/null
+++ b/tests/transformation/test_sort_graph.py
@@ -0,0 +1,150 @@
+from onnx import TensorProto, helper
+import numpy as np
+
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.general import SortGraph
+from finn.transformation.infer_shapes import InferShapes
+import pytest
+import finn.analysis.topology as ta
+
+
+def make_randomly_sorted_linear_model(num_of_nodes, seed=None):
+    if seed is not None:
+        np.random.seed(seed)
+
+    ch = 2
+    ifmdim = 16
+    input_shape = (1, ch, ifmdim, ifmdim)
+
+    top_in = helper.make_tensor_value_info("t0", TensorProto.FLOAT, input_shape)
+    top_out = helper.make_tensor_value_info(
+        "t" + str(num_of_nodes), TensorProto.FLOAT, input_shape
+    )
+
+    value_info = []
+    nodes = []
+    for i in range(num_of_nodes):
+        nodes += [
+            helper.make_node("Add", ["t" + str(i), "p" + str(i)], ["t" + str(i + 1)])
+        ]
+        value_info += [
+            helper.make_tensor_value_info("p" + str(i), TensorProto.FLOAT, input_shape)
+        ]
+
+    nodes = np.random.permutation(nodes)
+
+    modelproto = helper.make_model(
+        helper.make_graph(
+            name="test",
+            inputs=[top_in],
+            outputs=[top_out],
+            value_info=value_info,
+            nodes=nodes,
+        )
+    )
+    model = ModelWrapper(modelproto)
+    model = model.transform(InferShapes())
+
+    for i in range(num_of_nodes):
+        model.set_initializer(
+            "p" + str(i), np.random.rand(*input_shape).astype(np.float32)
+        )
+
+    return model
+
+
+@pytest.mark.parametrize("num_of_nodes", [64])
+def test_sort_linear_graph(num_of_nodes):
+    model = make_randomly_sorted_linear_model(num_of_nodes, seed=0)
+    new_model = model.transform(SortGraph())
+
+    # Test
+    ret = new_model.analysis(ta.nodes_topologically_sorted)
+    assert ret["nodes_topologically_sorted"], "Nodes are not topologically sorted."
+
+
+def test_sort_nonlinear_graph():
+    ch = 2
+    ifmdim = 16
+    input_shape = (1, ch, ifmdim, ifmdim)
+
+    top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
+    top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, input_shape)
+
+    num_of_params = 8
+    value_info = []
+    for i in range(num_of_params):
+        value_info += [
+            helper.make_tensor_value_info("p" + str(i), TensorProto.FLOAT, input_shape)
+        ]
+
+    modelproto = helper.make_model(
+        helper.make_graph(
+            name="test",
+            inputs=[top_in],
+            outputs=[top_out],
+            value_info=value_info,
+            nodes=[
+                # Not sorted nodes
+                helper.make_node("Mul", ["fork1", "p2"], ["t3"]),
+                helper.make_node("Add", ["t4", "p3"], ["t5"]),
+                helper.make_node("Add", ["t2", "t3"], ["t4"]),
+                helper.make_node("Add", ["t6", "t7"], ["t8"]),
+                helper.make_node("Add", ["fork3", "fork3"], ["top_out"]),
+                helper.make_node("Mul", ["t5", "p4"], ["fork2"]),
+                helper.make_node("Add", ["top_in", "p0"], ["fork1"]),
+                helper.make_node("Mul", ["fork1", "p1"], ["t2"]),
+                helper.make_node("Add", ["fork2", "p5"], ["t6"]),
+                helper.make_node("Add", ["fork2", "p6"], ["t7"]),
+                helper.make_node("Mul", ["t8", "p7"], ["fork3"]),
+            ],
+        )
+    )
+    model = ModelWrapper(modelproto)
+    model = model.transform(InferShapes())
+
+    np.random.seed(0)
+    for i in range(num_of_params):
+        model.set_initializer(
+            "p" + str(i), np.random.rand(*input_shape).astype(np.float32)
+        )
+
+    new_model = model.transform(SortGraph())
+
+    # Test
+    ret = new_model.analysis(ta.nodes_topologically_sorted)
+    assert ret["nodes_topologically_sorted"], "Nodes are not topologically sorted."
+
+
+if __name__ == "__main__":
+    import time
+
+    sizes = [10, 50, 100, 500, 1000]
+    times = []
+    reps = 10
+
+    print("SortGraph performance test:")
+    print("Test sizes", sizes)
+    print("Repetitions per size:", reps)
+    for sz in sizes:
+        acc_time = 0
+        print(" Testing size ", sz)
+        for i in range(reps):
+            # it should take the same time even with the sorted one
+            # but better new model each time as it is a more general approach
+            model = make_randomly_sorted_linear_model(sz)  # new model as seed is None
+            bef = time.time()
+            new_model = model.transform(SortGraph(), make_deepcopy=False)
+            acc_time += time.time() - bef
+
+        times += [acc_time / reps]
+
+    # print csv
+    print("\nnum_of_nodes,  seconds")
+    for sz, tm in zip(sizes, times):
+        print("{:12d}, {:6.4e}".format(sz, tm))
+
+    # plot
+    # import matplotlib.pyplot as plt
+    # plt.plot(sizes,times,"--o")
+    # plt.grid(True)