diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn
index bf1ad4f62d00a7658051be71b415e20bacfcafb1..c7c418482dbdfd6f4e6539a9eaf6ec4b9ff1e24b 100644
--- a/docker/Dockerfile.finn
+++ b/docker/Dockerfile.finn
@@ -86,10 +86,10 @@ RUN pip install -e git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg
 
 # git-based Python repo dependencies
 # these are installed in editable mode for easier co-development
-ARG FINN_BASE_COMMIT="ac0b86a63eb937b869bfa453a996a8a8b8506546"
+ARG FINN_BASE_COMMIT="d38426634f1cfbc5432a0e52c3f65c07fad12aa4"
 ARG FINN_EXP_COMMIT="f82c0d9868bb88ea045dfadb28508d327d287221"
 ARG BREVITAS_COMMIT="462f86cdc60f9915baf13afd1676fb21da44c2ee"
-ARG PYVERILATOR_COMMIT="e2ff74030de3992dcac54bf1b6aad2915946e8cb"
+ARG PYVERILATOR_COMMIT="0c3eb9343500fc1352a02c020a736c8c2db47e8e"
 ARG CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
 ARG HLSLIB_COMMIT="fbb07135b3d991602e8abe3f2c51212c11fd392b"
 ARG OMX_COMMIT="1dfc4aa2f2895632742cd5751520c6b472feb74e"
diff --git a/run-docker.sh b/run-docker.sh
index 19a66f9e95c57be8ef332a075ff69c000488fdce..a1147fcee55d345850da4c533dd9e88270d727a6 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -118,14 +118,14 @@ elif [ "$1" = "notebook" ]; then
   DOCKER_EXTRA+="-p $NETRON_PORT:$NETRON_PORT "
 elif [ "$1" = "build_dataflow" ]; then
   BUILD_DATAFLOW_DIR=$(readlink -f "$2")
-  DOCKER_EXTRA="-v $BUILD_DATAFLOW_DIR:$BUILD_DATAFLOW_DIR"
+  DOCKER_EXTRA="-v $BUILD_DATAFLOW_DIR:$BUILD_DATAFLOW_DIR "
   DOCKER_INTERACTIVE="-it"
   #FINN_HOST_BUILD_DIR=$BUILD_DATAFLOW_DIR/build
   gecho "Running build_dataflow for folder $BUILD_DATAFLOW_DIR"
   DOCKER_CMD="build_dataflow $BUILD_DATAFLOW_DIR"
 elif [ "$1" = "build_custom" ]; then
   BUILD_CUSTOM_DIR=$(readlink -f "$2")
-  DOCKER_EXTRA="-v $BUILD_CUSTOM_DIR:$BUILD_CUSTOM_DIR -w $BUILD_CUSTOM_DIR"
+  DOCKER_EXTRA="-v $BUILD_CUSTOM_DIR:$BUILD_CUSTOM_DIR -w $BUILD_CUSTOM_DIR "
   DOCKER_INTERACTIVE="-it"
   #FINN_HOST_BUILD_DIR=$BUILD_DATAFLOW_DIR/build
   gecho "Running build_custom: $BUILD_CUSTOM_DIR/build.py"
@@ -141,7 +141,7 @@ if [ "$FINN_DOCKER_GPU" != 0 ];then
   if [ ! -z "$NVIDIA_VISIBLE_DEVICES" ];then
     DOCKER_EXTRA+="--runtime nvidia -e NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES "
   else
-    DOCKER_EXTRA+="--gpus all"
+    DOCKER_EXTRA+="--gpus all "
   fi
 fi
 
diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py
index 4aa1ad31e1ad73762ef46cc861b1a255ce57b926..c4664a5471984e1f88a70f1d9bb6ce674e38c782 100644
--- a/src/finn/builder/build_dataflow.py
+++ b/src/finn/builder/build_dataflow.py
@@ -62,7 +62,7 @@ class StreamToLogger(object):
         pass
 
 
-def resolve_build_steps(cfg: DataflowBuildConfig):
+def resolve_build_steps(cfg: DataflowBuildConfig, partial: bool = True):
     steps = cfg.steps
     if steps is None:
         steps = default_build_dataflow_steps
@@ -76,19 +76,56 @@ def resolve_build_steps(cfg: DataflowBuildConfig):
             steps_as_fxns.append(transform_step)
         else:
             raise Exception("Could not resolve build step: " + str(transform_step))
+    if partial:
+        step_names = list(map(lambda x: x.__name__, steps_as_fxns))
+        if cfg.start_step is None:
+            start_ind = 0
+        else:
+            start_ind = step_names.index(cfg.start_step)
+        if cfg.stop_step is None:
+            stop_ind = len(step_names) - 1
+        else:
+            stop_ind = step_names.index(cfg.stop_step)
+        steps_as_fxns = steps_as_fxns[start_ind : (stop_ind + 1)]
+
     return steps_as_fxns
 
 
+def resolve_step_filename(
+    step_name: str, cfg: DataflowBuildConfig, step_delta: int = 0
+):
+    step_names = list(
+        map(lambda x: x.__name__, resolve_build_steps(cfg, partial=False))
+    )
+    assert step_name in step_names, "start_step %s not found" + step_name
+    step_no = step_names.index(step_name) + step_delta
+    assert step_no >= 0, "Invalid step+delta combination"
+    assert step_no < len(step_names), "Invalid step+delta combination"
+    filename = cfg.output_dir + "/intermediate_models/"
+    filename += "%s.onnx" % (step_names[step_no])
+    return filename
+
+
 def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
     """Best-effort build a dataflow accelerator using the given configuration.
 
     :param model_filename: ONNX model filename to build
     :param cfg: Build configuration
     """
-    model = ModelWrapper(model_filename)
+    # if start_step is specified, override the input model
+    if cfg.start_step is None:
+        print("Building dataflow accelerator from " + model_filename)
+        model = ModelWrapper(model_filename)
+    else:
+        intermediate_model_filename = resolve_step_filename(cfg.start_step, cfg, -1)
+        print(
+            "Building dataflow accelerator from intermediate checkpoint"
+            + intermediate_model_filename
+        )
+        model = ModelWrapper(intermediate_model_filename)
     assert type(model) is ModelWrapper
     finn_build_dir = os.environ["FINN_BUILD_DIR"]
-    print("Building dataflow accelerator from " + model_filename)
+
     print("Intermediate outputs will be generated in " + finn_build_dir)
     print("Final outputs will be generated in " + cfg.output_dir)
     print("Build log is at " + cfg.output_dir + "/build_dataflow.log")
@@ -132,7 +169,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
             sys.stdout = stdout_orig
             sys.stderr = stderr_orig
             time_per_step[step_name] = step_end - step_start
-            chkpt_name = "%d_%s.onnx" % (step_num, step_name)
+            chkpt_name = "%s.onnx" % (step_name)
             if cfg.save_intermediate_models:
                 intermediate_model_dir = cfg.output_dir + "/intermediate_models"
                 if not os.path.exists(intermediate_model_dir):
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index 4a112699ec9bdf126f447fe2244eb01f6f4fa042..052a6c701a639929f9c2dff682c2a8777b679788 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -172,6 +172,13 @@ class DataflowBuildConfig:
     #: that will override the target_fps setting here.
     target_fps: Optional[int] = None
 
+    #: (Optional) Use two-pass relaxation for folding, only relevant if target_fps
+    #: is set. If enabled, parallelization will internally run a second time if the
+    #: target cycles from the first pass could not be achieved, instead using the
+    #: achievable target to obtain a balanced pipeline. If disabled, this can be
+    #: useful for decreasing the latency (even though throughput won't increase).
+    folding_two_pass_relaxation: Optional[bool] = True
+
     #: (Optional) At which steps the generated intermediate output model
     #: will be verified. See documentation of VerificationStepType for
     #: available options.
@@ -185,6 +192,19 @@ class DataflowBuildConfig:
     #: verification. Only required if verify_steps is not empty.
     verify_expected_output_npy: Optional[str] = "expected_output.npy"
 
+    #: (Optional) Save full execution context for each of the verify_steps.
+    #: By default, only the top-level graph output is saved.
+    verify_save_full_context: Optional[bool] = False
+
+    #: (Optional) Save .vcd waveforms from rtlsim under reports.
+    #: By default, waveforms won't be saved.
+    verify_save_rtlsim_waveforms: Optional[bool] = False
+
+    #: (Optional) Run synthesis to generate a .dcp for the stitched-IP output product.
+    #: This can make it easier to treat it as a standalone artifact without requiring
+    #: the full list of layer IP build directories. By default, synthesis will not run.
+    stitched_ip_gen_dcp: Optional[bool] = False
+
     #: (Optional) Control the maximum width of the per-PE MVAU stream while
     #: exploring the parallelization attributes to reach target_fps
     #: Only relevant if target_fps is specified.
@@ -264,6 +284,13 @@ class DataflowBuildConfig:
     #: - functions are called with (model, DataflowBuildConfig) as args
     steps: Optional[List[Any]] = None
 
+    #: If given, start from this step, loading the intermediate model generated
+    #: from the previous step (save_intermediate_models must be enabled)
+    start_step: Optional[str] = None
+
+    #: If given, stop at this step.
+    stop_step: Optional[str] = None
+
     def _resolve_hls_clk_period(self):
         if self.hls_clk_period_ns is None:
             # use same clk for synth and hls if not explicitly specified
@@ -333,4 +360,7 @@ class DataflowBuildConfig:
                 + self.verify_expected_output_npy
             )
             verify_expected_output_npy = np.load(self.verify_expected_output_npy)
-            return (verify_input_npy, verify_expected_output_npy)
+            return (
+                verify_input_npy.astype(np.float32),
+                verify_expected_output_npy.astype(np.float32),
+            )
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index 5bdccebb58ccb6f4906a05dda58da2494366739f..b9c065ed2514cbbf9f92391ce496705aa3d4a822 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -30,7 +30,8 @@ import json
 import numpy as np
 import os
 from copy import deepcopy
-from shutil import copy, copytree
+from distutils.dir_util import copy_tree
+from shutil import copy
 
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 import finn.transformation.streamline.absorb as absorb
@@ -70,7 +71,6 @@ from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
 from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
     ReplaceVerilogRelPaths,
 )
@@ -97,6 +97,7 @@ from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
 from finn.transformation.streamline import Streamline
 from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
 from finn.util.config import extract_model_config_to_json
+from finn.util.pyverilator import pyverilate_get_liveness_threshold_cycles
 from finn.util.test import execute_parent
 
 
@@ -115,21 +116,77 @@ def verify_step(
         parent_model_fn = intermediate_models_dir + "/dataflow_parent.onnx"
         child_model_fn = intermediate_models_dir + "/verify_%s.onnx" % step_name
         model.save(child_model_fn)
-        out_npy = execute_parent(parent_model_fn, child_model_fn, in_npy)
+        out_tensor_name = ModelWrapper(parent_model_fn).graph.output[0].name
+        out_dict = execute_parent(
+            parent_model_fn, child_model_fn, in_npy, return_full_ctx=True
+        )
+        out_npy = out_dict[out_tensor_name]
     else:
         inp_tensor_name = model.graph.input[0].name
         out_tensor_name = model.graph.output[0].name
         inp_dict = {inp_tensor_name: in_npy}
-        out_dict = execute_onnx(model, inp_dict)
+        out_dict = execute_onnx(model, inp_dict, True)
         out_npy = out_dict[out_tensor_name]
     res = np.isclose(exp_out_npy, out_npy, atol=1e-3).all()
     res_to_str = {True: "SUCCESS", False: "FAIL"}
     res_str = res_to_str[res]
-    verification_output_fn = verify_out_dir + "/verify_%s_%s.npy" % (step_name, res_str)
-    np.save(verification_output_fn, out_npy)
+    if cfg.verify_save_full_context:
+        verification_output_fn = verify_out_dir + "/verify_%s_%s.npz" % (
+            step_name,
+            res_str,
+        )
+        np.savez(verification_output_fn, **out_dict)
+    else:
+        verification_output_fn = verify_out_dir + "/verify_%s_%s.npy" % (
+            step_name,
+            res_str,
+        )
+        np.save(verification_output_fn, out_npy)
     print("Verification for %s : %s" % (step_name, res_str))
 
 
+def prepare_for_stitched_ip_rtlsim(verify_model, cfg):
+    need_restitch = False
+    # rtlsim only supports certain impl_style for some nodes
+    # StreamingFIFO must have impl_style=rtl
+    for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO"):
+        inst = getCustomOp(fifo_layer)
+        if inst.get_nodeattr("impl_style") != "rtl":
+            inst.set_nodeattr("impl_style", "rtl")
+            inst.set_nodeattr("code_gen_dir_ipgen", "")
+            inst.set_nodeattr("ipgen_path", "")
+            need_restitch = True
+    # StreamingDataWidthConverter must have impl_style=hls
+    for dwc_layer in verify_model.get_nodes_by_op_type(
+        "StreamingDataWidthConverter_Batch"
+    ):
+        inst = getCustomOp(dwc_layer)
+        if inst.get_nodeattr("impl_style") != "hls":
+            inst.set_nodeattr("impl_style", "hls")
+            inst.set_nodeattr("code_gen_dir_ipgen", "")
+            inst.set_nodeattr("ipgen_path", "")
+            need_restitch = True
+    # if we've made alterations to the model, need to do some re-prep
+    if need_restitch:
+        print("Need to regen/re-stitch some IP for STITCHED_IP_RTLSIM")
+        verify_model = verify_model.transform(
+            PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
+        )
+        verify_model = verify_model.transform(HLSSynthIP())
+        verify_model = verify_model.transform(
+            CreateStitchedIP(
+                cfg._resolve_fpga_part(),
+                cfg.synth_clk_period_ns,
+                vitis=False,
+            )
+        )
+    # set top-level prop for stitched-ip rtlsim and launch
+    verify_model.set_metadata_prop("exec_mode", "rtlsim")
+    # TODO make configurable
+    # verify_model.set_metadata_prop("rtlsim_trace", "trace.vcd")
+    return verify_model
+
+
 def step_tidy_up(model: ModelWrapper, cfg: DataflowBuildConfig):
     """Run the tidy-up step on given model. This includes shape and datatype
     inference, constant folding, and giving nodes and tensors better names.
@@ -164,6 +221,7 @@ def step_streamline(model: ModelWrapper, cfg: DataflowBuildConfig):
         model = model.transform(MakeMaxPoolNHWC())
         model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
         model = model.transform(MakeMaxPoolNHWC())
+        model = model.transform(absorb.AbsorbConsecutiveTransposes())
     model = model.transform(ConvertBipolarMatMulToXnorPopcount())
     model = model.transform(Streamline())
     # absorb final add-mul nodes into TopK
@@ -212,7 +270,12 @@ def step_create_dataflow_partition(model: ModelWrapper, cfg: DataflowBuildConfig
     nodes, which point to a separate ONNX file. Dataflow accelerator synthesis
     can only be performed on those HLSCustomOp sub-graphs."""
 
-    parent_model = model.transform(CreateDataflowPartition())
+    parent_model = model.transform(
+        CreateDataflowPartition(
+            partition_model_dir=cfg.output_dir
+            + "/intermediate_models/supported_op_partitions"
+        )
+    )
     sdp_nodes = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")
     assert len(sdp_nodes) == 1, "Only a single StreamingDataflowPartition supported."
     sdp_node = sdp_nodes[0]
@@ -231,7 +294,11 @@ def step_target_fps_parallelization(model: ModelWrapper, cfg: DataflowBuildConfi
     target_cycles_per_frame = cfg._resolve_cycles_per_frame()
     if target_cycles_per_frame is not None:
         model = model.transform(
-            SetFolding(target_cycles_per_frame, mvau_wwidth_max=cfg.mvau_wwidth_max)
+            SetFolding(
+                target_cycles_per_frame,
+                mvau_wwidth_max=cfg.mvau_wwidth_max,
+                two_pass_relaxation=cfg.folding_two_pass_relaxation,
+            )
         )
     return model
 
@@ -380,25 +447,35 @@ def step_create_stitched_ip(model: ModelWrapper, cfg: DataflowBuildConfig):
     if DataflowOutputType.STITCHED_IP in cfg.generate_outputs:
         stitched_ip_dir = cfg.output_dir + "/stitched_ip"
         model = model.transform(
-            CreateStitchedIP(cfg._resolve_fpga_part(), cfg.synth_clk_period_ns)
+            CreateStitchedIP(
+                cfg._resolve_fpga_part(),
+                cfg.synth_clk_period_ns,
+                vitis=cfg.stitched_ip_gen_dcp,
+            )
         )
         # TODO copy all ip sources into output dir? as zip?
-        copytree(model.get_metadata_prop("vivado_stitch_proj"), stitched_ip_dir)
+        copy_tree(model.get_metadata_prop("vivado_stitch_proj"), stitched_ip_dir)
         print("Vivado stitched IP written into " + stitched_ip_dir)
     if VerificationStepType.STITCHED_IP_RTLSIM in cfg._resolve_verification_steps():
         # prepare ip-stitched rtlsim
         verify_model = deepcopy(model)
-        # rtlsim only supports impl_style=rtl for StreamingFIFO, ensure that
-        for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO"):
-            getCustomOp(fifo_layer).set_nodeattr("impl_style", "rtl")
-        # similarly for StreamingDataWidthConverter with impl_style=hls
-        for dwc_layer in verify_model.get_nodes_by_op_type(
-            "StreamingDataWidthConverter_Batch"
-        ):
-            getCustomOp(dwc_layer).set_nodeattr("impl_style", "hls")
-        verify_model = verify_model.transform(PrepareRTLSim())
-        verify_model.set_metadata_prop("exec_mode", "rtlsim")
+        verify_model = prepare_for_stitched_ip_rtlsim(verify_model, cfg)
+        # use critical path estimate to set rtlsim liveness threshold
+        # (very conservative)
+        verify_model = verify_model.transform(AnnotateCycles())
+        estimate_network_performance = verify_model.analysis(dataflow_performance)
+        prev_liveness = pyverilate_get_liveness_threshold_cycles()
+        os.environ["LIVENESS_THRESHOLD"] = str(
+            int(estimate_network_performance["critical_path_cycles"])
+        )
+        if cfg.verify_save_rtlsim_waveforms:
+            report_dir = cfg.output_dir + "/report"
+            os.makedirs(report_dir, exist_ok=True)
+            verify_model.set_metadata_prop(
+                "rtlsim_trace", "%s/verify_rtlsim.vcd" % (report_dir)
+            )
         verify_step(verify_model, cfg, "stitched_ip_rtlsim", need_parent=True)
+        os.environ["LIVENESS_THRESHOLD"] = str(prev_liveness)
     return model
 
 
@@ -411,28 +488,20 @@ def step_measure_rtlsim_performance(model: ModelWrapper, cfg: DataflowBuildConfi
         assert (
             DataflowOutputType.STITCHED_IP in cfg.generate_outputs
         ), "rtlsim_perf needs stitched IP"
+        report_dir = cfg.output_dir + "/report"
+        os.makedirs(report_dir, exist_ok=True)
         # prepare ip-stitched rtlsim
         rtlsim_model = deepcopy(model)
-        # rtlsim only supports impl_style=rtl for StreamingFIFO, ensure that
-        for fifo_layer in rtlsim_model.get_nodes_by_op_type("StreamingFIFO"):
-            getCustomOp(fifo_layer).set_nodeattr("impl_style", "rtl")
-        # similarly for StreamingDataWidthConverter with impl_style=hls
-        for dwc_layer in rtlsim_model.get_nodes_by_op_type(
-            "StreamingDataWidthConverter_Batch"
-        ):
-            getCustomOp(dwc_layer).set_nodeattr("impl_style", "hls")
-        rtlsim_model = rtlsim_model.transform(PrepareRTLSim())
-        rtlsim_model.set_metadata_prop("exec_mode", "rtlsim")
+        rtlsim_model = prepare_for_stitched_ip_rtlsim(rtlsim_model, cfg)
         # run with single input to get latency
+        if cfg.verify_save_rtlsim_waveforms:
+            rtlsim_model.set_metadata_prop(
+                "rtlsim_trace", "%s/rtlsim_perf_batch_%d.vcd" % (report_dir, 1)
+            )
+        rtlsim_model.set_metadata_prop("extra_verilator_args", str(["-CFLAGS", "-O3"]))
         rtlsim_perf_dict = throughput_test_rtlsim(rtlsim_model, 1)
-        rtlsim_latency = rtlsim_perf_dict["cycles"]
-        # run with num inputs equal to layers to fill the whole pipeline
-        # to get the steady-state throughput
-        rtlsim_bs = len(rtlsim_model.graph.node)
-        rtlsim_perf_dict = throughput_test_rtlsim(rtlsim_model, rtlsim_bs)
-        rtlsim_perf_dict["latency_cycles"] = rtlsim_latency
-        report_dir = cfg.output_dir + "/report"
-        os.makedirs(report_dir, exist_ok=True)
+        rtlsim_latency_bs1 = rtlsim_perf_dict["cycles"]
+        rtlsim_perf_dict["latency_cycles"] = rtlsim_latency_bs1
         with open(report_dir + "/rtlsim_performance.json", "w") as f:
             json.dump(rtlsim_perf_dict, f, indent=2)
 
@@ -446,7 +515,7 @@ def step_make_pynq_driver(model: ModelWrapper, cfg: DataflowBuildConfig):
     if DataflowOutputType.PYNQ_DRIVER in cfg.generate_outputs:
         driver_dir = cfg.output_dir + "/driver"
         model = model.transform(MakePYNQDriver(cfg._resolve_driver_platform()))
-        copytree(model.get_metadata_prop("pynq_driver_dir"), driver_dir)
+        copy_tree(model.get_metadata_prop("pynq_driver_dir"), driver_dir)
         print("PYNQ Python driver written into " + driver_dir)
     return model
 
@@ -487,9 +556,15 @@ def step_synthesize_bitfile(model: ModelWrapper, cfg: DataflowBuildConfig):
         os.makedirs(bitfile_dir, exist_ok=True)
         report_dir = cfg.output_dir + "/report"
         os.makedirs(report_dir, exist_ok=True)
+        partition_model_dir = cfg.output_dir + "/intermediate_models/kernel_partitions"
         if cfg.shell_flow_type == ShellFlowType.VIVADO_ZYNQ:
             model = model.transform(
-                ZynqBuild(cfg.board, cfg.synth_clk_period_ns, cfg.enable_hw_debug)
+                ZynqBuild(
+                    cfg.board,
+                    cfg.synth_clk_period_ns,
+                    cfg.enable_hw_debug,
+                    partition_model_dir=partition_model_dir,
+                )
             )
             copy(model.get_metadata_prop("bitfile"), bitfile_dir + "/finn-accel.bit")
             copy(model.get_metadata_prop("hw_handoff"), bitfile_dir + "/finn-accel.hwh")
@@ -513,6 +588,7 @@ def step_synthesize_bitfile(model: ModelWrapper, cfg: DataflowBuildConfig):
                     strategy=cfg._resolve_vitis_opt_strategy(),
                     enable_debug=cfg.enable_hw_debug,
                     floorplan_file=cfg.vitis_floorplan_file,
+                    partition_model_dir=partition_model_dir,
                 )
             )
             copy(model.get_metadata_prop("bitfile"), bitfile_dir + "/finn-accel.xclbin")
@@ -535,8 +611,8 @@ def step_deployment_package(model: ModelWrapper, cfg: DataflowBuildConfig):
         bitfile_dir = cfg.output_dir + "/bitfile"
         driver_dir = cfg.output_dir + "/driver"
         os.makedirs(deploy_dir, exist_ok=True)
-        copytree(bitfile_dir, deploy_dir + "/bitfile")
-        copytree(driver_dir, deploy_dir + "/driver")
+        copy_tree(bitfile_dir, deploy_dir + "/bitfile")
+        copy_tree(driver_dir, deploy_dir + "/driver")
     return model
 
 
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 320b947d0dd99b564da5775dfc8624993af57de2..a552fd419fb3e88c61bfd9229c24e0b71d470b87 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -52,6 +52,7 @@ from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO
 from finn.custom_op.fpgadataflow.streamingmaxpool_batch import StreamingMaxPool_Batch
 from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch
 from finn.custom_op.fpgadataflow.tlastmarker import TLastMarker
+from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour_Batch
 from finn.custom_op.fpgadataflow.vector_vector_activate_batch import (
     Vector_Vector_Activate_Batch,
 )
@@ -79,3 +80,4 @@ custom_op["Vector_Vector_Activate_Batch"] = Vector_Vector_Activate_Batch
 custom_op["ChannelwiseOp_Batch"] = ChannelwiseOp_Batch
 custom_op["IODMA"] = IODMA
 custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition
+custom_op["UpsampleNearestNeighbour_Batch"] = UpsampleNearestNeighbour_Batch
diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
index b1dc02131e45b0a04acb25723e09847ee858ebdc..073d6620ac3c2a4f62ac544e74ecf21b6e36d58f 100644
--- a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
+++ b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
@@ -526,15 +526,18 @@ class ChannelwiseOp_Batch(HLSCustomOp):
         # should ImgDim be defined or just filled in here like we do now?
         ishape = self.get_folded_input_shape()
         if len(ishape) == 3:
-            imgdim = 1
+            imgdim_h = 1
+            imgdim_w = 1
         elif len(ishape) == 5:
-            imgdim = ishape[1]
+            imgdim_h = ishape[1]
+            imgdim_w = ishape[2]
         else:
             raise Exception("""Unexpeted input shape""")
         self.code_gen_dict["$DOCOMPUTE$"] = [
-            """Thresholding_Batch<{}, NumChannels1, PE1, {}, {}>
+            """Thresholding_Batch<{}, {}, NumChannels1, PE1, {}, {}>
             (in0, out, threshs, numReps);""".format(
-                imgdim,
+                imgdim_h,
+                imgdim_w,
                 tmpl_args["TSrcI"],
                 tmpl_args["TDstI"],
             )
diff --git a/src/finn/custom_op/fpgadataflow/streamingdataflowpartition.py b/src/finn/custom_op/fpgadataflow/streamingdataflowpartition.py
index 53446ff1f2aba30e69bf188c1673c738440567fb..cf065cf156abed591e579b3f257e8f442eb3a976 100644
--- a/src/finn/custom_op/fpgadataflow/streamingdataflowpartition.py
+++ b/src/finn/custom_op/fpgadataflow/streamingdataflowpartition.py
@@ -47,6 +47,7 @@ class StreamingDataflowPartition(CustomOp):
             "partition_id": ("i", False, 0),
             "device_id": ("i", False, 0),
             "mem_port": ("s", False, ""),
+            "instance_name": ("s", False, ""),
         }
 
     def make_shape_compatible_op(self, model):
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index 5cf5da1b6d378d3177d0a40017733bd6672c9574..96594d441345332bbe5873570156e07cacbb385d 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -105,6 +105,16 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 "auto",
                 {"auto", "block", "distributed", "ultra"},
             ),
+            # FPGA resource type for threshold memories (if noActivation is False)
+            # auto -- let Vivado decide
+            # block -- use BRAM
+            # distributed -- use LUTRAM
+            "ram_style_thresholds": (
+                "s",
+                False,
+                "auto",
+                {"auto", "block", "distributed"},
+            ),
             # (mem_mode = decoupled only) whether weights will be writable through
             # an AXI-lite interface during runtime
             # 1 for enabled, 0 for disabled.
@@ -663,7 +673,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             assert (orig_thres_matrix.astype(np.int32) == orig_thres_matrix).all()
         ret = orig_thres_matrix
         # workaround for vivado_hls threshold bug
-        if ret[0][0] == 0:
+        if ret[0][0] == 0 and n_thres_steps == 1:
             ret = np.copy(ret)
             ret[0][0] = 1
             warnings.warn(
@@ -1212,6 +1222,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
     def pragmas(self):
         mem_mode = self.get_nodeattr("mem_mode")
+        ram_style_thresholds = self.get_nodeattr("ram_style_thresholds")
         self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
         self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
         in_fifo_depth = self.get_nodeattr("inFIFODepth")
@@ -1270,6 +1281,28 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                     "complete dim=3"
                 )
             )
+            # add resource pragma for thresholds if set
+            if ram_style_thresholds == "distributed":
+                self.code_gen_dict["$PRAGMAS$"].append(
+                    (
+                        "#pragma HLS RESOURCE variable=threshs.m_thresholds "
+                        "core=ROM_2P_LUTRAM"
+                    )
+                )
+            elif ram_style_thresholds == "block":
+                self.code_gen_dict["$PRAGMAS$"].append(
+                    (
+                        "#pragma HLS RESOURCE variable=threshs.m_thresholds "
+                        "core=ROM_2P_BRAM"
+                    )
+                )
+            elif ram_style_thresholds == "auto":
+                # no pragma needed
+                pass
+            else:
+                raise Exception(
+                    "Unrecognized ram_style_thresholds value:" + ram_style_thresholds
+                )
 
     def code_generation_ipi(self):
         cmd = []
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
index f6a200a47e9c50999a144229c362ddd70c4b22ec..7fb7634dc22e1d00569e7bb755bf120d6de4f808 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
@@ -336,7 +336,7 @@ class Thresholding_Batch(HLSCustomOp):
         ).all(), "Need int threshold tensor"
         ret = orig_thres_matrix
         # workaround for vivado_hls threshold bug
-        if ret[0][0] == 0:
+        if ret[0][0] == 0 and n_thres_steps == 1:
             ret = np.copy(ret)
             ret[0][0] = 1
             warnings.warn(
diff --git a/src/finn/custom_op/fpgadataflow/upsampler.py b/src/finn/custom_op/fpgadataflow/upsampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8aa09b1c0754b68e37d01551afe90811f22e7cd
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/upsampler.py
@@ -0,0 +1,296 @@
+import numpy as np
+import os
+import warnings
+from onnx import TensorProto, helper
+
+from finn.core.datatype import DataType
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class UpsampleNearestNeighbour_Batch(HLSCustomOp):
+    """
+    Corresponds to finn-hlslib UpsampleNearestNeighbour_Batch function.
+    Upsampling is done with the Nearest Neighbour algorithm.
+    The layer expects square feature maps for the in and output.
+    """
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            # Size of the output feature map
+            "OFMDim": ("i", True, 0),
+            # Size of the input feature map
+            "IFMDim": ("i", True, 0),
+            # Amount of channels of the input feature map
+            "NumChannels": ("i", True, 0),
+            # FINN input datatype
+            "inputDataType": ("s", True, ""),
+            # Batch size
+            "numInputVectors": ("i", False, 1),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_exp_cycles(self):
+        OFMDim = self.get_nodeattr("OFMDim")
+        batch_size = self.get_nodeattr("numInputVectors")
+        exp_cycles = OFMDim * OFMDim * batch_size
+        return int(exp_cycles)
+
+    def get_normal_input_shape(self):
+        IFMDim = self.get_nodeattr("IFMDim")
+        num_ch = self.get_nodeattr("NumChannels")
+        batch = self.get_nodeattr("numInputVectors")
+        ishape = (batch, IFMDim, IFMDim, num_ch)
+        return ishape
+
+    def get_normal_output_shape(self):
+        OFMDim = self.get_nodeattr("OFMDim")
+        num_ch = self.get_nodeattr("NumChannels")
+        batch = self.get_nodeattr("numInputVectors")
+        oshape = (batch, OFMDim, OFMDim, num_ch)
+        return oshape
+
+    def get_folded_input_shape(self):
+        normal_ishape = list(self.get_normal_input_shape())
+        return tuple(normal_ishape)
+
+    def get_folded_output_shape(self):
+        normal_oshape = list(self.get_normal_output_shape())
+        return tuple(normal_oshape)
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert (
+            ishape == exp_ishape
+        ), "Unexpect input shape for UpsampleNearestNeighbour_Batch."
+        # implement tensor with correct shape
+        values = np.random.randn(*oshape).astype(np.float32)
+        return helper.make_node(
+            "Constant",
+            inputs=[],
+            outputs=[self.onnx_node.output[0]],
+            value=helper.make_tensor(
+                name="const_tensor",
+                data_type=TensorProto.FLOAT,
+                dims=values.shape,
+                vals=values.flatten().astype(float),
+            ),
+        )
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        # data type stays the same
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("inputDataType", idt.name)
+        model.set_tensor_datatype(node.output[0], idt)
+
+    def verify_node(self):
+        pass
+
+    def get_input_datatype(self):
+        """Returns FINN DataType of input."""
+        ret = DataType[self.get_nodeattr("inputDataType")]
+        return ret
+
+    def get_output_datatype(self):
+        """Returns FINN DataType of output. (Same as input datatype)"""
+        return self.get_input_datatype()
+
+    def get_instream_width(self):
+        ibits = self.get_input_datatype().bitwidth()
+        ifm_ch = self.get_nodeattr("NumChannels")
+        return ibits * ifm_ch
+
+    def get_outstream_width(self):
+        obits = self.get_output_datatype().bitwidth()
+        ifm_ch = self.get_nodeattr("NumChannels")
+        return obits * ifm_ch
+
+    def get_number_output_values(self):
+        folded_oshape = self.get_folded_output_shape()
+        return np.prod(folded_oshape[:-1])
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "upsample.hpp"']
+
+    def defines(self, var):
+        self.code_gen_dict["$DEFINES$"] = []
+
+        ifm_ch = self.get_nodeattr("NumChannels")
+        self.code_gen_dict["$DEFINES$"] += ["#define IFMChannels {}".format(ifm_ch)]
+
+        ibits = self.get_input_datatype().bitwidth()
+        self.code_gen_dict["$DEFINES$"] += ["#define Input_precision {}".format(ibits)]
+
+        idim = self.get_nodeattr("IFMDim")
+        self.code_gen_dict["$DEFINES$"] += ["#define IFMDim {}".format(idim)]
+
+        odim = self.get_nodeattr("OFMDim")
+        self.code_gen_dict["$DEFINES$"] += ["#define OFMDim {}".format(odim)]
+
+        batch_size = self.get_nodeattr("numInputVectors")
+        self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(batch_size)]
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_input_datatype()
+        if dtype == DataType.BIPOLAR:
+            # use binary for bipolar storage
+            dtype = DataType.BINARY
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"] = []
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
+            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+        )
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+        )
+
+    def docompute(self):
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            """UpsampleNearestNeighbour_Batch<OFMDim, IFMDim, IFMChannels,
+            ap_uint<Input_precision> > (in0, out, numReps);"""
+        ]
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        if dtype == DataType.BIPOLAR:
+            # use binary for bipolar storage
+            dtype = DataType.BINARY
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        oshape = self.get_folded_output_shape()
+        oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
+
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                oshape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)"
+            % (self.onnx_node.name, packed_hls_type, packed_hls_type)
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+        )
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        exp_ishape = self.get_normal_input_shape()
+        exp_oshape = self.get_normal_output_shape()
+        folded_ishape = self.get_folded_input_shape()
+        folded_oshape = self.get_folded_output_shape()
+
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert (
+            inp.shape == exp_ishape
+        ), """Input shape doesn't
+        match expected shape (numInputVectors, ImgDim, ImgDim, NumChannels)."""
+        export_idt = self.get_input_datatype()
+
+        reshaped_input = inp.reshape(folded_ishape)
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_output(context)
+            assert (
+                context[node.output[0]].shape == folded_oshape
+            ), "cppsim did not produce expected folded output shape"
+            context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            rtlsim_inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+            odt = export_idt
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+        assert (
+            context[node.output[0]].shape == exp_oshape
+        ), """Output shape doesn't match expected shape
+            (1, OutputDim, OutputDim, NumChannels)."""
diff --git a/src/finn/qnn-data/templates/driver/driver_base.py b/src/finn/qnn-data/templates/driver/driver_base.py
index 4dd5a080e10e4a0ab5bd14381186e19144f6edb3..df1ab15c7892d66a668d82040b0da93366942cb7 100644
--- a/src/finn/qnn-data/templates/driver/driver_base.py
+++ b/src/finn/qnn-data/templates/driver/driver_base.py
@@ -85,24 +85,27 @@ class FINNExampleOverlay(Overlay):
         self.platform = platform
         self.batch_size = batch_size
         self.fclk_mhz = fclk_mhz
-        if self.platform == "alveo":
-            if "input_dma_name" in io_shape_dict.keys():
-                self.idma = getattr(self, io_shape_dict["input_dma_name"])
-            else:
-                self.idma = self.idma0
-            self.odma = self.odma0
-            self.odma_handle = None
-        elif self.platform == "zynq-iodma":
-            if "input_dma_name" in io_shape_dict.keys():
-                self.idma = getattr(self, io_shape_dict["input_dma_name"])
-            else:
-                self.idma = self.idma0
-            self.odma = self.odma0
+        self.idma = []
+        self.odma = []
+        self.odma_handle = []
+        if "input_dma_name" in io_shape_dict.keys():
+            for idma_name in io_shape_dict["input_dma_name"]:
+                self.idma.append(getattr(self, idma_name))
+        else:
+            self.idma = [self.idma0]
+        if "output_dma_name" in io_shape_dict.keys():
+            for odma_name in io_shape_dict["output_dma_name"]:
+                self.odma.append(getattr(self, odma_name))
+                if self.platform == "alveo":
+                    self.odma_handle.append(None)
+        else:
+            self.odma = [self.odma0]
+            if self.platform == "alveo":
+                self.odma_handle.append(None)
+        if self.platform == "zynq-iodma":
             # set the clock frequency as specified by user during transformations
             if self.fclk_mhz > 0:
                 Clocks.fclk0_mhz = self.fclk_mhz
-        else:
-            raise ValueError("Supported platforms are zynq-iodma alveo")
         # load any external + runtime weights
         self.load_external_weights()
         self.load_runtime_weights()
@@ -204,50 +207,50 @@ class FINNExampleOverlay(Overlay):
             # run accelerator to flush any stale weights from weight streamer FIFOs
             self.execute_on_buffers()
 
-    @property
-    def idt(self):
-        return self._io_shape_dict["idt"]
+    def idt(self, ind=0):
+        return self._io_shape_dict["idt"][ind]
 
-    @property
-    def odt(self):
-        return self._io_shape_dict["odt"]
+    def odt(self, ind=0):
+        return self._io_shape_dict["odt"][ind]
 
-    @property
-    def ishape_normal(self):
-        ret = list(self._io_shape_dict["ishape_normal"])
+    def ishape_normal(self, ind=0):
+        ret = list(self._io_shape_dict["ishape_normal"][ind])
         ret[0] = self.batch_size
         return tuple(ret)
 
-    @property
-    def oshape_normal(self):
-        ret = list(self._io_shape_dict["oshape_normal"])
+    def oshape_normal(self, ind=0):
+        ret = list(self._io_shape_dict["oshape_normal"][ind])
         ret[0] = self.batch_size
         return tuple(ret)
 
-    @property
-    def ishape_folded(self):
-        ret = list(self._io_shape_dict["ishape_folded"])
+    def ishape_folded(self, ind=0):
+        ret = list(self._io_shape_dict["ishape_folded"][ind])
         ret[0] = self.batch_size
         return tuple(ret)
 
-    @property
-    def oshape_folded(self):
-        ret = list(self._io_shape_dict["oshape_folded"])
+    def oshape_folded(self, ind=0):
+        ret = list(self._io_shape_dict["oshape_folded"][ind])
         ret[0] = self.batch_size
         return tuple(ret)
 
-    @property
-    def ishape_packed(self):
-        ret = list(self._io_shape_dict["ishape_packed"])
+    def ishape_packed(self, ind=0):
+        ret = list(self._io_shape_dict["ishape_packed"][ind])
         ret[0] = self.batch_size
         return tuple(ret)
 
-    @property
-    def oshape_packed(self):
-        ret = list(self._io_shape_dict["oshape_packed"])
+    def oshape_packed(self, ind=0):
+        ret = list(self._io_shape_dict["oshape_packed"][ind])
         ret[0] = self.batch_size
         return tuple(ret)
 
+    @property
+    def num_inputs(self):
+        return self._io_shape_dict["num_inputs"]
+
+    @property
+    def num_outputs(self):
+        return self._io_shape_dict["num_outputs"]
+
     @property
     def batch_size(self):
         return self._batch_size
@@ -261,68 +264,72 @@ class FINNExampleOverlay(Overlay):
             self.ibuf_packed_device = None
         if self.obuf_packed_device is not None:
             self.obuf_packed_device = None
-        if self.platform == "alveo":
-            self.ibuf_packed_device = allocate(shape=self.ishape_packed, dtype=np.uint8)
-            self.obuf_packed_device = allocate(shape=self.oshape_packed, dtype=np.uint8)
-        else:
-            self.ibuf_packed_device = allocate(
-                shape=self.ishape_packed, dtype=np.uint8, cacheable=True
+        cacheable = {"alveo": False, "zynq-iodma": True}[self.platform]
+        self.ibuf_packed_device = []
+        self.obuf_packed_device = []
+        self.obuf_packed = []
+        for i in range(self.num_inputs):
+            new_packed_ibuf = allocate(
+                shape=self.ishape_packed(i), dtype=np.uint8, cacheable=cacheable
             )
-            self.obuf_packed_device = allocate(
-                shape=self.oshape_packed, dtype=np.uint8, cacheable=True
+            self.ibuf_packed_device.append(new_packed_ibuf)
+        for o in range(self.num_outputs):
+            new_packed_obuf = allocate(
+                shape=self.oshape_packed(o), dtype=np.uint8, cacheable=cacheable
             )
-        self.obuf_packed = np.empty_like(self.obuf_packed_device)
+            self.obuf_packed_device.append(new_packed_obuf)
+            self.obuf_packed.append(np.empty_like(new_packed_obuf))
 
-    def fold_input(self, ibuf_normal):
+    def fold_input(self, ibuf_normal, ind=0):
         """Reshapes input in desired shape.
         Gets input data (ibuf_normal), checks if data is in expected normal shape.
         Returns folded input."""
         # ensure that shape is as expected
-        assert ibuf_normal.shape == self.ishape_normal
+        assert ibuf_normal.shape == self.ishape_normal(ind)
         # convert to folded form
-        ibuf_folded = ibuf_normal.reshape(self.ishape_folded)
+        ibuf_folded = ibuf_normal.reshape(self.ishape_folded(ind))
         return ibuf_folded
 
-    def pack_input(self, ibuf_folded):
+    def pack_input(self, ibuf_folded, ind=0):
         """Packs folded input and reverses both SIMD dim and endianness.
         Gets input data in folded shape and returns packed input data."""
         ibuf_packed = finnpy_to_packed_bytearray(
             ibuf_folded,
-            self.idt,
+            self.idt(ind),
             reverse_endian=True,
             reverse_inner=True,
             fast_mode=True,
         )
         return ibuf_packed
 
-    def unpack_output(self, obuf_packed):
+    def unpack_output(self, obuf_packed, ind=0):
         """Unpacks the packed output buffer from accelerator.
         Gets packed output and returns output data in folded shape."""
         obuf_folded = packed_bytearray_to_finnpy(
             obuf_packed,
-            self.odt,
-            self.oshape_folded,
+            self.odt(ind),
+            self.oshape_folded(ind),
             reverse_endian=True,
             reverse_inner=True,
             fast_mode=True,
         )
         return obuf_folded
 
-    def unfold_output(self, obuf_folded):
+    def unfold_output(self, obuf_folded, ind=0):
         """Unfolds output data to normal shape.
         Gets folded output data and returns output data in normal shape."""
-        obuf_normal = obuf_folded.reshape(self.oshape_normal)
+        obuf_normal = obuf_folded.reshape(self.oshape_normal(ind))
         return obuf_normal
 
-    def copy_input_data_to_device(self, data):
+    def copy_input_data_to_device(self, data, ind=0):
         """Copies given input data to PYNQ buffer."""
-        np.copyto(self.ibuf_packed_device, data)
-        self.ibuf_packed_device.flush()
+        np.copyto(self.ibuf_packed_device[ind], data)
+        self.ibuf_packed_device[ind].flush()
 
-    def copy_output_data_from_device(self, data):
+    def copy_output_data_from_device(self, data, ind=0):
         """Copies PYNQ output buffer from device."""
-        self.obuf_packed_device.invalidate()
-        np.copyto(data, self.obuf_packed_device)
+        self.obuf_packed_device[ind].invalidate()
+        np.copyto(data, self.obuf_packed_device[ind])
 
     def execute_on_buffers(self, asynch=False, batch_size=None):
         """Executes accelerator by setting up the DMA(s) on pre-allocated buffers.
@@ -338,24 +345,36 @@ class FINNExampleOverlay(Overlay):
             batch_size = self.batch_size
         assert batch_size <= self.batch_size, "Specified batch_size is too large."
         if self.platform == "zynq-iodma":
-            assert self.odma.read(0x00) & 0x4 != 0, "Output DMA is not idle"
+            for o in range(self.num_outputs):
+                assert (
+                    self.odma[o].read(0x00) & 0x4 != 0
+                ), "Output DMA %d is not idle" % (o)
             # manually launch IODMAs since signatures are missing
             for iwdma, iwbuf, iwdma_name in self.external_weights:
                 iwdma.write(0x10, iwbuf.device_address)
                 iwdma.write(0x1C, batch_size)
                 iwdma.write(0x00, 1)
-            self.idma.write(0x10, self.ibuf_packed_device.device_address)
-            self.idma.write(0x1C, batch_size)
-            self.odma.write(0x10, self.obuf_packed_device.device_address)
-            self.odma.write(0x1C, batch_size)
-            self.idma.write(0x00, 1)
-            self.odma.write(0x00, 1)
+            for o in range(self.num_outputs):
+                self.odma[o].write(0x10, self.obuf_packed_device[o].device_address)
+                self.odma[o].write(0x1C, batch_size)
+                self.odma[o].write(0x00, 1)
+            for i in range(self.num_inputs):
+                self.idma[i].write(0x10, self.ibuf_packed_device[i].device_address)
+                self.idma[i].write(0x1C, batch_size)
+                self.idma[i].write(0x00, 1)
         elif self.platform == "alveo":
-            assert self.odma_handle is None, "Output DMA is already running"
-            self.idma.start(self.ibuf_packed_device, batch_size)
+            for o in range(self.num_outputs):
+                assert self.odma_handle[o] is None, (
+                    "Output DMA %d is already running" % o
+                )
+            for i in range(self.num_inputs):
+                self.idma[i].start(self.ibuf_packed_device[i], batch_size)
             for iwdma, iwbuf, iwdma_name in self.external_weights:
                 iwdma.start(iwbuf, batch_size)
-            self.odma_handle = self.odma.start(self.obuf_packed_device, batch_size)
+            for o in range(self.num_outputs):
+                self.odma_handle[o] = self.odma[o].start(
+                    self.obuf_packed_device[o], batch_size
+                )
         else:
             raise Exception("Unrecognized platform: %s" % self.platform)
         # blocking behavior depends on asynch parameter
@@ -363,31 +382,48 @@ class FINNExampleOverlay(Overlay):
             self.wait_until_finished()
 
     def wait_until_finished(self):
-        "Block until the output DMA has finished writing."
+        "Block until all output DMAs have finished writing."
         if self.platform == "zynq-iodma":
             # check if output IODMA is finished via register reads
-            status = self.odma.read(0x00)
-            while status & 0x2 == 0:
-                status = self.odma.read(0x00)
+            for o in range(self.num_outputs):
+                status = self.odma[o].read(0x00)
+                while status & 0x2 == 0:
+                    status = self.odma[o].read(0x00)
         elif self.platform == "alveo":
-            assert self.odma_handle is not None, "No odma_handle to wait on"
-            self.odma_handle.wait()
-            self.odma_handle = None
+            assert all(
+                [x is not None for x in self.odma_handle]
+            ), "No odma_handle to wait on"
+            for o in range(self.num_outputs):
+                self.odma_handle[o].wait()
+                self.odma_handle[o] = None
         else:
             raise Exception("Unrecognized platform: %s" % self.platform)
 
     def execute(self, input_npy):
-        """Given input numpy array, first perform necessary packing and copying
-        to device buffers, execute on accelerator, then unpack output and return
-        output numpy array from accelerator."""
-        ibuf_folded = self.fold_input(input_npy)
-        ibuf_packed = self.pack_input(ibuf_folded)
-        self.copy_input_data_to_device(ibuf_packed)
+        """Given a single or a list of input numpy array, first perform necessary
+        packing and copying to device buffers, execute on accelerator, then unpack
+        output and return output numpy array from accelerator."""
+        # if single input, convert to list to normalize how we process the input
+        if not type(input_npy) is list:
+            input_npy = [input_npy]
+        assert self.num_inputs == len(
+            input_npy
+        ), "Not all accelerator inputs are specified."
+        for i in range(self.num_inputs):
+            ibuf_folded = self.fold_input(input_npy[i], ind=i)
+            ibuf_packed = self.pack_input(ibuf_folded, ind=i)
+            self.copy_input_data_to_device(ibuf_packed, ind=i)
         self.execute_on_buffers()
-        self.copy_output_data_from_device(self.obuf_packed)
-        obuf_folded = self.unpack_output(self.obuf_packed)
-        obuf_normal = self.unfold_output(obuf_folded)
-        return obuf_normal
+        outputs = []
+        for o in range(self.num_outputs):
+            self.copy_output_data_from_device(self.obuf_packed[o], ind=o)
+            obuf_folded = self.unpack_output(self.obuf_packed[o], ind=o)
+            obuf_normal = self.unfold_output(obuf_folded, ind=o)
+            outputs.append(obuf_normal)
+        if self.num_outputs == 1:
+            return outputs[0]
+        else:
+            return outputs
 
     def throughput_test(self):
         """Run accelerator with empty inputs to measure throughput and other metrics.
@@ -400,12 +436,14 @@ class FINNExampleOverlay(Overlay):
         runtime = end - start
         res["runtime[ms]"] = runtime * 1000
         res["throughput[images/s]"] = self.batch_size / runtime
-        res["DRAM_in_bandwidth[Mb/s]"] = (
-            np.prod(self.ishape_packed) * 0.000001 / runtime
-        )
-        res["DRAM_out_bandwidth[Mb/s]"] = (
-            np.prod(self.oshape_packed) * 0.000001 / runtime
-        )
+        total_in = 0
+        for i in range(self.num_inputs):
+            total_in += np.prod(self.ishape_packed(i))
+        res["DRAM_in_bandwidth[Mb/s]"] = total_in * 0.000001 / runtime
+        total_out = 0
+        for o in range(self.num_outputs):
+            total_out += np.prod(self.oshape_packed(o))
+        res["DRAM_out_bandwidth[Mb/s]"] = total_out * 0.000001 / runtime
         for iwdma, iwbuf, iwdma_name in self.external_weights:
             res["DRAM_extw_%s_bandwidth[Mb/s]" % iwdma_name] = (
                 self.batch_size * np.prod(iwbuf.shape) * 0.000001 / runtime
@@ -416,11 +454,11 @@ class FINNExampleOverlay(Overlay):
             res["fclk[mhz]"] = self.clock_dict["clock0"]["frequency"]
         res["batch_size"] = self.batch_size
         # also benchmark driver-related overheads
-        input_npy = gen_finn_dt_tensor(self.idt, self.ishape_normal)
+        input_npy = gen_finn_dt_tensor(self.idt(), self.ishape_normal())
         # provide as int8/uint8 to support fast packing path where possible
-        if self.idt == DataType.UINT8:
+        if self.idt() == DataType.UINT8:
             input_npy = input_npy.astype(np.uint8)
-        elif self.idt == DataType.INT8:
+        elif self.idt() == DataType.INT8:
             input_npy = input_npy.astype(np.int8)
         start = time.time()
         ibuf_folded = self.fold_input(input_npy)
@@ -441,13 +479,13 @@ class FINNExampleOverlay(Overlay):
         res["copy_input_data_to_device[ms]"] = runtime * 1000
 
         start = time.time()
-        self.copy_output_data_from_device(self.obuf_packed)
+        self.copy_output_data_from_device(self.obuf_packed[0])
         end = time.time()
         runtime = end - start
         res["copy_output_data_from_device[ms]"] = runtime * 1000
 
         start = time.time()
-        obuf_folded = self.unpack_output(self.obuf_packed)
+        obuf_folded = self.unpack_output(self.obuf_packed[0])
         end = time.time()
         runtime = end - start
         res["unpack_output[ms]"] = runtime * 1000
diff --git a/src/finn/qnn-data/templates/driver/validate.py b/src/finn/qnn-data/templates/driver/validate.py
index 001744cba2b59f6d1a0a67fca3e2ad9668a519c0..1b29d4342c830ae896e580f602e810ee25ed234d 100644
--- a/src/finn/qnn-data/templates/driver/validate.py
+++ b/src/finn/qnn-data/templates/driver/validate.py
@@ -94,11 +94,11 @@ if __name__ == "__main__":
     test_labels = test_labels.reshape(n_batches, bsize)
 
     for i in range(n_batches):
-        ibuf_normal = test_imgs[i].reshape(driver.ibuf_packed_device.shape)
+        ibuf_normal = test_imgs[i].reshape(driver.ibuf_packed_device[0].shape)
         exp = test_labels[i]
         driver.copy_input_data_to_device(ibuf_normal)
         driver.execute_on_buffers()
-        obuf_normal = np.empty_like(driver.obuf_packed_device)
+        obuf_normal = np.empty_like(driver.obuf_packed_device[0])
         driver.copy_output_data_from_device(obuf_normal)
         ret = np.bincount(obuf_normal.flatten() == exp.flatten())
         nok += ret[0]
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index c749d645dfbf9996c3eea430a0099cb5f12ee60a..3cb193055f3a455d95f7735ab38b2601809dbabd 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -61,7 +61,9 @@ class InferConvInpGen(Transformation):
                 i2c_out_shape = model.get_tensor_shape(i2c_output)
                 dt = model.get_tensor_datatype(i2c_input)
                 if not dt.is_integer():
-                    warnings.warn("Input is not int. Can't infer ConvInpGen")
+                    warnings.warn(
+                        "%s : Input is not int. Can't infer ConvInpGen." % n.name
+                    )
                     continue
                 i2c_inst = getCustomOp(n)
                 stride_h, stride_w = i2c_inst.get_nodeattr("stride")
@@ -89,9 +91,10 @@ class InferConvInpGen(Transformation):
                     # if padding enabled, ensure pad_val supported by DataType
                     # assert dt.allowed(pad_val),"""FMPadding_Batch DataType
                     # must support pad_val"""
-                    assert (
-                        pad_val == 0
-                    ), "FMPadding_Batch doesn't currently support pad_val!= 0"
+                    assert pad_val == 0, (
+                        "%s : FMPadding_Batch doesn't currently support pad_val!= 0"
+                        % n.name
+                    )
 
                     odim_padding_h = ifm_dim_h + pad_h
                     odim_padding_w = ifm_dim_w + pad_w
@@ -121,6 +124,7 @@ class InferConvInpGen(Transformation):
                         NumChannels=ifm_ch,
                         inputDataType=dt.name,
                         SIMD=ifm_ch,
+                        name="FMPadding_Batch_" + n.name,
                     )
                     graph.node.insert(node_ind, padding_node)
 
@@ -134,11 +138,15 @@ class InferConvInpGen(Transformation):
                 )
 
                 if (stride_h > 1 or stride_w > 1) and is_kernel_pointwise:
-                    assert (
-                        is_square_image
-                    ), "DownSampler currently only supports square input images."
-                    assert is_equal_stride, """DownSampler currently only supports equal stride value
+                    assert is_square_image, (
+                        "%s : DownSampler currently only supports square input images."
+                        % n.name
+                    )
+                    assert is_equal_stride, (
+                        """%s : DownSampler currently only supports equal stride value
                         along different axes."""
+                        % n.name
+                    )
                     ConvInpGen_idim = ConvInpGen_idim_h
                     stride = stride_h
                     # create DownSampler node
@@ -153,6 +161,7 @@ class InferConvInpGen(Transformation):
                         SIMD=ifm_ch,
                         Stride=stride,
                         inputDataType=dt.name,
+                        name="DownSampler_" + n.name,
                     )
                     graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
                 else:
@@ -160,12 +169,16 @@ class InferConvInpGen(Transformation):
                     if (
                         is_square_image and is_square_kernel
                     ):  # square images and square kernels
-                        assert is_equal_stride, """Non-equal strides along different axes is not supported
+                        assert is_equal_stride, (
+                            """%s: Non-equal strides along different axes is not supported
                             for (non-)square convolutions"""
-                        assert (
-                            dilation_h == 1 and dilation_w == 1
-                        ), """Dilation value != 1 is not supported
+                            % n.name
+                        )
+                        assert dilation_h == 1 and dilation_w == 1, (
+                            """%s: Dilation value != 1 is not supported
                             for square convolutions"""
+                            % n.name
+                        )
                         ConvInpGen_node = helper.make_node(
                             "ConvolutionInputGenerator",
                             [ConvInpGen_input],
@@ -182,16 +195,19 @@ class InferConvInpGen(Transformation):
                             inputDataType=dt.name,
                             outputDataType=dt.name,
                             depthwise=depthwise,
+                            name="ConvolutionInputGenerator_" + n.name,
                         )
                     else:  # non-square images and/or kernels
-                        assert (
-                            is_1d_convolution
-                        ), "ConvultionInputGenerator1D works only for 1D convolutions"
+                        assert is_1d_convolution, (
+                            "%s: ConvolutionInputGenerator1D works only for 1D convs"
+                            % n.name
+                        )
                         if dilation_h > 1 or dilation_w > 1:
-                            assert (
-                                stride_h == 1 and stride_w == 1
-                            ), """Stride value of greater than 1 is not supported for convolutions
+                            assert stride_h == 1 and stride_w == 1, (
+                                """%s: Stride value of greater than 1 is not supported for convolutions
                                 with dilation value greater than 1"""
+                                % n.name
+                            )
                         ConvInpGen_node = helper.make_node(
                             "ConvolutionInputGenerator1D",
                             [ConvInpGen_input],
@@ -208,6 +224,7 @@ class InferConvInpGen(Transformation):
                             inputDataType=dt.name,
                             outputDataType=dt.name,
                             depthwise=depthwise,
+                            name="ConvolutionInputGenerator1D_" + n.name,
                         )
                     graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
                 # remove old nodes
@@ -219,6 +236,102 @@ class InferConvInpGen(Transformation):
         return (model, graph_modified)
 
 
+class InferUpsample(Transformation):
+    """
+    Convert Upsample and Resize nodes to layers to UpsampleNearestNeighbour_Batch nodes.
+    """
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type == "Upsample" or n.op_type == "Resize":
+                # Extract mode and scales and input shape
+                mode = get_by_name(n.attribute, "mode").s.decode("ascii")
+                if n.op_type == "Upsample":
+                    scales = model.get_initializer(n.input[1])
+                else:
+                    scales = model.get_initializer(n.input[2])
+                in_shape = model.get_tensor_shape(n.input[0])
+
+                dt = model.get_tensor_datatype(n.input[0])
+                if not dt.is_integer():
+                    warnings.warn(
+                        "%s: Input not int. Can't infer UpsampleNearestNeighbour."
+                        % n.name
+                    )
+                    continue
+
+                if model.get_tensor_layout(n.input[0]) != DataLayout.NHWC:
+                    warnings.warn(
+                        "%s: Input not NHWC. Can't infer UpsampleNearestNeighbour."
+                        % n.name
+                    )
+                    continue
+
+                # Check that the parameters are okay
+                assert mode == "nearest", (
+                    "%s: Upsampling is only supported for the mode nearest." % n.name
+                )
+                assert len(in_shape) == 4, "Upsampling is only supported for 4D inputs."
+                assert scales.shape == (4,), (
+                    "%s: Upsampling is only supported for 4D scales." % n.name
+                )
+                assert (scales >= 1).all(), (
+                    n.name + ": Upsampling is only supported for scales "
+                    "which are larger or equal 1 in all dimensions."
+                )
+
+                # Assumes nhwc layout for scales and input
+                assert scales[1] == scales[2], (
+                    "%s: Upsampling is only supported for quadratic scales." % n.name
+                )
+                assert scales[0] == scales[3] == 1, (
+                    n.name + ": Upsampling is only supported for scales with "
+                    "the first and last dimensions being 1."
+                )
+                spatial_scale = scales[1]
+                assert spatial_scale == int(spatial_scale), (
+                    "%s: Upsampling is only supported for integer scales." % n.name
+                )
+
+                assert in_shape[1] == in_shape[2], (
+                    "%s: Upsampling is only supported for quadratic input shapes."
+                    % n.name
+                )
+
+                # Extract information for HLS node
+                IFMDim = in_shape[1]
+                OFMDim = int(round(in_shape[1] * spatial_scale))
+                NumChannels = in_shape[-1]
+                numInputVectors = in_shape[0]
+                inputDataType = dt.name
+
+                # Insert the HLSCustomOp node
+                Upsample_HLS_node = helper.make_node(
+                    "UpsampleNearestNeighbour_Batch",
+                    [n.input[0]],
+                    [n.output[0]],
+                    domain="finn.custom_op.fpgadataflow",
+                    backend="fpgadataflow",
+                    OFMDim=OFMDim,
+                    IFMDim=IFMDim,
+                    NumChannels=NumChannels,
+                    inputDataType=inputDataType,
+                    numInputVectors=numInputVectors,
+                    name="UpsampleNearestNeighbour_Batch_" + n.name,
+                )
+
+                # Remove the old node
+                graph.node.insert(node_ind, Upsample_HLS_node)
+                # remove old nodes
+                graph.node.remove(n)
+                graph_modified = True
+        return (model, graph_modified)
+
+
 class InferStreamingMaxPool(Transformation):
     """Convert MaxPoolNHWC layers to StreamingMaxPool layers."""
 
@@ -251,6 +364,7 @@ class InferStreamingMaxPool(Transformation):
                         NumChannels=ifm_ch,
                         ImgDim=(ifm_dim_h, ifm_dim_w),
                         dataType=dt.name,
+                        name="StreamingMaxPool_Batch_" + n.name,
                     )
                     graph.node.insert(node_ind, new_node)
                     # remove old nodes
@@ -273,7 +387,7 @@ class InferPool_Batch(Transformation):
         graph_modified = False
         for n in graph.node:
             node_ind += 1
-            if n.op_type in ["MaxPool", "QuantAvgPool2d"]:
+            if n.op_type in ["MaxPool", "QuantAvgPool2d", "MaxPoolNHWC"]:
                 # extract pool parameters
 
                 if n.op_type == "MaxPool":
@@ -286,6 +400,15 @@ class InferPool_Batch(Transformation):
                     k = inst.get_nodeattr("kernel")
                     stride = inst.get_nodeattr("stride")
                     dlayout = inst.get_nodeattr("data_layout")
+                elif n.op_type == "MaxPoolNHWC":
+                    inst = getCustomOp(n)
+                    k_shape = inst.get_nodeattr("kernel_shape")
+                    strides = inst.get_nodeattr("strides")
+                    assert k_shape[0] == k_shape[1]
+                    assert strides[0] == strides[1]
+                    k = k_shape[0]
+                    stride = strides[0]
+                    dlayout = "NHWC"
                 try:
                     pad = get_by_name(n.attribute, "pads").ints[-1]
                 except AttributeError:
@@ -302,7 +425,8 @@ class InferPool_Batch(Transformation):
                     continue
                 elif k == stride:
                     warnings.warn(
-                        """Inferring Pool_Batch node for k == stride.
+                        n.name
+                        + """: Inferring Pool_Batch node for k == stride.
                         This case can be optimized.
                         For example, for MaxPool run InferStreamingMaxPool before
                         InferPool_Batch """
@@ -363,7 +487,7 @@ class InferPool_Batch(Transformation):
                 accum_bits = 0
                 pool_size_param = k
                 pad_value = 0
-                if n.op_type == "MaxPool":
+                if n.op_type in ["MaxPool", "MaxPoolNHWC"]:
                     pool_fxn = "MaxPool"
                     odt = idt
                     pad_value = idt.min()
@@ -393,6 +517,7 @@ class InferPool_Batch(Transformation):
                     pad_value=pad_value,
                     depthwise=1,
                     input_shape="(1,{},{},{})".format(ifm_dim, ifm_dim, ifm_ch),
+                    name="Im2Col_" + n.name,
                 )
 
                 # Warning PE has to be equal to ifm_ch until Im2Col is replaced by
@@ -415,6 +540,7 @@ class InferPool_Batch(Transformation):
                     AccumBits=accum_bits,
                     Size=pool_size_param,
                     BatchSize=1,
+                    name="Pool_Batch_" + n.name,
                 )
 
                 if dlayout == "NCHW":
@@ -463,14 +589,16 @@ class InferBinaryStreamingFCLayer(Transformation):
                 mm_output = n.output[0]
                 mm_in_shape = model.get_tensor_shape(mm_input)
                 mm_out_shape = model.get_tensor_shape(mm_output)
-                assert (
-                    model.get_tensor_datatype(mm_input) == DataType.BINARY
-                ), """First
+                assert model.get_tensor_datatype(mm_input) == DataType.BINARY, (
+                    n.name
+                    + """: First
                 input for xnorpopcount is not set to FINN DataType BINARY."""
-                assert (
-                    model.get_tensor_datatype(mm_weight) == DataType.BINARY
-                ), """Second
+                )
+                assert model.get_tensor_datatype(mm_weight) == DataType.BINARY, (
+                    n.name
+                    + """: Second
                 input (weights) for xnorpopcount is not set to FINN DataType BINARY."""
+                )
                 idt = DataType.BINARY
                 wdt = DataType.BINARY
                 mm_output = n.output[0]
@@ -484,13 +612,12 @@ class InferBinaryStreamingFCLayer(Transformation):
                 # create node with no parallelization first
                 pe = 1
                 simd = 1
-                assert mh % pe == 0, "Requirement MH divisable by PE is violated."
-                assert mw % simd == 0, "Requirement MW divisable by SIMD is violated."
                 wmem = mw * mh // (pe * simd)
-                assert (
-                    mw * mh == wmem * pe * simd
-                ), """Requirement (MW * MH) divisiable by
+                assert mw * mh == wmem * pe * simd, (
+                    n.name
+                    + """: Requirement (MW * MH) divisiable by
                 (WMEM * PE * SIMD) is violated."""
+                )
                 # see if we have any following thresholds
                 consumer = model.find_consumer(mm_output)
                 if consumer is not None and consumer.op_type == "MultiThreshold":
@@ -500,10 +627,11 @@ class InferBinaryStreamingFCLayer(Transformation):
                     mt_out_shape = model.get_tensor_shape(mt_output)
                     mt_thres = consumer.input[1]
                     T = model.get_initializer(mt_thres)
-                    assert (
-                        T.shape[0] == 1 or T.shape[0] == mh
-                    ), """First dimension of
+                    assert T.shape[0] == 1 or T.shape[0] == mh, (
+                        consumer.name
+                        + """: First dimension of
                     thresholds neither 1 nor MH."""
+                    )
                     odt = model.get_tensor_datatype(mt_output)
                     if odt.bitwidth() == 1:
                         # covers both bipolar and binary
@@ -531,6 +659,7 @@ class InferBinaryStreamingFCLayer(Transformation):
                         noActivation=0,
                         numInputVectors=list(mm_in_shape[:-1]),
                         mem_mode=self.mem_mode,
+                        name=n.name,
                     )
                     graph.node.insert(node_ind, new_node)
                     # remove old nodes
@@ -561,6 +690,7 @@ class InferBinaryStreamingFCLayer(Transformation):
                         noActivation=1,
                         numInputVectors=list(mm_in_shape[:-1]),
                         mem_mode=self.mem_mode,
+                        name=n.name,
                     )
                     graph.node.insert(node_ind, new_node)
                     # remove old node
@@ -608,15 +738,12 @@ class InferQuantizedStreamingFCLayer(Transformation):
                     # create node with no parallelization first
                     pe = 1
                     simd = 1
-                    assert mh % pe == 0, "Requirement MH divisable by PE is violated."
-                    assert (
-                        mw % simd == 0
-                    ), "Requirement MW divisable by SIMD is violated."
                     wmem = mw * mh // (pe * simd)
-                    assert (
-                        mw * mh == wmem * pe * simd
-                    ), """Requirement (MW * MH) divisible by
+                    assert mw * mh == wmem * pe * simd, (
+                        n.name
+                        + """: Requirement (MW * MH) divisible by
                     (WMEM * PE * SIMD) is violated."""
+                    )
                     # see if we have any following thresholds
                     consumer = model.find_consumer(mm_output)
                     if consumer is not None and consumer.op_type == "MultiThreshold":
@@ -626,27 +753,30 @@ class InferQuantizedStreamingFCLayer(Transformation):
                         mt_out_shape = model.get_tensor_shape(mt_output)
                         mt_thres = consumer.input[1]
                         T = model.get_initializer(mt_thres)
-                        assert (
-                            T.shape[0] == 1 or T.shape[0] == mh
-                        ), """First dimension of
+                        assert T.shape[0] == 1 or T.shape[0] == mh, (
+                            consumer.name
+                            + """: First dimension of
                         thresholds neither 1 nor MH."""
+                        )
                         odt = model.get_tensor_datatype(mt_output)
                         scale = getCustomOp(consumer).get_nodeattr("out_scale")
                         actval = getCustomOp(consumer).get_nodeattr("out_bias")
-                        assert (
-                            int(actval) == actval
-                        ), "out_bias must be integer for HLS conversion."
+                        assert int(actval) == actval, (
+                            consumer.name
+                            + ": out_bias must be integer for HLS conversion."
+                        )
                         actval = int(actval)
                         odt_is_bipolar = odt == DataType.BIPOLAR
                         bipolar_ok = (
                             odt_is_bipolar and (scale == 2.0) and (actval == -1)
                         )
-                        assert (
-                            scale == 1.0 or bipolar_ok
-                        ), "out_scale = 1.0 or bipolar output needed for conversion."
-                        assert (not odt.signed()) or (
-                            actval < 0
-                        ), "Signed output requres actval < 0"
+                        assert scale == 1.0 or bipolar_ok, (
+                            consumer.name
+                            + ": out_scale=1 or bipolar output needed for conversion."
+                        )
+                        assert (not odt.signed()) or (actval < 0), (
+                            consumer.name + ": Signed output requres actval < 0"
+                        )
                         model.set_tensor_shape(mm_input, mm_in_shape)
                         model.set_tensor_shape(mt_output, mt_out_shape)
                         if bipolar_ok:
@@ -672,6 +802,7 @@ class InferQuantizedStreamingFCLayer(Transformation):
                             noActivation=0,
                             numInputVectors=list(mm_in_shape[:-1]),
                             mem_mode=self.mem_mode,
+                            name="StreamingFCLayer_Batch_" + n.name,
                         )
                         graph.node.insert(node_ind, new_node)
                         # remove old nodes
@@ -702,6 +833,7 @@ class InferQuantizedStreamingFCLayer(Transformation):
                             noActivation=1,
                             numInputVectors=list(mm_in_shape[:-1]),
                             mem_mode=self.mem_mode,
+                            name="StreamingFCLayer_Batch_" + n.name,
                         )
                         graph.node.insert(node_ind, new_node)
                         # remove old node
@@ -736,7 +868,8 @@ class InferVVAU(Transformation):
                     k_h, k_w = sparsity["dw"]["kernel_shape"]
                 except KeyError:
                     raise Exception(
-                        """Sparsity doesn't indicate that MatMul
+                        n.name
+                        + """: sparsity annotation doesn't indicate that MatMul
                         belongs to a depthwise convolution."""
                     )
 
@@ -772,9 +905,6 @@ class InferVVAU(Transformation):
                     model.set_tensor_shape(mm_weight, (channels, 1, k_h, k_w))
                     # create node with pe=channels as default
                     pe = channels
-                    assert (
-                        channels % pe == 0
-                    ), "Requirement Channels divisable by PE is violated."
                     # see if we have any following thresholds
                     consumer = model.find_consumer(mm_output)
                     if consumer is not None and consumer.op_type == "MultiThreshold":
@@ -783,23 +913,26 @@ class InferVVAU(Transformation):
                         mt_out_shape = model.get_tensor_shape(mt_output)
                         mt_thres = consumer.input[1]
                         T = model.get_initializer(mt_thres)
-                        assert (
-                            T.shape[0] == 1 or T.shape[0] == channels
-                        ), """First dimension of
+                        assert T.shape[0] == 1 or T.shape[0] == channels, (
+                            consumer.name
+                            + """: First dimension of
                         thresholds neither 1 nor Channels."""
+                        )
                         odt = model.get_tensor_datatype(mt_output)
                         scale = getCustomOp(consumer).get_nodeattr("out_scale")
-                        assert (
-                            scale == 1.0
-                        ), "out_scale must be equal to 1.0 for HLS conversion."
+                        assert scale == 1.0, (
+                            consumer.name
+                            + ": out_scale must be equal to 1.0 for HLS conversion."
+                        )
                         actval = getCustomOp(consumer).get_nodeattr("out_bias")
-                        assert (
-                            int(actval) == actval
-                        ), "out_bias must be integer for HLS conversion."
+                        assert int(actval) == actval, (
+                            consumer.name
+                            + ": out_bias must be integer for HLS conversion."
+                        )
                         actval = int(actval)
-                        assert (not odt.signed()) or (
-                            actval < 0
-                        ), "Signed output requres actval < 0"
+                        assert (not odt.signed()) or (actval < 0), (
+                            consumer.name + ": Signed output requres actval < 0"
+                        )
                         model.set_tensor_shape(mm_input, mm_in_shape)
                         model.set_tensor_shape(mt_output, mt_out_shape)
                         # create and insert new Vector_Vector_Activate_Batch node
@@ -819,6 +952,7 @@ class InferVVAU(Transformation):
                             outputDataType=odt.name,
                             ActVal=actval,
                             noActivation=0,
+                            name="Vector_Vector_Activate_Batch_" + n.name,
                         )
                         graph.node.insert(node_ind, new_node)
                         # remove old nodes
@@ -847,6 +981,7 @@ class InferVVAU(Transformation):
                             outputDataType=odt.name,
                             ActVal=0,
                             noActivation=1,
+                            name="Vector_Vector_Activate_Batch_" + n.name,
                         )
                         graph.node.insert(node_ind, new_node)
                         # remove old node
@@ -904,21 +1039,22 @@ class InferThresholdingLayer(Transformation):
                 ifc = int(thl_in_shape[-1])
                 # create node with no parallelization first
                 pe = 1
-                assert ifc % pe == 0, "Requirement IFC divisable by PE is violated."
 
                 odt = model.get_tensor_datatype(thl_output)
                 scale = getCustomOp(node).get_nodeattr("out_scale")
-                assert (
-                    scale == 1.0
-                ), "MultiThreshold out_scale must be equal to 1.0 for HLS conversion."
+                assert scale == 1.0, (
+                    node.name
+                    + ": MultiThreshold out_scale must be 1 for HLS conversion."
+                )
                 actval = getCustomOp(node).get_nodeattr("out_bias")
-                assert (
-                    int(actval) == actval
-                ), "MultiThreshold out_bias must be integer for HLS conversion."
+                assert int(actval) == actval, (
+                    node.name
+                    + ": MultiThreshold out_bias must be integer for HLS conversion."
+                )
                 actval = int(actval)
-                assert (not odt.signed()) or (
-                    actval < 0
-                ), "Signed output requres actval < 0"
+                assert (not odt.signed()) or (actval < 0), (
+                    node.name + ": Signed output requres actval < 0"
+                )
                 # create and insert new Thresholding_Batch node
                 new_node = helper.make_node(
                     "Thresholding_Batch",
@@ -935,6 +1071,7 @@ class InferThresholdingLayer(Transformation):
                     numInputVectors=list(thl_in_shape[:-1]),
                     ActVal=actval,
                     mem_mode=self.mem_mode,
+                    name="Thresholding_Batch_" + node.name,
                 )
                 graph.node.insert(insert_point, new_node)
                 # remove old node
@@ -1008,9 +1145,6 @@ class InferAddStreamsLayer(Transformation):
                 num_channels = int(in0_shape[-1])
                 # create node with no parallelization first
                 pe = 1
-                assert (
-                    num_channels % pe == 0
-                ), "Requirement Channels divisable by PE is violated."
 
                 # create and insert new StreamingFCLayer node
                 new_node = helper.make_node(
@@ -1023,6 +1157,7 @@ class InferAddStreamsLayer(Transformation):
                     PE=pe,
                     inputDataType=idt.name,
                     numInputVectors=in0_shape[:-1],
+                    name="AddStreams_Batch_" + node.name,
                 )
                 graph.node.insert(insert_point, new_node)
                 # remove old node
@@ -1069,9 +1204,6 @@ class InferDuplicateStreamsLayer(Transformation):
 
                 # create node with no parallelization first
                 pe = 1
-                assert (
-                    num_ch % pe == 0
-                ), "Requirement channels divisable by PE is violated."
 
                 dup_node = helper.make_node(
                     "DuplicateStreams_Batch",
@@ -1083,6 +1215,7 @@ class InferDuplicateStreamsLayer(Transformation):
                     PE=pe,
                     inputDataType=dt.name,
                     numInputVectors=vecs,
+                    name="DuplicateStreams_Batch_" + node.name,
                 )
 
                 graph.node.insert(node_ind, dup_node)
@@ -1251,6 +1384,7 @@ class InferChannelwiseLinearLayer(Transformation):
                     paramDataType=pdt.name,
                     outputDataType=odt.name,
                     numInputVectors=list(ll_in_shape[:-1]),
+                    name="ChannelwiseOp_Batch_" + node.name,
                 )
                 graph.node.insert(insert_point, new_node)
                 # remove old node
@@ -1293,9 +1427,6 @@ class InferLabelSelectLayer(Transformation):
                 num_inp_vecs = list(fc_in_shape[:-1])
                 # create node with no parallelization first
                 pe = 1
-                assert (
-                    num_labels % pe == 0
-                ), "Requirement Labels divisable by PE is violated."
 
                 k = model.get_initializer(k_input)[0]
 
@@ -1311,6 +1442,7 @@ class InferLabelSelectLayer(Transformation):
                     K=k,
                     inputDataType=idt.name,
                     numInputVectors=num_inp_vecs,
+                    name="LabelSelect_Batch_" + node.name,
                 )
                 graph.node.insert(node_ind, new_node)
                 # remove old node
@@ -1364,9 +1496,6 @@ class InferGlobalAccPoolLayer(Transformation):
                 vecs = in0_shape[:-1]
                 # create node with no parallelization first
                 pe = 1
-                assert (
-                    num_ch % pe == 0
-                ), "Requirement Labels divisable by PE is violated."
 
                 # create an additional tensor of the same shape and layout as result
                 out_shape = model.get_tensor_shape(result)
@@ -1387,6 +1516,7 @@ class InferGlobalAccPoolLayer(Transformation):
                     PE=pe,
                     inputDataType=idt.name,
                     numInputVectors=vecs,
+                    name="GlobalAccPool_Batch_" + node.name,
                 )
 
                 mul_value = helper.make_tensor_value_info(
diff --git a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
index 0aba60f9b6f08210c40f305694495b77f517f323..9b2577bc2b863e1075fc3252412ff1001b955cda 100644
--- a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
+++ b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
@@ -26,11 +26,11 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import copy
-from onnx import helper
-
+from finn.core.modelwrapper import ModelWrapper
 from finn.custom_op.registry import getCustomOp
 from finn.transformation.base import Transformation
+from finn.transformation.create_generic_partitions import PartitionFromLambda
+from finn.transformation.fpgadataflow.externalize_params import ExternalizeParams
 from finn.util.basic import get_by_name, make_build_dir
 
 
@@ -41,120 +41,76 @@ class CreateDataflowPartition(Transformation):
     that indicates the filename for the second graph that only contains
     dataflow nodes. No action is taken if there are no dataflow nodes."""
 
-    def __init__(self):
+    def __init__(self, partition_model_dir=None):
         super().__init__()
+        if partition_model_dir is None:
+            self.partition_model_dir = make_build_dir("dataflow_partition_")
+        else:
+            self.partition_model_dir = partition_model_dir
 
     def apply(self, model):
-        target_partition_id = 0
-        # we currently assume that all dataflow nodes belonging to the same partition
-        # are connected to each other and there is a single input/output to/from each.
-        # NOTE: all dataflow nodes with no partition_id set are moved to partition 0
-        # TODO: check the assumption and/or improve this.
-        while True:
-            all_nodes = list(model.graph.node)
-            df_nodes = filter(
-                lambda x: get_by_name(x.attribute, "backend") is not None, all_nodes
-            )
-            df_nodes = filter(
-                lambda x: get_by_name(x.attribute, "backend").s.decode("UTF-8")
-                == "fpgadataflow"
-                and (
-                    get_by_name(x.attribute, "partition_id") is None
-                    or get_by_name(x.attribute, "partition_id").i == target_partition_id
-                )
-                and x.op_type != "StreamingDataflowPartition",
-                df_nodes,
-            )
-            df_nodes = list(df_nodes)
-            non_df_nodes = filter(lambda x: x not in df_nodes, all_nodes)
-            non_df_nodes = list(non_df_nodes)
-
-            if len(df_nodes) == 0:
-                # no changes if no dataflow nodes are present
-                break
-            else:
-                # partition the model into two models
-                df_model = copy.deepcopy(model)
-                non_df_model = model
-                # remove all non-dataflow nodes from the dataflow model
-                for node_to_remove in non_df_nodes:
-                    df_model.graph.node.remove(node_to_remove)
-                # identify the entry and exit points for the dataflow part
-                df_in = df_model.graph.node[0].input[0]
-                df_out = df_model.graph.node[-1].output[0]
-                df_in_vi = df_model.get_tensor_valueinfo(df_in)
-                df_out_vi = df_model.get_tensor_valueinfo(df_out)
-                # set df graph in/out to be df_in/df_out
-                df_model.graph.input.remove(df_model.graph.input[0])
-                df_model.graph.input.insert(0, df_in_vi)
-                df_model.graph.output.remove(df_model.graph.output[0])
-                df_model.graph.output.insert(0, df_out_vi)
-                # parse StreamingFCLayers looking for external weight memories
-                fc_extw_nodes = filter(
-                    lambda x: x.op_type == "StreamingFCLayer_Batch"
-                    and get_by_name(x.attribute, "mem_mode") is not None
-                    and get_by_name(x.attribute, "mem_mode").s.decode("UTF-8")
-                    == "external",
-                    df_nodes,
-                )
-                fc_extw_nodes = list(fc_extw_nodes)
-                extra_df_inputs = []
+        def filter_fc_extw(x):
+            if x.op_type == "IODMA":
+                burst_mode = get_by_name(x.attribute, "burstMode")
+                if burst_mode is not None:
+                    burst_mode = burst_mode.s.decode("UTF-8")
+                    return burst_mode == "wrap"
 
-                for i in range(len(fc_extw_nodes)):
-                    fc_weight_vi = df_model.get_tensor_valueinfo(
-                        fc_extw_nodes[i].input[1]
-                    )
-                    df_model.graph.input.insert(i + 1, fc_weight_vi)
-                    extra_df_inputs.append(fc_extw_nodes[i].input[1])
+        extw_dma_nodes = list(filter(filter_fc_extw, model.graph.node))
+        if len(extw_dma_nodes) > 0:
+            model = model.transform(ExternalizeParams())
 
-                # save model
-                df_model_dir = make_build_dir(
-                    "dataflow_partition" + str(target_partition_id) + "_"
-                )
-                df_model_filename = df_model_dir + "/df_model.onnx"
-                df_model.cleanup()
-                df_model.save(df_model_filename)
-                # remove all dataflow nodes from the non-dataflow model
-                # keep track of where the dataflow part starts
-                df_start_ind = all_nodes.index(df_nodes[0])
-
-                # get and check floorplan
-                inst = getCustomOp(df_nodes[0])
-                slr = inst.get_nodeattr("slr")
-                for node in df_nodes[1:]:
-                    inst = getCustomOp(node)
-                    assert slr == inst.get_nodeattr(
-                        "slr"
-                    ), """all nodes with
-                same partition_id must have the same slr id"""
-
-                # check that there is only one non-null mem_port per partition
-                nmemports = 0
-                mem_port = ""
-                for node in df_nodes:
-                    inst = getCustomOp(node)
-                    port = inst.get_nodeattr("mem_port")
-                    if port is not None and port != "":
-                        nmemports += 1
-                        mem_port = port
-                assert nmemports <= 1, """too many memory ports per partition"""
+        def assign_partition_id(node):
+            if node.op_type in ["GenericPartition", "StreamingDataflowPartition"]:
+                return -1
+            else:
+                backend = get_by_name(node.attribute, "backend")
+                if backend is not None and backend.s.decode("UTF-8") == "fpgadataflow":
+                    assigned_partition = get_by_name(node.attribute, "partition_id")
+                    if assigned_partition is not None:
+                        return assigned_partition.i
+                    else:
+                        return 0
+                else:
+                    return -1
 
-                for node_to_remove in df_nodes:
-                    non_df_model.graph.node.remove(node_to_remove)
-                # create StreamingDataflow node with df_in/df_out io
-                df_node = helper.make_node(
-                    "StreamingDataflowPartition",
-                    [df_in] + extra_df_inputs,
-                    [df_out],
-                    # use the model attribute to mark the df model
-                    model=df_model_filename,
-                    domain="finn.custom_op.fpgadataflow",
-                    partition_id=target_partition_id,
-                    slr=slr,
-                    mem_port=mem_port,
-                )
-                non_df_model.graph.node.insert(df_start_ind, df_node)
-                model = non_df_model
-                target_partition_id += 1
+        # first, use the generic partitioning functionality to split up the graph
+        parent_model = model.transform(
+            PartitionFromLambda(
+                partitioning=assign_partition_id, partition_dir=self.partition_model_dir
+            )
+        )
+        # change node types to StreamingDataflowPartition
+        p_nodes = parent_model.get_nodes_by_op_type("GenericPartition")
+        for partition_ind, p_node in enumerate(p_nodes):
+            # go into partition to extract some info
+            p_node_inst = getCustomOp(p_node)
+            node_model_filename = p_node_inst.get_nodeattr("model")
+            p_model = ModelWrapper(node_model_filename)
+            # check floorplan (SLR assignment per node)
+            inst = getCustomOp(p_model.graph.node[0])
+            slr = inst.get_nodeattr("slr")
+            for node in p_model.graph.node:
+                inst = getCustomOp(node)
+                assert slr == inst.get_nodeattr(
+                    "slr"
+                ), """all nodes with same partition_id must have the same slr id"""
+            # check that there is only one non-null mem_port per partition
+            nmemports = 0
+            mem_port = ""
+            for node in p_model.graph.node:
+                inst = getCustomOp(node)
+                port = inst.get_nodeattr("mem_port")
+                if port is not None and port != "":
+                    nmemports += 1
+                    mem_port = port
+            assert nmemports <= 1, """Too many memory ports per partition"""
+            # done, change node type and add info in parent graph
+            p_node.op_type = "StreamingDataflowPartition"
+            p_node.domain = "finn.custom_op.fpgadataflow"
+            new_p_node_inst = getCustomOp(p_node)
+            new_p_node_inst.set_nodeattr("partition_id", partition_ind)
+            new_p_node_inst.set_nodeattr("slr", slr)
+            new_p_node_inst.set_nodeattr("mem_port", mem_port)
 
-        return (model, False)
+        return (parent_model, False)
diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index 74bc7395512425e6b81defe5cec4afaa4b669e90..c2ded29d24eb1c75fcba8b4053c78920a31b1d3b 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -86,11 +86,6 @@ class CreateStitchedIP(Transformation):
         self.clk_ns = clk_ns
         self.ip_name = ip_name
         self.vitis = vitis
-        if float(clk_ns) not in [5.0, 10.0, 20.0]:
-            warnings.warn(
-                """The chosen frequency may lead to failure due to clock divider
-                constraints."""
-            )
         self.has_aximm = False
         self.has_m_axis = False
         self.m_axis_idx = 0
@@ -221,6 +216,13 @@ class CreateStitchedIP(Transformation):
         ip_dirs = ["list"]
         # add RTL streamer IP
         ip_dirs.append("/workspace/finn/finn-rtllib/memstream")
+        if model.graph.node[0].op_type not in ["StreamingFIFO", "IODMA"]:
+            warnings.warn(
+                """First node is not StreamingFIFO or IODMA.
+                You may experience incorrect stitched-IP rtlsim or hardware
+                behavior. It is strongly recommended to insert FIFOs prior to
+                calling CreateStitchedIP."""
+            )
         # ensure that all nodes are fpgadataflow, and that IPs are generated
         for node in model.graph.node:
             assert is_fpgadataflow_node(
@@ -333,8 +335,8 @@ class CreateStitchedIP(Transformation):
         # if targeting Vitis, add some properties to the IP
         if self.vitis:
             tcl.append(
-                "ipx::remove_bus_parameter FREQ_HZ "
-                "[ipx::get_bus_interfaces CLK.AP_CLK -of_objects [ipx::current_core]]"
+                "set_property value_resolve_type user [ipx::get_bus_parameters "
+                "-of [ipx::get_bus_interfaces -of [ipx::current_core ]]]"
             )
             # replace source code with dcp
             tcl.append(
diff --git a/src/finn/transformation/fpgadataflow/externalize_params.py b/src/finn/transformation/fpgadataflow/externalize_params.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcb66a8538fdff46214c23491f48a59459625082
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/externalize_params.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2021, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+from finn.transformation.base import Transformation
+from finn.util.basic import get_by_name
+
+
+class ExternalizeParams(Transformation):
+    """Create top-level graph inputs for IODMAs serving layers where weights are
+    marked as external using mem_mode="external"."""
+
+    def __init__(self):
+        super().__init__()
+
+    def apply(self, model):
+        graph_modified = False
+
+        def filter_fc_extw(x):
+            if x.op_type == "IODMA":
+                burst_mode = get_by_name(x.attribute, "burstMode")
+                if burst_mode is not None:
+                    burst_mode = burst_mode.s.decode("UTF-8")
+                    return burst_mode == "wrap"
+
+        dma_extw_nodes = list(filter(filter_fc_extw, model.graph.node))
+
+        for dma_extw in dma_extw_nodes:
+            extw_tensor_name = dma_extw.input[0]
+            extw_tensor_name_out = dma_extw.output[0]
+            if extw_tensor_name in [x.name for x in model.graph.input]:
+                continue
+            else:
+                extw_vi = model.get_tensor_valueinfo(extw_tensor_name)
+                assert extw_vi is not None
+                model.graph.value_info.remove(extw_vi)
+                model.graph.input.append(extw_vi)
+                iodma_init = model.get_initializer(extw_vi.name)
+                assert iodma_init is not None
+                # remove output-side initializer to get correct dataflow partitioning
+                model.graph.initializer.remove(
+                    [
+                        x
+                        for x in model.graph.initializer
+                        if x.name == extw_tensor_name_out
+                    ][0]
+                )
+                graph_modified = True
+
+        return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index ef56db6376703ce1eb0134c173de61a562bca6e6..c8bb716922823876f5f16ffe62f17c425d49aa74 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -180,49 +180,50 @@ class InsertFIFO(Transformation):
                 n.input[0] = fifo_output_tensor.name
 
             # insert FIFO as last node, except when last node is DMA
-            if (
-                graph.node[-1].op_type != "StreamingFIFO"
-                and graph.node[-1].op_type != "IODMA"
-            ):
-                n = graph.node[-1]
-                assert (
-                    n.op_type != "TLastMarker"
-                ), """Insert tlast marker should be done
-                    after inserting the FIFOs"""
-                graph_out_name = graph.output[0].name
-                n0 = getCustomOp(n)
-                # determine fifo node attributes
-                fld_shape = n0.get_folded_output_shape()
-                dtype = n0.get_output_datatype()
-                fifo_depth = n0.get_nodeattr("outFIFODepth")
-
-                if fifo_depth <= 2:
-                    warnings.warn("Overriding output FIFO depth to 32")
-                    fifo_depth = 32
-
-                # create fifo node
-                fifo_input_tensor = oh.make_tensor_value_info(
-                    model.make_new_valueinfo_name(),
-                    TensorProto.FLOAT,
-                    n0.get_normal_output_shape(),
-                )
-                graph.value_info.append(fifo_input_tensor)
-                model.set_tensor_datatype(fifo_input_tensor.name, dtype)
-
-                fifo_node = oh.make_node(
-                    "StreamingFIFO",
-                    [fifo_input_tensor.name],
-                    [graph_out_name],
-                    domain="finn.custom_op.fpgadataflow",
-                    backend="fpgadataflow",
-                    depth=fifo_depth,
-                    folded_shape=fld_shape,
-                    dataType=str(dtype.name),
-                )
-                # insert fifo
-                graph.node.append(fifo_node)
-
-                # set fifo output tensor as new input tensor of second node
-                n.output[0] = fifo_input_tensor.name
+            graph_out_names = [x.name for x in model.graph.output]
+            for graph_out_name in graph_out_names:
+                final_node = model.find_producer(graph_out_name)
+                if (
+                    final_node.op_type != "StreamingFIFO"
+                    and final_node.op_type != "IODMA"
+                ):
+                    assert (
+                        final_node.op_type != "TLastMarker"
+                    ), """Insert tlast marker should be done
+                        after inserting the FIFOs"""
+                    n0 = getCustomOp(final_node)
+                    # determine fifo node attributes
+                    fld_shape = n0.get_folded_output_shape()
+                    dtype = n0.get_output_datatype()
+                    fifo_depth = n0.get_nodeattr("outFIFODepth")
+
+                    if fifo_depth <= 2:
+                        warnings.warn("Overriding output FIFO depth to 32")
+                        fifo_depth = 32
+
+                    # create fifo node
+                    fifo_input_tensor = oh.make_tensor_value_info(
+                        model.make_new_valueinfo_name(),
+                        TensorProto.FLOAT,
+                        n0.get_normal_output_shape(),
+                    )
+                    graph.value_info.append(fifo_input_tensor)
+                    model.set_tensor_datatype(fifo_input_tensor.name, dtype)
+
+                    fifo_node = oh.make_node(
+                        "StreamingFIFO",
+                        [fifo_input_tensor.name],
+                        [graph_out_name],
+                        domain="finn.custom_op.fpgadataflow",
+                        backend="fpgadataflow",
+                        depth=fifo_depth,
+                        folded_shape=fld_shape,
+                        dataType=str(dtype.name),
+                    )
+                    # insert fifo
+                    graph.node.append(fifo_node)
+
+                    # set fifo output tensor as new input tensor of second node
+                    final_node.output[0] = fifo_input_tensor.name
 
         return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py
index d4b2a1032aeb305c85ffb535ac821692ce747c18..d0ef270816c362af730a75b59be71d0457e0b8e2 100644
--- a/src/finn/transformation/fpgadataflow/insert_iodma.py
+++ b/src/finn/transformation/fpgadataflow/insert_iodma.py
@@ -87,6 +87,7 @@ class InsertIODMA(Transformation):
         return reshaped_w
 
     def apply(self, model):
+        modified = False
         # only makes sense for a pure fpgadataflow graph -- so we check!
         all_nodes = list(model.graph.node)
         assert all(
@@ -102,59 +103,14 @@ class InsertIODMA(Transformation):
                 all_nodes,
             )
         )
-        graph_in_name = model.graph.input[0].name
-        first_node = model.find_consumer(graph_in_name)
-        graph_out_name = model.graph.output[0].name
-        final_node = model.find_producer(graph_out_name)
-        if (
-            final_node.op_type == "IODMA"
-            and first_node.op_type == "IODMA"
-            and len(fc_extw_nodes) == 0
-        ):
-            # TODO maybe check the correctness of properties
-            return (model, False)
-        else:
-            if final_node.op_type != "IODMA":
-                out_shape = model.get_tensor_shape(graph_out_name)
-                out_dtype = model.get_tensor_datatype(graph_out_name)
-                final_node_inst = getCustomOp(final_node)
-                out_folded_shape = final_node_inst.get_folded_output_shape()
-                # take advantage of AXI stream width padding for DMA alignment
-                # (AXI streams are always padded to 8 bits)
-                # this is the width of stream input to DMA
-                padded_outstream_width = final_node_inst.get_outstream_width_padded()
-                padded_outstream_bytes = padded_outstream_width // 8
-                # determine the feasible interface width
-                transfer_bits = padded_outstream_width * np.prod(out_folded_shape[:-1])
-                intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
-                assert (
-                    intfwidth % 8 == 0
-                ), "No feasible interface width for transfer size"
-                # make new buffer
-                final_node_out = oh.make_tensor_value_info(
-                    model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
-                )
-                model.graph.value_info.append(final_node_out)
-                model.set_tensor_datatype(final_node_out.name, out_dtype)
-                # reroute final node output to final_node_out_name
-                final_node.output[0] = final_node_out.name
-                # FIXME: currently always using 8-bit dtypes to work around the
-                # padding problems for i/o DMA
-                dma_node = oh.make_node(
-                    "IODMA",
-                    [final_node_out.name],
-                    [graph_out_name],
-                    numInputVectors=out_folded_shape[:-1],
-                    NumChannels=padded_outstream_bytes,
-                    dataType="UINT8",
-                    intfWidth=intfwidth,
-                    streamWidth=padded_outstream_width,
-                    direction="out",
-                    domain="finn.custom_op.fpgadataflow",
-                    backend="fpgadataflow",
-                )
-                model.graph.node.append(dma_node)
-            if first_node.op_type != "IODMA":
+        # insert IODMAs for graph inputs
+        graph_in_names = [x.name for x in model.graph.input]
+        for graph_in_name in graph_in_names:
+            first_node = model.find_consumer(graph_in_name)
+            if first_node.op_type == "IODMA":
+                # IODMA already inserted for this input
+                continue
+            else:
                 in_shape = model.get_tensor_shape(graph_in_name)
                 in_dtype = model.get_tensor_datatype(graph_in_name)
                 first_node_inst = getCustomOp(first_node)
@@ -194,47 +150,96 @@ class InsertIODMA(Transformation):
                     backend="fpgadataflow",
                 )
                 model.graph.node.insert(0, dma_node)
-            for fc_node in fc_extw_nodes:
-                fc_inst = getCustomOp(fc_node)
-                fc_w_name = fc_node.input[1]
-                w_shape = model.get_tensor_shape(fc_w_name)
-                w_dtype = model.get_tensor_datatype(fc_w_name)
+                modified = True
+        # insert IODMAs for graph outputs
+        graph_out_names = [x.name for x in model.graph.output]
+        for graph_out_name in graph_out_names:
+            final_node = model.find_producer(graph_out_name)
+            if final_node.op_type == "IODMA":
+                continue
+            else:
+                out_shape = model.get_tensor_shape(graph_out_name)
+                out_dtype = model.get_tensor_datatype(graph_out_name)
+                final_node_inst = getCustomOp(final_node)
+                out_folded_shape = final_node_inst.get_folded_output_shape()
+                # take advantage of AXI stream width padding for DMA alignment
+                # (AXI streams are always padded to 8 bits)
+                # this is the width of stream input to DMA
+                padded_outstream_width = final_node_inst.get_outstream_width_padded()
+                padded_outstream_bytes = padded_outstream_width // 8
                 # determine the feasible interface width
-                transfer_bits = np.prod(w_shape) * w_dtype.bitwidth()
+                transfer_bits = padded_outstream_width * np.prod(out_folded_shape[:-1])
                 intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
                 assert (
                     intfwidth % 8 == 0
                 ), "No feasible interface width for transfer size"
-                # calculate width of stream output from DMA
-                pe = get_by_name(fc_node.attribute, "PE").i
-                simd = get_by_name(fc_node.attribute, "SIMD").i
-                streamWidth = fc_inst.get_weightstream_width_padded()
                 # make new buffer
-                W = model.get_initializer(fc_w_name)
-                iodma_mem = self.get_mem_init(W, pe, simd)
-                model.set_initializer(fc_w_name, iodma_mem)
-
-                fc_node_in = oh.make_tensor_value_info(
-                    model.make_new_valueinfo_name(), TensorProto.FLOAT, iodma_mem.shape
+                final_node_out = oh.make_tensor_value_info(
+                    model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
                 )
-                model.graph.value_info.append(fc_node_in)
-                model.set_tensor_datatype(fc_node_in.name, w_dtype)
-                model.set_initializer(fc_node_in.name, W)
+                model.graph.value_info.append(final_node_out)
+                model.set_tensor_datatype(final_node_out.name, out_dtype)
+                # reroute final node output to final_node_out_name
+                final_node.output[0] = final_node_out.name
+                # FIXME: currently always using 8-bit dtypes to work around the
+                # padding problems for i/o DMA
                 dma_node = oh.make_node(
                     "IODMA",
-                    [fc_w_name],
-                    [fc_node_in.name],
-                    numInputVectors=[iodma_mem.shape[0]],
-                    NumChannels=pe * simd,
-                    dataType=str(w_dtype.name),
+                    [final_node_out.name],
+                    [graph_out_name],
+                    numInputVectors=out_folded_shape[:-1],
+                    NumChannels=padded_outstream_bytes,
+                    dataType="UINT8",
                     intfWidth=intfwidth,
-                    streamWidth=streamWidth,
-                    direction="in",
-                    burstMode="wrap",
+                    streamWidth=padded_outstream_width,
+                    direction="out",
                     domain="finn.custom_op.fpgadataflow",
                     backend="fpgadataflow",
                 )
-                fc_node.input[1] = fc_node_in.name
-                model.graph.node.insert(0, dma_node)
+                model.graph.node.append(dma_node)
+                modified = True
+
+        for fc_node in fc_extw_nodes:
+            fc_inst = getCustomOp(fc_node)
+            fc_w_name = fc_node.input[1]
+            w_shape = model.get_tensor_shape(fc_w_name)
+            w_dtype = model.get_tensor_datatype(fc_w_name)
+            # determine the feasible interface width
+            transfer_bits = np.prod(w_shape) * w_dtype.bitwidth()
+            intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
+            assert intfwidth % 8 == 0, "No feasible interface width for transfer size"
+            # calculate width of stream output from DMA
+            pe = get_by_name(fc_node.attribute, "PE").i
+            simd = get_by_name(fc_node.attribute, "SIMD").i
+            streamWidth = fc_inst.get_weightstream_width_padded()
+            # make new buffer
+            W = model.get_initializer(fc_w_name)
+            iodma_mem = self.get_mem_init(W, pe, simd)
+            model.set_initializer(fc_w_name, iodma_mem)
+
+            fc_node_in = oh.make_tensor_value_info(
+                model.make_new_valueinfo_name(), TensorProto.FLOAT, iodma_mem.shape
+            )
+            model.graph.value_info.append(fc_node_in)
+            model.set_tensor_datatype(fc_node_in.name, w_dtype)
+            model.set_initializer(fc_node_in.name, W)
+            dma_node = oh.make_node(
+                "IODMA",
+                [fc_w_name],
+                [fc_node_in.name],
+                numInputVectors=[iodma_mem.shape[0]],
+                NumChannels=pe * simd,
+                dataType=str(w_dtype.name),
+                intfWidth=intfwidth,
+                streamWidth=streamWidth,
+                direction="in",
+                burstMode="wrap",
+                domain="finn.custom_op.fpgadataflow",
+                backend="fpgadataflow",
+            )
+            fc_node.input[1] = fc_node_in.name
+            model.graph.node.insert(0, dma_node)
+            modified = True
+        if modified:
             model = model.transform(SortGraph())
-            return (model, True)
+        return (model, modified)
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
index be2176a34763fdb5521a0acdfc3137fb4b4a766e..bfa2fdbf9594c52d9a3a2376312a929b9008c9ea 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
@@ -90,6 +90,7 @@ class MakePYNQDriver(Transformation):
         self.platform = platform
 
     def apply(self, model):
+
         # create a temporary folder for the generated driver
         pynq_driver_dir = make_build_dir(prefix="pynq_driver_")
         model.set_metadata_prop("pynq_driver_dir", pynq_driver_dir)
@@ -100,59 +101,100 @@ class MakePYNQDriver(Transformation):
         )
         driver_base_py = pynq_driver_dir + "/driver_base.py"
         shutil.copy(driver_base_template, driver_base_py)
-
         # extract input-output shapes from the graph
         # TODO convert this to an analysis pass?
-        i_tensor_name = model.graph.input[0].name
-        o_tensor_name = model.graph.output[0].name
-        i_tensor_shape_normal = tuple(model.get_tensor_shape(i_tensor_name))
-        o_tensor_shape_normal = tuple(model.get_tensor_shape(o_tensor_name))
-        i_tensor_dt = model.get_tensor_datatype(i_tensor_name)
-        o_tensor_dt = model.get_tensor_datatype(o_tensor_name)
-
-        first_node = model.find_consumer(i_tensor_name)
-        last_node = model.find_producer(o_tensor_name)
-        if first_node.op_type == "StreamingDataflowPartition":
-            # IODMAs and dataflow partitions have already been created
-            # extract folded i/o shapes from IODMA consumer/producer
-            first_df_model = ModelWrapper(getCustomOp(first_node).get_nodeattr("model"))
+        idt = []
+        idma_names = []
+        ishape_normal = []
+        ishape_folded = []
+        ishape_packed = []
+        for idma_ind, graph_in in enumerate(model.graph.input):
+            i_tensor_name = graph_in.name
+            # get inp tensor properties
+            i_tensor_dt = model.get_tensor_datatype(i_tensor_name)
+            i_tensor_shape_normal = tuple(model.get_tensor_shape(i_tensor_name))
+            # go down into dataflow partition to get folded shape info etc
+            # TODO consider setting these as attributes during dataflow partitioning
+            i_consumer = model.find_consumer(i_tensor_name)
+            assert (
+                i_consumer.op_type == "StreamingDataflowPartition"
+            ), """
+                Ensure CreateDataflowPartition called before driver creation."""
+            first_df_model = ModelWrapper(getCustomOp(i_consumer).get_nodeattr("model"))
             assert (
                 first_df_model.graph.node[0].op_type == "IODMA"
             ), "First partition must hold input IODMA"
-            successors = model.find_direct_successors(first_node)
+            successors = model.find_direct_successors(i_consumer)
+            successor_input_num = list(successors[0].input).index(i_consumer.output[0])
             successor_sdp = getCustomOp(successors[0])
             successor_df_model = ModelWrapper(successor_sdp.get_nodeattr("model"))
             first_node = successor_df_model.find_consumer(
-                successor_df_model.graph.input[0].name
+                successor_df_model.graph.input[successor_input_num].name
             )
-
-            last_df_model = ModelWrapper(getCustomOp(last_node).get_nodeattr("model"))
+            i_tensor_shape_folded = tuple(
+                getCustomOp(first_node).get_folded_input_shape()
+            )
+            # generate dummy folded i/o tensors and their packed versions
+            i_tensor_dummy_folded = gen_finn_dt_tensor(
+                i_tensor_dt, i_tensor_shape_folded
+            )
+            i_tensor_dummy_packed = dpk.finnpy_to_packed_bytearray(
+                i_tensor_dummy_folded, i_tensor_dt
+            )
+            i_tensor_shape_packed = i_tensor_dummy_packed.shape
+            # append all input tensor info to relevant lists
+            idt.append(str(i_tensor_dt))
+            ishape_normal.append(i_tensor_shape_normal)
+            ishape_folded.append(i_tensor_shape_folded)
+            ishape_packed.append(i_tensor_shape_packed)
+            idma_names.append(getCustomOp(i_consumer).get_nodeattr("instance_name"))
+
+        odt = []
+        odma_names = []
+        oshape_normal = []
+        oshape_folded = []
+        oshape_packed = []
+        for odma_ind, graph_out in enumerate(model.graph.output):
+            o_tensor_name = graph_out.name
+            # get inp tensor properties
+            o_tensor_dt = model.get_tensor_datatype(o_tensor_name)
+            o_tensor_shape_normal = tuple(model.get_tensor_shape(o_tensor_name))
+            # go down into IODMA partition to get folded shape info etc
+            # TODO consider setting these as attributes during dataflow partitioning
+            o_producer = model.find_producer(o_tensor_name)
+            assert (
+                o_producer.op_type == "StreamingDataflowPartition"
+            ), """
+                Ensure CreateDataflowPartition called before driver creation."""
+            df_model = ModelWrapper(getCustomOp(o_producer).get_nodeattr("model"))
             assert (
-                last_df_model.graph.node[0].op_type == "IODMA"
-            ), "Last partition must hold output IODMA"
-            predecessors = model.find_direct_predecessors(last_node)
+                df_model.graph.node[-1].op_type == "IODMA"
+            ), "Partition must hold output IODMA"
+            predecessors = model.find_direct_predecessors(o_producer)
+            predecessor_output_num = list(predecessors[0].output).index(
+                o_producer.input[0]
+            )
             predecessor_sdp = getCustomOp(predecessors[0])
             predecessor_df_model = ModelWrapper(predecessor_sdp.get_nodeattr("model"))
             last_node = predecessor_df_model.find_producer(
-                predecessor_df_model.graph.output[0].name
+                predecessor_df_model.graph.output[predecessor_output_num].name
             )
-
-        # else: transformation called before IODMA/SDP creation (legacy flow)
-        # can access folded i/o shapes directly
-        i_tensor_shape_folded = tuple(getCustomOp(first_node).get_folded_input_shape())
-        o_tensor_shape_folded = tuple(getCustomOp(last_node).get_folded_output_shape())
-
-        # generate dummy folded i/o tensors and their packed versions
-        i_tensor_dummy_folded = gen_finn_dt_tensor(i_tensor_dt, i_tensor_shape_folded)
-        o_tensor_dummy_folded = gen_finn_dt_tensor(o_tensor_dt, o_tensor_shape_folded)
-        i_tensor_dummy_packed = dpk.finnpy_to_packed_bytearray(
-            i_tensor_dummy_folded, i_tensor_dt
-        )
-        o_tensor_dummy_packed = dpk.finnpy_to_packed_bytearray(
-            o_tensor_dummy_folded, o_tensor_dt
-        )
-        i_tensor_shape_packed = i_tensor_dummy_packed.shape
-        o_tensor_shape_packed = o_tensor_dummy_packed.shape
+            o_tensor_shape_folded = tuple(
+                getCustomOp(last_node).get_folded_output_shape()
+            )
+            o_tensor_dummy_folded = gen_finn_dt_tensor(
+                o_tensor_dt, o_tensor_shape_folded
+            )
+            o_tensor_dummy_packed = dpk.finnpy_to_packed_bytearray(
+                o_tensor_dummy_folded, o_tensor_dt
+            )
+            o_tensor_shape_packed = o_tensor_dummy_packed.shape
+            # append all output tensor info to relevant lists
+            odt.append(str(o_tensor_dt))
+            oshape_normal.append(o_tensor_shape_normal)
+            oshape_folded.append(o_tensor_shape_folded)
+            oshape_packed.append(o_tensor_shape_packed)
+            odma_names.append(getCustomOp(o_producer).get_nodeattr("instance_name"))
 
         # generate external weights npy files
         weights_dir = pynq_driver_dir + "/runtime_weights"
@@ -166,47 +208,50 @@ class MakePYNQDriver(Transformation):
                 node.op_type == "StreamingDataflowPartition"
             ), "CreateDataflowPartition needs to be applied before driver generation"
 
-            producer = model.find_producer(node.input[0])
-            init_tensor = model.get_initializer(node.input[0])
+            if len(node.input) > 0:
+                producer = model.find_producer(node.input[0])
+                init_tensor = model.get_initializer(node.input[0])
+            else:
+                producer = None
+                init_tensor = None
 
             if producer is None:  # input dma?
-                idma_name = "idma" + str(idma_idx)
-                if init_tensor is not None:  # input weights dma?
+                sdp_inst = getCustomOp(node)
+                idma_name = sdp_inst.get_nodeattr("instance_name")
+                df_model = ModelWrapper(sdp_inst.get_nodeattr("model"))
+                assert df_model.graph.node[0].op_type == "IODMA"
+                iodma_node = getCustomOp(df_model.graph.node[0])
+                if iodma_node.get_nodeattr("burstMode") == "wrap":  # input weights dma?
+                    init_tensor = df_model.get_initializer(
+                        iodma_node.onnx_node.input[0]
+                    )
                     ext_weight_dma_cnt += 1
-                    w_dtype = model.get_tensor_datatype(node.input[0])
+                    w_dtype = df_model.get_tensor_datatype(
+                        iodma_node.onnx_node.input[0]
+                    )
                     init_external_tensor = to_external_tensor(init_tensor, w_dtype)
                     np.save(
                         weights_dir + "/" + idma_name + ".npy", init_external_tensor
                     )
-                else:
-                    net_input_name = idma_name
-
                 idma_idx += 1
 
         # fill in the driver template
         driver_py = pynq_driver_dir + "/driver.py"
         driver = template_driver.pynq_driver_template
 
-        def mss(x, batch_var_name="1"):
-            # "make shape string"
-            # for a shape like (1, ...) emit a string (N, ...)
-            # where N is the default value for batch_var_name
-            # this lets the driver work with a batch of samples at once
-            ret = str(x)
-            ret = ret.replace("(1,", "(%s," % batch_var_name)
-            ret = ret.replace("[1,", "[%s," % batch_var_name)
-            return ret
-
         driver = driver.replace("$PLATFORM$", self.platform)
-        driver = driver.replace("$INPUT_FINN_DATATYPE$", str(i_tensor_dt))
-        driver = driver.replace("$INPUT_SHAPE_NORMAL$", mss(i_tensor_shape_normal))
-        driver = driver.replace("$INPUT_SHAPE_FOLDED$", mss(i_tensor_shape_folded))
-        driver = driver.replace("$INPUT_SHAPE_PACKED$", mss(i_tensor_shape_packed))
-        driver = driver.replace("$OUTPUT_FINN_DATATYPE$", str(o_tensor_dt))
-        driver = driver.replace("$OUTPUT_SHAPE_NORMAL$", mss(o_tensor_shape_normal))
-        driver = driver.replace("$OUTPUT_SHAPE_FOLDED$", mss(o_tensor_shape_folded))
-        driver = driver.replace("$OUTPUT_SHAPE_PACKED$", mss(o_tensor_shape_packed))
-        driver = driver.replace("$INPUT_DMA_NAME$", "'%s'" % net_input_name)
+        driver = driver.replace("$INPUT_FINN_DATATYPE$", str(idt).replace("'", ""))
+        driver = driver.replace("$INPUT_SHAPE_NORMAL$", str(ishape_normal))
+        driver = driver.replace("$INPUT_SHAPE_FOLDED$", str(ishape_folded))
+        driver = driver.replace("$INPUT_SHAPE_PACKED$", str(ishape_packed))
+        driver = driver.replace("$OUTPUT_FINN_DATATYPE$", str(odt).replace("'", ""))
+        driver = driver.replace("$OUTPUT_SHAPE_NORMAL$", str(oshape_normal))
+        driver = driver.replace("$OUTPUT_SHAPE_FOLDED$", str(oshape_folded))
+        driver = driver.replace("$OUTPUT_SHAPE_PACKED$", str(oshape_packed))
+        driver = driver.replace("$INPUT_DMA_NAME$", "%s" % str(idma_names))
+        driver = driver.replace("$OUTPUT_DMA_NAME$", "%s" % str(odma_names))
+        driver = driver.replace("$NUM_INPUTS$", str(len(idma_names)))
+        driver = driver.replace("$NUM_OUTPUTS$", str(len(odma_names)))
         driver = driver.replace("$EXT_WEIGHT_NUM$", str(ext_weight_dma_cnt))
 
         with open(driver_py, "w") as f:
diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
index 0f4dd2673c026f4e9179c2ab6e860ca04ced68eb..84d587b6cecea63cb3be41a4a73bcc24aeb822f3 100644
--- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -42,11 +42,10 @@ from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA
-from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
 from finn.transformation.infer_data_layouts import InferDataLayouts
-from finn.util.basic import get_by_name, make_build_dir, pynq_part_map
+from finn.util.basic import make_build_dir, pynq_part_map
 
 from . import templates
 
@@ -54,19 +53,22 @@ from . import templates
 def collect_ip_dirs(model, ipstitch_path):
     # collect list of all IP dirs
     ip_dirs = []
+    need_memstreamer = False
     for node in model.graph.node:
-        ip_dir_attribute = get_by_name(node.attribute, "ip_path")
-        assert (
-            ip_dir_attribute is not None
-        ), """Node attribute "ip_path" is
-        empty. Please run transformation HLSSynth_ipgen first."""
-        ip_dir_value = ip_dir_attribute.s.decode("UTF-8")
+        node_inst = getCustomOp(node)
+        ip_dir_value = node_inst.get_nodeattr("ip_path")
         assert os.path.isdir(
             ip_dir_value
         ), """The directory that should
         contain the generated ip blocks doesn't exist."""
         ip_dirs += [ip_dir_value]
+        if node.op_type in ["StreamingFCLayer_Batch", "Thresholding_Batch"]:
+            if node_inst.get_nodeattr("mem_mode") == "decoupled":
+                need_memstreamer = True
     ip_dirs += [ipstitch_path + "/ip"]
+    if need_memstreamer:
+        # add RTL streamer IP
+        ip_dirs.append("/workspace/finn/finn-rtllib/memstream")
     return ip_dirs
 
 
@@ -142,16 +144,21 @@ class MakeZYNQProject(Transformation):
             # assume only one connection from each ip to the next
             # all aximm allocated to DDR[0]
             # all kernels allocated to SLR0
-            producer = model.find_producer(node.input[0])
+            if len(node.input) == 0:
+                producer = None
+            else:
+                producer = model.find_producer(node.input[0])
             consumer = model.find_consumers(node.output[0])
             # define kernel instances
             # name kernels connected to graph inputs as idmaxx
-            # name kernels connected to graph inputs as odmaxx
+            # name kernels connected to graph outputs as odmaxx
             if producer is None or consumer is None:
                 if producer is None:
                     instance_names[node.name] = "idma" + str(idma_idx)
+                    idma_idx += 1
                 elif consumer is None:
                     instance_names[node.name] = "odma" + str(odma_idx)
+                    odma_idx += 1
                 config.append(
                     "create_bd_cell -type ip -vlnv %s %s"
                     % (vivado_stitch_vlnv, instance_names[node.name])
@@ -176,7 +183,7 @@ class MakeZYNQProject(Transformation):
                     "assign_axi_addr_proc %s/%s"
                     % (instance_names[node.name], axilite_intf_name)
                 )
-                idma_idx += 1
+
                 aximm_idx += 1
                 axilite_idx += 1
             else:
@@ -197,6 +204,7 @@ class MakeZYNQProject(Transformation):
                         % (instance_names[node.name], axilite_intf_name)
                     )
                     axilite_idx += 1
+            sdp_node.set_nodeattr("instance_name", instance_names[node.name])
 
             config.append(
                 "connect_bd_net [get_bd_pins %s/ap_clk] "
@@ -295,12 +303,19 @@ class ZynqBuild(Transformation):
 
     """
 
-    def __init__(self, platform, period_ns, enable_debug=False):
+    def __init__(
+        self,
+        platform,
+        period_ns,
+        enable_debug=False,
+        partition_model_dir=None,
+    ):
         super().__init__()
         self.fpga_part = pynq_part_map[platform]
         self.period_ns = period_ns
         self.platform = platform
         self.enable_debug = enable_debug
+        self.partition_model_dir = partition_model_dir
 
     def apply(self, model):
         # first infer layouts
@@ -310,7 +325,7 @@ class ZynqBuild(Transformation):
             InsertIODMA(64),
             InsertDWC(),
             Floorplan(),
-            CreateDataflowPartition(),
+            CreateDataflowPartition(partition_model_dir=self.partition_model_dir),
         ]
         for trn in prep_transforms:
             model = model.transform(trn)
@@ -332,7 +347,7 @@ class ZynqBuild(Transformation):
             kernel_model = kernel_model.transform(HLSSynthIP())
             kernel_model = kernel_model.transform(
                 CreateStitchedIP(
-                    self.fpga_part, self.period_ns, sdp_node.onnx_node.name, True
+                    self.fpga_part, self.period_ns, sdp_node.onnx_node.name, False
                 )
             )
             kernel_model.set_metadata_prop("platform", "zynq-iodma")
@@ -345,6 +360,4 @@ class ZynqBuild(Transformation):
         # set platform attribute for correct remote execution
         model.set_metadata_prop("platform", "zynq-iodma")
 
-        # create driver
-        model = model.transform(MakePYNQDriver(platform="zynq-iodma"))
         return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
index c06c34574aa22a23d1307232b0fd8e65224f1983..39eb049565475b462ea0df9d88b46e3598e6cdd9 100644
--- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py
+++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
@@ -87,9 +87,14 @@ class RemoveShallowFIFOs(Transformation):
     def apply(self, model):
         shallow_fifos = []
         for node in model.graph.node:
+            if len(node.input) > 0:
+                is_first_node = model.find_producer(node.input[0]) is None
+            else:
+                is_first_node = True
             if (
                 node.op_type == "StreamingFIFO"
                 and getCustomOp(node).get_nodeattr("depth") <= self.shallow_threshold
+                and (not is_first_node)
             ):
                 # bypass shallow fifos
                 shallow_fifos.append(node)
diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py
index 914dda9554395fc89cac8692e13339ae3ce9baf7..64d7a080724820d58a026bafbe74a4d7567b2179 100644
--- a/src/finn/transformation/fpgadataflow/set_folding.py
+++ b/src/finn/transformation/fpgadataflow/set_folding.py
@@ -26,6 +26,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import numpy as np
 import warnings
 
 from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
@@ -154,9 +155,16 @@ class SetFolding(Transformation):
                     pe = node_inst.get_nodeattr("PE")
                     swu_node_inst.set_nodeattr("SIMD", pe)
                 else:
-                    raise Exception(
-                        "Expected SWU on DW op input, found " + swu_node.op_type
-                    )
+                    if op_type == "Vector_Vector_Activate_Batch":
+                        ksize = np.prod(node_inst.get_nodeattr("Kernel"))
+                    elif op_type == "Pool_Batch":
+                        ksize = node_inst.get_nodeattr("KernelSize")
+                    else:
+                        raise Exception("Undefined edge case for %s" % op_type)
+                    if ksize != 1:  # pointwise vvau/pool lack a SWU
+                        raise Exception(
+                            "Expected SWU on DW op input, found " + swu_node.op_type
+                        )
             elif op_type in simd_ops:
                 if op_type == "ConvolutionInputGenerator":
                     depthwise = node_inst.get_nodeattr("depthwise")
diff --git a/src/finn/transformation/fpgadataflow/template_driver.py b/src/finn/transformation/fpgadataflow/template_driver.py
index 5265835dd2530a5c93ceefbef629a43d6f33de52..62b9bfb1969114ad858f50cb7eb0137aa12135ed 100644
--- a/src/finn/transformation/fpgadataflow/template_driver.py
+++ b/src/finn/transformation/fpgadataflow/template_driver.py
@@ -79,7 +79,10 @@ io_shape_dict = {
     "ishape_packed" : $INPUT_SHAPE_PACKED$,
     "oshape_packed" : $OUTPUT_SHAPE_PACKED$,
     "input_dma_name" : $INPUT_DMA_NAME$,
-    "number_of_external_weights": $EXT_WEIGHT_NUM$
+    "output_dma_name" : $OUTPUT_DMA_NAME$,
+    "number_of_external_weights": $EXT_WEIGHT_NUM$,
+    "num_inputs" : $NUM_INPUTS$,
+    "num_outputs" : $NUM_OUTPUTS$,
 }
 
 if __name__ == "__main__":
@@ -88,8 +91,8 @@ if __name__ == "__main__":
     parser.add_argument('--platform', help='Target platform: zynq-iodma alveo', default="$PLATFORM$")
     parser.add_argument('--batchsize', help='number of samples for inference', type=int, default=1)
     parser.add_argument('--bitfile', help='name of bitfile (i.e. "resizer.bit")', default="resizer.bit")
-    parser.add_argument('--inputfile', help='name of input npy file (i.e. "input.npy")', default="input.npy")
-    parser.add_argument('--outputfile', help='name of output npy file (i.e. "output.npy")', default="output.npy")
+    parser.add_argument('--inputfile', help='name(s) of input npy file(s) (i.e. "input.npy")', nargs="*", type=str, default=["input.npy"])
+    parser.add_argument('--outputfile', help='name(s) of output npy file(s) (i.e. "output.npy")', nargs="*", type=str, default=["output.npy"])
     parser.add_argument('--runtime_weight_dir', help='path to folder containing runtime-writable .dat weights', default="runtime_weights/")
     # parse arguments
     args = parser.parse_args()
@@ -111,16 +114,13 @@ if __name__ == "__main__":
     # for the remote execution the data from the input npy file has to be loaded,
     # packed and copied to the PYNQ buffer
     if exec_mode == "execute":
-        # remove old output file to prevent reusing old output
-        # in case execution fails
-        try:
-            os.remove(outputfile)
-        except FileNotFoundError:
-            pass
-        # load desired input .npy file
-        ibuf_normal = np.load(inputfile)
+        # load desired input .npy file(s)
+        ibuf_normal = []
+        for ifn in inputfile:
+            ibuf_normal.append(np.load(ifn))
         obuf_normal = accel.execute(ibuf_normal)
-        np.save(outputfile, obuf_normal)
+        for o, obuf in enumerate(obuf_normal):
+            np.save(outputfile[o], obuf)
     elif exec_mode == "throughput_test":
         # remove old metrics file
         try:
diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py
index 502b6f2bffd0d64980ae911d28b845ad90633a44..a2865321418343efbfdae12c111ba4334ecfee28 100644
--- a/src/finn/transformation/fpgadataflow/vitis_build.py
+++ b/src/finn/transformation/fpgadataflow/vitis_build.py
@@ -43,7 +43,6 @@ from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA
-from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.general import (
     GiveReadableTensorNames,
@@ -207,7 +206,10 @@ class VitisLink(Transformation):
             # has axis, aximm and axilite
             # everything else is axis-only
             # assume only one connection from each ip to the next
-            producer = model.find_producer(node.input[0])
+            if len(node.input) == 0:
+                producer = None
+            else:
+                producer = model.find_producer(node.input[0])
             consumer = model.find_consumers(node.output[0])
             # define kernel instances
             # name kernels connected to graph inputs as idmaxx
@@ -223,6 +225,7 @@ class VitisLink(Transformation):
             else:
                 instance_names[node.name] = node.name
                 config.append("nk=%s:1:%s" % (node.name, instance_names[node.name]))
+            sdp_node.set_nodeattr("instance_name", instance_names[node.name])
             # explicitly assign SLRs if the slr attribute is not -1
             node_slr = sdp_node.get_nodeattr("slr")
             if node_slr != -1:
@@ -375,6 +378,7 @@ class VitisBuild(Transformation):
         enable_debug=False,
         floorplan_file=None,
         enable_link=True,
+        partition_model_dir=None,
     ):
         super().__init__()
         self.fpga_part = fpga_part
@@ -384,6 +388,7 @@ class VitisBuild(Transformation):
         self.enable_debug = enable_debug
         self.floorplan_file = floorplan_file
         self.enable_link = enable_link
+        self.partition_model_dir = partition_model_dir
 
     def apply(self, model):
         _check_vitis_envvars()
@@ -398,7 +403,9 @@ class VitisBuild(Transformation):
 
         model = model.transform(Floorplan(floorplan=self.floorplan_file))
 
-        model = model.transform(CreateDataflowPartition())
+        model = model.transform(
+            CreateDataflowPartition(partition_model_dir=self.partition_model_dir)
+        )
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(GiveReadableTensorNames())
 
@@ -439,6 +446,4 @@ class VitisBuild(Transformation):
         # set platform attribute for correct remote execution
         model.set_metadata_prop("platform", "alveo")
 
-        # create driver
-        model = model.transform(MakePYNQDriver(platform="alveo"))
         return (model, False)
diff --git a/src/finn/transformation/streamline/absorb.py b/src/finn/transformation/streamline/absorb.py
index 1e2830356fe0133038caaa1dbc43f97ca98378d1..cf712b38054c78c6e414ad914ab67378daec5d12 100644
--- a/src/finn/transformation/streamline/absorb.py
+++ b/src/finn/transformation/streamline/absorb.py
@@ -308,8 +308,8 @@ class Absorb1BitMulIntoConv(Transformation):
 
 
 class AbsorbTransposeIntoMultiThreshold(Transformation):
-    """Change (NCHWTranspose -> MultiThreshold -> NHWCTranspose) to (MultiThreshold)
-    with NHWC mode. For (NCHWTranspose -> MultiThreshold) move Transpose past MT."""
+    """For (NCHWTranspose -> MultiThreshold) move Transpose past MultiThreshold
+    and set its data_layout mode to NHWC."""
 
     def apply(self, model):
         graph = model.graph
@@ -321,43 +321,43 @@ class AbsorbTransposeIntoMultiThreshold(Transformation):
                 perms = list(get_by_name(n.attribute, "perm").ints)
                 if perms == [0, 3, 1, 2]:
                     mt_cand = model.find_consumer(n.output[0])
-                    if mt_cand.op_type == "MultiThreshold" and not model.is_fork_node(
-                        mt_cand
+                    if (
+                        mt_cand is not None
+                        and mt_cand.op_type == "MultiThreshold"
+                        # and not model.is_fork_node(mt_cand)
                     ):
-                        final_t_cand = model.find_consumer(mt_cand.output[0])
-                        if final_t_cand.op_type == "Transpose":
-                            perms = list(
-                                get_by_name(final_t_cand.attribute, "perm").ints
-                            )
-                            if perms == [0, 2, 3, 1]:
-                                mt = getCustomOp(mt_cand)
-                                mt.set_nodeattr("data_layout", "NHWC")
-                                # get rid of tranpose nodes, wire MT directly
-                                mt_cand.input[0] = n.input[0]
-                                mt_cand.output[0] = final_t_cand.output[0]
-                                graph.node.remove(n)
-                                graph.node.remove(final_t_cand)
-                                graph_modified = True
+                        final_t_cands = model.find_consumers(mt_cand.output[0])
+                        mt = getCustomOp(mt_cand)
+                        mt.set_nodeattr("data_layout", "NHWC")
+                        # get rid of first tranpose node
+                        mt_cand.input[0] = n.input[0]
+                        graph.node.remove(n)
+                        # fix output shape for MultiThreshold
+                        mt_orig_oshape = model.get_tensor_shape(mt_cand.output[0])
+                        mt_ishape = model.get_tensor_shape(mt_cand.input[0])
+                        model.set_tensor_shape(mt_cand.output[0], mt_ishape)
+                        # re-insert Transpose behind MultiThreshold
+                        transpose_output = model.make_new_valueinfo_name()
+                        new_transpose = oh.make_node(
+                            "Transpose",
+                            [mt_cand.output[0]],
+                            [transpose_output],
+                            perm=[0, 3, 1, 2],
+                        )
+                        graph.node.insert(node_ind + 1, new_transpose)
+                        if final_t_cands is not None:
+                            # rewire next nodes' inputs
+                            for final_t_cand in final_t_cands:
+                                final_t_cand.input[0] = transpose_output
                         else:
-                            mt = getCustomOp(mt_cand)
-                            mt.set_nodeattr("data_layout", "NHWC")
-                            # get rid of first tranpose node
-                            mt_cand.input[0] = n.input[0]
-                            graph.node.remove(n)
-                            # fix output shape for MultiThreshold
-                            mt_ishape = model.get_tensor_shape(mt_cand.input[0])
+                            # replace graph top-level output
+                            get_by_name(
+                                model.graph.output, mt_cand.output[0]
+                            ).name = transpose_output
                             model.set_tensor_shape(mt_cand.output[0], mt_ishape)
-                            # re-insert Transpose behind MultiThreshold
-                            transpose_output = model.make_new_valueinfo_name()
-                            new_transpose = oh.make_node(
-                                "Transpose",
-                                [mt_cand.output[0]],
-                                [transpose_output],
-                                perm=[0, 3, 1, 2],
-                            )
-                            graph.node.insert(node_ind + 1, new_transpose)
-                            final_t_cand.input[0] = transpose_output
-                            graph_modified = True
+                        # set value_info shape for transpose output
+                        model.set_tensor_shape(transpose_output, mt_orig_oshape)
+                        graph_modified = True
         if graph_modified:
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
@@ -531,11 +531,20 @@ class AbsorbConsecutiveTransposes(Transformation):
                             # TODO implement this to allow for forks as producers
                             consumers = model.find_direct_successors(next_node)
                             prod = model.find_producer(n.input[0])
-                            for cons in consumers:
-                                for cons_in in cons.input:
-                                    if cons_in == next_node.output[0]:
-                                        prod.output[0] = cons_in
-                                        break
+                            if prod is not None:
+                                for cons in consumers:
+                                    for cons_in in cons.input:
+                                        if cons_in == next_node.output[0]:
+                                            prod.output[0] = cons_in
+                                            break
+                            else:
+                                # n.input[0] is top-level graph input
+                                # wire consumers directly to that
+                                for cons in consumers:
+                                    for i, iname in enumerate(cons.input):
+                                        if iname == next_node.output[0]:
+                                            cons.input[i] = n.input[0]
+
                             # remove both transposes
                             graph.node.remove(n)
                             graph.node.remove(next_node)
@@ -544,3 +553,81 @@ class AbsorbConsecutiveTransposes(Transformation):
         if graph_modified:
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
+
+
+class AbsorbTransposeIntoResize(Transformation):
+    """For (NCHWTranspose -> Resize) move Transpose past Resize and
+    change the Resize node's attributes accordingly."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for node in graph.node:
+            node_ind += 1
+            if node.op_type == "Transpose" and not model.is_fork_node(node):
+                perms = list(get_by_name(node.attribute, "perm").ints)
+                if perms == [0, 3, 1, 2]:
+                    mt_cand = model.find_consumer(node.output[0])
+                    if mt_cand is not None and mt_cand.op_type == "Resize":
+                        mode = get_by_name(mt_cand.attribute, "mode").s.decode("ascii")
+                        # skip if mode is not nearest
+                        if mode != "nearest":
+                            continue
+                        # if sizes specified, turn into scales
+                        if len(mt_cand.input) > 3:
+                            sizes = model.get_initializer(mt_cand.input[3])
+                        else:
+                            sizes = None
+                        if sizes is not None:
+                            ishape = model.get_tensor_shape(mt_cand.input[0])
+                            ns, cs, hs, ws = sizes / np.asarray(ishape)
+                            model.set_initializer(
+                                mt_cand.input[2], np.asarray([ns, cs, hs, ws])
+                            )
+                            mt_cand.input.remove(mt_cand.input[3])
+                        # scales already specified, transpose indices to NHWC
+                        scales = model.get_initializer(mt_cand.input[2])
+                        assert scales is not None
+                        ns, cs, hs, ws = scales
+                        model.set_initializer(
+                            mt_cand.input[2], np.asarray([ns, hs, ws, cs])
+                        )
+                        # get rid of first tranpose node
+                        mt_cand.input[0] = node.input[0]
+                        graph.node.remove(node)
+                        is_last_node = mt_cand.output[0] in [
+                            x.name for x in model.graph.output
+                        ]
+
+                        new_tensor_name = model.make_new_valueinfo_name()
+                        if is_last_node:
+                            trans_input = new_tensor_name
+                            trans_output = mt_cand.output[0]
+                        else:
+                            trans_input = mt_cand.output[0]
+                            trans_output = new_tensor_name
+                        # fix tensor shapes for Resize and Transpose
+                        # n, c, h, w = model.get_tensor_shape(mt_cand.input[0])
+                        n, c, hx, wx = model.get_tensor_shape(mt_cand.output[0])
+                        model.set_tensor_shape(trans_input, (n, hx, wx, c))
+                        model.set_tensor_shape(trans_output, (n, c, hx, wx))
+                        # re-insert Transpose behind Resize
+                        new_transpose = oh.make_node(
+                            "Transpose",
+                            [trans_input],
+                            [trans_output],
+                            perm=[0, 3, 1, 2],
+                        )
+                        graph.node.insert(node_ind + 1, new_transpose)
+                        # rewire nodes
+                        final_t_cands = model.find_consumers(mt_cand.output[0])
+                        if final_t_cands is not None:
+                            # rewire next nodes' inputs
+                            for final_t_cand in final_t_cands:
+                                final_t_cand.input[0] = trans_output
+                        mt_cand.output[0] = trans_input
+                        graph_modified = True
+        if graph_modified:
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index 1b22f474abe3f59ac91551efa3661b2612442776..cd44e115eed42d1c0529b86a49e8855ff7c492ce 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -594,11 +594,17 @@ class MoveScalarLinearPastInvariants(Transformation):
         nodes = [n for n in graph.node]
         for n in nodes:
             node_ind += 1
+            is_nearest_neighbor_resample = False
+            if n.op_type == "Upsample" or n.op_type == "Resize":
+                # Extract mode and scales and input shape
+                mode = get_by_name(n.attribute, "mode").s.decode("ascii")
+                is_nearest_neighbor_resample = mode == "nearest"
             if (
                 n.op_type == "GlobalAveragePool"
                 or n.op_type == "Reshape"
                 or n.op_type == "Transpose"
                 or n.op_type == "Flatten"
+                or is_nearest_neighbor_resample
             ):
                 in0 = n.input[0]
                 if in0 is None:
diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py
index 00a9fa721a320a8b70ee913e878955b9caddc3bf..6790485ceab373cd539727aefc59fa8999b3b192 100644
--- a/tests/end2end/test_end2end_bnn_pynq.py
+++ b/tests/end2end/test_end2end_bnn_pynq.py
@@ -63,6 +63,7 @@ from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
+from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
@@ -521,14 +522,6 @@ class TestEnd2End:
             topology, wbits, abits, return_topk=1
         )
         y = execute_parent(parent_chkpt, rtlsim_chkpt, input_tensor_npy)
-        model = ModelWrapper(rtlsim_chkpt)
-        perf["cycles_rtlsim"] = model.get_metadata_prop("cycles_rtlsim")
-        # warnings.warn("Estimated & rtlsim performance: " + str(perf))
-        # for (k, v) in perf.items():
-        #    update_dashboard_data(topology, wbits, abits, k, v)
-        update_dashboard_data(
-            topology, wbits, abits, "cycles_rtlsim", perf["cycles_rtlsim"]
-        )
         assert np.isclose(y, output_tensor_npy).all()
 
     @pytest.mark.slow
@@ -541,12 +534,17 @@ class TestEnd2End:
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         n_nodes = len(model.graph.node)
         perf_est = model.analysis(dataflow_performance)
-        latency = int(model.get_metadata_prop("cycles_rtlsim"))
+        ret_b1 = throughput_test_rtlsim(model, batchsize=1)
+        latency = int(ret_b1["cycles"])
         cycles_per_sample_est = perf_est["max_cycles"]
         batchsize = 2 * n_nodes
         ret = throughput_test_rtlsim(model, batchsize=batchsize)
         res_cycles = ret["cycles"]
         est_cycles = latency + cycles_per_sample_est * batchsize
+        # warnings.warn("Estimated & rtlsim performance: " + str(perf))
+        # for (k, v) in perf.items():
+        #    update_dashboard_data(topology, wbits, abits, k, v)
+        update_dashboard_data(topology, wbits, abits, "cycles_rtlsim", latency)
         assert (abs(res_cycles - est_cycles) / res_cycles) < 0.15
 
     @pytest.mark.slow
@@ -588,9 +586,22 @@ class TestEnd2End:
         update_dashboard_data(topology, wbits, abits, "board", cfg["board"])
         model.save(get_checkpoint_name(topology, wbits, abits, "build_" + kind))
 
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.vitis
+    @pytest.mark.parametrize("kind", ["zynq", "alveo"])
+    def test_make_pynq_driver(self, topology, wbits, abits, kind):
+        if kind == "alveo" and ("VITIS_PATH" not in os.environ):
+            pytest.skip("VITIS_PATH not set")
+        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "build_" + kind)
+        model = load_test_checkpoint_or_skip(prev_chkpt_name)
+        kind_to_driver_platform = {"zynq": "zynq-iodma", "alveo": "alveo"}
+        model = model.transform(MakePYNQDriver(kind_to_driver_platform[kind]))
+        model.save(get_checkpoint_name(topology, wbits, abits, "driver_" + kind))
+
     @pytest.mark.parametrize("kind", ["zynq", "alveo"])
     def test_deploy(self, topology, wbits, abits, kind):
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "build_" + kind)
+        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "driver_" + kind)
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         cfg = get_build_env(kind, target_clk_ns)
         if cfg["ip"] == "":
diff --git a/tests/end2end/test_end2end_mobilenet_v1.py b/tests/end2end/test_end2end_mobilenet_v1.py
index 1289b02636f030397075a9f580ed0977cd465a88..760c77ea406386ce4886b21bcb042808984dcb7f 100644
--- a/tests/end2end/test_end2end_mobilenet_v1.py
+++ b/tests/end2end/test_end2end_mobilenet_v1.py
@@ -198,6 +198,7 @@ def test_end2end_mobilenet_lowering():
     )
     model = model.transform(LowerConvsToMatMul())
     model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
+    model = model.transform(absorb.AbsorbConsecutiveTransposes())
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(GiveReadableTensorNames())
     model = model.transform(InferDataTypes())
diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py b/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
index 37a1c8d8486a535c8ff87f4b06905b3059bba35a..3357ee6d6c1e540818549f2d0df8b8554690ca3c 100644
--- a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
+++ b/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
@@ -70,6 +70,7 @@ def test_convert_to_hls_layers_cnv_w1a1(fused_activation):
     model = model.transform(LowerConvsToMatMul())
     model = model.transform(MakeMaxPoolNHWC())
     model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
+    model = model.transform(absorb.AbsorbConsecutiveTransposes())
     model = model.transform(ConvertBipolarMatMulToXnorPopcount())
     model = model.transform(Streamline())
     model = model.transform(InferDataLayouts())
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
index b87241de56870cad70d08583b24292e0da91109e..012bc3e2e140c3fa63729584629613e3046f8838 100644
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
@@ -43,6 +43,7 @@ from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
@@ -216,6 +217,7 @@ def test_runtime_thresholds_single_layer():
     old_weight_stream = map(lambda x: int(x, 16), old_weight_stream.split("\n"))
     old_weight_stream = list(old_weight_stream)
     # need to create stitched IP for runtime weight testing
+    model = model.transform(InsertFIFO(True))
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
     model = model.transform(HLSSynthIP())
diff --git a/tests/fpgadataflow/test_fpgadataflow_upsampler.py b/tests/fpgadataflow/test_fpgadataflow_upsampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7e06adb816cfa664187f39a9567d4f742e4043b
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_upsampler.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+import os
+import torch
+from brevitas.export import FINNManager
+from torch import nn
+
+import finn.core.onnx_exec as oxe
+import finn.transformation.streamline.absorb as absorb
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.base import Transformation
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.convert_to_hls_layers import InferUpsample
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.make_input_chanlast import MakeInputChannelsLast
+
+tmpdir = os.environ["FINN_BUILD_DIR"]
+
+
+class ForceDataTypeForTensors(Transformation):
+    """
+    Forces a certain datatype for all tensors in a model.
+    """
+
+    def __init__(self, dType=DataType.INT8):
+        super().__init__()
+        self._dType = dType
+
+    def apply(self, model):
+        graph = model.graph
+        for n in graph.node:
+            for inp in n.input:
+                model.set_tensor_datatype(inp, self._dType)
+            for inp in n.output:
+                model.set_tensor_datatype(inp, self._dType)
+
+        return model, False
+
+
+_to_chan_last_args = (0, 2, 3, 1)
+_to_chan_first_args = (0, 3, 1, 2)
+
+
+class TransposeUpsampleIO(Transformation):
+    """
+    Converts the inputs outputs for all Upsample and Resize nodes
+    from NCHW to NHWC.
+    """
+
+    def apply(self, model):
+        graph = model.graph
+        for n in graph.node:
+            if n.op_type == "Upsample" or n.op_type == "Resize":
+                # Set input shape
+                inp = n.input[0]
+                NCHW_shape = model.get_tensor_shape(inp)
+                NHWC_shape = [NCHW_shape[idx] for idx in _to_chan_last_args]
+                model.set_tensor_shape(inp, NHWC_shape)
+                # Set output shape
+                out = n.output[0]
+                NCHW_shape = model.get_tensor_shape(out)
+                NHWC_shape = [NCHW_shape[idx] for idx in _to_chan_last_args]
+                model.set_tensor_shape(out, NHWC_shape)
+        return model, False
+
+
+class PyTorchTestModel(nn.Module):
+    def __init__(self, upscale_factor=2):
+        super(PyTorchTestModel, self).__init__()
+        self.m = nn.Upsample(
+            scale_factor=upscale_factor,
+            mode="nearest",
+        )
+
+    def forward(self, x):
+        x = self.m(x)
+        return x
+
+
+# param datatype
+@pytest.mark.parametrize("dt", [DataType.INT8])
+# Width/height of square input feature map
+@pytest.mark.parametrize("IFMDim", [3, 5])
+# upscaling factor
+@pytest.mark.parametrize("scale", [2, 3])
+# Number of input/output channels
+@pytest.mark.parametrize("NumChannels", [4])
+# execution mode
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.vivado
+@pytest.mark.slow
+def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode):
+    atol = 1e-3
+    # Create the test model and inputs for it
+    torch_model = PyTorchTestModel(upscale_factor=scale)
+    input_shape = (1, NumChannels, IFMDim, IFMDim)
+    test_in = torch.arange(0, np.prod(np.asarray(input_shape)))
+    # Limit the input to values valid for the given datatype
+    test_in %= dt.max() - dt.min() + 1
+    test_in += dt.min()
+    # Additionally make sure we always start with 0, for convenience purposes.
+    test_in = torch.roll(test_in, dt.min())
+    test_in = test_in.view(*input_shape).type(torch.float32)
+
+    # Get golden PyTorch and ONNX inputs
+    golden_torch_float = torch_model(test_in)
+    export_path = f"{tmpdir}/Upsample_exported.onnx"
+    FINNManager.export(
+        torch_model, input_shape=input_shape, export_path=export_path, opset_version=11
+    )
+    model = ModelWrapper(export_path)
+    input_dict = {model.graph.input[0].name: test_in.numpy().astype(np.int32)}
+    input_dict = {model.graph.input[0].name: test_in.numpy()}
+    golden_output_dict = oxe.execute_onnx(model, input_dict, True)
+    golden_result = golden_output_dict[model.graph.output[0].name]
+
+    # Make sure PyTorch and ONNX match
+    pyTorch_onnx_match = np.isclose(golden_result, golden_torch_float).all()
+    assert pyTorch_onnx_match, "ONNX and PyTorch upsampling output don't match."
+
+    # Prep model for execution
+    model = ModelWrapper(export_path)
+    # model = model.transform(TransposeUpsampleIO())
+    model = model.transform(MakeInputChannelsLast())
+    model = model.transform(InferDataLayouts())
+    model = model.transform(absorb.AbsorbTransposeIntoResize())
+    model = model.transform(InferShapes())
+    model = model.transform(ForceDataTypeForTensors(dType=dt))
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(InferUpsample())
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+
+    # Check that all nodes are UpsampleNearestNeighbour_Batch nodes
+    for n in model.get_finn_nodes():
+        node_check = n.op_type == "UpsampleNearestNeighbour_Batch"
+        assert node_check, "All nodes should be UpsampleNearestNeighbour_Batch nodes."
+
+    # Prep sim
+    if exec_mode == "cppsim":
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+        model = model.transform(SetExecMode("cppsim"))
+    elif exec_mode == "rtlsim":
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP("xc7z020clg400-1", 10))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(SetExecMode("rtlsim"))
+        model = model.transform(PrepareRTLSim())
+    else:
+        raise Exception("Unknown exec_mode")
+
+    # Run sim
+    test_in_transposed = test_in.numpy().transpose(_to_chan_last_args)
+    input_dict = {model.graph.input[0].name: test_in_transposed}
+    output_dict = oxe.execute_onnx(model, input_dict, True)
+    test_result = output_dict[model.graph.output[0].name]
+    output_matches = np.isclose(golden_result, test_result, atol=atol).all()
+
+    if exec_mode == "cppsim":
+        assert output_matches, "Cppsim output doesn't match ONNX/PyTorch."
+    elif exec_mode == "rtlsim":
+        assert output_matches, "Rtlsim output doesn't match ONNX/PyTorch."
diff --git a/tests/fpgadataflow/test_runtime_weights.py b/tests/fpgadataflow/test_runtime_weights.py
index 73b1315592af79145e1b7c6f147b3ede7e066bce..706d11114f2f08df700efd40afb8dea218efbf42 100644
--- a/tests/fpgadataflow/test_runtime_weights.py
+++ b/tests/fpgadataflow/test_runtime_weights.py
@@ -36,8 +36,8 @@ from finn.core.rtlsim_exec import rtlsim_exec
 from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.general import GiveUniqueNodeNames
 from finn.util.basic import gen_finn_dt_tensor
 from finn.util.create import hls_random_mlp_maker
@@ -78,11 +78,11 @@ def test_runtime_weights_single_layer():
     os.remove("old_weights.dat")
     old_weight_stream = map(lambda x: int(x, 16), old_weight_stream.split("\n"))
     old_weight_stream = list(old_weight_stream)
+    model = model.transform(InsertFIFO(True))
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
     model = model.transform(HLSSynthIP())
     model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
-    model = model.transform(PrepareRTLSim())
     model.set_metadata_prop("exec_mode", "rtlsim")
     in_tensor = np.asarray(range(mw), dtype=np.float32)
     # add two copies of the input tensor as the first one is just used to
diff --git a/tests/fpgadataflow/test_set_folding.py b/tests/fpgadataflow/test_set_folding.py
index f268611c296687987fffe32293b0454109bc7db4..8f4d57d3f84dd5c5a167b0e35b775def8ed27c5d 100644
--- a/tests/fpgadataflow/test_set_folding.py
+++ b/tests/fpgadataflow/test_set_folding.py
@@ -98,7 +98,8 @@ def make_multi_fclayer_model(ch, wdt, adt, tdt, nnodes):
     model.set_tensor_datatype("outp", adt)
 
     for i in range(1, nnodes + 1):
-        model.graph.value_info.append(tensors[i])
+        if tensors[i].name != "outp":
+            model.graph.value_info.append(tensors[i])
         model.set_initializer("weights_" + str(i - 1), W)
         model.set_initializer("thresh_" + str(i - 1), T)
         model.set_tensor_datatype("weights_" + str(i - 1), wdt)