diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index 0e8988d5392810b08ed647fc0466699425430e12..0aea65fdd7999b56989239685f6606a8e1b2e618 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -12,8 +12,8 @@ gecho () {
 
 # checkout the correct dependency repo commits
 # the repos themselves are cloned in the Dockerfile
-FINN_BASE_COMMIT=951d5e9dd25b7f38731fa539959667a86e7091b2
-BREVITAS_COMMIT=6ffefa8dbf37fdb0f44c994f34604c29fadb16b0
+FINN_BASE_COMMIT=f2e5f0582ef2b7cbc134168993816c337ca8d3a6
+BREVITAS_COMMIT=b75e0408d9759ed519296e3af29b9c16fb94b0b8
 CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4
 HLSLIB_COMMIT=cfafe11a93b79ab1af7529d68f08886913a6466e
 PYVERILATOR_COMMIT=06c29ecf3ba0361e3d0a75c98f6918ba67bf0e27
diff --git a/docs/finn/internals.rst b/docs/finn/internals.rst
index dee62f09a9253380e05300dac8fa34915c20dab5..4cbf671235cbe61b7afcba9979c1259ecddf35a0 100644
--- a/docs/finn/internals.rst
+++ b/docs/finn/internals.rst
@@ -23,7 +23,9 @@ This behavior can be disabled (not recommended!) by setting the environment vari
 Custom Operations/Nodes
 =======================
 
-FINN uses many custom operations (op_type in ONNX NodeProto) that are not defined in the ONNX operator schema. These custom nodes are marked with domain="finn" in the protobuf to identify them as such. These nodes can represent specific operations that we need for low-bit networks, or operations that are specific to a particular hardware backend. To get more familiar with custom operations and how they are created, please take a look in the Jupyter notebook about CustomOps (see chapter :ref:`tutorials` for details) or directly in the module :py:mod:`finn.custom_op`.
+FINN uses many custom operations (op_type in ONNX NodeProto) that are not defined in the ONNX operator schema. These custom nodes are marked with domain="finn.*" in the protobuf to identify them as such. These nodes can represent specific operations that we need for low-bit networks, or operations that are specific to a particular hardware backend. To get more familiar with custom operations and how they are created, please take a look in the Jupyter notebook about CustomOps (see chapter :ref:`tutorials` for details) or directly in the module :py:mod:`finn.custom_op`.
+
+.. note:: See the description of `this PR <https://github.com/Xilinx/finn-base/pull/6>`_ for more on how the operator wrapper library is organized.
 
 Custom ONNX Execution Flow
 ==========================
diff --git a/docs/finn/source_code/finn.custom_op.rst b/docs/finn/source_code/finn.custom_op.rst
index 72dd4beb90e87d527543ab11ac1ce1d6ac0604b3..8c43ddb424b5f690a0c266c4f31ab95dfa77e480 100644
--- a/docs/finn/source_code/finn.custom_op.rst
+++ b/docs/finn/source_code/finn.custom_op.rst
@@ -24,7 +24,7 @@ Base Class
 finn.custom\_op.im2col
 -----------------------------
 
-.. automodule:: finn.custom_op.im2col
+.. automodule:: finn.custom_op.general.im2col
    :members:
    :undoc-members:
    :show-inheritance:
@@ -32,7 +32,7 @@ finn.custom\_op.im2col
 finn.custom\_op.maxpoolnhwc
 ----------------------------------
 
-.. automodule:: finn.custom_op.maxpoolnhwc
+.. automodule:: finn.custom_op.general.maxpoolnhwc
    :members:
    :undoc-members:
    :show-inheritance:
@@ -40,7 +40,7 @@ finn.custom\_op.maxpoolnhwc
 finn.custom\_op.multithreshold
 -------------------------------------
 
-.. automodule:: finn.custom_op.multithreshold
+.. automodule:: finn.custom_op.general.multithreshold
    :members:
    :undoc-members:
    :show-inheritance:
@@ -56,7 +56,7 @@ finn.custom\_op.registry
 finn.custom\_op.streamingdataflowpartition
 -------------------------------------------------
 
-.. automodule:: finn.custom_op.streamingdataflowpartition
+.. automodule:: finn.custom_op.general.streamingdataflowpartition
    :members:
    :undoc-members:
    :show-inheritance:
@@ -64,7 +64,7 @@ finn.custom\_op.streamingdataflowpartition
 finn.custom\_op.xnorpopcount
 -----------------------------------
 
-.. automodule:: finn.custom_op.xnorpopcount
+.. automodule:: finn.custom_op.general.xnorpopcount
    :members:
    :undoc-members:
    :show-inheritance:
diff --git a/notebooks/advanced/1_custom_transformation_pass.ipynb b/notebooks/advanced/1_custom_transformation_pass.ipynb
index 9c54d6f26913e558867b2f800b424f4157f47491..9d9bc74633975076b9464dcc38da920204f05c06 100644
--- a/notebooks/advanced/1_custom_transformation_pass.ipynb
+++ b/notebooks/advanced/1_custom_transformation_pass.ipynb
@@ -398,7 +398,7 @@
       "        if is_fpgadataflow_node(node) is True:\n",
       "            try:\n",
       "                # lookup op_type in registry of CustomOps\n",
-      "                inst = registry.custom_op[op_type](node)\n",
+      "                inst = registry.getCustomOp(node)\n",
       "                # ensure that code is generated\n",
       "                assert (\n",
       "                    inst.get_nodeattr(\"code_gen_dir_cppsim\") != \"\"\n",
diff --git a/notebooks/end2end_example/tfc_end2end_verification.ipynb b/notebooks/end2end_example/tfc_end2end_verification.ipynb
index 92de7fb7e42b5d0013af31cc0fd88e34d354def8..54738c3725c0141fddc3497dee024ca90db3f3ce 100644
--- a/notebooks/end2end_example/tfc_end2end_verification.ipynb
+++ b/notebooks/end2end_example/tfc_end2end_verification.ipynb
@@ -128,7 +128,7 @@
     }
    ],
    "source": [
-    "from finn.custom_op.xnorpopcount import xnorpopcountmatmul\n",
+    "from finn.custom_op.general.xnorpopcount import xnorpopcountmatmul\n",
     "showSrc(xnorpopcountmatmul)"
    ]
   },
diff --git a/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py b/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py
index 201333aebdb3fc1d15464389e37326dcaf6848e0..0fcf2e382561852eb1c0b02e1d417db05057655c 100644
--- a/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py
+++ b/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py
@@ -41,8 +41,7 @@ def exp_cycles_per_layer(model):
     cycle_dict = {}
     for node in model.graph.node:
         if is_fpgadataflow_node(node) is True:
-            op_type = node.op_type
-            inst = registry.custom_op[op_type](node)
+            inst = registry.getCustomOp(node)
             cycle_dict[node.name] = inst.get_exp_cycles()
 
     return cycle_dict
diff --git a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
index 03b31b9c1ec51b45e17152d35d5824b6137ab4a2..39d6332aa42594528fbd5a04dd5efad2c3237e77 100644
--- a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
+++ b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
@@ -51,8 +51,7 @@ def hls_synth_res_estimation(model):
             res_dict[node.name]["LUT"] = 0
             res_dict[node.name]["DSP48E"] = 0
             res_dict[node.name]["URAM"] = 0
-            op_type = node.op_type
-            inst = registry.custom_op[op_type](node)
+            inst = registry.getCustomOp(node)
             code_gen_dir = inst.get_nodeattr("code_gen_dir_ipgen")
             if code_gen_dir == "":
                 warnings.warn(
diff --git a/src/finn/analysis/fpgadataflow/res_estimation.py b/src/finn/analysis/fpgadataflow/res_estimation.py
index e52557573dab072709da4452f4e2d477e99b98c9..2c714b1f12b75e9789f1865d6737422f4d9d9a97 100644
--- a/src/finn/analysis/fpgadataflow/res_estimation.py
+++ b/src/finn/analysis/fpgadataflow/res_estimation.py
@@ -41,8 +41,45 @@ def res_estimation(model):
     res_dict = {}
     for node in model.graph.node:
         if is_fpgadataflow_node(node) is True:
-            op_type = node.op_type
-            inst = registry.custom_op[op_type](node)
+            inst = registry.getCustomOp(node)
             res_dict[node.name] = inst.node_res_estimation()
 
     return res_dict
+
+
+def res_estimation_complete(model):
+    """Estimates the resources needed for the given model and all values for
+    resource-related switches.
+    Ensure that all nodes have unique names (by calling the GiveUniqueNodeNames
+    transformation) prior to calling this analysis pass to ensure all nodes are
+    visible in the results.
+
+    Returns {node name : [resource estimation(s)]}."""
+
+    res_dict = {}
+    for node in model.graph.node:
+        if is_fpgadataflow_node(node) is True:
+            op_type = node.op_type
+            inst = registry.getCustomOp(node)
+            if op_type == "StreamingFCLayer_Batch" or op_type == "Vector_Vector_Activate_Batch":
+                orig_restype = inst.get_nodeattr("resType")
+                res_dict[node.name] = []
+                inst.set_nodeattr("resType", "dsp")
+                res_dict[node.name].append(inst.node_res_estimation())
+                inst.set_nodeattr("resType", "lut")
+                res_dict[node.name].append(inst.node_res_estimation())
+                inst.set_nodeattr("resType", orig_restype)
+            elif op_type == "ConvolutionInputGenerator":
+                orig_ramstyle = inst.get_nodeattr("ram_style")
+                res_dict[node.name] = []
+                inst.set_nodeattr("ram_style", "block")
+                res_dict[node.name].append(inst.node_res_estimation())
+                inst.set_nodeattr("ram_style", "distributed")
+                res_dict[node.name].append(inst.node_res_estimation())
+                inst.set_nodeattr("ram_style", "ultra")
+                res_dict[node.name].append(inst.node_res_estimation())
+                inst.set_nodeattr("ram_style", orig_ramstyle)
+            else:
+                res_dict[node.name] = [inst.node_res_estimation()]
+
+    return res_dict
diff --git a/src/finn/analysis/verify_custom_nodes.py b/src/finn/analysis/verify_custom_nodes.py
index 0e05022dd0cb72291128259b983513322524b9da..9af1e9a4fe83de24f64a7e9df535bcf78f5fc234 100644
--- a/src/finn/analysis/verify_custom_nodes.py
+++ b/src/finn/analysis/verify_custom_nodes.py
@@ -27,6 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import finn.custom_op.registry as registry
+from finn.util.basic import is_finn_op
 
 
 def verify_nodes(model):
@@ -39,9 +40,9 @@ def verify_nodes(model):
 
     verification_dict = {}
     for node in model.graph.node:
-        if node.domain == "finn":
+        if is_finn_op(node.domain):
             op_type = node.op_type
-            inst = registry.custom_op[op_type](node)
+            inst = registry.getCustomOp(node)
             verification_dict[op_type] = inst.verify_node()
 
     return verification_dict
diff --git a/src/finn/custom_op/__init__.py b/src/finn/custom_op/__init__.py
deleted file mode 100644
index 06fc7e5659d8f55f63fe40380abac70dc74c0a4d..0000000000000000000000000000000000000000
--- a/src/finn/custom_op/__init__.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) 2020, Xilinx
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from pkgutil import extend_path
-
-__path__ = extend_path(__path__, __name__)
-
-from finn.custom_op.registry import custom_op
-
-# make sure new CustomOp subclasses are imported here so that they get
-# registered and plug in correctly into the infrastructure
-from finn.custom_op.fpgadataflow.convolutioninputgenerator import (
-    ConvolutionInputGenerator,
-)
-from finn.custom_op.fpgadataflow.downsampler import DownSampler
-from finn.custom_op.fpgadataflow.streamingfclayer_batch import StreamingFCLayer_Batch
-from finn.custom_op.fpgadataflow.streamingmaxpool_batch import StreamingMaxPool_Batch
-from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO
-from finn.custom_op.fpgadataflow.tlastmarker import TLastMarker
-from finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch import (
-    StreamingDataWidthConverter_Batch,
-)
-from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch
-from finn.custom_op.fpgadataflow.pool_batch import Pool_Batch
-from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch
-from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch
-from finn.custom_op.fpgadataflow.addstreams_batch import AddStreams_Batch
-from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch
-from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch
-from finn.custom_op.fpgadataflow.vector_vector_activate_batch import (
-    Vector_Vector_Activate_Batch,
-)
-from finn.custom_op.fpgadataflow.channelwise_op_batch import ChannelwiseOp_Batch
-from finn.custom_op.fpgadataflow.iodma import IODMA
-
-
-custom_op["DownSampler"] = DownSampler
-custom_op["StreamingMaxPool_Batch"] = StreamingMaxPool_Batch
-custom_op["StreamingFCLayer_Batch"] = StreamingFCLayer_Batch
-custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator
-custom_op["TLastMarker"] = TLastMarker
-custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch
-custom_op["StreamingFIFO"] = StreamingFIFO
-custom_op["GlobalAccPool_Batch"] = GlobalAccPool_Batch
-custom_op["Pool_Batch"] = Pool_Batch
-custom_op["FMPadding_Batch"] = FMPadding_Batch
-custom_op["Thresholding_Batch"] = Thresholding_Batch
-custom_op["AddStreams_Batch"] = AddStreams_Batch
-custom_op["LabelSelect_Batch"] = LabelSelect_Batch
-custom_op["DuplicateStreams_Batch"] = DuplicateStreams_Batch
-custom_op["Vector_Vector_Activate_Batch"] = Vector_Vector_Activate_Batch
-custom_op["ChannelwiseOp_Batch"] = ChannelwiseOp_Batch
-custom_op["IODMA"] = IODMA
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index a0c10f08c017db78c8aff284a7e07fa1c26d466e..068950b89ae543f5a37c28d83d87ecfa605eaab4 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -25,601 +25,49 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-# namespace package, extend path
-from pkgutil import extend_path
 
-__path__ = extend_path(__path__, __name__)
-
-from abc import abstractmethod
-import numpy as np
-import os
-import subprocess
-from finn.custom_op.base import CustomOp
-from finn.util.basic import (
-    CppBuilder,
-    make_build_dir,
-    roundup_to_integer_multiple,
-    get_rtlsim_trace_depth,
+from finn.custom_op.fpgadataflow.convolutioninputgenerator import (
+    ConvolutionInputGenerator,
 )
-from finn.util.fpgadataflow import (
-    IPGenBuilder,
-    pyverilate_get_liveness_threshold_cycles,
-    rtlsim_multi_io,
+from finn.custom_op.fpgadataflow.downsampler import DownSampler
+from finn.custom_op.fpgadataflow.streamingfclayer_batch import StreamingFCLayer_Batch
+from finn.custom_op.fpgadataflow.streamingmaxpool_batch import StreamingMaxPool_Batch
+from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO
+from finn.custom_op.fpgadataflow.tlastmarker import TLastMarker
+from finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch import (
+    StreamingDataWidthConverter_Batch,
 )
-from . import templates
-
-try:
-    from pyverilator import PyVerilator
-except ModuleNotFoundError:
-    PyVerilator = None
-
-
-class HLSCustomOp(CustomOp):
-    """HLSCustomOp class all custom ops that correspond to a finn-hlslib
-    function are based on. Contains different functions every fpgadataflow
-    custom node should have. Some as abstract methods, these have to be filled
-    when writing a new fpgadataflow custom op node."""
-
-    def __init__(self, onnx_node):
-        super().__init__(onnx_node)
-
-        self.code_gen_dict = {}
-
-        # getting templates from templates.py
-
-        # template for single node execution
-        self.docompute_template = templates.docompute_template
-
-        # templates for single node ip generation
-        # cpp file
-        self.ipgen_template = templates.ipgen_template
-        # tcl script
-        self.ipgentcl_template = templates.ipgentcl_template
-
-    def get_nodeattr_types(self):
-        return {
-            "backend": ("s", True, "fpgadataflow"),
-            "code_gen_dir_cppsim": ("s", False, ""),
-            "code_gen_dir_ipgen": ("s", False, ""),
-            "executable_path": ("s", False, ""),
-            "ipgen_path": ("s", False, ""),
-            "ip_path": ("s", False, ""),
-            "ip_vlnv": ("s", False, ""),
-            "exec_mode": ("s", False, ""),
-            "cycles_rtlsim": ("i", False, 0),
-            "cycles_estimate": ("i", False, 0),
-            "rtlsim_trace": ("s", False, ""),
-            "res_estimate": ("s", False, ""),
-            "res_hls": ("s", False, ""),
-            "res_synth": ("s", False, ""),
-            "rtlsim_so": ("s", False, ""),
-            # partitioning info
-            "partition_id": ("i", False, 0),
-            # input and output FIFO depths
-            "inFIFODepth": ("i", False, 2),
-            "outFIFODepth": ("i", False, 2),
-        }
-
-    def get_verilog_top_module_name(self):
-        "Return the Verilog top module name for this node."
-
-        node = self.onnx_node
-        prefixed_top_name = "%s_%s" % (node.name, node.name)
-        return prefixed_top_name
-
-    def get_verilog_top_module_intf_names(self):
-        """Return a dict of names of input and output interfaces.
-        The keys reflect the protocols each interface implements:
-        'clk', 'rst', 'm_axis', 's_axis', 'aximm', 'axilite'.
-        Values are lists of names:
-        's_axis' names correspond to the list of node inputs in order,
-        'm_axis' names correspond to the list of node outputs in order'
-        Each block must have at most one aximm and one axilite."""
-        intf_names = {}
-        intf_names["clk"] = ["ap_clk"]
-        intf_names["rst"] = ["ap_rst_n"]
-        intf_names["s_axis"] = [("in0_V_V", self.get_instream_width_padded())]
-        intf_names["m_axis"] = [("out_V_V", self.get_outstream_width_padded())]
-        intf_names["aximm"] = []
-        intf_names["axilite"] = []
-        return intf_names
-
-    def get_verilog_top_filename(self):
-        "Return the Verilog top module filename for this node."
-
-        verilog_file = "{}/project_{}/sol1/impl/verilog/{}.v".format(
-            self.get_nodeattr("code_gen_dir_ipgen"),
-            self.onnx_node.name,
-            self.get_verilog_top_module_name(),
-        )
-        return verilog_file
-
-    def get_all_verilog_paths(self):
-        "Return list of all folders containing Verilog code for this node."
-
-        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-        assert (
-            code_gen_dir != ""
-        ), """Node attribute "code_gen_dir_ipgen" is
-        not set. Please run HLSSynthIP first."""
-        verilog_path = "{}/project_{}/sol1/impl/verilog/".format(
-            code_gen_dir, self.onnx_node.name
-        )
-        # default impl only returns the HLS verilog codegen dir
-        return [verilog_path]
-
-    def get_all_verilog_filenames(self):
-        "Return list of all Verilog files used for this node."
-
-        verilog_files = []
-        verilog_paths = self.get_all_verilog_paths()
-        for verilog_path in verilog_paths:
-            for f in os.listdir(verilog_path):
-                if f.endswith(".v"):
-                    verilog_files += [f]
-        return verilog_files
-
-    def prepare_rtlsim(self):
-        """Creates a Verilator emulation library for the RTL code generated
-        for this node, sets the rtlsim_so attribute to its path and returns
-        a PyVerilator wrapper around it."""
-
-        if PyVerilator is None:
-            raise ImportError("Installation of PyVerilator is required.")
-        verilog_paths = self.get_all_verilog_paths()
-        verilog_files = self.get_all_verilog_filenames()
-        # build the Verilator emu library
-        sim = PyVerilator.build(
-            verilog_files,
-            build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
-            verilog_path=verilog_paths,
-            trace_depth=get_rtlsim_trace_depth(),
-            top_module_name=self.get_verilog_top_module_name(),
-        )
-        # save generated lib filename in attribute
-        self.set_nodeattr("rtlsim_so", sim.lib._name)
-        return sim
-
-    def get_rtlsim(self):
-        """Return a PyVerilator wrapper for the Verilator emulation library
-        for this node."""
-
-        rtlsim_so = self.get_nodeattr("rtlsim_so")
-        assert os.path.isfile(rtlsim_so), "Cannot find rtlsim library."
-        # create PyVerilator wrapper
-        sim = PyVerilator(rtlsim_so)
-        return sim
-
-    def node_res_estimation(self):
-        """Returns summarized resource estimation of BRAMs and LUTs
-        of the node as a dictionary."""
-        ret = dict()
-        ret["BRAM_18K"] = self.bram_estimation()
-        ret["BRAM_efficiency"] = self.bram_efficiency_estimation()
-        ret["LUT"] = self.lut_estimation()
-        return ret
-
-    def bram_efficiency_estimation(self):
-        """Function for BRAM efficiency estimation: actual parameter storage
-        needed divided by the allocated BRAM storage (from estimation)"""
-        return 1
-
-    def bram_estimation(self):
-        """Function for BRAM resource estimation, is member function of
-        HLSCustomOp class but has to be filled by every node"""
-        return 0
-
-    def lut_estimation(self):
-        """Function for LUT resource estimation, is member function of
-        HLSCustomOp class but has to be filled by every node"""
-        return 0
-
-    def get_exp_cycles(self):
-        """Function for estimation of expected cycles for set folding,
-        is member function of HLSCustomOp class but has to be filled
-        by every node"""
-        return 0
-
-    def code_generation_ipgen(self, model, fpgapart, clk):
-        """Generates c++ code and tcl script for ip generation."""
-        node = self.onnx_node
-
-        # generate top cpp file for ip generation
-        path = self.get_nodeattr("code_gen_dir_ipgen")
-        self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())]
-        self.generate_params(model, path)
-        self.global_includes()
-        self.defines("ipgen")
-        self.blackboxfunction()
-        self.pragmas()
-        self.docompute()
-
-        template = self.ipgen_template
-
-        for key in self.code_gen_dict:
-            # transform list into long string separated by '\n'
-            code_gen_line = "\n".join(self.code_gen_dict[key])
-            template = template.replace(key, code_gen_line)
-        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-        f = open(os.path.join(code_gen_dir, "top_{}.cpp".format(node.name)), "w")
-        f.write(template)
-        f.close()
-        self.code_gen_dict.clear()
-
-        # generate tcl script for ip generation
-        self.code_gen_dict["$PROJECTNAME$"] = ["project_{}".format(node.name)]
-        self.code_gen_dict["$HWSRCDIR$"] = [code_gen_dir]
-        self.code_gen_dict["$FPGAPART$"] = [fpgapart]
-        self.code_gen_dict["$FINNHLSLIBDIR$"] = ["/workspace/finn-hlslib"]
-        self.code_gen_dict["$TOPFXN$"] = [node.name]
-        self.code_gen_dict["$CLKPERIOD$"] = [str(clk)]
-        self.code_gen_dict["$EXTRA_DIRECTIVES$"] = self.ipgen_extra_directives()
-
-        template = self.ipgentcl_template
-
-        for key in self.code_gen_dict:
-            # transform list into long string separated by '\n'
-            code_gen_line = "\n".join(self.code_gen_dict[key])
-            template = template.replace(key, code_gen_line)
-        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-        f = open(os.path.join(code_gen_dir, "hls_syn_{}.tcl".format(node.name)), "w")
-        f.write(template)
-        f.close()
-        self.code_gen_dict.clear()
-
-    def ipgen_extra_directives(self):
-        "Return a list of extra tcl directives for HLS synthesis."
-        return []
-
-    def ipgen_singlenode_code(self):
-        """Builds the bash script for ip generation using the IPGenBuilder from
-        finn.util.fpgadataflow."""
-        node = self.onnx_node
-        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-        builder = IPGenBuilder()
-        builder.append_tcl(code_gen_dir + "/hls_syn_{}.tcl".format(node.name))
-        builder.set_ipgen_path(code_gen_dir + "/project_{}".format(node.name))
-        builder.build(code_gen_dir)
-        self.set_nodeattr("ipgen_path", builder.ipgen_path)
-        self.set_nodeattr("ip_path", builder.ipgen_path + "/sol1/impl/ip")
-        vlnv = "xilinx.com:hls:%s:1.0" % node.name
-        self.set_nodeattr("ip_vlnv", vlnv)
-
-    def code_generation_cppsim(self, model):
-        """Generates c++ code for simulation (cppsim)."""
-        node = self.onnx_node
-        path = self.get_nodeattr("code_gen_dir_cppsim")
-        self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())]
-        self.generate_params(model, path)
-        self.global_includes()
-        self.defines("cppsim")
-        self.read_npy_data()
-        self.strm_decl()
-        self.pragmas()
-        self.docompute()
-        self.dataoutstrm()
-        self.save_as_npy()
-
-        template = self.docompute_template
-
-        for key in self.code_gen_dict:
-            # transform list into long string separated by '\n'
-            code_gen_line = "\n".join(self.code_gen_dict[key])
-            template = template.replace(key, code_gen_line)
-        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
-        f = open(os.path.join(code_gen_dir, "execute_{}.cpp".format(node.op_type)), "w")
-        f.write(template)
-        f.close()
-        self.code_gen_dict.clear()
-
-    def code_generation_ipi(self):
-        """Constructs and returns the TCL for node instantiation in Vivado IPI."""
-        vlnv = self.get_nodeattr("ip_vlnv")
-        cmd = ["create_bd_cell -type ip -vlnv %s %s" % (vlnv, self.onnx_node.name)]
-        return cmd
-
-    def compile_singlenode_code(self):
-        """Builds the bash script for compilation using the CppBuilder from
-        finn.util.basic and executes the script to produce the executable."""
-        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
-        builder = CppBuilder()
-        # to enable additional debug features please uncommand the next line
-        # builder.append_includes("-DDEBUG")
-        builder.append_includes("-I/workspace/finn/src/finn/qnn-data/cpp")
-        builder.append_includes("-I/workspace/cnpy/")
-        builder.append_includes("-I/workspace/finn-hlslib")
-        builder.append_includes("-I{}/include".format(os.environ["VIVADO_PATH"]))
-        builder.append_includes("--std=c++11")
-        builder.append_includes("-O3")
-        builder.append_sources(code_gen_dir + "/*.cpp")
-        builder.append_sources("/workspace/cnpy/cnpy.cpp")
-        builder.append_includes("-lz")
-        builder.set_executable_path(code_gen_dir + "/node_model")
-        builder.build(code_gen_dir)
-        self.set_nodeattr("executable_path", builder.executable_path)
-
-    def dynamic_input_to_npy(self, context, count):
-        """Saves input (given context) into .npy files.
-
-        Count indicates the number of inputs that have to be saved."""
-        node = self.onnx_node
-        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
-        if code_gen_dir == "":
-            raise Exception(
-                """
-Found no codegen dir for this node, did you run the prepare_cppsim transformation?
-            """
-            )
-        # create a npy file for each input of the node (in_ind is input index)
-        # assuming dynamic inputs start from 0
-        for in_ind in range(count):
-            current_input_name = node.input[in_ind]
-            # make copy before saving array
-            input_array = context[current_input_name].copy()
-            np.save(
-                os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), input_array
-            )
-
-    def npy_to_dynamic_output(self, context):
-        """Reads the output from an output.npy file generated from cppsim and
-        places its content into the context dictionary."""
-        node = self.onnx_node
-        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
-        output = np.load("{}/output.npy".format(code_gen_dir))
-        context[node.output[0]] = output
-
-    def npy_to_dynamic_outputs(self, context, npy_list):
-        """Reads the output from .npy files generated from cppsim and places
-        their content into the context dictionary.
-        npy_list is a list specifying which files to read, and its order must
-        match the order of node outputs."""
-        node = self.onnx_node
-        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
-        for i in range(len(npy_list)):
-            output = np.load("{}/{}".format(code_gen_dir, npy_list[i]))
-            context[node.output[i]] = output
-
-    def exec_precompiled_singlenode_model(self):
-        """Executes precompiled executable."""
-        executable_path = self.get_nodeattr("executable_path")
-        if executable_path == "":
-            raise Exception(
-                """
-Found no executable for this node, did you run the codegen and
-compilation transformations?
-            """
-            )
-        process_execute = subprocess.Popen(executable_path, stdout=subprocess.PIPE)
-        process_execute.communicate()
-
-    def reset_rtlsim(self, sim):
-        """Sets reset input in pyverilator to zero, toggles the clock and set it
-        back to one"""
-        sim.io.ap_rst_n = 0
-        sim.io.ap_clk = 1
-        sim.io.ap_clk = 0
-        sim.io.ap_rst_n = 1
-
-    def toggle_clk(self, sim):
-        """Toggles the clock input in pyverilator once."""
-        sim.io.ap_clk = 1
-        sim.io.ap_clk = 0
-
-    def rtlsim(self, sim, inp, inp2=None):
-        """Runs the pyverilator simulation by passing the input values to the simulation,
-        toggle the clock and observing the execution time. Function contains also an
-        observation loop that can abort the simulation if no output value is produced
-        after 100 cycles."""
-
-        trace_file = self.get_nodeattr("rtlsim_trace")
-        if trace_file != "":
-            if trace_file == "default":
-                trace_file = self.onnx_node.name + ".vcd"
-            sim.start_vcd_trace(trace_file)
-        inputs = inp
-        outputs = []
-        sim.io.out_V_V_TREADY = 1
-
-        # observe if output is completely calculated
-        # observation_count will contain the number of cycles the calculation ran
-        num_out_values = self.get_number_output_values()
-        output_observed = False
-        observation_count = 0
-
-        # avoid infinite looping of simulation by aborting when there is no change in
-        # output values after 100 cycles
-        no_change_count = 0
-        old_outputs = outputs
-        liveness_threshold = pyverilate_get_liveness_threshold_cycles()
-
-        while not (output_observed):
-            sim.io.in0_V_V_TVALID = 1 if len(inputs) > 0 else 0
-            sim.io.in0_V_V_TDATA = inputs[0] if len(inputs) > 0 else 0
-            if sim.io.in0_V_V_TREADY == 1 and sim.io.in0_V_V_TVALID == 1:
-                inputs = inputs[1:]
-
-            if inp2 is not None:
-                sim.io.in1_V_V_TVALID = 1 if len(inp2) > 0 else 0
-                sim.io.in1_V_V_TDATA = inp2[0] if len(inp2) > 0 else 0
-                if sim.io.in1_V_V_TREADY == 1 and sim.io.in1_V_V_TVALID == 1:
-                    inp2 = inp2[1:]
-
-            if sim.io.out_V_V_TVALID == 1 and sim.io.out_V_V_TREADY == 1:
-                outputs = outputs + [sim.io.out_V_V_TDATA]
-            sim.io.ap_clk = 1
-            sim.io.ap_clk = 0
-
-            observation_count = observation_count + 1
-            no_change_count = no_change_count + 1
-
-            if len(outputs) == num_out_values:
-                self.set_nodeattr("cycles_rtlsim", observation_count)
-                output_observed = True
-
-            if no_change_count == liveness_threshold:
-                if old_outputs == outputs:
-                    if trace_file != "":
-                        sim.flush_vcd_trace()
-                        sim.stop_vcd_trace()
-                    raise Exception(
-                        "Error in simulation! Takes too long to produce output. "
-                        "Consider setting the LIVENESS_THRESHOLD env.var. to a "
-                        "larger value."
-                    )
-                else:
-                    no_change_count = 0
-                    old_outputs = outputs
-        if trace_file != "":
-            sim.flush_vcd_trace()
-            sim.stop_vcd_trace()
-        return outputs
-
-    def rtlsim_multi_io(self, sim, io_dict):
-        "Run rtlsim for this node, supports multiple i/o streams."
-
-        trace_file = self.get_nodeattr("rtlsim_trace")
-        if trace_file == "default":
-            trace_file = self.onnx_node.name + ".vcd"
-        num_out_values = self.get_number_output_values()
-        total_cycle_count = rtlsim_multi_io(sim, io_dict, num_out_values, trace_file)
-        self.set_nodeattr("cycles_rtlsim", total_cycle_count)
-
-    def execute_node(self, context, graph):
-        """Executes single node using cppsim or rtlsim."""
-        mode = self.get_nodeattr("exec_mode")
-        if mode == "cppsim":
-            # save input(s)
-            self.dynamic_input_to_npy(context, 1)
-            # execute the precompiled model
-            self.exec_precompiled_singlenode_model()
-            # load output npy file
-            self.npy_to_dynamic_output(context)
-        elif mode == "rtlsim":
-            pass
-
-        else:
-            raise Exception(
-                """Invalid value for attribute exec_mode! Is currently set to: {}
-            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
-                    mode
-                )
-            )
-
-    def generate_params(self, model, path):
-        """Function to generate parameters (i.e. weights and thresholds),
-        is member function of HLSCustomOp class but has to be filled
-        by every node."""
-        pass
-
-    @abstractmethod
-    def get_number_output_values(self):
-        """Function to get the number of expected output values,
-        is member function of HLSCustomOp class but has to be filled
-        by every node."""
-        pass
-
-    @abstractmethod
-    def global_includes(self):
-        """Function to set the global includes for c++ code that has to be generated
-        for cppsim or rtlsim, is member function of HLSCustomOp class but has to
-        be filled by every node."""
-        pass
-
-    @abstractmethod
-    def defines(self, var):
-        """Function to set the define commands for c++ code that has to be generated
-        for cppsim or rtlsim, is member function of HLSCustomOp class but has to
-        be filled by every node.
-
-        var: makes it possible to reuse the function for different c++ code generation.
-        I.e. if set to "ipgen" in StreamingFCLayer_Batch additional PRAGMA defines are
-        added."""
-        pass
-
-    @abstractmethod
-    def read_npy_data(self):
-        """Function to generate the commands for reading data from .npy file in c++,
-        is member function of HLSCustomOp class but has to be filled by every node."""
-        pass
-
-    @abstractmethod
-    def strm_decl(self):
-        """Function to generate the commands for the stream declaration in c++,
-        is member function of HLSCustomOp class but has to be filled
-        by every node."""
-        pass
-
-    @abstractmethod
-    def docompute(self):
-        """Function to generate the commands for the computational part of the
-        c++ code, is member function of HLSCustomOp class but has to be filled
-        by every node."""
-        pass
-
-    @abstractmethod
-    def dataoutstrm(self):
-        """Function to generate the commands for reading out data from c++ and convert
-        into npy format, is member function of HLSCustomOp class but has to be filled
-        by every node."""
-        pass
-
-    @abstractmethod
-    def save_as_npy(self):
-        """Function to generate the commands for saving data in .npy file in c++,
-        is member function of HLSCustomOp class but has to be filled by every node."""
-        pass
-
-    @abstractmethod
-    def blackboxfunction(self):
-        """Function to generate a blackbock function in c++ from which an IP block
-        will be generated, is member function of HLSCustomOp class but has to be filled
-        by every node."""
-        pass
-
-    @abstractmethod
-    def pragmas(self):
-        """Function to generate the pragma commands in c++, is member function of
-        HLSCustomOp class but has to be filled by every node."""
-        pass
-
-    def get_normal_input_shape(self):
-        """Returns normal input shape if implemented."""
-        raise Exception("get_normal_input_shape not implemented for this op")
-
-    def get_normal_output_shape(self):
-        """Returns folded output shape if implemented."""
-        raise Exception("get_normal_output_shape not implemented for this op")
-
-    def get_folded_input_shape(self):
-        """Returns folded input shape (according to synapse folding), if implemented."""
-        raise Exception("get_folded_input_shape not implemented for this op")
-
-    def get_folded_output_shape(self):
-        """Returns folded output shape (according to neuron folding), if implemented."""
-        raise Exception("get_folded_output_shape not implemented for this op")
-
-    def get_instream_width(self):
-        """Returns input stream width, if implemented."""
-        raise Exception("get_instream_width not implemented for this op")
-
-    def get_outstream_width(self):
-        """Returns output stream width, if implemented."""
-        raise Exception("get_outstream_width not implemented for this op")
-
-    def get_instream_width_padded(self):
-        """Returns input stream width padded to a multiple of 8. This is required
-        by the AXI Stream spec."""
-        in_width = self.get_instream_width()
-        return roundup_to_integer_multiple(in_width, 8)
-
-    def get_outstream_width_padded(self):
-        """Returns output stream width padded to a multiple of 8. This is required
-        by the AXI Stream spec."""
-        out_width = self.get_outstream_width()
-        return roundup_to_integer_multiple(out_width, 8)
-
-    def get_ap_int_max_w(self):
-        "Return the maximum width of any ap_int used in this module."
-        instream = self.get_instream_width()
-        outstream = self.get_outstream_width()
-        return max([instream, outstream])
+from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch
+from finn.custom_op.fpgadataflow.pool_batch import Pool_Batch
+from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch
+from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch
+from finn.custom_op.fpgadataflow.addstreams_batch import AddStreams_Batch
+from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch
+from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch
+from finn.custom_op.fpgadataflow.vector_vector_activate_batch import (
+    Vector_Vector_Activate_Batch,
+)
+from finn.custom_op.fpgadataflow.channelwise_op_batch import ChannelwiseOp_Batch
+from finn.custom_op.fpgadataflow.iodma import IODMA
+
+custom_op = dict()
+
+# make sure new HLSCustomOp subclasses are imported here so that they get
+# registered and plug in correctly into the infrastructure
+custom_op["DownSampler"] = DownSampler
+custom_op["StreamingMaxPool_Batch"] = StreamingMaxPool_Batch
+custom_op["StreamingFCLayer_Batch"] = StreamingFCLayer_Batch
+custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator
+custom_op["TLastMarker"] = TLastMarker
+custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch
+custom_op["StreamingFIFO"] = StreamingFIFO
+custom_op["GlobalAccPool_Batch"] = GlobalAccPool_Batch
+custom_op["Pool_Batch"] = Pool_Batch
+custom_op["FMPadding_Batch"] = FMPadding_Batch
+custom_op["Thresholding_Batch"] = Thresholding_Batch
+custom_op["AddStreams_Batch"] = AddStreams_Batch
+custom_op["LabelSelect_Batch"] = LabelSelect_Batch
+custom_op["DuplicateStreams_Batch"] = DuplicateStreams_Batch
+custom_op["Vector_Vector_Activate_Batch"] = Vector_Vector_Activate_Batch
+custom_op["ChannelwiseOp_Batch"] = ChannelwiseOp_Batch
+custom_op["IODMA"] = IODMA
diff --git a/src/finn/custom_op/fpgadataflow/addstreams_batch.py b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
index 593f9f4fdf574aa2a2b4e70de5fe6ece2ce2085d..9222720543bb463f62be76e980c222194d237f44 100644
--- a/src/finn/custom_op/fpgadataflow/addstreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
@@ -31,7 +31,7 @@ import os
 import numpy as np
 
 from finn.core.datatype import DataType
-from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from onnx import TensorProto, helper
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
@@ -109,13 +109,6 @@ class AddStreams_Batch(HLSCustomOp):
 
     def verify_node(self):
         info_messages = []
-        # verify that "domain" is set to "finn"
-        domain_value = self.onnx_node.domain
-        if domain_value == "finn":
-            info_messages.append("Attribute domain is set correctly")
-        else:
-            info_messages.append('Attribute domain should be set to "finn"')
-
         # verify that "backend" is set to "fpgadataflow"
         backend_value = self.get_nodeattr("backend")
         if backend_value == "fpgadataflow":
diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
index 88b55aaec8fa834abe274b703a404b4419571401..635f37d5695a56d7c22f2287030ccb7331ab347b 100644
--- a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
+++ b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
@@ -33,7 +33,7 @@ import numpy as np
 
 from onnx import TensorProto, helper
 from finn.core.datatype import DataType
-from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
     numpy_to_hls_code,
@@ -95,11 +95,11 @@ class ChannelwiseOp_Batch(HLSCustomOp):
         my_attrs = {
             # channelwise "map" function to apply:
             # one of cmp_le, cmp_ge, add, mul
-            "Func": ("s", False, "cmp_le"),
+            "Func": ("s", False, "cmp_le", {"cmp_le", "cmp_ge", "add", "mul"}),
             "PE": ("i", True, 0),
             "NumChannels": ("i", True, 0),
             # string defining memory resource type for parameters
-            "ram_style": ("s", False, "distributed"),
+            "ram_style": ("s", False, "distributed", {"distributed", "block"}),
             # FINN DataTypes for inputs, weights, outputs
             "inputDataType": ("s", True, ""),
             "paramDataType": ("s", True, ""),
@@ -178,13 +178,6 @@ class ChannelwiseOp_Batch(HLSCustomOp):
 
     def verify_node(self):
         info_messages = []
-        # verify that "domain" is set to "finn"
-        domain_value = self.onnx_node.domain
-        if domain_value == "finn":
-            info_messages.append("Attribute domain is set correctly")
-        else:
-            info_messages.append('Attribute domain should be set to "finn"')
-
         # verify that "backend" is set to "fpgadataflow"
         backend_value = self.get_nodeattr("backend")
         if backend_value == "fpgadataflow":
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
index d33d6c963c0c55309f7f258c9ec1d7723e112282..3f400053df8de6ec1e53e39fb5a3edee15f3ab30 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
@@ -28,11 +28,12 @@
 
 import os
 
+import math
 import numpy as np
 
 from finn.core.datatype import DataType
-from finn.custom_op.fpgadataflow import HLSCustomOp
-from finn.custom_op.im2col import compute_conv_output_dim
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.custom_op.general.im2col import compute_conv_output_dim
 from onnx import TensorProto, helper
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
@@ -69,13 +70,18 @@ class ConvolutionInputGenerator(HLSCustomOp):
             # FINN DataTypes for inputs, weights, outputs
             "inputDataType": ("s", True, ""),
             "outputDataType": ("s", True, ""),
-            "depthwise": ("i", False, 0),
+            "depthwise": ("i", False, 0, {0, 1}),
             # FPGA resource type for ConvolutionInputGenerator input buffer
             # auto -- let Vivado HLS decide
             # block -- use BRAM
             # distributed -- use LUTRAM
             # ultra -- use URAM
-            "ram_style": ("s", False, "distributed"),
+            "ram_style": (
+                "s",
+                False,
+                "distributed",
+                {"auto", "block", "distributed", "ultra"},
+            ),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -194,6 +200,75 @@ class ConvolutionInputGenerator(HLSCustomOp):
 
         return int(exp_cycles)
 
+    def bram_estimation(self):
+        simd = self.get_nodeattr("SIMD")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        k = self.get_nodeattr("ConvKernelDim")
+        stride = self.get_nodeattr("Stride")
+        ram_style = self.get_nodeattr("ram_style")
+        if ram_style == "block" or ram_style == "auto":
+            ram_depth = ifm_dim * ifm_ch / simd
+            if ram_depth <= 512:
+                ram_width = 36
+            elif ram_depth <= 1024:
+                ram_width = 18
+            elif ram_depth <= 2048:
+                ram_width = 9
+            elif ram_depth <= 4096:
+                ram_width = 4
+            elif ram_depth <= 8192:
+                ram_width = 2
+            else:
+                ram_width = 1
+            return int(
+                (k + stride)
+                * (
+                    math.ceil(simd * self.get_input_datatype().bitwidth() / ram_width)
+                    * math.ceil(ifm_dim * ifm_ch / simd / ram_depth)
+                )
+            )
+        else:
+            return 0
+
+    def lut_estimation(self):
+        simd = self.get_nodeattr("SIMD")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        k = self.get_nodeattr("ConvKernelDim")
+        stride = self.get_nodeattr("Stride")
+        ram_style = self.get_nodeattr("ram_style")
+        if ram_style == "distributed":
+            ram_luts = int(
+                (k + stride)
+                * (
+                    simd
+                    * self.get_input_datatype().bitwidth()
+                    * math.ceil(ifm_dim * ifm_ch / simd / 64)
+                )
+            )
+        else:
+            ram_luts = 0
+        return 300 + ram_luts
+
+    def uram_estimation(self):
+        simd = self.get_nodeattr("SIMD")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        k = self.get_nodeattr("ConvKernelDim")
+        stride = self.get_nodeattr("Stride")
+        ram_style = self.get_nodeattr("ram_style")
+        if ram_style == "ultra":
+            return int(
+                (k + stride)
+                * (
+                    math.ceil(simd * self.get_input_datatype().bitwidth() / 64)
+                    * math.ceil(ifm_dim * ifm_ch / simd / 4096)
+                )
+            )
+        else:
+            return 0
+
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
         node = self.onnx_node
diff --git a/src/finn/custom_op/fpgadataflow/downsampler.py b/src/finn/custom_op/fpgadataflow/downsampler.py
index 15d55653b4e431dead885d75650b1500150d8775..e7e0c00ccd0b82643dbff15a0426fdc3831bd685 100644
--- a/src/finn/custom_op/fpgadataflow/downsampler.py
+++ b/src/finn/custom_op/fpgadataflow/downsampler.py
@@ -2,7 +2,7 @@ import os
 import numpy as np
 from onnx import TensorProto, helper
 from finn.core.datatype import DataType
-from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 
diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
index 603fef78df561b301ffd20725febdc35daa78f6f..370c87c8618da2bb2eac5ee4c20ad86d64b03703 100644
--- a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
@@ -31,7 +31,7 @@ import os
 import numpy as np
 
 from finn.core.datatype import DataType
-from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from onnx import helper, TensorProto
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
@@ -110,13 +110,6 @@ class DuplicateStreams_Batch(HLSCustomOp):
 
     def verify_node(self):
         info_messages = []
-        # verify that "domain" is set to "finn"
-        domain_value = self.onnx_node.domain
-        if domain_value == "finn":
-            info_messages.append("Attribute domain is set correctly")
-        else:
-            info_messages.append('Attribute domain should be set to "finn"')
-
         # verify that "backend" is set to "fpgadataflow"
         backend_value = self.get_nodeattr("backend")
         if backend_value == "fpgadataflow":
diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
index 95ecc5f10525456e7f5a6d838e0850adaee5415f..e8efa3abb4e75830bf31cd88c8cb21f517e0a9f7 100644
--- a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
@@ -2,7 +2,7 @@ import os
 import numpy as np
 from onnx import TensorProto, helper
 from finn.core.datatype import DataType
-from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 
@@ -28,7 +28,7 @@ class FMPadding_Batch(HLSCustomOp):
             # controls distribution of padded pixels
             # in case of uneven padding -- see FMPadding fxn
             # in hlslib
-            "PaddingStyle": ("i", False, 2),
+            "PaddingStyle": ("i", False, 2, {2, 1}),
             # shape describing input vecs per execution
             "numInputVectors": ("i", False, 1),
         }
diff --git a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
index 56f1a9d56d9da7057e3cbe61f3d92877e58087d6..6035ad75d8037b6f93eb38700930c535a5409298 100644
--- a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
@@ -31,7 +31,7 @@ import os
 import numpy as np
 
 from finn.core.datatype import DataType
-from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from onnx import TensorProto, helper
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
@@ -115,13 +115,6 @@ class GlobalAccPool_Batch(HLSCustomOp):
 
     def verify_node(self):
         info_messages = []
-        # verify that "domain" is set to "finn"
-        domain_value = self.onnx_node.domain
-        if domain_value == "finn":
-            info_messages.append("Attribute domain is set correctly")
-        else:
-            info_messages.append('Attribute domain should be set to "finn"')
-
         # verify that "backend" is set to "fpgadataflow"
         backend_value = self.get_nodeattr("backend")
         if backend_value == "fpgadataflow":
diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py
new file mode 100644
index 0000000000000000000000000000000000000000..3431061e772e7eda310733f1a0d31f4b2db154ac
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py
@@ -0,0 +1,634 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# namespace package, extend path
+
+from abc import abstractmethod
+import numpy as np
+import os
+import subprocess
+from finn.custom_op.base import CustomOp
+from finn.util.basic import (
+    CppBuilder,
+    make_build_dir,
+    roundup_to_integer_multiple,
+    get_rtlsim_trace_depth,
+)
+from finn.util.fpgadataflow import (
+    IPGenBuilder,
+    pyverilate_get_liveness_threshold_cycles,
+    rtlsim_multi_io,
+)
+from . import templates
+
+try:
+    from pyverilator import PyVerilator
+except ModuleNotFoundError:
+    PyVerilator = None
+
+
+class HLSCustomOp(CustomOp):
+    """HLSCustomOp class all custom ops that correspond to a finn-hlslib
+    function are based on. Contains different functions every fpgadataflow
+    custom node should have. Some as abstract methods, these have to be filled
+    when writing a new fpgadataflow custom op node."""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+
+        self.code_gen_dict = {}
+
+        # getting templates from templates.py
+
+        # template for single node execution
+        self.docompute_template = templates.docompute_template
+
+        # templates for single node ip generation
+        # cpp file
+        self.ipgen_template = templates.ipgen_template
+        # tcl script
+        self.ipgentcl_template = templates.ipgentcl_template
+
+    def get_nodeattr_types(self):
+        return {
+            "backend": ("s", True, "fpgadataflow"),
+            "code_gen_dir_cppsim": ("s", False, ""),
+            "code_gen_dir_ipgen": ("s", False, ""),
+            "executable_path": ("s", False, ""),
+            "ipgen_path": ("s", False, ""),
+            "ip_path": ("s", False, ""),
+            "ip_vlnv": ("s", False, ""),
+            "exec_mode": ("s", False, "", {"", "rtlsim", "cppsim"}),
+            "cycles_rtlsim": ("i", False, 0),
+            "cycles_estimate": ("i", False, 0),
+            "rtlsim_trace": ("s", False, ""),
+            "res_estimate": ("s", False, ""),
+            "res_hls": ("s", False, ""),
+            "res_synth": ("s", False, ""),
+            "rtlsim_so": ("s", False, ""),
+            # partitioning info
+            "partition_id": ("i", False, 0),
+            # input and output FIFO depths
+            "inFIFODepth": ("i", False, 2),
+            "outFIFODepth": ("i", False, 2),
+        }
+
+    def get_verilog_top_module_name(self):
+        "Return the Verilog top module name for this node."
+
+        node = self.onnx_node
+        prefixed_top_name = "%s_%s" % (node.name, node.name)
+        return prefixed_top_name
+
+    def get_verilog_top_module_intf_names(self):
+        """Return a dict of names of input and output interfaces.
+        The keys reflect the protocols each interface implements:
+        'clk', 'rst', 'm_axis', 's_axis', 'aximm', 'axilite'.
+        Values are lists of names:
+        's_axis' names correspond to the list of node inputs in order,
+        'm_axis' names correspond to the list of node outputs in order'
+        Each block must have at most one aximm and one axilite."""
+        intf_names = {}
+        intf_names["clk"] = ["ap_clk"]
+        intf_names["rst"] = ["ap_rst_n"]
+        intf_names["s_axis"] = [("in0_V_V", self.get_instream_width_padded())]
+        intf_names["m_axis"] = [("out_V_V", self.get_outstream_width_padded())]
+        intf_names["aximm"] = []
+        intf_names["axilite"] = []
+        return intf_names
+
+    def get_verilog_top_filename(self):
+        "Return the Verilog top module filename for this node."
+
+        verilog_file = "{}/project_{}/sol1/impl/verilog/{}.v".format(
+            self.get_nodeattr("code_gen_dir_ipgen"),
+            self.onnx_node.name,
+            self.get_verilog_top_module_name(),
+        )
+        return verilog_file
+
+    def get_all_verilog_paths(self):
+        "Return list of all folders containing Verilog code for this node."
+
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        assert (
+            code_gen_dir != ""
+        ), """Node attribute "code_gen_dir_ipgen" is
+        not set. Please run HLSSynthIP first."""
+        verilog_path = "{}/project_{}/sol1/impl/verilog/".format(
+            code_gen_dir, self.onnx_node.name
+        )
+        # default impl only returns the HLS verilog codegen dir
+        return [verilog_path]
+
+    def get_all_verilog_filenames(self):
+        "Return list of all Verilog files used for this node."
+
+        verilog_files = []
+        verilog_paths = self.get_all_verilog_paths()
+        for verilog_path in verilog_paths:
+            for f in os.listdir(verilog_path):
+                if f.endswith(".v"):
+                    verilog_files += [f]
+        return verilog_files
+
+    def prepare_rtlsim(self):
+        """Creates a Verilator emulation library for the RTL code generated
+        for this node, sets the rtlsim_so attribute to its path and returns
+        a PyVerilator wrapper around it."""
+
+        if PyVerilator is None:
+            raise ImportError("Installation of PyVerilator is required.")
+        verilog_paths = self.get_all_verilog_paths()
+        verilog_files = self.get_all_verilog_filenames()
+        # build the Verilator emu library
+        sim = PyVerilator.build(
+            verilog_files,
+            build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
+            verilog_path=verilog_paths,
+            trace_depth=get_rtlsim_trace_depth(),
+            top_module_name=self.get_verilog_top_module_name(),
+        )
+        # save generated lib filename in attribute
+        self.set_nodeattr("rtlsim_so", sim.lib._name)
+        return sim
+
+    def get_rtlsim(self):
+        """Return a PyVerilator wrapper for the Verilator emulation library
+        for this node."""
+
+        rtlsim_so = self.get_nodeattr("rtlsim_so")
+        assert os.path.isfile(rtlsim_so), "Cannot find rtlsim library."
+        # create PyVerilator wrapper
+        sim = PyVerilator(rtlsim_so)
+        return sim
+
+    def node_res_estimation(self):
+        """Returns summarized resource estimation of BRAMs and LUTs
+        of the node as a dictionary."""
+        ret = dict()
+        ret["BRAM_18K"] = self.bram_estimation()
+        ret["BRAM_efficiency"] = self.bram_efficiency_estimation()
+        ret["LUT"] = self.lut_estimation()
+        ret["URAM"] = self.uram_estimation()
+        ret["DSP"] = self.dsp_estimation()
+        return ret
+
+    def bram_efficiency_estimation(self):
+        """Function for BRAM efficiency estimation: actual parameter storage
+        needed divided by the allocated BRAM storage (from estimation)"""
+        return 1
+
+    def bram_estimation(self):
+        """Function for BRAM resource estimation, is member function of
+        HLSCustomOp class but has to be filled by every node"""
+        return 0
+
+    def uram_estimation(self):
+        """Function for UltraRAM resource estimation, is member function of
+        HLSCustomOp class but has to be filled by every node"""
+        return 0
+
+    def lut_estimation(self):
+        """Function for LUT resource estimation, is member function of
+        HLSCustomOp class but has to be filled by every node"""
+        return 0
+
+    def dsp_estimation(self):
+        """Function for DSP resource estimation, is member function of
+        HLSCustomOp class but has to be filled by every node"""
+        return 0
+
+    def get_exp_cycles(self):
+        """Function for estimation of expected cycles for set folding,
+        is member function of HLSCustomOp class but has to be filled
+        by every node"""
+        return 0
+
+    def code_generation_ipgen(self, model, fpgapart, clk):
+        """Generates c++ code and tcl script for ip generation."""
+        node = self.onnx_node
+
+        # generate top cpp file for ip generation
+        path = self.get_nodeattr("code_gen_dir_ipgen")
+        self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())]
+        self.generate_params(model, path)
+        self.global_includes()
+        self.defines("ipgen")
+        self.blackboxfunction()
+        self.pragmas()
+        self.docompute()
+
+        template = self.ipgen_template
+
+        for key in self.code_gen_dict:
+            # transform list into long string separated by '\n'
+            code_gen_line = "\n".join(self.code_gen_dict[key])
+            template = template.replace(key, code_gen_line)
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        f = open(os.path.join(code_gen_dir, "top_{}.cpp".format(node.name)), "w")
+        f.write(template)
+        f.close()
+        self.code_gen_dict.clear()
+
+        # generate tcl script for ip generation
+        self.code_gen_dict["$PROJECTNAME$"] = ["project_{}".format(node.name)]
+        self.code_gen_dict["$HWSRCDIR$"] = [code_gen_dir]
+        self.code_gen_dict["$FPGAPART$"] = [fpgapart]
+        self.code_gen_dict["$FINNHLSLIBDIR$"] = ["/workspace/finn-hlslib"]
+        self.code_gen_dict["$TOPFXN$"] = [node.name]
+        self.code_gen_dict["$CLKPERIOD$"] = [str(clk)]
+        self.code_gen_dict["$EXTRA_DIRECTIVES$"] = self.ipgen_extra_directives()
+
+        template = self.ipgentcl_template
+
+        for key in self.code_gen_dict:
+            # transform list into long string separated by '\n'
+            code_gen_line = "\n".join(self.code_gen_dict[key])
+            template = template.replace(key, code_gen_line)
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        f = open(os.path.join(code_gen_dir, "hls_syn_{}.tcl".format(node.name)), "w")
+        f.write(template)
+        f.close()
+        self.code_gen_dict.clear()
+
+    def ipgen_extra_directives(self):
+        "Return a list of extra tcl directives for HLS synthesis."
+        return []
+
+    def ipgen_singlenode_code(self):
+        """Builds the bash script for ip generation using the IPGenBuilder from
+        finn.util.fpgadataflow."""
+        node = self.onnx_node
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        builder = IPGenBuilder()
+        builder.append_tcl(code_gen_dir + "/hls_syn_{}.tcl".format(node.name))
+        builder.set_ipgen_path(code_gen_dir + "/project_{}".format(node.name))
+        builder.build(code_gen_dir)
+        self.set_nodeattr("ipgen_path", builder.ipgen_path)
+        self.set_nodeattr("ip_path", builder.ipgen_path + "/sol1/impl/ip")
+        vlnv = "xilinx.com:hls:%s:1.0" % node.name
+        self.set_nodeattr("ip_vlnv", vlnv)
+
+    def code_generation_cppsim(self, model):
+        """Generates c++ code for simulation (cppsim)."""
+        node = self.onnx_node
+        path = self.get_nodeattr("code_gen_dir_cppsim")
+        self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())]
+        self.generate_params(model, path)
+        self.global_includes()
+        self.defines("cppsim")
+        self.read_npy_data()
+        self.strm_decl()
+        self.pragmas()
+        self.docompute()
+        self.dataoutstrm()
+        self.save_as_npy()
+
+        template = self.docompute_template
+
+        for key in self.code_gen_dict:
+            # transform list into long string separated by '\n'
+            code_gen_line = "\n".join(self.code_gen_dict[key])
+            template = template.replace(key, code_gen_line)
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        f = open(os.path.join(code_gen_dir, "execute_{}.cpp".format(node.op_type)), "w")
+        f.write(template)
+        f.close()
+        self.code_gen_dict.clear()
+
+    def code_generation_ipi(self):
+        """Constructs and returns the TCL for node instantiation in Vivado IPI."""
+        vlnv = self.get_nodeattr("ip_vlnv")
+        cmd = ["create_bd_cell -type ip -vlnv %s %s" % (vlnv, self.onnx_node.name)]
+        return cmd
+
+    def compile_singlenode_code(self):
+        """Builds the bash script for compilation using the CppBuilder from
+        finn.util.basic and executes the script to produce the executable."""
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        builder = CppBuilder()
+        # to enable additional debug features please uncommand the next line
+        # builder.append_includes("-DDEBUG")
+        builder.append_includes("-I/workspace/finn/src/finn/qnn-data/cpp")
+        builder.append_includes("-I/workspace/cnpy/")
+        builder.append_includes("-I/workspace/finn-hlslib")
+        builder.append_includes("-I{}/include".format(os.environ["VIVADO_PATH"]))
+        builder.append_includes("--std=c++11")
+        builder.append_includes("-O3")
+        builder.append_sources(code_gen_dir + "/*.cpp")
+        builder.append_sources("/workspace/cnpy/cnpy.cpp")
+        builder.append_includes("-lz")
+        builder.set_executable_path(code_gen_dir + "/node_model")
+        builder.build(code_gen_dir)
+        self.set_nodeattr("executable_path", builder.executable_path)
+
+    def dynamic_input_to_npy(self, context, count):
+        """Saves input (given context) into .npy files.
+
+        Count indicates the number of inputs that have to be saved."""
+        node = self.onnx_node
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        if code_gen_dir == "":
+            raise Exception(
+                """
+Found no codegen dir for this node, did you run the prepare_cppsim transformation?
+            """
+            )
+        # create a npy file for each input of the node (in_ind is input index)
+        # assuming dynamic inputs start from 0
+        for in_ind in range(count):
+            current_input_name = node.input[in_ind]
+            # make copy before saving array
+            input_array = context[current_input_name].copy()
+            np.save(
+                os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), input_array
+            )
+
+    def npy_to_dynamic_output(self, context):
+        """Reads the output from an output.npy file generated from cppsim and
+        places its content into the context dictionary."""
+        node = self.onnx_node
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        output = np.load("{}/output.npy".format(code_gen_dir))
+        context[node.output[0]] = output
+
+    def npy_to_dynamic_outputs(self, context, npy_list):
+        """Reads the output from .npy files generated from cppsim and places
+        their content into the context dictionary.
+        npy_list is a list specifying which files to read, and its order must
+        match the order of node outputs."""
+        node = self.onnx_node
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        for i in range(len(npy_list)):
+            output = np.load("{}/{}".format(code_gen_dir, npy_list[i]))
+            context[node.output[i]] = output
+
+    def exec_precompiled_singlenode_model(self):
+        """Executes precompiled executable."""
+        executable_path = self.get_nodeattr("executable_path")
+        if executable_path == "":
+            raise Exception(
+                """
+Found no executable for this node, did you run the codegen and
+compilation transformations?
+            """
+            )
+        process_execute = subprocess.Popen(executable_path, stdout=subprocess.PIPE)
+        process_execute.communicate()
+
+    def reset_rtlsim(self, sim):
+        """Sets reset input in pyverilator to zero, toggles the clock and set it
+        back to one"""
+        sim.io.ap_rst_n = 0
+        sim.io.ap_clk = 1
+        sim.io.ap_clk = 0
+        sim.io.ap_rst_n = 1
+
+    def toggle_clk(self, sim):
+        """Toggles the clock input in pyverilator once."""
+        sim.io.ap_clk = 1
+        sim.io.ap_clk = 0
+
+    def rtlsim(self, sim, inp, inp2=None):
+        """Runs the pyverilator simulation by passing the input values to the simulation,
+        toggle the clock and observing the execution time. Function contains also an
+        observation loop that can abort the simulation if no output value is produced
+        after 100 cycles."""
+
+        trace_file = self.get_nodeattr("rtlsim_trace")
+        if trace_file != "":
+            if trace_file == "default":
+                trace_file = self.onnx_node.name + ".vcd"
+            sim.start_vcd_trace(trace_file)
+        inputs = inp
+        outputs = []
+        sim.io.out_V_V_TREADY = 1
+
+        # observe if output is completely calculated
+        # observation_count will contain the number of cycles the calculation ran
+        num_out_values = self.get_number_output_values()
+        output_observed = False
+        observation_count = 0
+
+        # avoid infinite looping of simulation by aborting when there is no change in
+        # output values after 100 cycles
+        no_change_count = 0
+        old_outputs = outputs
+        liveness_threshold = pyverilate_get_liveness_threshold_cycles()
+
+        while not (output_observed):
+            sim.io.in0_V_V_TVALID = 1 if len(inputs) > 0 else 0
+            sim.io.in0_V_V_TDATA = inputs[0] if len(inputs) > 0 else 0
+            if sim.io.in0_V_V_TREADY == 1 and sim.io.in0_V_V_TVALID == 1:
+                inputs = inputs[1:]
+
+            if inp2 is not None:
+                sim.io.in1_V_V_TVALID = 1 if len(inp2) > 0 else 0
+                sim.io.in1_V_V_TDATA = inp2[0] if len(inp2) > 0 else 0
+                if sim.io.in1_V_V_TREADY == 1 and sim.io.in1_V_V_TVALID == 1:
+                    inp2 = inp2[1:]
+
+            if sim.io.out_V_V_TVALID == 1 and sim.io.out_V_V_TREADY == 1:
+                outputs = outputs + [sim.io.out_V_V_TDATA]
+            sim.io.ap_clk = 1
+            sim.io.ap_clk = 0
+
+            observation_count = observation_count + 1
+            no_change_count = no_change_count + 1
+
+            if len(outputs) == num_out_values:
+                self.set_nodeattr("cycles_rtlsim", observation_count)
+                output_observed = True
+
+            if no_change_count == liveness_threshold:
+                if old_outputs == outputs:
+                    if trace_file != "":
+                        sim.flush_vcd_trace()
+                        sim.stop_vcd_trace()
+                    raise Exception(
+                        "Error in simulation! Takes too long to produce output. "
+                        "Consider setting the LIVENESS_THRESHOLD env.var. to a "
+                        "larger value."
+                    )
+                else:
+                    no_change_count = 0
+                    old_outputs = outputs
+        if trace_file != "":
+            sim.flush_vcd_trace()
+            sim.stop_vcd_trace()
+        return outputs
+
+    def rtlsim_multi_io(self, sim, io_dict):
+        "Run rtlsim for this node, supports multiple i/o streams."
+
+        trace_file = self.get_nodeattr("rtlsim_trace")
+        if trace_file == "default":
+            trace_file = self.onnx_node.name + ".vcd"
+        num_out_values = self.get_number_output_values()
+        total_cycle_count = rtlsim_multi_io(sim, io_dict, num_out_values, trace_file)
+        self.set_nodeattr("cycles_rtlsim", total_cycle_count)
+
+    def execute_node(self, context, graph):
+        """Executes single node using cppsim or rtlsim."""
+        mode = self.get_nodeattr("exec_mode")
+        if mode == "cppsim":
+            # save input(s)
+            self.dynamic_input_to_npy(context, 1)
+            # execute the precompiled model
+            self.exec_precompiled_singlenode_model()
+            # load output npy file
+            self.npy_to_dynamic_output(context)
+        elif mode == "rtlsim":
+            pass
+
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+    def generate_params(self, model, path):
+        """Function to generate parameters (i.e. weights and thresholds),
+        is member function of HLSCustomOp class but has to be filled
+        by every node."""
+        pass
+
+    @abstractmethod
+    def get_number_output_values(self):
+        """Function to get the number of expected output values,
+        is member function of HLSCustomOp class but has to be filled
+        by every node."""
+        pass
+
+    @abstractmethod
+    def global_includes(self):
+        """Function to set the global includes for c++ code that has to be generated
+        for cppsim or rtlsim, is member function of HLSCustomOp class but has to
+        be filled by every node."""
+        pass
+
+    @abstractmethod
+    def defines(self, var):
+        """Function to set the define commands for c++ code that has to be generated
+        for cppsim or rtlsim, is member function of HLSCustomOp class but has to
+        be filled by every node.
+
+        var: makes it possible to reuse the function for different c++ code generation.
+        I.e. if set to "ipgen" in StreamingFCLayer_Batch additional PRAGMA defines are
+        added."""
+        pass
+
+    @abstractmethod
+    def read_npy_data(self):
+        """Function to generate the commands for reading data from .npy file in c++,
+        is member function of HLSCustomOp class but has to be filled by every node."""
+        pass
+
+    @abstractmethod
+    def strm_decl(self):
+        """Function to generate the commands for the stream declaration in c++,
+        is member function of HLSCustomOp class but has to be filled
+        by every node."""
+        pass
+
+    @abstractmethod
+    def docompute(self):
+        """Function to generate the commands for the computational part of the
+        c++ code, is member function of HLSCustomOp class but has to be filled
+        by every node."""
+        pass
+
+    @abstractmethod
+    def dataoutstrm(self):
+        """Function to generate the commands for reading out data from c++ and convert
+        into npy format, is member function of HLSCustomOp class but has to be filled
+        by every node."""
+        pass
+
+    @abstractmethod
+    def save_as_npy(self):
+        """Function to generate the commands for saving data in .npy file in c++,
+        is member function of HLSCustomOp class but has to be filled by every node."""
+        pass
+
+    @abstractmethod
+    def blackboxfunction(self):
+        """Function to generate a blackbock function in c++ from which an IP block
+        will be generated, is member function of HLSCustomOp class but has to be filled
+        by every node."""
+        pass
+
+    @abstractmethod
+    def pragmas(self):
+        """Function to generate the pragma commands in c++, is member function of
+        HLSCustomOp class but has to be filled by every node."""
+        pass
+
+    def get_normal_input_shape(self):
+        """Returns normal input shape if implemented."""
+        raise Exception("get_normal_input_shape not implemented for this op")
+
+    def get_normal_output_shape(self):
+        """Returns folded output shape if implemented."""
+        raise Exception("get_normal_output_shape not implemented for this op")
+
+    def get_folded_input_shape(self):
+        """Returns folded input shape (according to synapse folding), if implemented."""
+        raise Exception("get_folded_input_shape not implemented for this op")
+
+    def get_folded_output_shape(self):
+        """Returns folded output shape (according to neuron folding), if implemented."""
+        raise Exception("get_folded_output_shape not implemented for this op")
+
+    def get_instream_width(self):
+        """Returns input stream width, if implemented."""
+        raise Exception("get_instream_width not implemented for this op")
+
+    def get_outstream_width(self):
+        """Returns output stream width, if implemented."""
+        raise Exception("get_outstream_width not implemented for this op")
+
+    def get_instream_width_padded(self):
+        """Returns input stream width padded to a multiple of 8. This is required
+        by the AXI Stream spec."""
+        in_width = self.get_instream_width()
+        return roundup_to_integer_multiple(in_width, 8)
+
+    def get_outstream_width_padded(self):
+        """Returns output stream width padded to a multiple of 8. This is required
+        by the AXI Stream spec."""
+        out_width = self.get_outstream_width()
+        return roundup_to_integer_multiple(out_width, 8)
+
+    def get_ap_int_max_w(self):
+        "Return the maximum width of any ap_int used in this module."
+        instream = self.get_instream_width()
+        outstream = self.get_outstream_width()
+        return max([instream, outstream])
diff --git a/src/finn/custom_op/fpgadataflow/iodma.py b/src/finn/custom_op/fpgadataflow/iodma.py
index 67af0c5cb409c6deea9bacf247f803d119aa1b17..0ab8bf295927f233b5785f76a1d6894c7993f9ef 100644
--- a/src/finn/custom_op/fpgadataflow/iodma.py
+++ b/src/finn/custom_op/fpgadataflow/iodma.py
@@ -30,7 +30,7 @@ import numpy as np
 import math
 from onnx import TensorProto, helper
 from finn.core.datatype import DataType
-from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 
 
 # the IODMA inerfaces a memory-mapped AXI interface and an AXI stream
@@ -87,8 +87,8 @@ class IODMA(HLSCustomOp):
             "streamWidth": ("i", False, 32),
             # DMA-specific parameters
             "intfWidth": ("i", False, 32),
-            "burstMode": ("s", False, "increment"),
-            "direction": ("s", False, "in"),
+            "burstMode": ("s", False, "increment", {"wrap", "increment"}),
+            "direction": ("s", False, "in", {"in", "out"}),
             # shape describing input vecs per execution
             "numInputVectors": ("ints", False, [1]),
             # name of axi-mm interface
diff --git a/src/finn/custom_op/fpgadataflow/labelselect_batch.py b/src/finn/custom_op/fpgadataflow/labelselect_batch.py
index 6e206d2058076802a48b69f4c69cccf744489f31..39fa87baa08cb43ea7cb4f3d2aa2159b07b8522b 100644
--- a/src/finn/custom_op/fpgadataflow/labelselect_batch.py
+++ b/src/finn/custom_op/fpgadataflow/labelselect_batch.py
@@ -31,7 +31,7 @@ import os
 import numpy as np
 
 from finn.core.datatype import DataType
-from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from onnx import TensorProto, helper
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 from finn.util.basic import roundup_to_integer_multiple
@@ -128,13 +128,6 @@ class LabelSelect_Batch(HLSCustomOp):
 
     def verify_node(self):
         info_messages = []
-        # verify that "domain" is set to "finn"
-        domain_value = self.onnx_node.domain
-        if domain_value == "finn":
-            info_messages.append("Attribute domain is set correctly")
-        else:
-            info_messages.append('Attribute domain should be set to "finn"')
-
         # verify that "backend" is set to "fpgadataflow"
         backend_value = self.get_nodeattr("backend")
         if backend_value == "fpgadataflow":
diff --git a/src/finn/custom_op/fpgadataflow/pool_batch.py b/src/finn/custom_op/fpgadataflow/pool_batch.py
index 4a2fa6889ae0ebb94976d50b0fc8362d01a63bea..edba084b5258de37198520257e438f90f8cc65e3 100644
--- a/src/finn/custom_op/fpgadataflow/pool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/pool_batch.py
@@ -29,7 +29,7 @@
 import os
 import numpy as np
 
-from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.core.datatype import DataType
 from onnx import TensorProto, helper
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
@@ -60,9 +60,9 @@ class Pool_Batch(HLSCustomOp):
             "KernelSize": ("i", True, 0),
             # Function:
             #  - MaxPool
-            #  - AvgPool (not yet supported, but HLSLIB does)
-            #  - AccPool (not yet supported, but HLSLIB does)
-            "Function": ("s", True, ""),
+            #  - QuantAvgPool
+            # TODO add support for AvgPool and AccPool
+            "Function": ("s", True, "", {"MaxPool", "QuantAvgPool"}),
             "OutImgDim": ("i", True, 0),
             # FINN DataTypes for inputs/outputs
             "InputDataType": ("s", True, ""),
@@ -185,14 +185,6 @@ class Pool_Batch(HLSCustomOp):
 
     def verify_node(self):
         info_messages = []
-
-        # verify that "domain" is set to "finn"
-        domain_value = self.onnx_node.domain
-        if domain_value == "finn":
-            info_messages.append("Attribute domain is set correctly")
-        else:
-            info_messages.append('Attribute domain should be set to "finn"')
-
         # verify that "backend" is set to "fpgadataflow"
         backend_value = self.get_nodeattr("backend")
         if backend_value == "fpgadataflow":
diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
index e80920551120e0e74aae217d9fe4e287e6cabd3d..e2d97a0eaa29604006790a542157639c5c776b22 100644
--- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
@@ -30,7 +30,7 @@ import os
 import numpy as np
 import math
 
-from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.core.datatype import DataType
 from onnx import TensorProto, helper
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
@@ -55,7 +55,7 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
             # Toggle between hls or IPI implementation
             # hls - use the hls generated IP during stitching
             # vivado - use the AXI Infrastructure DWC
-            "impl_style": ("s", False, "hls"),
+            "impl_style": ("s", False, "hls", {"hls", "vivado"}),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -186,14 +186,6 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
 
     def verify_node(self):
         info_messages = []
-
-        # verify that "domain" is set to "finn"
-        domain_value = self.onnx_node.domain
-        if domain_value == "finn":
-            info_messages.append("Attribute domain is set correctly")
-        else:
-            info_messages.append('Attribute domain should be set to "finn"')
-
         # verify that "backend" is set to "fpgadataflow"
         backend_value = self.get_nodeattr("backend")
         if backend_value == "fpgadataflow":
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index 9d63a6866269ddf6c5c7cf54de00b6dfd11505e6..10e0fbbde4f485a9fc9febb21308c9b0c49da041 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -32,7 +32,7 @@ import numpy as np
 
 from onnx import TensorProto, helper
 from finn.core.datatype import DataType
-from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.basic import (
     interleave_matrix_outer_dim_from_partitions,
     roundup_to_integer_multiple,
@@ -68,7 +68,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             "SIMD": ("i", True, 0),
             "MW": ("i", True, 0),
             "MH": ("i", True, 0),
-            "resType": ("s", True, ""),
+            "resType": ("s", False, "lut", {"auto", "lut", "dsp"}),
             "ActVal": ("i", False, 0),
             # FINN DataTypes for inputs, weights, outputs
             "inputDataType": ("s", True, ""),
@@ -78,9 +78,9 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             "accDataType": ("s", False, "INT32"),
             # use xnor-popcount for binary weights/inputs, thus treating them
             # as bipolar
-            "binaryXnorMode": ("i", False, 0),
+            "binaryXnorMode": ("i", False, 0, {0, 1}),
             # no-activation mode (produce accumulators)
-            "noActivation": ("i", False, 0),
+            "noActivation": ("i", False, 0, {0, 1}),
             # number of input vectors, examples:
             # [1] is a single vector (like a FC layer with batch=1)
             # [4] is four vectors (like a FC layer with batch=4)
@@ -90,13 +90,13 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             # const -- embedded weights, default, long compile/synth times
             # decoupled -- streaming weights with weight streamer packaged inside IP
             # external -- streaming weights with external streamer
-            "mem_mode": ("s", False, "const"),
+            "mem_mode": ("s", False, "const", {"const", "decoupled", "external"}),
             # FPGA resource type for memories in decoupled mode
             # auto -- let Vivado decide
             # block -- use BRAM
             # distributed -- use LUTRAM
             # see also https://www.xilinx.com/support/answers/38070.html
-            "ram_style": ("s", False, "auto"),
+            "ram_style": ("s", False, "auto", {"auto", "block", "distributed"}),
             # (mem_mode = decoupled only) whether weights will be writable through
             # an AXI-lite interface during runtime
             # 1 for enabled, 0 for disabled.
@@ -106,7 +106,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             # always "flush" the accelerator by first passing a dummy input
             # vector through the accelerator. This will get rid of any old
             # weight data from the weight FIFOs.
-            "runtime_writeable_weights": ("i", False, 0),
+            "runtime_writeable_weights": ("i", False, 0, {0, 1}),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -159,13 +159,6 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
     def verify_node(self):
         info_messages = []
-        # verify that "domain" is set to "finn"
-        domain_value = self.onnx_node.domain
-        if domain_value == "finn":
-            info_messages.append("Attribute domain is set correctly")
-        else:
-            info_messages.append('Attribute domain should be set to "finn"')
-
         # verify that "backend" is set to "fpgadataflow"
         backend_value = self.get_nodeattr("backend")
         if backend_value == "fpgadataflow":
@@ -238,7 +231,27 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         D_in = self.get_nodeattr("MW")
         D_out = self.get_nodeattr("MH")
         omega = (D_in * D_out) / (Q * P)
-        return P * (math.ceil(omega / 512)) * (math.ceil((Q * W) / 36))
+        mem_width = Q * W * P
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (mmode == "decoupled" and mstyle == "distributed") or (
+            mmode == "const" and self.calc_wmem() <= 128
+        ):
+            return 0
+        # assuming SDP mode RAMB18s (see UG573 Table 1-10)
+        # assuming decoupled (RTL) memory, which is more efficient than const (HLS)
+        if mem_width == 1:
+            return math.ceil(omega / 16384)
+        elif mem_width == 2:
+            return math.ceil(omega / 8192)
+        elif mem_width <= 4:
+            return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4))
+        elif mem_width <= 9:
+            return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 9))
+        elif mem_width <= 18 or omega > 512:
+            return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 18))
+        else:
+            return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36))
 
     def bram_efficiency_estimation(self):
         wdt = self.get_weight_datatype()
@@ -246,6 +259,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         D_in = self.get_nodeattr("MW")
         D_out = self.get_nodeattr("MH")
         bram16_est = self.bram_estimation()
+        if bram16_est == 0:
+            return 1
         wbits = W * D_in * D_out
         bram16_est_capacity = bram16_est * 36 * 512
         return wbits / bram16_est_capacity
@@ -261,6 +276,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         # TODO add in/out FIFO contributions
         P = self.get_nodeattr("PE")
         Q = self.get_nodeattr("SIMD")
+        MW = self.get_nodeattr("MW")
         wdt = self.get_weight_datatype()
         W = wdt.bitwidth()
         # determine tdt with input and weight data types
@@ -269,8 +285,55 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         # parameters from experiments in paper mentioned above
         c0 = 300
         c1 = 1.1
+        c2 = 0
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (mmode == "decoupled" and mstyle == "distributed") or (
+            mmode == "const" and self.calc_wmem() <= 128
+        ):
+            c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64)
+
+        # multiplication
+        res_type = self.get_nodeattr("resType")
+        if res_type == "dsp":
+            mult_luts = 0
+        else:
+            mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A)
+        # adder tree
+        addertree_luts = (W + A) * (2 * Q - 1)
+        # accumulator
+        acc_bits = W + A + np.ceil(math.log(MW, 2))
+        acc_luts = acc_bits
+        # thresholds and threshold comparators
+        thr_luts = 0
+        comp_luts = 0
+        noact = self.get_nodeattr("noActivation")
+        if noact == 0:
+            odt = self.get_output_datatype()
+            B = odt.bitwidth()
+            thr_luts = (2 ** B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64)
+            comp_luts = (2 ** B - 1) * acc_bits
+
+        return int(
+            c0
+            + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts))
+            + c2
+        )
 
-        return c0 + c1 * (P * Q) * (W * A)
+    def dsp_estimation(self):
+        # multiplication
+        P = self.get_nodeattr("PE")
+        res_type = self.get_nodeattr("resType")
+        Q = self.get_nodeattr("SIMD")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        idt = self.get_input_datatype()
+        A = idt.bitwidth()
+        if res_type == "dsp":
+            mult_dsp = P * Q * np.ceil((W + A) / 48)  # TODO: more accurate modelling
+        else:
+            mult_dsp = 0
+        return int(mult_dsp)
 
     def get_exp_cycles(self):
         pe = self.get_nodeattr("PE")
@@ -934,6 +997,11 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
     def docompute(self):
         mem_mode = self.get_nodeattr("mem_mode")
+        map_to_hls_mult_style = {
+            "auto": "ap_resource_dflt()",
+            "lut": "ap_resource_lut()",
+            "dsp": "ap_resource_dsp()",
+        }
         tmpl_args = self.get_template_param_values()
         if self.calc_tmem() == 0:
             odtype_hls_str = self.get_output_datatype().get_hls_datatype_str()
@@ -950,7 +1018,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                     tmpl_args["TDstI"],
                     tmpl_args["TWeightI"],
                     threshs,
-                    self.get_nodeattr("resType"),
+                    map_to_hls_mult_style[self.get_nodeattr("resType")],
                 )
             ]
         elif mem_mode == "decoupled" or mem_mode == "external":
@@ -968,7 +1036,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                     tmpl_args["TWeightI"],
                     wdtype_hls_str,
                     threshs,
-                    self.get_nodeattr("resType"),
+                    map_to_hls_mult_style[self.get_nodeattr("resType")],
                 )
             ]
 
diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py
index 56a7e86854a903a608c253122880ce6ef2e68ef4..9063f018bdcf64c9664e92eeabec539ee2c721af 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfifo.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py
@@ -32,7 +32,7 @@ import subprocess
 import math
 import warnings
 
-from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.core.datatype import DataType
 from onnx import TensorProto, helper
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
@@ -56,13 +56,18 @@ class StreamingFIFO(HLSCustomOp):
             # Toggle between hls or IPI implementation
             # rtl - use the hls generated IP during stitching
             # vivado - use the AXI Infrastructure FIFO
-            "impl_style": ("s", False, "rtl"),
+            "impl_style": ("s", False, "rtl", {"rtl", "vivado"}),
             # FPGA resource type for FIFOs when impl_style is vivado
             # auto -- let Vivado decide
             # block -- use BRAM
             # distributed -- use LUTRAM
             # ultra -- use URAM (on UltraScale+)
-            "ram_style": ("s", False, "auto"),
+            "ram_style": (
+                "s",
+                False,
+                "auto",
+                {"auto", "block", "distributed", "ultra"},
+            ),
         }
         my_attrs.update(super().get_nodeattr_types())
 
diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
index 53bcab993b25173c8620d7f4a6694a8efaf74c4d..7850a85ccf61c7e4a26c25b807d6613a1ad66c5a 100644
--- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
@@ -29,8 +29,8 @@
 import os
 import numpy as np
 
-from finn.custom_op.fpgadataflow import HLSCustomOp
-from finn.custom_op.im2col import compute_conv_output_dim
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.custom_op.general.im2col import compute_conv_output_dim
 from finn.core.datatype import DataType
 from onnx import TensorProto, helper
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
@@ -138,14 +138,6 @@ class StreamingMaxPool_Batch(HLSCustomOp):
 
     def verify_node(self):
         info_messages = []
-
-        # verify that "domain" is set to "finn"
-        domain_value = self.onnx_node.domain
-        if domain_value == "finn":
-            info_messages.append("Attribute domain is set correctly")
-        else:
-            info_messages.append('Attribute domain should be set to "finn"')
-
         # verify that "backend" is set to "fpgadataflow"
         backend_value = self.get_nodeattr("backend")
         if backend_value == "fpgadataflow":
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
index ccb065f62a8340b916bfa5f6cf96c23c65d19d12..8a944fe77dc938db4154bb0a2ffcff8fdaefbd72 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
@@ -34,7 +34,7 @@ import numpy as np
 
 from onnx import TensorProto, helper
 from finn.core.datatype import DataType
-from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.basic import (
     interleave_matrix_outer_dim_from_partitions,
     roundup_to_integer_multiple,
@@ -70,7 +70,7 @@ class Thresholding_Batch(HLSCustomOp):
             # number of steps in thresholding function
             "numSteps": ("i", True, 1),
             # string defining memory type
-            "ram_style": ("s", False, "distributed"),
+            "ram_style": ("s", False, "distributed", {"distributed", "block"}),
             # FINN DataTypes for inputs, outputs
             "inputDataType": ("s", True, ""),
             "weightDataType": ("s", True, ""),
@@ -88,7 +88,7 @@ class Thresholding_Batch(HLSCustomOp):
             # memory mode for the thresholds
             # const -- embedded thresholds, default
             # decoupled -- streaming thresholds with  streamer packaged inside IP
-            "mem_mode": ("s", False, "const"),
+            "mem_mode": ("s", False, "const", {"const", "decoupled"}),
             # (mem_mode = decoupled only) whether weights (thresholds) will be
             # writable through an AXI-lite interface during runtime
             # 1 for enabled, 0 for disabled.
@@ -98,7 +98,7 @@ class Thresholding_Batch(HLSCustomOp):
             # always "flush" the accelerator by first passing a dummy input
             # vector through the accelerator. This will get rid of any old
             # weight data from the weight FIFOs.
-            "runtime_writeable_weights": ("i", False, 0),
+            "runtime_writeable_weights": ("i", False, 0, {0, 1}),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -137,13 +137,6 @@ class Thresholding_Batch(HLSCustomOp):
 
     def verify_node(self):
         info_messages = []
-        # verify that "domain" is set to "finn"
-        domain_value = self.onnx_node.domain
-        if domain_value == "finn":
-            info_messages.append("Attribute domain is set correctly")
-        else:
-            info_messages.append('Attribute domain should be set to "finn"')
-
         # verify that "backend" is set to "fpgadataflow"
         backend_value = self.get_nodeattr("backend")
         if backend_value == "fpgadataflow":
diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/tlastmarker.py
index 38a139c279701ae7892f41b63c3c717a3e736691..bedaf0984c39ef7603e6829961d7a3efb6ff489f 100644
--- a/src/finn/custom_op/fpgadataflow/tlastmarker.py
+++ b/src/finn/custom_op/fpgadataflow/tlastmarker.py
@@ -26,7 +26,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 
 
 class TLastMarker(HLSCustomOp):
@@ -47,14 +47,14 @@ class TLastMarker(HLSCustomOp):
             # whether static or dynamic (from AXI lite) number of iterations are used
             "DynIters": ("i", False, 1),
             # direction: whether to insert or remove TLAST
-            "Direction": ("s", False, "out"),
+            "Direction": ("s", False, "out", {"out", "in"}),
             # width of input-output data streams, in bits
             "StreamWidth": ("i", True, 0),
             # width of individual element in stream, in bits
             "ElemWidth": ("i", True, 0),
             # Protocol: external or internal
             # Vitis docs recommend using qdma_axis for external, ap_axiu for internal
-            "Protocol": ("s", False, "external"),
+            "Protocol": ("s", False, "external", {"external", "internal"}),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
diff --git a/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py b/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py
index c7c08d081a04ff72ae2a198e65091d042bd8d599..333884f361983e2a465715f3f4119c9c6384558e 100644
--- a/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py
+++ b/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py
@@ -1,9 +1,10 @@
 import os
 import numpy as np
+import math
 
 from onnx import TensorProto, helper
 from finn.core.datatype import DataType
-from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.basic import interleave_matrix_outer_dim_from_partitions
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
@@ -24,14 +25,14 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
             "Dim": ("i", True, 0),
             "Channels": ("i", True, 0),
             "Kernel": ("i", True, 0),
-            "resType": ("s", True, ""),
+            "resType": ("s", False, "auto", {"auto", "lut", "dsp"}),
             "ActVal": ("i", False, 0),
             # FINN DataTypes for inputs, weights, outputs
             "inputDataType": ("s", True, ""),
             "weightDataType": ("s", True, ""),
             "outputDataType": ("s", True, ""),
             # no-activation mode (produce accumulators)
-            "noActivation": ("i", False, 0),
+            "noActivation": ("i", False, 0, {0, 1}),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -408,6 +409,11 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
         )
 
     def docompute(self):
+        map_to_hls_mult_style = {
+            "auto": "ap_resource_dflt()",
+            "lut": "ap_resource_lut()",
+            "dsp": "ap_resource_dsp()",
+        }
         tmpl_args = self.get_template_param_values()
         if self.calc_tmem() == 0:
             odtype_hls_str = self.get_output_datatype().get_hls_datatype_str()
@@ -423,7 +429,7 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
                 tmpl_args["TDstI"],
                 tmpl_args["TWeightI"],
                 threshs,
-                self.get_nodeattr("resType"),
+                map_to_hls_mult_style[self.get_nodeattr("resType")],
             )
         ]
 
@@ -504,3 +510,99 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
                     "complete dim=3"
                 )
             )
+
+    def bram_estimation(self):
+        """Calculates resource estimation for BRAM"""
+        # TODO add in/out FIFO contributions
+        P = self.get_nodeattr("PE")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        omega = self.calc_wmem()
+        # assuming SDP mode RAMB18s (see UG573 Table 1-10)
+        # since this is HLS memory, not using the full width of a BRAM
+        # assuming memories up to 128 deep get implemented in LUTs
+        if self.calc_wmem() <= 128:
+            return 0
+
+        if W == 1:
+            return math.ceil(omega / 16384) * P
+        elif W == 2:
+            return math.ceil(omega / 8192) * P
+        elif W <= 4:
+            return (math.ceil(omega / 4096)) * (math.ceil(W / 4)) * P
+        elif W <= 9:
+            return (math.ceil(omega / 2048)) * (math.ceil(W / 8)) * P
+        elif W <= 18 or omega > 512:
+            return (math.ceil(omega / 1024)) * (math.ceil(W / 16)) * P
+        else:
+            return (math.ceil(omega / 512)) * (math.ceil(W / 32)) * P
+
+    def bram_efficiency_estimation(self):
+        P = self.get_nodeattr("PE")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        omega = self.calc_wmem()
+        bram16_est = self.bram_estimation()
+        if bram16_est == 0:
+            return 1
+        wbits = W * P * omega
+        bram16_est_capacity = bram16_est * 36 * 512
+        return wbits / bram16_est_capacity
+
+    def lut_estimation(self):
+        """Calculates resource estimations for LUTs based on:
+        - FINN-R: An End-to-End Deep-Learning Framework for Fast
+        Exploration of Quantized Neural Networks
+        - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien,
+        Y. Umuroglu, M. Leeser and K. Vissers
+        - 12. Sep 2018
+        """
+        # TODO add in/out FIFO contributions
+        P = self.get_nodeattr("PE")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        # determine tdt with input and weight data types
+        idt = self.get_input_datatype()
+        A = idt.bitwidth()
+        # parameters from experiments in paper mentioned above
+        c0 = 300
+        c1 = 1.1
+        c2 = 0
+        if self.calc_wmem() <= 128:
+            c2 = P * W * math.ceil(self.calc_wmem() / 64)
+
+        # multiplication
+        res_type = self.get_nodeattr("resType")
+        if res_type == "dsp":
+            mult_luts = 0
+        else:
+            mult_luts = (2 * math.ceil((W + A) / 6) - 1) * (W + A)
+        # accumulator
+        k = self.get_nodeattr("Kernel")
+        acc_bits = W + A + math.ceil(math.log(k * k, 2))
+        acc_luts = acc_bits
+        # thresholds and threshold comparators
+        thr_luts = 0
+        comp_luts = 0
+        noact = self.get_nodeattr("noActivation")
+        if noact == 0:
+            odt = self.get_output_datatype()
+            B = odt.bitwidth()
+            thr_luts = (2 ** B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64)
+            comp_luts = (2 ** B - 1) * acc_bits
+
+        return int(c0 + c1 * (P * (mult_luts + acc_luts + thr_luts + comp_luts)) + c2)
+
+    def dsp_estimation(self):
+        # multiplication
+        P = self.get_nodeattr("PE")
+        res_type = self.get_nodeattr("resType")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        idt = self.get_input_datatype()
+        A = idt.bitwidth()
+        if res_type == "dsp":
+            mult_dsp = P * np.ceil((W + A) / 48)  # TODO: more accurate modelling
+        else:
+            mult_dsp = 0
+        return int(mult_dsp)
diff --git a/src/finn/qnn-data/onnx/finn-hls-model/finn-hls-onnx-model.onnx b/src/finn/qnn-data/onnx/finn-hls-model/finn-hls-onnx-model.onnx
deleted file mode 100644
index c2db9153f4a0269025da64f54b491ee6d511dbdd..0000000000000000000000000000000000000000
--- a/src/finn/qnn-data/onnx/finn-hls-model/finn-hls-onnx-model.onnx
+++ /dev/null
@@ -1,207 +0,0 @@
-finn-hls-onnx-model:º
-R
-inp	memInStrm	memInStrm"FIFO*
-backend"fpgadataflow *
-depth€ :finn
-Ò
-	memInStrm
-weights0
-thresh0out1"StreamingFCLayer_Batch*
-
-MH€ *
-
-MWÀ *	
-PE  *
-SIMD@ *
-backend"fpgadataflow *!
-resDataType"Recast<XnorMul> *
-resType"ap_resource_lut() :finn
-L
-out1inter0inter0"FIFO*
-backend"fpgadataflow *
-depth :finn
-Ï
-inter0
-weights1
-thresh1out2"StreamingFCLayer_Batch*
-
-MH€ *
-
-MW€ *	
-PE@ *
-SIMD  *
-backend"fpgadataflow *!
-resDataType"Recast<XnorMul> *
-resType"ap_resource_lut() :finn
-L
-out2inter1inter1"FIFO*
-backend"fpgadataflow *
-depth :finn
-Ï
-inter1
-weights2
-thresh2out3"StreamingFCLayer_Batch*
-
-MH€ *
-
-MW€ *	
-PE  *
-SIMD@ *
-backend"fpgadataflow *!
-resDataType"Recast<XnorMul> *
-resType"ap_resource_lut() :finn
-L
-out3inter2inter2"FIFO*
-backend"fpgadataflow *
-depth :finn
-Î
-inter2
-weights3
-thresh3out4"StreamingFCLayer_Batch*	
-MH@ *
-
-MW€ *	
-PE *
-SIMD *
-backend"fpgadataflow *!
-resDataType"Recast<XnorMul> *
-resType"ap_resource_lut() :finn
-O
-out4outp
-memOutStrm"FIFO*
-backend"fpgadataflow *
-depth€ :finnfinn_hls_onnx_graphZ
-inp
-
-
-
-@b
-outp
-
-
-
-@j
-	memInStrm
-
-
-
-@j
-weights0
-
-@
- 
- j%
-thresh0
-
- 
- 
-
-
-j
-out1
-
-
- 
- j
-inter0
-
-
- 
- j
-weights1
-
- 
-@
-€j%
-thresh1
-
-
-@
-
-
-j
-out2
-
-
-
-@j
-inter1
-
-
-
-@j
-weights2
-
-@
- 
-€j%
-thresh2
-
- 
- 
-
-
-j
-out3
-
-
- 
- j
-inter2
-
-
- 
- j
-weights3
-
-
-
-€j%
-thresh3
-
-
-
-
-
-j
-out4
-
-
-
-@r
-inp
-
finn_datatypeBIPOLARr 
-outp
-
finn_datatypeBIPOLARr%
-	memInStrm
-
finn_datatypeBIPOLARr$
-weights0
-
finn_datatypeBIPOLARr#
-thresh0
-
finn_datatypeBIPOLARr 
-out1
-
finn_datatypeBIPOLARr"
-inter0
-
finn_datatypeBIPOLARr$
-weights1
-
finn_datatypeBIPOLARr#
-thresh1
-
finn_datatypeBIPOLARr 
-out2
-
finn_datatypeBIPOLARr"
-inter1
-
finn_datatypeBIPOLARr$
-weights2
-
finn_datatypeBIPOLARr#
-thresh2
-
finn_datatypeBIPOLARr 
-out3
-
finn_datatypeBIPOLARr"
-inter2
-
finn_datatypeBIPOLARr$
-weights3
-
finn_datatypeBIPOLARr#
-thresh3
-
finn_datatypeBIPOLARr 
-out4
-
finn_datatypeBIPOLARB
\ No newline at end of file
diff --git a/src/finn/qnn-data/onnx/finn-hls-model/tfc_w1_a1_after_conv_to_hls.onnx b/src/finn/qnn-data/onnx/finn-hls-model/tfc_w1_a1_after_conv_to_hls.onnx
deleted file mode 100644
index aada6f07e9d3910122d2eb357d8a8c1224e9fbab..0000000000000000000000000000000000000000
Binary files a/src/finn/qnn-data/onnx/finn-hls-model/tfc_w1_a1_after_conv_to_hls.onnx and /dev/null differ
diff --git a/src/finn/transformation/fpgadataflow/cleanup.py b/src/finn/transformation/fpgadataflow/cleanup.py
index f089317074eb2bded4675f6fd2e22fdaeb4b6a82..5dbe5f0517d07bef07e5ecff6e4c7afff0293d86 100644
--- a/src/finn/transformation/fpgadataflow/cleanup.py
+++ b/src/finn/transformation/fpgadataflow/cleanup.py
@@ -56,7 +56,7 @@ class CleanUp(Transformation):
             if is_fpgadataflow_node(node) is True:
                 try:
                     # lookup op_type in registry of CustomOps
-                    inst = registry.custom_op[op_type](node)
+                    inst = registry.getCustomOp(node)
                     # delete code_gen_dir from cppsim
                     code_gen_dir = inst.get_nodeattr("code_gen_dir_cppsim")
                     if os.path.isdir(code_gen_dir):
diff --git a/src/finn/transformation/fpgadataflow/compile_cppsim.py b/src/finn/transformation/fpgadataflow/compile_cppsim.py
index e17feb4683189ad2f8174f0564a877f84870b51d..6321b3335907948fb49de966c80eb21637e0a6ec 100644
--- a/src/finn/transformation/fpgadataflow/compile_cppsim.py
+++ b/src/finn/transformation/fpgadataflow/compile_cppsim.py
@@ -52,7 +52,7 @@ class CompileCppSim(NodeLocalTransformation):
         if is_fpgadataflow_node(node) is True:
             try:
                 # lookup op_type in registry of CustomOps
-                inst = registry.custom_op[op_type](node)
+                inst = registry.getCustomOp(node)
                 # ensure that code is generated
                 assert (
                     inst.get_nodeattr("code_gen_dir_cppsim") != ""
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index f27ebc645dbee20ff97b64aa942e375250f60cbd..749cf6c91a975a2ffaffedefa77b2f3fcb793e32 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -105,7 +105,7 @@ class InferConvInpGen(Transformation):
                         "FMPadding_Batch",
                         [i2c_input],
                         [padding_out],
-                        domain="finn",
+                        domain="finn.custom_op.fpgadataflow",
                         backend="fpgadataflow",
                         ImgDim=ifm_dim,
                         Padding=2 * pad,
@@ -121,7 +121,7 @@ class InferConvInpGen(Transformation):
                         "DownSampler",
                         [ConvInpGen_input],
                         [i2c_output],
-                        domain="finn",
+                        domain="finn.custom_op.fpgadataflow",
                         backend="fpgadataflow",
                         ImgDim=ConvInpGen_idim,
                         NumChannels=ifm_ch,
@@ -136,7 +136,7 @@ class InferConvInpGen(Transformation):
                         "ConvolutionInputGenerator",
                         [ConvInpGen_input],
                         [i2c_output],
-                        domain="finn",
+                        domain="finn.custom_op.fpgadataflow",
                         backend="fpgadataflow",
                         ConvKernelDim=k,
                         IFMChannels=ifm_ch,
@@ -187,7 +187,7 @@ class InferStreamingMaxPool(Transformation):
                         "StreamingMaxPool_Batch",
                         [mp_input],
                         [mp_output],
-                        domain="finn",
+                        domain="finn.custom_op.fpgadataflow",
                         backend="fpgadataflow",
                         PoolDim=k,
                         NumChannels=ifm_ch,
@@ -314,7 +314,7 @@ class InferPool_Batch(Transformation):
                     "Im2Col",
                     [inp_trans_out],
                     [im2col_out],
-                    domain="finn",
+                    domain="finn.custom_op.general",
                     stride=stride,
                     kernel_size=k,
                     pad_amount=pad,
@@ -331,7 +331,7 @@ class InferPool_Batch(Transformation):
                     "Pool_Batch",
                     [im2col_out],
                     [pool_output],
-                    domain="finn",
+                    domain="finn.custom_op.fpgadataflow",
                     backend="fpgadataflow",
                     InputDataType=idt.name,
                     OutputDataType=odt.name,
@@ -440,9 +440,8 @@ class InferBinaryStreamingFCLayer(Transformation):
                         "StreamingFCLayer_Batch",
                         [mm_input, mm_weight, mt_thres],
                         [mt_output],
-                        domain="finn",
+                        domain="finn.custom_op.fpgadataflow",
                         backend="fpgadataflow",
-                        resType="ap_resource_lut()",
                         MW=mw,
                         MH=mh,
                         SIMD=simd,
@@ -471,9 +470,8 @@ class InferBinaryStreamingFCLayer(Transformation):
                         "StreamingFCLayer_Batch",
                         [mm_input, mm_weight],
                         [mm_output],
-                        domain="finn",
+                        domain="finn.custom_op.fpgadataflow",
                         backend="fpgadataflow",
-                        resType="ap_resource_lut()",
                         MW=mw,
                         MH=mh,
                         SIMD=simd,
@@ -575,9 +573,8 @@ class InferQuantizedStreamingFCLayer(Transformation):
                             "StreamingFCLayer_Batch",
                             [mm_input, mm_weight, mt_thres],
                             [mt_output],
-                            domain="finn",
+                            domain="finn.custom_op.fpgadataflow",
                             backend="fpgadataflow",
-                            resType="ap_resource_lut()",
                             MW=mw,
                             MH=mh,
                             SIMD=simd,
@@ -606,9 +603,8 @@ class InferQuantizedStreamingFCLayer(Transformation):
                             "StreamingFCLayer_Batch",
                             [mm_input, mm_weight],
                             [mm_output],
-                            domain="finn",
+                            domain="finn.custom_op.fpgadataflow",
                             backend="fpgadataflow",
-                            resType="ap_resource_lut()",
                             MW=mw,
                             MH=mh,
                             SIMD=simd,
@@ -726,9 +722,9 @@ class InferVVAU(Transformation):
                             "Vector_Vector_Activate_Batch",
                             [mm_input, mm_weight, mt_thres],
                             [mt_output],
-                            domain="finn",
+                            domain="finn.custom_op.fpgadataflow",
                             backend="fpgadataflow",
-                            resType="ap_resource_lut()",
+                            resType="lut",
                             PE=pe,
                             Dim=mm_in_shape[1],
                             Channels=channels,
@@ -754,9 +750,9 @@ class InferVVAU(Transformation):
                             "Vector_Vector_Activate_Batch",
                             [mm_input, mm_weight],
                             [mm_output],
-                            domain="finn",
+                            domain="finn.custom_op.fpgadataflow",
                             backend="fpgadataflow",
-                            resType="ap_resource_lut()",
+                            resType="lut",
                             PE=pe,
                             Dim=mm_in_shape[1],
                             Channels=channels,
@@ -842,7 +838,7 @@ class InferThresholdingLayer(Transformation):
                     "Thresholding_Batch",
                     [thl_input, thl_threshold],
                     [thl_output],
-                    domain="finn",
+                    domain="finn.custom_op.fpgadataflow",
                     backend="fpgadataflow",
                     NumChannels=ifc,
                     PE=pe,
@@ -935,7 +931,7 @@ class InferAddStreamsLayer(Transformation):
                     "AddStreams_Batch",
                     [in0, in1],
                     [result],
-                    domain="finn",
+                    domain="finn.custom_op.fpgadataflow",
                     backend="fpgadataflow",
                     NumChannels=num_channels,
                     PE=pe,
@@ -995,7 +991,7 @@ class InferDuplicateStreamsLayer(Transformation):
                     "DuplicateStreams_Batch",
                     [output_tensor],
                     out_tensor_clones,
-                    domain="finn",
+                    domain="finn.custom_op.fpgadataflow",
                     backend="fpgadataflow",
                     NumChannels=num_ch,
                     PE=pe,
@@ -1160,7 +1156,7 @@ class InferChannelwiseLinearLayer(Transformation):
                     "ChannelwiseOp_Batch",
                     [ll_input, ll_const],
                     [ll_output],
-                    domain="finn",
+                    domain="finn.custom_op.fpgadataflow",
                     backend="fpgadataflow",
                     Func=func,
                     NumChannels=ch,
@@ -1221,7 +1217,7 @@ class InferLabelSelectLayer(Transformation):
                     "LabelSelect_Batch",
                     [fc_input],
                     [idx_output],
-                    domain="finn",
+                    domain="finn.custom_op.fpgadataflow",
                     backend="fpgadataflow",
                     Labels=num_labels,
                     PE=pe,
@@ -1297,7 +1293,7 @@ class InferGlobalAccPoolLayer(Transformation):
                     "GlobalAccPool_Batch",
                     [in0],
                     [pool_out],
-                    domain="finn",
+                    domain="finn.custom_op.fpgadataflow",
                     backend="fpgadataflow",
                     NumChannels=num_ch,
                     PE=pe,
diff --git a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
index 90a92d11ce621897e3e6c687f57b1cdf77d08fba..6df9e6d1e62270b13f31560a99109c9b108f8025 100644
--- a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
+++ b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
@@ -126,7 +126,7 @@ class CreateDataflowPartition(Transformation):
                     [df_out],
                     # use the model attribute to mark the df model
                     model=df_model_filename,
-                    domain="finn",
+                    domain="finn.custom_op.general",
                 )
                 non_df_model.graph.node.insert(df_start_ind, df_node)
                 model = non_df_model
diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index f7643673a0ba326ab77e4379d524fc831fbbc9ca..cbd353e4ad9099d13f10deadb4c99c290713d370 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -32,7 +32,7 @@ import subprocess
 import json
 
 from finn.transformation.base import Transformation
-from finn.util.basic import get_by_name, make_build_dir
+from finn.util.basic import get_by_name, make_build_dir, is_finn_op
 from finn.custom_op.registry import getCustomOp
 from finn.util.basic import get_num_default_workers
 import multiprocessing as mp
@@ -223,7 +223,7 @@ class CreateStitchedIP(Transformation):
         ip_dirs.append("/workspace/finn/finn-rtllib/memstream")
         # ensure that all nodes are fpgadataflow, and that IPs are generated
         for node in model.graph.node:
-            assert node.domain == "finn", 'Node domain is not set to "finn"'
+            assert is_finn_op(node.domain), "Found non-FINN node"
             backend_attribute = get_by_name(node.attribute, "backend")
             assert backend_attribute is not None, "Backend node attribute is not set."
             backend_value = backend_attribute.s.decode("UTF-8")
diff --git a/src/finn/transformation/fpgadataflow/hlssynth_ip.py b/src/finn/transformation/fpgadataflow/hlssynth_ip.py
index e79d70544c5e8d2b9060e354d7713b8405ae9c7f..bbd012a715e49b61c19daad65f8de889112f92a7 100644
--- a/src/finn/transformation/fpgadataflow/hlssynth_ip.py
+++ b/src/finn/transformation/fpgadataflow/hlssynth_ip.py
@@ -56,7 +56,7 @@ class HLSSynthIP(NodeLocalTransformation):
         if is_fpgadataflow_node(node) is True:
             try:
                 # lookup op_type in registry of CustomOps
-                inst = registry.custom_op[op_type](node)
+                inst = registry.getCustomOp(node)
                 # ensure that code is generated
                 assert (
                     inst.get_nodeattr("code_gen_dir_ipgen") != ""
diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py
index 195a005ff87b43c6b64017354895693cd811a48e..e26e92391edd8ac420e89c72fb34c5554c601967 100644
--- a/src/finn/transformation/fpgadataflow/insert_dwc.py
+++ b/src/finn/transformation/fpgadataflow/insert_dwc.py
@@ -4,6 +4,7 @@ from onnx import helper as oh
 from finn.custom_op.registry import getCustomOp
 from finn.transformation.base import Transformation
 from finn.util.fpgadataflow import is_fpgadataflow_node
+import warnings
 
 
 def _is_dwc_node(node):
@@ -40,48 +41,59 @@ class InsertDWC(Transformation):
         for n in graph.node:
             node_ind += 1
             if _suitable_node(n):
-                n_output = n.output[0]
-                consumer = model.find_consumer(n_output)
-                if _suitable_node(consumer) is True:
-                    n0 = getCustomOp(n)
-                    n1 = getCustomOp(consumer)
-                    n0_out_shape = n0.get_folded_output_shape()
-                    n1_in_shape = n1.get_folded_input_shape()
-                    if n0_out_shape[-1] != n1_in_shape[-1]:
-                        graph_modified = True
-                        # determine dwc inwidth
-                        dwc_in_width = n0.get_outstream_width()
-                        # determine dwc outwidth
-                        dwc_out_width = n1.get_instream_width()
-
-                        # determine shape for dwc
-                        dwc_shape = n0.get_normal_output_shape()
-
-                        # determine dtype for dwc
-                        dtype = n0.get_output_datatype()
-
-                        dwc_output_tensor = oh.make_tensor_value_info(
-                            model.make_new_valueinfo_name(),
-                            TensorProto.FLOAT,
-                            dwc_shape,
+                for n_output in n.output:
+                    consumers = model.find_consumers(n_output)
+                    if consumers is None:
+                        continue
+                    if len(consumers) > 1:
+                        warnings.warn(
+                            n.name
+                            + ": HLS node with fan-out higher than 1 cannot be stitched"
                         )
-                        graph.value_info.append(dwc_output_tensor)
-
-                        dwc_node = oh.make_node(
-                            "StreamingDataWidthConverter_Batch",
-                            [n_output],
-                            [dwc_output_tensor.name],
-                            domain="finn",
-                            backend="fpgadataflow",
-                            shape=dwc_shape,
-                            inWidth=dwc_in_width,
-                            outWidth=dwc_out_width,
-                            dataType=str(dtype.name),
-                        )
-                        # insert dwc
-                        graph.node.insert(node_ind + 1, dwc_node)
 
-                        # set dwc output tensor as new input tensor of second node
-                        consumer.input[0] = dwc_output_tensor.name
+                    consumer = consumers[0]
+                    if _suitable_node(consumer) is True:
+                        n0 = getCustomOp(n)
+                        n1 = getCustomOp(consumer)
+                        n0_out_shape = n0.get_folded_output_shape()
+                        n1_in_shape = n1.get_folded_input_shape()
+                        if n0_out_shape[-1] != n1_in_shape[-1]:
+                            graph_modified = True
+                            # determine dwc inwidth
+                            dwc_in_width = n0.get_outstream_width()
+                            # determine dwc outwidth
+                            dwc_out_width = n1.get_instream_width()
+
+                            # determine shape for dwc
+                            dwc_shape = n0.get_normal_output_shape()
+
+                            # determine dtype for dwc
+                            dtype = n0.get_output_datatype()
+
+                            dwc_output_tensor = oh.make_tensor_value_info(
+                                model.make_new_valueinfo_name(),
+                                TensorProto.FLOAT,
+                                dwc_shape,
+                            )
+                            graph.value_info.append(dwc_output_tensor)
+
+                            dwc_node = oh.make_node(
+                                "StreamingDataWidthConverter_Batch",
+                                [n_output],
+                                [dwc_output_tensor.name],
+                                domain="finn.custom_op.fpgadataflow",
+                                backend="fpgadataflow",
+                                shape=dwc_shape,
+                                inWidth=dwc_in_width,
+                                outWidth=dwc_out_width,
+                                dataType=str(dtype.name),
+                            )
+                            # insert dwc
+                            graph.node.insert(node_ind + 1, dwc_node)
+
+                            # set dwc output tensor as new input tensor of second node
+                            for idx, inp in enumerate(consumer.input):
+                                if inp == n_output:
+                                    consumer.input[idx] = dwc_output_tensor.name
 
         return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index a3056aaa15a5f00cdc7b33f5dba83820c76dfa10..def6babf82f8fb4bc290daa19efb4aeec074541c 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -116,7 +116,7 @@ class InsertFIFO(Transformation):
                                 "StreamingFIFO",
                                 [n_output],
                                 [fifo_output_tensor.name],
-                                domain="finn",
+                                domain="finn.custom_op.fpgadataflow",
                                 backend="fpgadataflow",
                                 depth=fifo_depth,
                                 folded_shape=fld_shape,
@@ -164,7 +164,7 @@ class InsertFIFO(Transformation):
                     "StreamingFIFO",
                     [n_input],
                     [fifo_output_tensor.name],
-                    domain="finn",
+                    domain="finn.custom_op.fpgadataflow",
                     backend="fpgadataflow",
                     depth=fifo_depth,
                     folded_shape=fld_shape,
@@ -210,7 +210,7 @@ class InsertFIFO(Transformation):
                     "StreamingFIFO",
                     [fifo_input_tensor.name],
                     [graph_out_name],
-                    domain="finn",
+                    domain="finn.custom_op.fpgadataflow",
                     backend="fpgadataflow",
                     depth=fifo_depth,
                     folded_shape=fld_shape,
diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py
index 1196035b22cf21c2de4901dc544875ebc80525d4..fe53bd39639462b8cebcdf5febe3b11e7eda96dc 100644
--- a/src/finn/transformation/fpgadataflow/insert_iodma.py
+++ b/src/finn/transformation/fpgadataflow/insert_iodma.py
@@ -139,7 +139,7 @@ class InsertIODMA(Transformation):
                     intfWidth=intfwidth,
                     streamWidth=streamWidth,
                     direction="out",
-                    domain="finn",
+                    domain="finn.custom_op.fpgadataflow",
                     backend="fpgadataflow",
                 )
                 model.graph.node.append(dma_node)
@@ -172,7 +172,7 @@ class InsertIODMA(Transformation):
                     intfWidth=intfwidth,
                     streamWidth=streamWidth,
                     direction="in",
-                    domain="finn",
+                    domain="finn.custom_op.fpgadataflow",
                     backend="fpgadataflow",
                 )
                 model.graph.node.insert(0, dma_node)
@@ -212,7 +212,7 @@ class InsertIODMA(Transformation):
                     streamWidth=streamWidth,
                     direction="in",
                     burstMode="wrap",
-                    domain="finn",
+                    domain="finn.custom_op.fpgadataflow",
                     backend="fpgadataflow",
                 )
                 fc_node.input[1] = fc_node_in.name
diff --git a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
index 8ffb083217bb3a7e379112b3da102487c0cd50c2..3ce9824b14a54f502c90650e7b3b75e9cdaab77f 100644
--- a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
+++ b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
@@ -84,7 +84,7 @@ class InsertTLastMarker(Transformation):
                 DynIters=(1 if self.dyniters else 0),
                 Direction="out",
                 Protocol=("external" if self.external else "internal"),
-                domain="finn",
+                domain="finn.custom_op.fpgadataflow",
                 backend="fpgadataflow",
             )
             model.graph.node.append(tlast_node)
@@ -159,7 +159,7 @@ class InsertTLastMarker(Transformation):
                         DynIters=(1 if self.dyniters else 0),
                         Direction="in",
                         Protocol=("external" if self.external else "internal"),
-                        domain="finn",
+                        domain="finn.custom_op.fpgadataflow",
                         backend="fpgadataflow",
                     )
                     model.graph.node.insert(insert_idx, tlast_node)
diff --git a/src/finn/transformation/fpgadataflow/prepare_cppsim.py b/src/finn/transformation/fpgadataflow/prepare_cppsim.py
index 26354bdf70e10bfcddfbaf732a214865c6feb8f5..653ec02ff306bf35d5fd3f7265404e61641077ac 100644
--- a/src/finn/transformation/fpgadataflow/prepare_cppsim.py
+++ b/src/finn/transformation/fpgadataflow/prepare_cppsim.py
@@ -44,7 +44,7 @@ def _codegen_single_node(node, model):
     op_type = node.op_type
     try:
         # lookup op_type in registry of CustomOps
-        inst = registry.custom_op[op_type](node)
+        inst = registry.getCustomOp(node)
         # get the path of the code generation directory
         code_gen_dir = inst.get_nodeattr("code_gen_dir_cppsim")
         # ensure that there is a directory
diff --git a/src/finn/transformation/fpgadataflow/prepare_ip.py b/src/finn/transformation/fpgadataflow/prepare_ip.py
index 53cb0af163b853c0a0352d8562cca66b3ecf6068..4ed5e80aa7baa585f83314ec42233d5885dff32d 100644
--- a/src/finn/transformation/fpgadataflow/prepare_ip.py
+++ b/src/finn/transformation/fpgadataflow/prepare_ip.py
@@ -41,7 +41,7 @@ def _codegen_single_node(node, model, fpgapart, clk):
     op_type = node.op_type
     try:
         # lookup op_type in registry of CustomOps
-        inst = registry.custom_op[op_type](node)
+        inst = registry.getCustomOp(node)
         # get the path of the code generation directory
         code_gen_dir = inst.get_nodeattr("code_gen_dir_ipgen")
         # ensure that there is a directory
diff --git a/src/finn/transformation/fpgadataflow/prepare_rtlsim.py b/src/finn/transformation/fpgadataflow/prepare_rtlsim.py
index d2ec5561a349d5fc83f02870c1a682dba8433e43..eaa85b9102b55bf8ecdf3a9f284f87468581e113 100644
--- a/src/finn/transformation/fpgadataflow/prepare_rtlsim.py
+++ b/src/finn/transformation/fpgadataflow/prepare_rtlsim.py
@@ -65,7 +65,7 @@ class PrepareRTLSim(NodeLocalTransformation):
         if is_fpgadataflow_node(node) is True:
             try:
                 # lookup op_type in registry of CustomOps
-                inst = registry.custom_op[op_type](node)
+                inst = registry.getCustomOp(node)
                 inst.prepare_rtlsim()
                 # ensure that executable path is now set
                 assert (
diff --git a/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py b/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py
index c577704129fa564f5e0e1e256623ff10125cf5ac..cc7c305b3ec94482e64235a1b1cf4eee543c46e1 100644
--- a/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py
+++ b/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py
@@ -41,11 +41,10 @@ class ReplaceVerilogRelPaths(Transformation):
 
     def apply(self, model):
         for node in model.graph.node:
-            op_type = node.op_type
             if is_fpgadataflow_node(node) is True:
                 try:
                     # lookup op_type in registry of CustomOps
-                    inst = registry.custom_op[op_type](node)
+                    inst = registry.getCustomOp(node)
                     # find the IP gen dir
                     ipgen_path = inst.get_nodeattr("ipgen_path")
                     if ipgen_path is not None and os.path.isdir(ipgen_path):
diff --git a/src/finn/transformation/fpgadataflow/set_exec_mode.py b/src/finn/transformation/fpgadataflow/set_exec_mode.py
index 6a76031f4c76831f514b77aee6cd3c560b3b9910..4677e59f7b35fec38aeaae65485ed16ba1e18f06 100644
--- a/src/finn/transformation/fpgadataflow/set_exec_mode.py
+++ b/src/finn/transformation/fpgadataflow/set_exec_mode.py
@@ -45,7 +45,7 @@ class SetExecMode(Transformation):
             if is_fpgadataflow_node(node) is True:
                 try:
                     # lookup op_type in registry of CustomOps
-                    inst = registry.custom_op[op_type](node)
+                    inst = registry.getCustomOp(node)
                     # set sim_mode accordingly to argument mode
                     inst.set_nodeattr("exec_mode", self.mode)
                     # ensure that sim_mode is now set
diff --git a/src/finn/transformation/move_reshape.py b/src/finn/transformation/move_reshape.py
index a07eaf142293487237b3f2b93460ba492eb5368d..cb8deaeec4b79d3c47d7705ff8f9bf72a085dfc0 100644
--- a/src/finn/transformation/move_reshape.py
+++ b/src/finn/transformation/move_reshape.py
@@ -1,10 +1,10 @@
 from finn.transformation.base import Transformation
-from finn.util.basic import get_by_name
+from finn.util.basic import get_by_name, is_finn_op
 
 
 def _is_fpgadataflow_node(node):
     if node is not None:
-        if node.domain == "finn":
+        if is_finn_op(node.domain):
             n_backend = get_by_name(node.attribute, "backend")
             if n_backend is None:
                 return False
diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index bae3c9f22f4e5b2a525f15d1d948e42a4087953a..08a01171364c6f9c1ecc36b9f12f7447ad24e56c 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -584,7 +584,7 @@ class MakeMaxPoolNHWC(Transformation):
                     perms = list(get_by_name(consumer.attribute, "perm").ints)
                     if perms == [0, 2, 3, 1]:
                         n.op_type = "MaxPoolNHWC"
-                        n.domain = "finn"
+                        n.domain = "finn.custom_op.general"
                         start_name = n.input[0]
                         mid_name = consumer.input[0]
                         end_name = consumer.output[0]
diff --git a/src/finn/transformation/streamline/sign_to_thres.py b/src/finn/transformation/streamline/sign_to_thres.py
index 4e35012ceb4f84284ff2a96a60e4a9bd58a65cce..13f2e8524af7ce2d3457d0637f1c6d02733f504b 100644
--- a/src/finn/transformation/streamline/sign_to_thres.py
+++ b/src/finn/transformation/streamline/sign_to_thres.py
@@ -60,7 +60,7 @@ class ConvertSignToThres(Transformation):
                     "MultiThreshold",
                     [sign_in_name, thres_param_name],
                     [sign_out_name],
-                    domain="finn",
+                    domain="finn.custom_op.general",
                     out_scale=2.0,
                     out_bias=-1.0,
                     out_dtype="BIPOLAR",
diff --git a/src/finn/util/create.py b/src/finn/util/create.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9c5d7b1b59916edfc8730992535f3ddb57c4d60
--- /dev/null
+++ b/src/finn/util/create.py
@@ -0,0 +1,178 @@
+# Copyright (c) 2020 Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+from onnx import TensorProto, helper
+
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor
+
+
+def hls_random_mlp_maker(layer_spec):
+    """Create an MLP of given specification using HLSCustomOp instances.
+    Generate random weights/thresholds of appropriate size."""
+    ret = []
+    for lyr in layer_spec:
+        idt = lyr["idt"]
+        wdt = lyr["wdt"]
+        mw = lyr["mw"]
+        mh = lyr["mh"]
+        act = lyr["act"]
+        lyr["W"] = gen_finn_dt_tensor(wdt, (mw, mh))
+        if act is None:
+            # no activation, produce accumulators
+            T = None
+            tdt = None
+            if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
+                odt = DataType.UINT32
+            else:
+                odt = DataType.INT32
+        else:
+            odt = act
+            (min, max) = calculate_signed_dot_prod_range(idt, wdt, mw)
+            n_steps = act.get_num_possible_values() - 1
+            T = np.random.randint(min, max - 1, (mh, n_steps)).astype(np.float32)
+            # provide non-decreasing thresholds
+            T = np.sort(T, axis=1)
+            # generate thresholds for activation
+            if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
+                tdt = DataType.UINT32
+                # bias thresholds to be positive
+                T = np.ceil((T + mw) / 2)
+                assert (T >= 0).all()
+            else:
+                tdt = DataType.INT32
+        lyr["T"] = T
+        lyr["tdt"] = tdt
+        lyr["odt"] = odt
+        ret.append(lyr)
+
+    return hls_mlp_maker(ret)
+
+
+def hls_mlp_maker(layer_spec):
+    """Create an MLP of given specification using HLSCustomOp instances."""
+
+    current_in_name = ""
+    current_out_name = ""
+    i = 0
+
+    graph = helper.make_graph(nodes=[], name="mlp", inputs=[], outputs=[])
+
+    model = helper.make_model(graph, producer_name="finn")
+    model = ModelWrapper(model)
+
+    for lyr in layer_spec:
+        current_W_name = "W_%d" % i
+        current_T_name = "T_%d" % i
+        current_in_name = "act_%d" % i
+        current_out_name = "act_%d" % (i + 1)
+
+        W = lyr["W"]
+        (mw, mh) = W.shape
+        T = lyr["T"]
+        pe = lyr["pe"]
+        simd = lyr["simd"]
+        wdt = lyr["wdt"]
+        idt = lyr["idt"]
+        tdt = lyr["tdt"]
+        odt = lyr["odt"]
+
+        if i == 0:
+            global_in = helper.make_tensor_value_info(
+                current_in_name, TensorProto.FLOAT, [1, mw]
+            )
+            model.graph.input.append(global_in)
+
+        if i == len(layer_spec) - 1:
+            global_out = helper.make_tensor_value_info(
+                current_out_name, TensorProto.FLOAT, [1, mh]
+            )
+            model.graph.output.append(global_out)
+
+        # there are two ways to implement bipolar weights and inputs for
+        # StreamingFC:
+        # - specify their datatypes as such
+        # - specify their datatypes as BINARY as use binaryXnorMode
+        if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
+            # we'll internally convert weights/inputs to binary and specify the
+            # datatypes as such, and also set the binaryXnorMode attribute to 1
+            export_wdt = DataType.BINARY
+            export_idt = DataType.BINARY
+            binary_xnor_mode = 1
+        else:
+            export_wdt = wdt
+            export_idt = idt
+            binary_xnor_mode = 0
+
+        if T is not None:
+            no_act = 0
+            node_inp_list = [current_in_name, current_W_name, current_T_name]
+            if odt == DataType.BIPOLAR:
+                actval = 0
+            else:
+                actval = odt.min()
+        else:
+            # no thresholds
+            node_inp_list = [current_in_name, current_W_name]
+            actval = 0
+            no_act = 1
+        FCLayer_node = helper.make_node(
+            "StreamingFCLayer_Batch",
+            node_inp_list,
+            [current_out_name],
+            domain="finn.custom_op.fpgadataflow",
+            backend="fpgadataflow",
+            MW=mw,
+            MH=mh,
+            SIMD=simd,
+            PE=pe,
+            inputDataType=export_idt.name,
+            weightDataType=export_wdt.name,
+            outputDataType=odt.name,
+            ActVal=actval,
+            binaryXnorMode=binary_xnor_mode,
+            noActivation=no_act,
+        )
+
+        model.graph.node.append(FCLayer_node)
+        model.set_tensor_datatype(current_in_name, idt)
+        model.set_tensor_datatype(current_out_name, odt)
+        model.set_tensor_datatype(current_W_name, wdt)
+        if binary_xnor_mode:
+            # convert bipolar to binary
+            model.set_initializer(current_W_name, (W + 1) / 2)
+        else:
+            model.set_initializer(current_W_name, W)
+        if T is not None:
+            model.set_tensor_datatype(current_T_name, tdt)
+            model.set_initializer(current_T_name, T)
+        i += 1
+
+    return model
diff --git a/tests/fpgadataflow/test_code_gen_trafo.py b/tests/fpgadataflow/test_code_gen_trafo.py
index 24933759830535dfcec768d47a6020b4f3e2de35..cf3e064804216e192909eae75f01880554f03d9f 100644
--- a/tests/fpgadataflow/test_code_gen_trafo.py
+++ b/tests/fpgadataflow/test_code_gen_trafo.py
@@ -51,11 +51,10 @@ def test_code_gen_trafo():
         "StreamingFCLayer_Batch",
         node_inp_list,
         ["outp"],
-        domain="finn",
+        domain="finn.custom_op.fpgadataflow",
         backend="fpgadataflow",
         code_gen_dir="",
         executable_path="",
-        resType="ap_resource_lut()",
         MW=mw,
         MH=mh,
         SIMD=simd,
diff --git a/tests/fpgadataflow/test_compilation_trafo.py b/tests/fpgadataflow/test_compilation_trafo.py
index 65894e02e490f6931e5b03a9aa67b8f22e32583a..a12c69285b7b335f075d8ffd7ba27e039ebc6f8c 100644
--- a/tests/fpgadataflow/test_compilation_trafo.py
+++ b/tests/fpgadataflow/test_compilation_trafo.py
@@ -53,11 +53,10 @@ def test_compilation_trafo():
         "StreamingFCLayer_Batch",
         node_inp_list,
         ["outp"],
-        domain="finn",
+        domain="finn.custom_op.fpgadataflow",
         backend="fpgadataflow",
         code_gen_dir="",
         executable_path="",
-        resType="ap_resource_lut()",
         MW=mw,
         MH=mh,
         SIMD=simd,
diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
index aaffa3f7ed28116a9c1de9dd3b9dacba19954ee1..9d350a9342e3de56cbbb5b3fc4abec69bfc254dc 100644
--- a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
@@ -47,7 +47,7 @@ import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.custom_op.im2col import compute_conv_output_dim
+from finn.custom_op.general.im2col import compute_conv_output_dim
 from finn.custom_op.registry import getCustomOp
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 
diff --git a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
index 86409feffd120b1baeeee471415e93f29d9e655a..e8f3c3ae3290b5bdc23e46f7e9991222fdfac000 100644
--- a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
+++ b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
@@ -90,7 +90,7 @@ def make_single_quantavpool_modelwrapper(k, stride, ifm_ch, ifm_dim, ofm_dim, id
         "QuantAvgPool2d",
         ["inp"],
         ["outp"],
-        domain="finn",
+        domain="finn.custom_op.general",
         stride=stride,
         kernel=k,
         ibits=idt.bitwidth(),
diff --git a/tests/fpgadataflow/test_create_dataflow_partition.py b/tests/fpgadataflow/test_create_dataflow_partition.py
deleted file mode 100644
index 6732b92ae0865e390002bd3c65dfefe3890610e2..0000000000000000000000000000000000000000
--- a/tests/fpgadataflow/test_create_dataflow_partition.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright (c) 2020, Xilinx
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import os.path
-from pkgutil import get_data
-
-
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.registry import getCustomOp
-from finn.transformation.fpgadataflow.create_dataflow_partition import (
-    CreateDataflowPartition,
-)
-from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker
-from finn.util.basic import make_build_dir
-from finn.util.test import load_test_checkpoint_or_skip
-
-build_dir = make_build_dir("test_dataflow_partition_")
-
-
-def test_dataflow_partition_create():
-    # load the onnx model
-    raw_m = get_data(
-        "finn.qnn-data", "onnx/finn-hls-model/tfc_w1_a1_after_conv_to_hls.onnx"
-    )
-    model = ModelWrapper(raw_m)
-    model = model.transform(CreateDataflowPartition())
-    assert model.graph.node[2].op_type == "StreamingDataflowPartition"
-    sdp_node = getCustomOp(model.graph.node[2])
-    assert sdp_node.__class__.__name__ == "StreamingDataflowPartition"
-    assert os.path.isfile(sdp_node.get_nodeattr("model"))
-    model.save(build_dir + "/test_dataflow_partition_create.onnx")
-
-
-def test_dataflow_partition_tlastmarker():
-    model = load_test_checkpoint_or_skip(
-        build_dir + "/test_dataflow_partition_create.onnx"
-    )
-    model_path = getCustomOp(model.graph.node[2]).get_nodeattr("model")
-    model = ModelWrapper(model_path)
-    model = model.transform(InsertTLastMarker())
-    assert model.graph.node[-1].op_type == "TLastMarker"
-    assert model.graph.node[-1].domain == "finn"
-    tl_node = getCustomOp(model.graph.node[-1])
-    assert tl_node.get_nodeattr("NumIters") == 1
-    assert tl_node.get_nodeattr("StreamWidth") == 320
-    assert tl_node.get_nodeattr("ElemWidth") == 32
-    model.save(build_dir + "/test_dataflow_partition_tlastmarker.onnx")
-    model = model.transform(InsertTLastMarker())
-    model.save(build_dir + "/test_dataflow_partition_tlastmarker2.onnx")
diff --git a/tests/fpgadataflow/test_depthwise_convolution.py b/tests/fpgadataflow/test_depthwise_convolution.py
index f269a1ed7247503f561425b97115694503522171..7c608fc3863ab72d1097f49b793af73664b2be48 100644
--- a/tests/fpgadataflow/test_depthwise_convolution.py
+++ b/tests/fpgadataflow/test_depthwise_convolution.py
@@ -43,7 +43,7 @@ from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 
 import finn.core.onnx_exec as oxe
-from finn.custom_op.im2col import compute_conv_output_dim
+from finn.custom_op.general.im2col import compute_conv_output_dim
 from finn.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor
 from finn.custom_op.registry import getCustomOp
 
@@ -70,7 +70,7 @@ def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding):
         tdt = DataType.INT32
         thresh_node = oh.make_node(
             "MultiThreshold",
-            domain="finn",
+            domain="finn.custom_op.general",
             inputs=["outp", "T"],
             outputs=["out_act"],
             data_layout="NHWC",
@@ -93,7 +93,7 @@ def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding):
 
     im2col_node = oh.make_node(
         "Im2Col",
-        domain="finn",
+        domain="finn.custom_op.general",
         inputs=["inp"],
         outputs=["im2col_out"],
         kernel_size=k,
diff --git a/tests/fpgadataflow/test_fpgadataflow_addstreams.py b/tests/fpgadataflow/test_fpgadataflow_addstreams.py
index e4191c75d6249d22b36e41fed50c5f7896f13c22..0fa156e23b4a01270297e4e8e1fdc13a75eb5a59 100644
--- a/tests/fpgadataflow/test_fpgadataflow_addstreams.py
+++ b/tests/fpgadataflow/test_fpgadataflow_addstreams.py
@@ -55,7 +55,7 @@ def make_addstreams_modelwrapper(ch, pe, idt):
         "AddStreams_Batch",
         ["inp1", "inp2"],
         ["outp"],
-        domain="finn",
+        domain="finn.custom_op.fpgadataflow",
         backend="fpgadataflow",
         NumChannels=ch,
         PE=pe,
diff --git a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
index 0e0ce7d542f605cc6af5df13b45d670cfcafa5a9..e45dfe07c3abc0ce218dee0563055acb4458ccd0 100644
--- a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
+++ b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
@@ -61,7 +61,7 @@ def make_modelwrapper(C, pe, idt, odt, pdt, func, vecs):
         "ChannelwiseOp_Batch",
         node_inp_list,
         ["outp"],
-        domain="finn",
+        domain="finn.custom_op.fpgadataflow",
         backend="fpgadataflow",
         NumChannels=NumChannels,
         Func=func,
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
index 020a2a545dadaf32c469789c90d0ea530688812c..0e2e60534bcc871592128fdbbd5ca52b3cc0fe4f 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
@@ -60,7 +60,7 @@ def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, simd, stride, i
         "Im2Col",
         ["inp"],
         ["outp"],
-        domain="finn",
+        domain="finn.custom_op.general",
         backend="fpgadataflow",
         stride=stride,
         kernel_size=k,
@@ -96,7 +96,7 @@ def make_single_slidingwindow_modelwrapper(
         "ConvolutionInputGenerator",
         ["inp"],
         ["outp"],
-        domain="finn",
+        domain="finn.custom_op.fpgadataflow",
         backend="fpgadataflow",
         ConvKernelDim=k,
         IFMChannels=ifm_ch,
diff --git a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
index 47401ddb9546d5b32a5d36c6731981aabe0ca7cd..12505fdf456aa55f881fb5f3d2d609080cc97074 100644
--- a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
+++ b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
@@ -58,7 +58,7 @@ def make_dupstreams_modelwrapper(ch, pe, idim, idt):
         "DuplicateStreams_Batch",
         ["inp"],
         ["outp0", "outp1"],
-        domain="finn",
+        domain="finn.custom_op.fpgadataflow",
         backend="fpgadataflow",
         NumChannels=ch,
         PE=pe,
diff --git a/tests/fpgadataflow/test_fpgadataflow_dwc.py b/tests/fpgadataflow/test_fpgadataflow_dwc.py
index 90b3145805f0c1ba59c7225b121b14b124ffe878..34930e672f3ff9816d3328da102b1bc1daa8a3b1 100644
--- a/tests/fpgadataflow/test_fpgadataflow_dwc.py
+++ b/tests/fpgadataflow/test_fpgadataflow_dwc.py
@@ -50,7 +50,7 @@ def make_single_dwc_modelwrapper(Shape, INWidth, OUTWidth, finn_dtype):
         "StreamingDataWidthConverter_Batch",
         ["inp"],
         ["outp"],
-        domain="finn",
+        domain="finn.custom_op.fpgadataflow",
         backend="fpgadataflow",
         shape=Shape,
         inWidth=INWidth,
diff --git a/tests/fpgadataflow/test_fpgadataflow_fclayer.py b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
index feff580002c317a3a8754dba2b6a9f291e408ac5..00f1ba5d59288b1a463fadbd684ff872269d6970 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fclayer.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
@@ -33,11 +33,11 @@ from onnx import TensorProto, helper
 
 from finn.custom_op.registry import getCustomOp
 import finn.core.onnx_exec as oxe
-import finn.custom_op.xnorpopcount as xp
+import finn.custom_op.general.xnorpopcount as xp
 from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.multithreshold import multithreshold
+from finn.custom_op.general.multithreshold import multithreshold
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
@@ -88,9 +88,8 @@ def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=Non
         "StreamingFCLayer_Batch",
         node_inp_list,
         ["outp"],
-        domain="finn",
+        domain="finn.custom_op.fpgadataflow",
         backend="fpgadataflow",
-        resType="ap_resource_lut()",
         MW=mw,
         MH=mh,
         SIMD=simd,
diff --git a/tests/fpgadataflow/test_fpgadataflow_fifo.py b/tests/fpgadataflow/test_fpgadataflow_fifo.py
index 77c518966c15ae002b6e88c244c1ee9e853c29aa..1f1c9936139df4160bd08a0e168d1f4b7e639077 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fifo.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fifo.py
@@ -55,7 +55,7 @@ def make_single_fifo_modelwrapper(Shape, Depth, fld_shape, finn_dtype):
         "StreamingFIFO",
         ["inp"],
         ["outp"],
-        domain="finn",
+        domain="finn.custom_op.fpgadataflow",
         backend="fpgadataflow",
         depth=Depth,
         folded_shape=fld_shape,
diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
index 8b38b2520c2239be822093da70fb29f6545c0b43..b2835d578b03ee689330d53a9a7b233c9b9f4222 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
@@ -69,7 +69,7 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_sty
         "FMPadding_Batch",
         ["inp"],
         ["outp"],
-        domain="finn",
+        domain="finn.custom_op.fpgadataflow",
         backend="fpgadataflow",
         ImgDim=idim,
         Padding=padding,
diff --git a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
index 191e00022a0b0ab11fcf4d1a476442cbd824408d..7fca91925a63a5da4294adb002a3cc97831a88ca 100644
--- a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
+++ b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
@@ -54,7 +54,7 @@ def make_accpool_modelwrapper(ch, pe, idim, idt):
         "GlobalAccPool_Batch",
         ["inp"],
         ["outp"],
-        domain="finn",
+        domain="finn.custom_op.fpgadataflow",
         backend="fpgadataflow",
         NumChannels=ch,
         PE=pe,
diff --git a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
index 2e9d25778147b1aa774c56f877c35d094c62e2bc..306844c7ef3828d8483d3b0006491864f1525e21 100644
--- a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
+++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
@@ -88,9 +88,8 @@ def create_one_fc_model(mem_mode="const"):
         "StreamingFCLayer_Batch",
         ["inp", "w0"],
         ["outp"],
-        domain="finn",
+        domain="finn.custom_op.fpgadataflow",
         backend="fpgadataflow",
-        resType="ap_resource_lut()",
         MW=m,
         MH=m,
         SIMD=simd,
@@ -143,9 +142,8 @@ def create_two_fc_model(mem_mode="decoupled"):
         "StreamingFCLayer_Batch",
         ["inp", "w0"],
         ["mid"],
-        domain="finn",
+        domain="finn.custom_op.fpgadataflow",
         backend="fpgadataflow",
-        resType="ap_resource_lut()",
         MW=m,
         MH=m,
         SIMD=simd,
@@ -163,9 +161,8 @@ def create_two_fc_model(mem_mode="decoupled"):
         "StreamingFCLayer_Batch",
         ["mid", "w1"],
         ["outp"],
-        domain="finn",
+        domain="finn.custom_op.fpgadataflow",
         backend="fpgadataflow",
-        resType="ap_resource_lut()",
         MW=m,
         MH=m,
         SIMD=simd,
@@ -263,23 +260,23 @@ def test_fpgadataflow_ipstitch_rtlsim(mem_mode):
         "m_axis_0_tlast",
         "m_axis_0_tready",
         "m_axis_0_tvalid",
-        "s_axi_control_araddr",
-        "s_axi_control_arready",
-        "s_axi_control_arvalid",
-        "s_axi_control_awaddr",
-        "s_axi_control_awready",
-        "s_axi_control_awvalid",
-        "s_axi_control_bready",
-        "s_axi_control_bresp",
-        "s_axi_control_bvalid",
-        "s_axi_control_rdata",
-        "s_axi_control_rready",
-        "s_axi_control_rresp",
-        "s_axi_control_rvalid",
-        "s_axi_control_wdata",
-        "s_axi_control_wready",
-        "s_axi_control_wstrb",
-        "s_axi_control_wvalid",
+        "s_axi_control_0_araddr",
+        "s_axi_control_0_arready",
+        "s_axi_control_0_arvalid",
+        "s_axi_control_0_awaddr",
+        "s_axi_control_0_awready",
+        "s_axi_control_0_awvalid",
+        "s_axi_control_0_bready",
+        "s_axi_control_0_bresp",
+        "s_axi_control_0_bvalid",
+        "s_axi_control_0_rdata",
+        "s_axi_control_0_rready",
+        "s_axi_control_0_rresp",
+        "s_axi_control_0_rvalid",
+        "s_axi_control_0_wdata",
+        "s_axi_control_0_wready",
+        "s_axi_control_0_wstrb",
+        "s_axi_control_0_wvalid",
     ]
     assert sorted(dir(sim.io)) == sorted(exp_io)
     model.set_metadata_prop("exec_mode", "rtlsim")
diff --git a/tests/fpgadataflow/test_fpgadataflow_labelselect.py b/tests/fpgadataflow/test_fpgadataflow_labelselect.py
index dae91b94120e94eb86bbc4b958decd581f36e671..5d496dbb33d21c9092fb2076cac75b3ccbbaa1e9 100644
--- a/tests/fpgadataflow/test_fpgadataflow_labelselect.py
+++ b/tests/fpgadataflow/test_fpgadataflow_labelselect.py
@@ -53,7 +53,7 @@ def make_labelselect_modelwrapper(labels, pe, k, idt):
         "LabelSelect_Batch",
         ["inp"],
         ["outp"],
-        domain="finn",
+        domain="finn.custom_op.fpgadataflow",
         backend="fpgadataflow",
         Labels=labels,
         PE=pe,
diff --git a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
index 398a17132a2ef6c92e600102ff5c0b71a1f65aaa..06ebd90000e7466b2781d3284c5a0a0e56733dea 100644
--- a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
+++ b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
@@ -28,7 +28,10 @@
 
 from onnx import TensorProto, helper
 
-from finn.analysis.fpgadataflow.res_estimation import res_estimation
+from finn.analysis.fpgadataflow.res_estimation import (
+    res_estimation,
+    res_estimation_complete,
+)
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.general import GiveUniqueNodeNames
@@ -53,7 +56,7 @@ def test_res_estimate():
     pe = 1
     idt = DataType.INT2
     wdt = DataType.INT2
-    odt = DataType.INT32
+    odt = DataType.INT2
     actval = odt.min()
 
     inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, mw])
@@ -64,9 +67,8 @@ def test_res_estimate():
         "StreamingFCLayer_Batch",
         node_inp_list,
         ["outp"],
-        domain="finn",
+        domain="finn.custom_op.fpgadataflow",
         backend="fpgadataflow",
-        resType="ap_resource_lut()",
         MW=mw,
         MH=mh,
         SIMD=simd,
@@ -92,10 +94,29 @@ def test_res_estimate():
     model = model.transform(GiveUniqueNodeNames())
     prod_resource_estimation = model.analysis(res_estimation)
     expect_resource_estimation = {
-        "StreamingFCLayer_Batch_0": {"BRAM_18K": 1, 'BRAM_efficiency': 0.001736111111111111, "LUT": 304.4}
+        "StreamingFCLayer_Batch_0": {
+            "BRAM_18K": 0,
+            "BRAM_efficiency": 1,
+            "LUT": 357,
+            "DSP": 0,
+            "URAM": 0,
+        }
     }
 
     assert check_two_dict_for_equality(
         prod_resource_estimation, expect_resource_estimation
     ), """The produced output of
-    the resource estimation analysis pass is not equal to the expected one"""
+    the res_estimation analysis pass is not equal to the expected one"""
+
+    prod_resource_estimation = model.analysis(res_estimation_complete)
+    expect_resource_estimation = {
+        "StreamingFCLayer_Batch_0": [
+            {"BRAM_18K": 0, "BRAM_efficiency": 1, "LUT": 352, "DSP": 1, "URAM": 0},
+            {"BRAM_18K": 0, "BRAM_efficiency": 1, "LUT": 357, "DSP": 0, "URAM": 0},
+        ]
+    }
+
+    assert check_two_dict_for_equality(
+        prod_resource_estimation, expect_resource_estimation
+    ), """The produced output of
+    the res_estimation_complete analysis pass is not equal to the expected one"""
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
index 8461efd15576fc04906b7f48b2629ad83835de38..5d46f4c3db35c159458dfc9e0eb8aae8ee89cb20 100644
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
@@ -35,7 +35,7 @@ import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.multithreshold import multithreshold
+from finn.custom_op.general.multithreshold import multithreshold
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
@@ -67,7 +67,7 @@ def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode):
         "Thresholding_Batch",
         node_inp_list,
         ["outp"],
-        domain="finn",
+        domain="finn.custom_op.fpgadataflow",
         backend="fpgadataflow",
         NumChannels=NumChannels,
         PE=pe,
diff --git a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py b/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
index d61edc86dd6b5669c334e6b7f78ea9a8550cae93..ff88536f477e80e5c92a2c352f0af81488997c7f 100644
--- a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
+++ b/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
@@ -59,7 +59,7 @@ def make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt):
         "MaxPoolNHWC",
         ["inp"],
         ["outp"],
-        domain="finn",
+        domain="finn.custom_op.general",
         kernel_shape=[k, k],
         strides=[k, k],
         pads=[0, 0, 0, 0],
@@ -90,7 +90,7 @@ def make_single_streamingmaxpool_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt):
         "StreamingMaxPool_Batch",
         ["inp"],
         ["outp"],
-        domain="finn",
+        domain="finn.custom_op.fpgadataflow",
         backend="fpgadataflow",
         PoolDim=k,
         NumChannels=ifm_ch,
diff --git a/tests/transformation/streamline/test_move_chw_add_past_conv.py b/tests/transformation/streamline/test_move_chw_add_past_conv.py
index b626f7e5b8564739ec383aaddfc262d642bf47cc..fc64a04e40036eae7057c15f4e628155bd563e51 100644
--- a/tests/transformation/streamline/test_move_chw_add_past_conv.py
+++ b/tests/transformation/streamline/test_move_chw_add_past_conv.py
@@ -34,7 +34,7 @@ from onnx import helper, TensorProto
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline.reorder import MoveAddPastConv
-from finn.custom_op.im2col import compute_conv_output_dim
+from finn.custom_op.general.im2col import compute_conv_output_dim
 import finn.core.onnx_exec as oxe
 
 
diff --git a/tests/transformation/streamline/test_move_maxpool_past_multithreshold.py b/tests/transformation/streamline/test_move_maxpool_past_multithreshold.py
index 2fc19debf8d6fc89d15e3d731f1e54daa491c321..7c49baf8cd9d5b85b3b76f3513d42483d3bbeb0c 100644
--- a/tests/transformation/streamline/test_move_maxpool_past_multithreshold.py
+++ b/tests/transformation/streamline/test_move_maxpool_past_multithreshold.py
@@ -52,7 +52,7 @@ def test_move_maxpool_past_multithreshold():
             "MultiThreshold",
             ["t1", "thres1"],
             ["t2"],
-            domain="finn",
+            domain="finn.custom_op.general",
             out_dtype="BIPOLAR",
             out_bias=-1.0,
             out_scale=1.0,
@@ -64,7 +64,7 @@ def test_move_maxpool_past_multithreshold():
             "MultiThreshold",
             ["t3", "thres2"],
             ["top_out"],
-            domain="finn",
+            domain="finn.custom_op.general",
             out_dtype="UINT4",
         )
     ]
diff --git a/tests/transformation/streamline/test_move_mul_past_dw_conv.py b/tests/transformation/streamline/test_move_mul_past_dw_conv.py
index 1ae8fbfe89986d58d3d71f5f8735a98469d9d1e3..5e96d15867b087fbb5f4f1b467aea34cb33e3ff4 100644
--- a/tests/transformation/streamline/test_move_mul_past_dw_conv.py
+++ b/tests/transformation/streamline/test_move_mul_past_dw_conv.py
@@ -1,7 +1,7 @@
 import pytest
 
 from onnx import helper, TensorProto
-from finn.custom_op.im2col import compute_conv_output_dim
+from finn.custom_op.general.im2col import compute_conv_output_dim
 import finn.core.onnx_exec as oxe
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
diff --git a/tests/transformation/streamline/test_round_thresholds.py b/tests/transformation/streamline/test_round_thresholds.py
index d59aba996201f8c2fc67cf6e40497b5d43611d39..f9259908a2b4e4d716e3fb9ae7ec28cd9ec85d03 100644
--- a/tests/transformation/streamline/test_round_thresholds.py
+++ b/tests/transformation/streamline/test_round_thresholds.py
@@ -40,7 +40,7 @@ def test_round_thresholds():
     thresholds = helper.make_tensor_value_info("thresholds", TensorProto.FLOAT, [4, 1])
     out = helper.make_tensor_value_info("out", TensorProto.FLOAT, [1, 4])
     node_def = helper.make_node(
-        "MultiThreshold", ["v", "thresholds"], ["out"], domain="finn"
+        "MultiThreshold", ["v", "thresholds"], ["out"], domain="finn.custom_op.general"
     )
     graph_def = helper.make_graph([node_def], "test_model", [v, thresholds], [out])
     model_def = helper.make_model(graph_def)
diff --git a/tests/util/test_create.py b/tests/util/test_create.py
new file mode 100644
index 0000000000000000000000000000000000000000..42a288b74ecda9746296519b1b86563c75b2752e
--- /dev/null
+++ b/tests/util/test_create.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2020 Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import finn.util.create as create
+from finn.core.datatype import DataType
+
+
+@pytest.mark.parametrize("bitwidth", [DataType.BIPOLAR, DataType.INT2, DataType.INT4])
+def test_hls_random_mlp_maker(bitwidth):
+    w = bitwidth
+    a = bitwidth
+    layer_spec = [
+        {
+            "mw": 185,
+            "mh": 100,
+            "simd": 185,
+            "pe": 100,
+            "idt": DataType.BIPOLAR,
+            "wdt": w,
+            "act": a,
+        },
+        {"mw": 100, "mh": 100, "simd": 100, "pe": 100, "idt": a, "wdt": w, "act": a},
+        {"mw": 100, "mh": 100, "simd": 100, "pe": 100, "idt": a, "wdt": w, "act": a},
+        {"mw": 100, "mh": 100, "simd": 100, "pe": 100, "idt": a, "wdt": w, "act": a},
+        {
+            "mw": 100,
+            "mh": 1,
+            "simd": 100,
+            "pe": 1,
+            "idt": a,
+            "wdt": w,
+            "act": DataType.BIPOLAR,
+        },
+    ]
+
+    ret = create.hls_random_mlp_maker(layer_spec)
+    assert len(ret.graph.node) == 5
+    # ret.save("mlp-%s.onnx" % str(bitwidth))