diff --git a/docker/Dockerfile.finn_dev b/docker/Dockerfile.finn_dev
index f8e15f34fb4da3dc4ee353a29d26866b68879144..db49dceb2d06670dfc43059d3a4fa6160a8ded58 100644
--- a/docker/Dockerfile.finn_dev
+++ b/docker/Dockerfile.finn_dev
@@ -49,13 +49,14 @@ RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config
 COPY requirements.txt .
 RUN pip install -r requirements.txt
 RUN rm requirements.txt
-RUN pip install jupyter
-RUN pip install matplotlib
-RUN pip install pytest-dependency
-RUN pip install sphinx
-RUN pip install sphinx_rtd_theme
-RUN pip install pytest-xdist
-RUN pip install pytest-parallel
+RUN pip install jupyter==1.0.0
+RUN pip install matplotlib==3.3.1 --ignore-installed certifi
+RUN pip install pytest-dependency==0.5.1
+RUN pip install sphinx==3.1.2
+RUN pip install sphinx_rtd_theme==0.5.0
+RUN pip install pytest-xdist==2.0.0
+RUN pip install pytest-parallel==0.1.0
+RUN pip install netron==4.4.7
 
 # switch user
 RUN groupadd -g $GID $GNAME
@@ -80,19 +81,6 @@ RUN git clone https://github.com/maltanar/pyverilator /workspace/pyverilator
 RUN git clone https://github.com/maltanar/PYNQ-HelloWorld.git /workspace/PYNQ-HelloWorld
 # oh-my-xilinx
 RUN git clone https://bitbucket.org/maltanar/oh-my-xilinx.git /workspace/oh-my-xilinx
-# netron
-RUN git clone https://github.com/lutzroeder/netron.git /workspace/netron
-
-# build and install netron
-USER root
-RUN curl -sL https://deb.nodesource.com/setup_12.x | bash -
-RUN apt-get install -y nodejs
-WORKDIR /workspace/netron
-RUN git checkout 376e9d33733a3eacfe3c432808fd46e6cd1460cb
-RUN npm install
-RUN python setup.py build
-RUN pip install /workspace/netron
-USER $UNAME
 
 # for this developer-oriented Docker container we assume the FINN repo is cloned and mounted from the host
 # at /workspace/finn -- see run-docker.sh for an example of how to do this.
diff --git a/docker/quicktest.sh b/docker/quicktest.sh
index 02e014cd3cc7bb88eebd02f03ff599913079152b..b06feccdc578a59c8ef00531871e1211c2a407e5 100755
--- a/docker/quicktest.sh
+++ b/docker/quicktest.sh
@@ -16,6 +16,11 @@ elif [ $1 = "rtlsim" ]; then
 elif [ $1 = "end2end" ]; then
   echo "Running end2end test suite with no parallelism"
   python setup.py test --addopts "-k end2end"
+elif [ $1 = "full" ]; then
+  echo "Running full test suite, each step with appropriate parallelism"
+  $0 main;
+  $0 rtlsim;
+  $0 end2end;
 else
   echo "Unrecognized argument to quicktest.sh"
 fi
diff --git a/requirements.txt b/requirements.txt
index b15d86ed89f7b0e76b772ce42aba6481937310b0..4aa1cbe3484a3447851879d7da9ce9d48b066592 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,13 +1,11 @@
-bitstring
-docrep
-future
+bitstring==3.1.7
+docrep==0.2.7
+future==0.18.2
 numpy==1.18.0
 onnx==1.6.0
 onnxruntime==1.2.0
-pre-commit
-pyverilator
-scipy
-sphinx
-toposort
-vcdvcd
-wget
+pre-commit==2.6.0
+scipy==1.5.2
+toposort==1.5
+vcdvcd==1.0.5
+wget==3.2
diff --git a/src/finn/core/datatype.py b/src/finn/core/datatype.py
index 222d11a8872f9be757fd60fbfa5f8abea683311a..df895a1ad446d6b2cc3ebb24f1179944f4cfe9ab 100644
--- a/src/finn/core/datatype.py
+++ b/src/finn/core/datatype.py
@@ -50,17 +50,69 @@ class DataType(Enum):
     UINT2 = auto()
     UINT3 = auto()
     UINT4 = auto()
+    UINT5 = auto()
+    UINT6 = auto()
+    UINT7 = auto()
     UINT8 = auto()
+    UINT9 = auto()
+    UINT10 = auto()
+    UINT11 = auto()
+    UINT12 = auto()
+    UINT13 = auto()
+    UINT14 = auto()
+    UINT15 = auto()
     UINT16 = auto()
+    UINT17 = auto()
+    UINT18 = auto()
+    UINT19 = auto()
+    UINT20 = auto()
+    UINT21 = auto()
+    UINT22 = auto()
+    UINT23 = auto()
+    UINT24 = auto()
+    UINT25 = auto()
+    UINT26 = auto()
+    UINT27 = auto()
+    UINT28 = auto()
+    UINT29 = auto()
+    UINT30 = auto()
+    UINT31 = auto()
     UINT32 = auto()
+    UINT64 = auto()
     BIPOLAR = auto()
     TERNARY = auto()
     INT2 = auto()
     INT3 = auto()
     INT4 = auto()
+    INT5 = auto()
+    INT6 = auto()
+    INT7 = auto()
     INT8 = auto()
+    INT9 = auto()
+    INT10 = auto()
+    INT11 = auto()
+    INT12 = auto()
+    INT13 = auto()
+    INT14 = auto()
+    INT15 = auto()
     INT16 = auto()
+    INT17 = auto()
+    INT18 = auto()
+    INT19 = auto()
+    INT20 = auto()
+    INT21 = auto()
+    INT22 = auto()
+    INT23 = auto()
+    INT24 = auto()
+    INT25 = auto()
+    INT26 = auto()
+    INT27 = auto()
+    INT28 = auto()
+    INT29 = auto()
+    INT30 = auto()
+    INT31 = auto()
     INT32 = auto()
+    INT64 = auto()
     FLOAT32 = auto()
 
     def bitwidth(self):
diff --git a/src/finn/core/modelwrapper.py b/src/finn/core/modelwrapper.py
index 646add188c5d475cf37ccd33cf24d29d61754ae1..98b234592ebe0c704fafd1eed980325d8566e7e2 100644
--- a/src/finn/core/modelwrapper.py
+++ b/src/finn/core/modelwrapper.py
@@ -36,6 +36,11 @@ from onnx import TensorProto
 import finn.util.basic as util
 import finn.util.onnx as onnxutil
 from finn.core.datatype import DataType
+from finn.transformation.general import (
+    RemoveUnusedTensors,
+    RemoveStaticGraphInputs,
+    SortGraph,
+)
 
 
 class ModelWrapper:
@@ -87,7 +92,7 @@ class ModelWrapper:
         """Runs given anaylsis_fxn on this model and return resulting dict."""
         return analysis_fxn(self)
 
-    def transform(self, transformation, make_deepcopy=True):
+    def transform(self, transformation, make_deepcopy=True, cleanup=True):
         """Applies given Transformation repeatedly until no more changes can be made
         and returns a transformed ModelWrapper instance.
 
@@ -101,6 +106,22 @@ class ModelWrapper:
             (transformed_model, model_was_changed) = transformation.apply(
                 transformed_model
             )
+        if cleanup:
+            transformed_model.cleanup()
+        return transformed_model
+
+    def cleanup(self):
+        "Run cleanup transformations on the model."
+        transformed_model = self
+        cleanup_transforms = [
+            RemoveUnusedTensors(),
+            RemoveStaticGraphInputs(),
+            SortGraph(),
+        ]
+        for trn in cleanup_transforms:
+            transformed_model = transformed_model.transform(
+                trn, cleanup=False, make_deepcopy=False
+            )
         return transformed_model
 
     def check_compatibility(self):
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index 87f52eeea591ba42bf5374df3c93bcc3e4f8e944..ea6922123a1334a7ea0d0568e09c043e06490f38 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -36,6 +36,7 @@ from finn.custom_op.fpgadataflow import HLSCustomOp
 from finn.util.basic import (
     interleave_matrix_outer_dim_from_partitions,
     roundup_to_integer_multiple,
+    calculate_matvec_accumulator_range,
 )
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
@@ -72,6 +73,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             "inputDataType": ("s", True, ""),
             "weightDataType": ("s", True, ""),
             "outputDataType": ("s", True, ""),
+            # FINN DataType for accumulator -- auto-computed and updated
+            "accDataType": ("s", False, "INT32"),
             # use xnor-popcount for binary weights/inputs, thus treating them
             # as bipolar
             "binaryXnorMode": ("i", False, 0),
@@ -424,6 +427,51 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         ret = np.flip(ret, axis=-1)
         return ret
 
+    def minimize_accumulator_width(self, model):
+        weights = model.get_initializer(self.onnx_node.input[1])
+        if len(self.onnx_node.input) > 2:
+            thresholds = model.get_initializer(self.onnx_node.input[2])
+        else:
+            thresholds = None
+        idt = self.get_input_datatype()
+        # calculate minimum and maximum values of accumulator
+        (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt)
+        if thresholds is not None:
+            threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
+            # set threshold datatype (and accumulator datatype implicitly)
+            min_threshold = thresholds.min()
+            max_threshold = thresholds.max()
+            # get range required by threshold values
+            tdt_min = min(acc_min, min_threshold)
+            tdt_max = max(acc_max, max_threshold)
+            if tdt_min < 0:
+                if abs(tdt_min) > tdt_max:
+                    tdt = DataType.get_smallest_possible(tdt_min)
+                else:
+                    tdt = DataType.get_smallest_possible(0 - tdt_max)
+            else:
+                tdt = DataType.get_smallest_possible(tdt_max)
+            assert np.vectorize(tdt.allowed)(
+                threshold_tensor
+            ).all(), "Thresholds can't be expressed with type %s" % str(tdt)
+            self.set_nodeattr("accDataType", tdt.name)
+        else:
+            if acc_min < 0:
+                if abs(acc_min) > acc_max:
+                    adt = DataType.get_smallest_possible(acc_min)
+                else:
+                    adt = DataType.get_smallest_possible(0 - acc_max)
+            else:
+                adt = DataType.get_smallest_possible(acc_max)
+            # ensure a datatype divisible by 8-bits in case this is the last node
+            bw = roundup_to_integer_multiple(adt.bitwidth(), 8)
+            new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw))
+            adt = DataType[new_adt_name]
+            self.set_nodeattr("accDataType", adt.name)
+            # for no-activation nodes, output dt = acc dt
+            self.set_nodeattr("outputDataType", adt.name)
+        return DataType[self.get_nodeattr("accDataType")]
+
     def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
         """Convert the original numpy weight matrix orig_weight_matrix into
         a form suitable for passing to the hlslib call:
@@ -573,7 +621,6 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             thresholds = model.get_initializer(self.onnx_node.input[2])
             if thresholds is not None:
                 threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
-                tdt = DataType.INT32
                 # use UINT32 threshold export for bipolar times bipolar
                 inp_is_bipolar = self.get_input_datatype() == DataType.BIPOLAR
                 wt_is_bipolar = self.get_weight_datatype() == DataType.BIPOLAR
@@ -583,11 +630,12 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1
                 inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode)
                 wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
-                if inp_is_bipolar and wt_is_bipolar:
-                    tdt = DataType.UINT32
+                # get computed threshold datatype from attribute
+                tdt = DataType[self.get_nodeattr("accDataType")]
+
                 assert np.vectorize(tdt.allowed)(
                     threshold_tensor
-                ).all(), "Thresholds are not int"
+                ).all(), "Thresholds can't be expressed with type %s" % str(tdt)
                 thresholds_hls_code = numpy_to_hls_code(
                     threshold_tensor, tdt, "thresholds", False, True
                 )
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
index 379ebd92d86d54c6bc621c7f89b01eacba2b5d3f..562bab0f18990096f7364b3a4e2bcbbbf4ce2b58 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
@@ -283,10 +283,25 @@ class Thresholding_Batch(HLSCustomOp):
         thresholds = model.get_initializer(self.onnx_node.input[1])
 
         threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
-        tdt = DataType.INT32
+
+        min_threshold = thresholds.min()
+        max_threshold = thresholds.max()
+        min_input = self.get_input_datatype().min()
+        max_input = self.get_input_datatype().max()
+        # get range required by threshold values
+        tdt_min = min(min_input, min_threshold)
+        tdt_max = max(max_input, max_threshold)
+        if tdt_min < 0:
+            if abs(tdt_min) > tdt_max:
+                tdt = DataType.get_smallest_possible(tdt_min)
+            else:
+                tdt = DataType.get_smallest_possible(0 - tdt_max - 1)
+        else:
+            tdt = DataType.get_smallest_possible(tdt_max)
         assert np.vectorize(tdt.allowed)(
             threshold_tensor
-        ).all(), "Thresholds are not int"
+        ).all(), "Thresholds can't be expressed with type %s" % str(tdt)
+
         thresholds_hls_code = numpy_to_hls_code(
             threshold_tensor, tdt, "thresholds", False, True
         )
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index e6dca0e4b05f943c971bc0f97af03f5038fd0dab..88f5fa926f73d5cb1919a02c83153cb8d1894711 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -40,6 +40,9 @@ from finn.transformation.general import SortGraph
 import finn.core.data_layout as DataLayout
 from finn.util.onnx import nchw_to_nhwc
 from finn.util.basic import get_by_name
+from finn.transformation.fpgadataflow.minimize_accumulator_width import (
+    MinimizeAccumulatorWidth,
+)
 
 
 class InferConvInpGen(Transformation):
@@ -489,6 +492,7 @@ class InferBinaryStreamingFCLayer(Transformation):
                     graph.node.remove(n)
                     graph_modified = True
         if graph_modified:
+            model = model.transform(MinimizeAccumulatorWidth())
             model = model.transform(InferShapes())
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
@@ -623,6 +627,7 @@ class InferQuantizedStreamingFCLayer(Transformation):
                         graph.node.remove(n)
                         graph_modified = True
         if graph_modified:
+            model = model.transform(MinimizeAccumulatorWidth())
             model = model.transform(InferShapes())
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
index 5ec4ab14d65d63523856a6bb107bf75c1ca5a261..fb8b4358abd772d13c355f797649dc3b51975b4d 100644
--- a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
+++ b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
@@ -112,6 +112,7 @@ class CreateDataflowPartition(Transformation):
                     "dataflow_partition" + str(target_partition_id) + "_"
                 )
                 df_model_filename = df_model_dir + "/df_model.onnx"
+                df_model.cleanup()
                 df_model.save(df_model_filename)
                 # remove all dataflow nodes from the non-dataflow model
                 # keep track of where the dataflow part starts
diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
index 1558d7399fe5399053c3f347cd06c4e0d76753e7..095327be0d3c36f201bcf343d8aea61aa069b8e1 100644
--- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -294,11 +294,12 @@ class ZynqBuild(Transformation):
         # Build each kernel individually
         sdp_nodes = model.get_nodes_by_op_type("StreamingDataflowPartition")
         for sdp_node in sdp_nodes:
+            prefix = sdp_node.name + "_"
             sdp_node = getCustomOp(sdp_node)
             dataflow_model_filename = sdp_node.get_nodeattr("model")
             kernel_model = ModelWrapper(dataflow_model_filename)
             kernel_model = kernel_model.transform(InsertFIFO())
-            kernel_model = kernel_model.transform(GiveUniqueNodeNames())
+            kernel_model = kernel_model.transform(GiveUniqueNodeNames(prefix))
             kernel_model.save(dataflow_model_filename)
             kernel_model = kernel_model.transform(
                 PrepareIP(self.fpga_part, self.period_ns)
diff --git a/src/finn/transformation/fpgadataflow/minimize_accumulator_width.py b/src/finn/transformation/fpgadataflow/minimize_accumulator_width.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c54a5efbd3b28f0fbfd074b512929edab234e78
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/minimize_accumulator_width.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from finn.custom_op.registry import getCustomOp
+from finn.transformation import Transformation
+from finn.util.fpgadataflow import is_fpgadataflow_node
+
+
+class MinimizeAccumulatorWidth(Transformation):
+    """For relevant nodes, call the accumulator width minimization
+    functions to save on resources. May alter tensor DataType for
+    certain nodes if they produce an accumulator as result."""
+
+    def __init__(self):
+        super().__init__()
+
+    def apply(self, model):
+        for node in model.graph.node:
+            if is_fpgadataflow_node(node) is True:
+                inst = getCustomOp(node)
+                if hasattr(inst, "minimize_accumulator_width"):
+                    inst.minimize_accumulator_width(model)
+        return (model, False)
diff --git a/src/finn/transformation/general.py b/src/finn/transformation/general.py
index 4303eb17f39a9949f5729e895e449bbb6a633033..8ad59d2baf3015cfebffeff88a059f48d9428371 100644
--- a/src/finn/transformation/general.py
+++ b/src/finn/transformation/general.py
@@ -81,14 +81,19 @@ class RemoveStaticGraphInputs(Transformation):
 
 
 class GiveUniqueNodeNames(Transformation):
-    """Give unique names to each node in the graph using enumeration."""
+    """Give unique names to each node in the graph using enumeration, starting
+    with given prefix (if specified in the constructor)."""
+
+    def __init__(self, prefix=""):
+        super().__init__()
+        self.prefix = prefix
 
     def apply(self, model):
         optype_count = {}
         for n in model.graph.node:
             if n.op_type not in optype_count.keys():
                 optype_count[n.op_type] = 0
-            n.name = "%s_%d" % (n.op_type, optype_count[n.op_type])
+            n.name = "%s%s_%d" % (self.prefix, n.op_type, optype_count[n.op_type])
             optype_count[n.op_type] += 1
         # return model_was_changed = False as single iteration is always enough
         return (model, False)
@@ -189,6 +194,9 @@ class SortGraph(Transformation):
     # Probably this is faster than copying initializers and more robust in general
 
     def apply(self, model):
+        if len(model.graph.node) == 1:
+            # single-node graph, nothing to sort
+            return (model, False)
         # Gather graph structure
         graph_dependencies = {}
         node_list = [
@@ -214,7 +222,7 @@ class SortGraph(Transformation):
         for new_idx, sorted_idx in enumerate(sorted_node_indexes):
             model.graph.node.insert(new_idx, node_list[sorted_idx])
 
-        return model, False
+        return (model, False)
 
 
 class ConvertSubToAdd(Transformation):
diff --git a/src/finn/transformation/merge_onnx_models.py b/src/finn/transformation/merge_onnx_models.py
index 5dc6127ed189311c72a119932394aca4745e3608..ceacab197150fe6d32e3a9eda268aed186b1a8bc 100644
--- a/src/finn/transformation/merge_onnx_models.py
+++ b/src/finn/transformation/merge_onnx_models.py
@@ -31,12 +31,12 @@ from onnx import helper
 
 from finn.transformation import Transformation
 from finn.core.modelwrapper import ModelWrapper
-import finn.util.basic as util
 from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.infer_datatypes import InferDataTypes
 from finn.transformation.infer_data_layouts import InferDataLayouts
 from finn.transformation.general import (
     GiveReadableTensorNames,
+    GiveRandomTensorNames,
     GiveUniqueNodeNames,
     GiveUniqueParameterTensors,
 )
@@ -59,6 +59,9 @@ class MergeONNXModels(Transformation):
         graph_modified = False
         pre_model = self.pre_model
         post_model = copy.deepcopy(model)
+        # to avoid mix-ups, start by giving all tensors random names
+        pre_model = pre_model.transform(GiveRandomTensorNames())
+        post_model = post_model.transform(GiveRandomTensorNames())
 
         # check for dynamic outputs of pre model
         dyn_outp = []
@@ -94,27 +97,6 @@ class MergeONNXModels(Transformation):
         for n in post_model.graph.node:
             n.name = ""
 
-        # randomize all tensor names
-        names1 = pre_model.get_all_tensor_names()
-        names2 = post_model.get_all_tensor_names()
-        used_names = names1 + names2
-
-        # pre_model
-        for tensor_name in names1:
-            new_name = util.random_string()
-            while new_name in used_names:
-                new_name = util.random_string()
-            pre_model.rename_tensor(tensor_name, new_name)
-            used_names.append(new_name)
-
-        # post_model
-        for tensor in names2:
-            new_name = util.random_string()
-            while new_name in used_names:
-                new_name = util.random_string()
-            post_model.rename_tensor(tensor_name, new_name)
-            used_names.append(new_name)
-
         # check if models can be merged
         output_model_a = dyn_outp[0].name
         input_model_b = dyn_inp[0].name
@@ -124,6 +106,9 @@ class MergeONNXModels(Transformation):
             output_a_shape == input_b_shape
         ), "Models can't be merged! Shapes don't match."
 
+        pre_model.save("pre.onnx")
+        post_model.save("post.onnx")
+
         # connect output of one model to input of the other
         for n in pre_model.graph.node:
             if output_model_a == n.output[0]:
@@ -132,83 +117,43 @@ class MergeONNXModels(Transformation):
         # extract information for new model
 
         # nodes
-        node_list_a = pre_model.graph.node
-        node_list_b = post_model.graph.node
-
-        node_list = node_list_a
-        for node in node_list_b:
-            node_list.append(node)
+        node_pre = [node for node in pre_model.graph.node]
+        node_post = [node for node in post_model.graph.node]
+        node_new = node_pre + node_post
 
         # in and output
         inp = pre_model.graph.input[0]
         outp = post_model.graph.output[0]
 
+        vi_pre = [x for x in pre_model.graph.value_info]
+        out_pre = [x for x in pre_model.graph.output]
+        qa_pre = [x for x in pre_model.graph.quantization_annotation]
+        init_pre = [x for x in pre_model.graph.initializer]
+
+        vi_post = [x for x in post_model.graph.value_info]
+        qa_post = [x for x in post_model.graph.quantization_annotation]
+        init_post = [x for x in post_model.graph.initializer]
+
+        vi_new = vi_pre + vi_post + out_pre
+        qa_new = qa_pre + qa_post
+        init_new = init_pre + init_post
+
         # create new graph and model
         new_graph = helper.make_graph(
-            nodes=node_list,
+            nodes=node_new,
             name="fuse-graph",
             inputs=[inp],
             outputs=[outp],
-            value_info=[],
+            value_info=vi_new,
         )
 
         new_model = helper.make_model(new_graph, producer_name="fuse_model")
         new_model = ModelWrapper(new_model)
 
-        # add value info from both models to new model
-        # pre model
-        vi_pre = [x for x in pre_model.graph.input]
-        vi_pre += [x for x in pre_model.graph.output]
-        vi_pre += [x for x in pre_model.graph.value_info]
-        for vi in vi_pre:
-            # preserve intializers, quantization/sparsity annotation, etc.
-            # initializer
-            init_val = pre_model.get_initializer(vi.name)
-            if init_val is not None:
-                new_model.set_initializer(vi.name, init_val)
-            # FINN datatype
-            dtype = pre_model.get_tensor_datatype(vi.name)
-            new_model.set_tensor_datatype(vi.name, dtype)
-            # data layout
-            data_layout = pre_model.get_tensor_layout(vi.name)
-            if data_layout is not None:
-                new_model.set_tensor_layout(vi.name, data_layout)
-            # sparsity
-            sparsity = pre_model.get_tensor_sparsity(vi.name)
-            if sparsity is not None:
-                new_model.set_tensor_sparsity(vi.name, sparsity)
-            # graph input should not be part of graph.value_info, so don't insert
-            # if current vi == inp, but the quantization annotation is preserved
-            if vi == inp:
-                continue
-            new_model.graph.value_info.append(vi)
-
-        # post model
-        vi_model = [x for x in post_model.graph.input]
-        vi_model += [x for x in post_model.graph.output]
-        vi_model += [x for x in post_model.graph.value_info]
-        for vi in vi_model:
-            # preserve intializers, quantization/sparsity annotation, etc.
-            # initializer
-            init_val = post_model.get_initializer(vi.name)
-            if init_val is not None:
-                new_model.set_initializer(vi.name, init_val)
-            # FINN datatype
-            dtype = post_model.get_tensor_datatype(vi.name)
-            new_model.set_tensor_datatype(vi.name, dtype)
-            # data layout
-            data_layout = post_model.get_tensor_layout(vi.name)
-            if data_layout is not None:
-                new_model.set_tensor_layout(vi.name, data_layout)
-            # sparsity
-            sparsity = post_model.get_tensor_sparsity(vi.name)
-            if sparsity is not None:
-                new_model.set_tensor_sparsity(vi.name, sparsity)
-            # graph output should not be part of graph.value_info, so don't insert
-            # if current vi == outp, but the quantization annotation is preserved
-            if vi == outp:
-                continue
-            new_model.graph.value_info.append(vi)
+        for i in init_new:
+            new_model.graph.initializer.append(i)
+        for qa in qa_new:
+            new_model.graph.quantization_annotation.append(qa)
 
         # tidy-up new model
         model = new_model
diff --git a/src/finn/transformation/streamline/round_thresholds.py b/src/finn/transformation/streamline/round_thresholds.py
index c33281d85449c173a4631297fd1d67ac0aed8c81..8626ef40619b067c6672c9017ddcb747998c3f2c 100644
--- a/src/finn/transformation/streamline/round_thresholds.py
+++ b/src/finn/transformation/streamline/round_thresholds.py
@@ -51,10 +51,20 @@ class RoundAndClipThresholds(Transformation):
                     model.set_tensor_datatype(n.input[1], idtype)
                     graph_modified = True
                 if idtype.is_integer() and not idtype.signed() and (Tnew < 0).any():
-                    # clip any negative thresholds
+                    # clip any negative thresholds if input is unsigned
                     Tnew = np.clip(Tnew, 0, None)
                     model.set_initializer(n.input[1], Tnew)
                     # use same datatype as inputs for thresholds
                     model.set_tensor_datatype(n.input[1], idtype)
                     graph_modified = True
+                if idtype.is_integer() and (
+                    (Tnew < (idtype.min() - 1)).any()
+                    or (Tnew > (idtype.max() + 1)).any()
+                ):
+                    # clip any large thresholds to input range + 1
+                    Tnew = np.clip(Tnew, idtype.min() - 1, idtype.max() + 1)
+                    model.set_initializer(n.input[1], Tnew)
+                    # use same datatype as inputs for thresholds
+                    model.set_tensor_datatype(n.input[1], idtype)
+                    graph_modified = True
         return (model, graph_modified)
diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index 6c92e9b2765b1c2be6f95ee148964bccfb3cd7be..cc759bebb1b856a84e25978d442e460332092d23 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -156,13 +156,19 @@ def make_build_dir(prefix=""):
 
 
 def get_by_name(container, name, name_field="name"):
-    """Return item from container by .name field if it exists, None otherwise"""
+    """Return item from container by .name field if it exists, None otherwise.
+    Will throw an Exception if multiple items are found, since this violates the
+    ONNX standard."""
     names = [getattr(x, name_field) for x in container]
-    try:
-        ind = names.index(name)
-        return container[ind]
-    except ValueError:
+
+    inds = [i for i, e in enumerate(names) if e == name]
+    if len(inds) > 1:
+        raise Exception("Found multiple get_by_name matches, undefined behavior")
+    elif len(inds) == 0:
         return None
+    else:
+        ind = inds[0]
+        return container[ind]
 
 
 def remove_by_name(container, name, name_field="name"):
@@ -259,6 +265,33 @@ def pad_tensor_to_multiple_of(ndarray, pad_to_dims, val=0, distr_pad=False):
     return ret
 
 
+def calculate_matvec_accumulator_range(matrix, vec_dt):
+    """Calculate the minimum and maximum possible result (accumulator) values
+    for a dot product x * A, given matrix A of dims (MW, MH), and vector (1, MW)
+    with datatype vec_dt. Returns (acc_min, acc_max).
+    """
+    min_weight = matrix.min()
+    max_weight = matrix.max()
+    perceptive_field_elems = matrix.shape[0]
+    min_input = vec_dt.min()
+    max_input = vec_dt.max()
+    # calculate minimum and maximum values of accumulator
+    # assume inputs span the whole range of the input datatype
+    acc_min = perceptive_field_elems * min(
+        min_weight * max_input,
+        min_weight * min_input,
+        max_weight * max_input,
+        max_weight * min_input,
+    )
+    acc_max = perceptive_field_elems * max(
+        min_weight * max_input,
+        min_weight * min_input,
+        max_weight * max_input,
+        max_weight * min_input,
+    )
+    return (acc_min, acc_max)
+
+
 def gen_finn_dt_tensor(finn_dt, tensor_shape):
     """Generates random tensor in given shape and with given FINN DataType."""
     if type(tensor_shape) == list:
diff --git a/tests/end2end/test_end2end_cnv_w1a1.py b/tests/end2end/test_end2end_cnv_w1a1.py
index ebca224389550929cebd542cf4201cf62481a169..f931f91c89f738899ff9e6584be81a3b2d542227 100644
--- a/tests/end2end/test_end2end_cnv_w1a1.py
+++ b/tests/end2end/test_end2end_cnv_w1a1.py
@@ -78,6 +78,7 @@ from finn.transformation.fpgadataflow.annotate_resources import AnnotateResource
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.core.throughput_test import throughput_test_rtlsim
+import warnings
 
 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
@@ -317,6 +318,10 @@ def test_end2end_cnv_w1a1_synth_pynq_project():
     )
     model = model.transform(SynthPYNQProject())
     model = model.transform(AnnotateResources("synth"))
+    warnings.warn(
+        "Post-synthesis resources (excluding shell): "
+        + model.get_metadata_prop("res_total_synth")
+    )
     model.save(build_dir + "/end2end_cnv_w1a1_synth.onnx")
 
 
diff --git a/tests/end2end/test_end2end_cnv_w2a2.py b/tests/end2end/test_end2end_cnv_w2a2.py
index 2e34990007677ce1b8e0a9ae4a1781d4527ee040..239094a3c931c16b3afe8d1874345e4dc90334ef 100644
--- a/tests/end2end/test_end2end_cnv_w2a2.py
+++ b/tests/end2end/test_end2end_cnv_w2a2.py
@@ -77,6 +77,7 @@ from finn.transformation.fpgadataflow.annotate_resources import AnnotateResource
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.core.throughput_test import throughput_test_rtlsim
+import warnings
 
 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
@@ -315,6 +316,10 @@ def test_end2end_cnv_w2a2_synth_pynq_project():
     )
     model = model.transform(SynthPYNQProject())
     model = model.transform(AnnotateResources("synth"))
+    warnings.warn(
+        "Post-synthesis resources (excluding shell): "
+        + model.get_metadata_prop("res_total_synth")
+    )
     model.save(build_dir + "/end2end_cnv_w2a2_synth.onnx")
 
 
diff --git a/tests/end2end/test_end2end_tfc_w1a1.py b/tests/end2end/test_end2end_tfc_w1a1.py
index b827cbb1c31cc84de9fa5d4df4d6b23e02a02a5f..1a3cc4f1bb9232809e864bb0c784498534f63631 100644
--- a/tests/end2end/test_end2end_tfc_w1a1.py
+++ b/tests/end2end/test_end2end_tfc_w1a1.py
@@ -79,6 +79,7 @@ from finn.transformation.fpgadataflow.annotate_resources import AnnotateResource
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.core.throughput_test import throughput_test_rtlsim
 import finn.util.vcd as vcd
+import warnings
 
 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
@@ -241,11 +242,11 @@ def test_end2end_tfc_w1a1_throughput_test_rtlsim():
     # run through IP-stitched rtlsim with increasing batch sizes and
     # check the number of cycles it takes to execute
     ret = throughput_test_rtlsim(model, 1)
-    assert ret["cycles"] == 205
+    assert np.isclose(ret["cycles"], 205, atol=5)
     ret = throughput_test_rtlsim(model, 10)
-    assert ret["cycles"] == 844
+    assert np.isclose(ret["cycles"], 844, atol=10)
     ret = throughput_test_rtlsim(model, 100)
-    assert ret["cycles"] == 7234
+    assert np.isclose(ret["cycles"], 7234, atol=100)
 
 
 @pytest.mark.vivado
@@ -314,6 +315,10 @@ def test_end2end_tfc_w1a1_synth_pynq_project():
     )
     model = model.transform(SynthPYNQProject())
     model = model.transform(AnnotateResources("synth"))
+    warnings.warn(
+        "Post-synthesis resources (excluding shell): "
+        + model.get_metadata_prop("res_total_synth")
+    )
     model.save(build_dir + "/end2end_tfc_w1a1_synth.onnx")
 
 
diff --git a/tests/end2end/test_end2end_tfc_w1a2.py b/tests/end2end/test_end2end_tfc_w1a2.py
index 755650e3d4da6947a93495fd5bbe0464cf485193..0f066cb06c53ce118d0a357fce0999299d7f3305 100644
--- a/tests/end2end/test_end2end_tfc_w1a2.py
+++ b/tests/end2end/test_end2end_tfc_w1a2.py
@@ -74,6 +74,7 @@ from finn.util.basic import pynq_part_map
 from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip
 from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+import warnings
 
 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
@@ -270,6 +271,10 @@ def test_end2end_tfc_w1a2_synth_pynq_project():
     )
     model = model.transform(SynthPYNQProject())
     model = model.transform(AnnotateResources("synth"))
+    warnings.warn(
+        "Post-synthesis resources (excluding shell): "
+        + model.get_metadata_prop("res_total_synth")
+    )
     model.save(build_dir + "/end2end_tfc_w1a2_synth.onnx")
 
 
diff --git a/tests/end2end/test_end2end_tfc_w2a2.py b/tests/end2end/test_end2end_tfc_w2a2.py
index 4b2dd9ef01850897d95ede1214f87e9aa5b79f63..6eb613fc877b6e6801140f2a03c3a9509c08c0cb 100644
--- a/tests/end2end/test_end2end_tfc_w2a2.py
+++ b/tests/end2end/test_end2end_tfc_w2a2.py
@@ -74,6 +74,7 @@ from finn.util.basic import pynq_part_map
 from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip
 from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+import warnings
 
 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
@@ -270,6 +271,10 @@ def test_end2end_tfc_w2a2_synth_pynq_project():
     )
     model = model.transform(SynthPYNQProject())
     model = model.transform(AnnotateResources("synth"))
+    warnings.warn(
+        "Post-synthesis resources (excluding shell): "
+        + model.get_metadata_prop("res_total_synth")
+    )
     model.save(build_dir + "/end2end_tfc_w2a2_synth.onnx")
 
 
diff --git a/tests/end2end/test_zynqbuild_end2end_cnv_w1a1.py b/tests/end2end/test_zynqbuild_end2end_cnv_w1a1.py
index 25cafcfd4c552fb368cbaca2d1d2714cf2d14011..a272fadc12f095034693e555e4d791e9e73262ab 100644
--- a/tests/end2end/test_zynqbuild_end2end_cnv_w1a1.py
+++ b/tests/end2end/test_zynqbuild_end2end_cnv_w1a1.py
@@ -63,6 +63,7 @@ from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
 from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
 from finn.transformation.infer_data_layouts import InferDataLayouts
 from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
+import warnings
 
 
 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
@@ -178,6 +179,10 @@ def test_end2end_zynqbuild_cnv_w1a1_build():
     )
     model = model.transform(ZynqBuild(test_pynq_board, target_clk_ns))
     model = model.transform(AnnotateResources("synth"))
+    warnings.warn(
+        "Post-synthesis resources (excluding shell): "
+        + model.get_metadata_prop("res_total_synth")
+    )
     model.save(build_dir + "/end2end_zynqbuild_cnv_w1a1_build.onnx")
 
 
diff --git a/tests/end2end/test_zynqbuild_end2end_tfc_w1a1.py b/tests/end2end/test_zynqbuild_end2end_tfc_w1a1.py
index ff2af70731d9248dd2593db5be9e465fa86157dd..8b298d5644d6d6cda038e8ca1757be7538ba9804 100644
--- a/tests/end2end/test_zynqbuild_end2end_tfc_w1a1.py
+++ b/tests/end2end/test_zynqbuild_end2end_tfc_w1a1.py
@@ -64,6 +64,7 @@ from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip
 from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
 from finn.transformation.infer_data_layouts import InferDataLayouts
 from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
+import warnings
 
 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
@@ -160,6 +161,10 @@ def test_end2end_zynqbuild_tfc_w1a1_build():
     )
     model = model.transform(ZynqBuild(test_pynq_board, target_clk_ns))
     model = model.transform(AnnotateResources("synth"))
+    warnings.warn(
+        "Post-synthesis resources (excluding shell): "
+        + model.get_metadata_prop("res_total_synth")
+    )
     model.save(build_dir + "/end2end_zynqbuild_tfc_w1a1_build.onnx")
 
 
diff --git a/tests/end2end/test_zynqbuild_end2end_tfc_w2a2.py b/tests/end2end/test_zynqbuild_end2end_tfc_w2a2.py
index 7b28090854adbbcb6f400f73c2b6f6557f540e5e..bdb24d82dd639abe52aac9688b0b98430f72cabd 100644
--- a/tests/end2end/test_zynqbuild_end2end_tfc_w2a2.py
+++ b/tests/end2end/test_zynqbuild_end2end_tfc_w2a2.py
@@ -58,6 +58,7 @@ from finn.util.basic import pynq_part_map
 from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip
 from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
 from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
+import warnings
 
 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
@@ -149,6 +150,10 @@ def test_end2end_zynqbuild_tfc_w2a2_build():
     )
     model = model.transform(ZynqBuild(test_pynq_board, target_clk_ns))
     model = model.transform(AnnotateResources("synth"))
+    warnings.warn(
+        "Post-synthesis resources (excluding shell): "
+        + model.get_metadata_prop("res_total_synth")
+    )
     model.save(build_dir + "/end2end_zynqbuild_tfc_w2a2_build.onnx")
 
 
diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_fc.py b/tests/fpgadataflow/test_convert_to_hls_layers_fc.py
index d77065ad9396d0cc8dd57a39ed823fffcb30ee47..bd600c6c57d00d5fc03152f75b9f2f8c6beeeb2c 100644
--- a/tests/fpgadataflow/test_convert_to_hls_layers_fc.py
+++ b/tests/fpgadataflow/test_convert_to_hls_layers_fc.py
@@ -89,7 +89,6 @@ def test_convert_to_hls_layers_tfc_w1a1():
     assert fc3.op_type == "StreamingFCLayer_Batch"
     assert model.get_tensor_shape(fc3.input[0]) == [1, 64]
     assert model.get_tensor_shape(fc3.input[1]) == [64, 10]
-    os.remove(export_onnx_path)
 
     fc0w = getCustomOp(fc0)
     fc0w.set_nodeattr("SIMD", 784)
@@ -123,6 +122,7 @@ def test_convert_to_hls_layers_tfc_w1a1():
     # do forward pass in PyTorch/Brevitas
     expected = tfc.forward(input_tensor).detach().numpy()
     assert np.isclose(produced, expected, atol=1e-3).all()
+    os.remove(export_onnx_path)
 
 
 @pytest.mark.vivado