diff --git a/.gitignore b/.gitignore
index d7ee7e014a0c175a8a88060f2aa320efeb501ddc..0c1bbd84fe24be46446a7d714dd708d601813e53 100644
--- a/.gitignore
+++ b/.gitignore
@@ -84,3 +84,10 @@ MANIFEST
 
 # PYNQ board files
 /board_files/
+
+# datasets for testing
+/dataset/
+/data/
+
+# Google Drive key for dashboard
+/gdrive-key/
diff --git a/docker/Dockerfile.finn_ci b/docker/Dockerfile.finn_ci
index 27028a0fa3e64d0396ec8e69ab2ad725eccca75f..fac168d55edd565b1cf84c4d9b556c51feb4e526 100644
--- a/docker/Dockerfile.finn_ci
+++ b/docker/Dockerfile.finn_ci
@@ -59,6 +59,7 @@ RUN apt update; apt install nano
 RUN pip install pytest-dependency
 RUN pip install pytest-xdist
 RUN pip install pytest-parallel
+RUN pip install -e git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading
 
 ENV PYTHONPATH "${PYTHONPATH}:/workspace/finn/src"
 ENV PYTHONPATH "${PYTHONPATH}:/workspace/pyverilator"
diff --git a/docker/Dockerfile.finn_dev b/docker/Dockerfile.finn_dev
index a84cd7be48578d72cd931672298d5695a5fc8268..89cf2c6747b5adbf89f4dc8563e817965cec3394 100644
--- a/docker/Dockerfile.finn_dev
+++ b/docker/Dockerfile.finn_dev
@@ -57,6 +57,7 @@ RUN pip install sphinx_rtd_theme==0.5.0
 RUN pip install pytest-xdist==2.0.0
 RUN pip install pytest-parallel==0.1.0
 RUN pip install netron
+RUN pip install -e git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading
 
 # switch user
 RUN groupadd -g $GID $GNAME
diff --git a/requirements.txt b/requirements.txt
index 4aa1cbe3484a3447851879d7da9ce9d48b066592..ba7bc716b741911820e67f1455aeca4c05e6e005 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,7 @@
 bitstring==3.1.7
 docrep==0.2.7
 future==0.18.2
+gspread==3.6.0
 numpy==1.18.0
 onnx==1.6.0
 onnxruntime==1.2.0
diff --git a/src/finn/core/onnx_exec.py b/src/finn/core/onnx_exec.py
index 15e2a69cd3f61c59287264b76b2229a7da3a3734..85b52c0f33baac609b4dad4df59f8442f737ffc2 100644
--- a/src/finn/core/onnx_exec.py
+++ b/src/finn/core/onnx_exec.py
@@ -42,7 +42,7 @@ import finn.analysis.topology as ta
 from finn.util.basic import sanitize_quant_values, get_sanitize_quant_tensors
 
 
-def execute_node(node, context, graph):
+def execute_node(node, context, graph, return_full_exec_context=False):
     """Executes a single node by using onnxruntime, with custom function or
     if dataflow partition by using remote execution or rtlsim.
 
@@ -59,16 +59,21 @@ def execute_node(node, context, graph):
         if old_iname != new_iname:
             inp_ctx[new_iname] = inp_ctx[old_iname]
             del inp_ctx[old_iname]
-        ret = execute_onnx(model, inp_ctx, False)
+        ret = execute_onnx(model, inp_ctx, return_full_exec_context)
         # if the model was in ip-stitched rtlsim mode, may get annotation
         # for numbet of elapsed cycles, save again
         if model.get_metadata_prop("exec_mode") == "rtlsim":
             model.save(sdp_node.get_nodeattr("model"))
         # output may have been renamed in partition
-        assert len(ret) == 1
+        assert len(model.graph.output) == 1
         node_oname = node.output[0]
         model_oname = model.graph.output[0].name
         context[node_oname] = ret[model_oname]
+        # prefix and insert exec context entries
+        if return_full_exec_context:
+            for tname in ret.keys():
+                if tname != model_oname:
+                    context[node.name + "_" + tname] = ret[tname]
     else:
         if node.domain == "finn":
 
@@ -198,7 +203,7 @@ def execute_onnx(
                 execution_context = sanitize_quant_values(
                     model, node.input, execution_context
                 )
-            execute_node(node, execution_context, graph)
+            execute_node(node, execution_context, graph, return_full_exec_context)
             if get_sanitize_quant_tensors() != 0:
                 # round output values to quantization annotation
                 execution_context = sanitize_quant_values(
diff --git a/src/finn/custom_op/fpgadataflow/labelselect_batch.py b/src/finn/custom_op/fpgadataflow/labelselect_batch.py
index c6598a30ec3c1b50bdd5c532cefc071d422c40ab..6e206d2058076802a48b69f4c69cccf744489f31 100644
--- a/src/finn/custom_op/fpgadataflow/labelselect_batch.py
+++ b/src/finn/custom_op/fpgadataflow/labelselect_batch.py
@@ -34,6 +34,7 @@ from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow import HLSCustomOp
 from onnx import TensorProto, helper
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+from finn.util.basic import roundup_to_integer_multiple
 
 
 class LabelSelect_Batch(HLSCustomOp):
@@ -46,6 +47,10 @@ class LabelSelect_Batch(HLSCustomOp):
             # If not provided compute min size
             labels = self.get_nodeattr("Labels")
             odt = DataType.get_smallest_possible(labels - 1)
+            # ensure a datatype divisible by 8-bits in case this is the last node
+            bw = roundup_to_integer_multiple(odt.bitwidth(), 8)
+            new_odt_name = odt.name.replace(str(odt.bitwidth()), str(bw))
+            odt = DataType[new_odt_name]
             odt_name = odt.name
             self.set_nodeattr("outputDataType", odt_name)
 
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
index 562bab0f18990096f7364b3a4e2bcbbbf4ce2b58..2429bf6190f822fb4a6c988fcbb34152d5a338e0 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
@@ -73,6 +73,8 @@ class Thresholding_Batch(HLSCustomOp):
             # [4] is four vectors (like a FC layer with batch=4)
             # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
             "numInputVectors": ("ints", False, [1]),
+            # initialization value for the thresholding accumulator
+            "ActVal": ("i", False, 0),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -321,7 +323,7 @@ class Thresholding_Batch(HLSCustomOp):
                 threshold_tensor.shape[-1],
                 tdt_hls,
                 odt_hls,
-                export_odt.min(),
+                self.get_nodeattr("ActVal"),
                 "std::less_equal<%s>" % tdt_hls,
             )
         )
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index c6edd6104e48e88a9233777a29e41b60fbb588ca..d4d5b006493b8db1da0184e98ba35493d3e6ccbd 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -820,7 +820,19 @@ class InferThresholdingLayer(Transformation):
                 assert ifc % pe == 0, "Requirement IFC divisable by PE is violated."
 
                 odt = model.get_tensor_datatype(thl_output)
-                # create and insert new StreamingFCLayer node
+                scale = getCustomOp(node).get_nodeattr("out_scale")
+                assert (
+                    scale == 1.0
+                ), "MultiThreshold out_scale must be equal to 1.0 for HLS conversion."
+                actval = getCustomOp(node).get_nodeattr("out_bias")
+                assert (
+                    int(actval) == actval
+                ), "MultiThreshold out_bias must be integer for HLS conversion."
+                actval = int(actval)
+                assert (not odt.signed()) or (
+                    actval < 0
+                ), "Signed output requres actval < 0"
+                # create and insert new Thresholding_Batch node
                 new_node = helper.make_node(
                     "Thresholding_Batch",
                     [thl_input, thl_threshold],
@@ -832,6 +844,7 @@ class InferThresholdingLayer(Transformation):
                     inputDataType=idt.name,
                     outputDataType=odt.name,
                     numInputVectors=list(thl_in_shape[:-1]),
+                    ActVal=actval,
                 )
                 graph.node.insert(insert_point, new_node)
                 # remove old node
diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index 334507affba51e948bc5a907af3003152821a3f9..0def25d8429f5d3f6c02a9db656650bc1baba6ee 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -54,7 +54,7 @@ class CreateStitchedIP(Transformation):
     The packaged block design IP can be found under the ip subdirectory.
     """
 
-    def __init__(self, fpgapart, clk_ns=10.0, ip_name="finn_design", vitis=False):
+    def __init__(self, fpgapart, clk_ns, ip_name="finn_design", vitis=False):
         super().__init__()
         self.fpgapart = fpgapart
         self.clk_ns = clk_ns
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
index 0e50213ee6feee5f45c18f87cb31a5faf5fb1c50..813b40698d1beec54e6ba3fa5344a8d0bb715a00 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
@@ -124,6 +124,13 @@ class MakePYNQDriver(Transformation):
 
         with open(driver_py, "w") as f:
             f.write(driver)
+
+        # add validate.py to run full top-1 test (only for suitable networks)
+        validate_py = pynq_driver_dir + "/validate.py"
+        validate_src = templates.pynq_validation_template
+        with open(validate_py, "w") as f:
+            f.write(validate_src)
+
         # copy all the dependencies into the driver folder
         shutil.copytree(
             get_finn_root() + "/src/finn/util", pynq_driver_dir + "/finn/util"
diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py
index 66580c70d23a2d2b19bfe8d94fefcd39a3208bcb..2b3789dc21cb4fe3f62d6b2a2ea0888329c9db66 100644
--- a/src/finn/transformation/fpgadataflow/templates.py
+++ b/src/finn/transformation/fpgadataflow/templates.py
@@ -436,3 +436,55 @@ open_project $VITIS_PROJ_PATH$/_x/link/vivado/vpl/prj/prj.xpr
 open_run impl_1
 report_utilization -hierarchical -hierarchical_depth 5 -file $VITIS_PROJ_PATH$/synth_report.xml -format xml
 """
+
+pynq_validation_template = """
+import argparse
+from driver import FINNAccelDriver
+import numpy as np
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser(description='Validate top-1 accuracy for FINN accelerator')
+  parser.add_argument('--batchsize', help='number of samples for inference', type=int, default=100)
+  parser.add_argument('--dataset', help='dataset to use (mnist of cifar10)', required=True)
+  # parse arguments
+  args = parser.parse_args()
+  bsize = args.batchsize
+  dataset = args.dataset
+
+  if dataset == "mnist":
+    from dataset_loading import mnist
+    trainx, trainy, testx, testy, valx, valy = mnist.load_mnist_data("/tmp", download=True, one_hot=False)
+  elif dataset == "cifar10":
+    from dataset_loading import cifar
+    trainx, trainy, testx, testy, valx, valy = cifar.load_cifar_data("/tmp", download=True, one_hot=False)
+  else:
+    raise Exception("Unrecognized dataset")
+
+  test_imgs = testx
+  test_labels = testy
+
+  ok = 0
+  nok = 0
+  total = test_imgs.shape[0]
+  driver = FINNAccelDriver(bsize, "resizer.bit", "zynq-iodma")
+
+  n_batches = int(total / bsize)
+
+  test_imgs = test_imgs.reshape(n_batches, bsize, -1)
+  test_labels = test_labels.reshape(n_batches, bsize)
+
+  for i in range(n_batches):
+    ibuf_normal = test_imgs[i].reshape(driver.ibuf_packed_device.shape)
+    exp = test_labels[i]
+    driver.copy_input_data_to_device(ibuf_normal)
+    driver.execute()
+    obuf_normal = np.empty_like(driver.obuf_packed_device)
+    driver.copy_output_data_from_device(obuf_normal)
+    ret = np.bincount(obuf_normal.flatten() == exp.flatten())
+    nok += ret[0]
+    ok += ret[1]
+    print("batch %d / %d : total OK %d NOK %d" % (i, n_batches, ok, nok))
+
+  acc = 100.0 * ok / (total)
+  print("Final accuracy: %f" % acc)
+"""
diff --git a/src/finn/transformation/infer_data_layouts.py b/src/finn/transformation/infer_data_layouts.py
index e7a6b88239a1735d5379e165333f8356ae6f88a1..d07162fa049bd016e91b8c5b01ea56eda6267655 100644
--- a/src/finn/transformation/infer_data_layouts.py
+++ b/src/finn/transformation/infer_data_layouts.py
@@ -75,6 +75,17 @@ def _infer_node_data_layout(model, node):
             inp_layout = model.get_tensor_layout(node.input[0])
             out_layout = [inp_layout[i] for i in perm]
             model.set_tensor_layout(node.output[0], out_layout)
+        elif node.op_type == "Unsqueeze":
+            inp_layout = model.get_tensor_layout(node.input[0])
+            # add dummy dimension at the output
+            out_layout = inp_layout + ["x"]
+            model.set_tensor_layout(node.output[0], out_layout)
+        elif node.op_type == "Squeeze":
+            inp_layout = model.get_tensor_layout(node.input[0])
+            assert inp_layout[-1] == "x"
+            # remove dummy dimension
+            out_layout = inp_layout[:-1]
+            model.set_tensor_layout(node.output[0], out_layout)
         else:
             # try to guess based on number of output dims
             for o in node.output:
diff --git a/src/finn/transformation/streamline/absorb.py b/src/finn/transformation/streamline/absorb.py
index 8398a277443530e84632d26fbfca6d90ea4b0b9e..0f2c5525d91263b44002677b505087d38408333a 100644
--- a/src/finn/transformation/streamline/absorb.py
+++ b/src/finn/transformation/streamline/absorb.py
@@ -424,8 +424,9 @@ class AbsorbTransposeIntoFlatten(Transformation):
         return (model, graph_modified)
 
 
-class AbsorbScalarMulIntoTopK(Transformation):
-    """Absorb a mul node into a suceeding topk node if the mul is scalar."""
+class AbsorbScalarMulAddIntoTopK(Transformation):
+    """Remove mul/add node prior to topk node if the op is scalar. Note that
+    the TopK output probabilities will change, but the indices won't."""
 
     def apply(self, model):
         graph = model.graph
@@ -435,14 +436,17 @@ class AbsorbScalarMulIntoTopK(Transformation):
             node_ind += 1
             if n.op_type == "TopK":
                 prod = model.find_producer(n.input[0])
-                if prod is not None and prod.op_type == "Mul":
+                if prod is not None and (prod.op_type in ["Mul", "Add"]):
                     prod_input = prod.input[0]
                     param_name = prod.input[1]
                     A = model.get_initializer(param_name)
                     if A is None:
                         warnings.warn("Param is not constant, skipping")
                         continue
-                    if all(x == 1 for x in A.shape) and A > 0:
+                    is_scalar = all(x == 1 for x in A.shape)
+                    is_scalar_pos_mul = is_scalar and (prod.op_type == "Mul") and A > 0
+                    is_scalar_add = is_scalar and (prod.op_type == "Add")
+                    if is_scalar_pos_mul or is_scalar_add:
                         # if the mul is scalar and positive, we can just delete the
                         # mul node and rewire the top k node. Because the top k node
                         # works with probabilities and their relation to each other
diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index b47f269dd6f2671c3d98c9316954483c0e72f14f..f4c1dc1306b67e5807c25cfb08c961729dbfbdf6 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -533,7 +533,7 @@ class MoveScalarLinearPastInvariants(Transformation):
                 if prod0 is None:
                     continue
 
-                if prod0.op_type == "Mul" or prod0.op_type == "Add":
+                if prod0.op_type in ["Mul", "Add", "Div"]:
                     # check if second input of producer is an initializer
                     init0 = model.get_initializer(prod0.input[1])
                     # if either initializer is None, skip
diff --git a/src/finn/util/gdrive.py b/src/finn/util/gdrive.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2d9b89e354e42849a82b563fe391b9f6e603f4e
--- /dev/null
+++ b/src/finn/util/gdrive.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import gspread
+import os
+import warnings
+from datetime import datetime
+
+
+def upload_to_end2end_dashboard(data_dict):
+    gdrive_key = "/workspace/finn/gdrive-key/service_account.json"
+    if not os.path.isfile(gdrive_key):
+        warnings.warn("Google Drive key not found, skipping dashboard upload")
+        return
+    gc = gspread.service_account(filename=gdrive_key)
+    spreadsheet = gc.open("finn-end2end-dashboard")
+    worksheet = spreadsheet.get_worksheet(0)
+    keys = list(data_dict.keys())
+    vals = list(data_dict.values())
+    # check against existing header
+    existing_keys = worksheet.row_values(1)
+    if existing_keys != keys:
+        # create new worksheet
+        dtstr = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        worksheet = spreadsheet.add_worksheet(
+            title="Dashboard " + dtstr, rows=10, cols=len(keys), index=0
+        )
+        # create header row with keys
+        worksheet.update("A1:1", [keys])
+        # freeze and make header bold
+        worksheet.freeze(rows=1)
+        worksheet.format("A1:1", {"textFormat": {"bold": True}})
+    # insert values into new row
+    worksheet.insert_row([], index=2)
+    worksheet.update("A2:2", [vals])
diff --git a/src/finn/util/test.py b/src/finn/util/test.py
index 3cd4248c5fbf438ac7dd7974adb38d251d389a07..32c6a0a3a3bb19b95590181dbe447e82cf9966a2 100644
--- a/src/finn/util/test.py
+++ b/src/finn/util/test.py
@@ -77,6 +77,11 @@ def get_test_model_untrained(netname, wbits, abits):
     return get_test_model(netname, wbits, abits, pretrained=False)
 
 
+def get_topk(vec, k):
+    "Return indices of the top-k values in given array vec (treated as 1D)."
+    return np.flip(vec.flatten().argsort())[:k]
+
+
 def soft_verify_topk(invec, idxvec, k):
     """Check that the topK indices provided actually point to the topK largest
     values in the input vector"""
@@ -140,7 +145,6 @@ def get_example_input(topology):
     elif topology == "cnv":
         fn = pk.resource_filename("finn", "data/cifar10/cifar10-test-data-class3.npz")
         input_tensor = np.load(fn)["arr_0"].astype(np.float32)
-        input_tensor = input_tensor / 255
         return input_tensor
     else:
         raise Exception("Unknown topology, can't return example input")
@@ -158,7 +162,7 @@ def get_trained_network_and_ishape(topology, wbits, abits):
     return (model, ishape)
 
 
-def execute_parent(parent_path, child_path, input_tensor_npy):
+def execute_parent(parent_path, child_path, input_tensor_npy, return_full_ctx=False):
     """Execute parent model containing a single StreamingDataflowPartition by
     replacing it with the model at child_path and return result."""
 
@@ -169,5 +173,7 @@ def execute_parent(parent_path, child_path, input_tensor_npy):
     sdp_node = getCustomOp(sdp_node)
     sdp_node.set_nodeattr("model", child_path)
     ret = execute_onnx(parent_model, {iname: input_tensor_npy}, True)
-    y = ret[oname]
-    return y
+    if return_full_ctx:
+        return ret
+    else:
+        return ret[oname]
diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py
index 29ecb2c7e49444cecade6d3321aaba3b9add4b9c..4eed1a260974e4f842e9e93756caff135c5fbdde 100644
--- a/tests/end2end/test_end2end_bnn_pynq.py
+++ b/tests/end2end/test_end2end_bnn_pynq.py
@@ -63,12 +63,16 @@ from finn.util.test import (
     get_example_input,
     get_trained_network_and_ishape,
     execute_parent,
+    get_topk,
 )
 from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
 from finn.transformation.infer_data_layouts import InferDataLayouts
 from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
 from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
-from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
+from finn.transformation.streamline.reorder import (
+    MakeMaxPoolNHWC,
+    MoveScalarLinearPastInvariants,
+)
 import warnings
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
@@ -84,6 +88,15 @@ from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
 from finn.core.modelwrapper import ModelWrapper
 from scipy.stats import linregress
 from finn.core.throughput_test import throughput_test_remote, throughput_test_rtlsim
+from finn.util.pytorch import ToTensor
+from finn.transformation.merge_onnx_models import MergeONNXModels
+from finn.transformation.insert_topk import InsertTopK
+from finn.core.datatype import DataType
+from dataset_loading import mnist, cifar
+from datetime import datetime
+import subprocess
+from finn.util.gdrive import upload_to_end2end_dashboard
+from collections import OrderedDict
 
 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 target_clk_ns = 10
@@ -95,6 +108,24 @@ def get_checkpoint_name(topology, wbits, abits, step):
     return build_dir + "/end2end_%s_w%da%d_%s.onnx" % (topology, wbits, abits, step)
 
 
+def get_dashboard_data(topology, wbits, abits):
+    stats_file = build_dir + "/end2end_%s_w%da%d.txt" % (topology, wbits, abits)
+    stats_dict = OrderedDict()
+    if os.path.isfile(stats_file):
+        with open(stats_file, "r") as f:
+            stats_dict_txt = f.read()
+        stats_dict = eval(stats_dict_txt)
+    return stats_dict
+
+
+def update_dashboard_data(topology, wbits, abits, key, val):
+    stats_dict = get_dashboard_data(topology, wbits, abits)
+    stats_dict[key] = val
+    stats_file = build_dir + "/end2end_%s_w%da%d.txt" % (topology, wbits, abits)
+    with open(stats_file, "w") as f:
+        f.write(str(stats_dict))
+
+
 def fold_tfc(model):
     fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
     # (PE, SIMD, in_fifo_depth, out_fifo_depth, ramstyle) for each layer
@@ -111,6 +142,10 @@ def fold_tfc(model):
         fcl_inst.set_nodeattr("inFIFODepth", ififo)
         fcl_inst.set_nodeattr("outFIFODepth", ofifo)
         fcl_inst.set_nodeattr("ram_style", ramstyle)
+    # set parallelism for input quantizer to be same as first layer's SIMD
+    inp_qnt_node = model.get_nodes_by_op_type("Thresholding_Batch")[0]
+    inp_qnt = getCustomOp(inp_qnt_node)
+    inp_qnt.set_nodeattr("PE", 49)
     return model
 
 
@@ -187,14 +222,71 @@ def get_folding_function(topology, wbits, abits):
         raise Exception("Unknown topology/quantization combo for predefined folding")
 
 
-def get_golden_io_pair(topology, wbits, abits):
+def get_golden_io_pair(topology, wbits, abits, preproc=ToTensor(), return_topk=None):
     (model, ishape) = get_trained_network_and_ishape(topology, wbits, abits)
     input_tensor_npy = get_example_input(topology)
     input_tensor_torch = torch.from_numpy(input_tensor_npy).float()
+    if preproc is not None:
+        input_tensor_torch = preproc.forward(input_tensor_torch).detach()
     output_tensor_npy = model.forward(input_tensor_torch).detach().numpy()
+    if return_topk is not None:
+        output_tensor_npy = get_topk(output_tensor_npy, k=return_topk)
     return (input_tensor_npy, output_tensor_npy)
 
 
+def measure_top1_accuracy(model_chkpt, dataset, parent_chkpt=None):
+    if dataset == "cifar10":
+        trainx, trainy, testx, testy, valx, valy = cifar.load_cifar_data(
+            "/workspace/finn/dataset", download=True, one_hot=False
+        )
+    elif dataset == "mnist":
+        trainx, trainy, testx, testy, valx, valy = mnist.load_mnist_data(
+            "/workspace/finn/dataset", download=True, one_hot=False
+        )
+    else:
+        raise Exception("Unrecognized dataset")
+    # move from dataset_loader layout to ONNX layout: NHWC -> NCHW
+    testx = testx.transpose(0, 3, 1, 2)
+    model = ModelWrapper(model_chkpt)
+    iname = model.graph.input[0].name
+    oname = model.graph.output[0].name
+    if parent_chkpt is None:
+        ishape = model.get_tensor_shape(iname)
+    else:
+        parent_model = ModelWrapper(parent_chkpt)
+        parent_iname = parent_model.graph.input[0].name
+        ishape = parent_model.get_tensor_shape(parent_iname)
+    ok = 0
+    nok = 0
+    n_batches = testx.shape[0]
+    for i in range(n_batches):
+        tdata = testx[i].reshape(ishape).astype(np.float32)
+        exp = testy[i].item()
+        if parent_chkpt is not None:
+            y = execute_parent(parent_chkpt, model_chkpt, tdata)
+        else:
+            y = execute_onnx(model, {iname: tdata}, False)[oname]
+        ret = y.item()
+        if ret == exp:
+            ok += 1
+        else:
+            nok += 1
+        if i % 10 == 0:
+            print("%d : OK %d NOK %d " % (i, ok, nok))
+    acc_top1 = ok * 100.0 / (ok + nok)
+    warnings.warn("Final OK %d NOK %d top-1 %f" % (ok, nok, acc_top1))
+    return acc_top1
+
+
+def topology2dataset(topology):
+    if "fc" in topology:
+        return "mnist"
+    elif "cnv" in topology:
+        return "cifar10"
+    else:
+        raise Exception("Unrecognized topology")
+
+
 @pytest.mark.parametrize("wbits", [1, 2])
 @pytest.mark.parametrize("abits", [1, 2])
 @pytest.mark.parametrize("topology", ["tfc", "cnv"])
@@ -205,6 +297,15 @@ class TestEnd2End:
         (model, ishape) = get_trained_network_and_ishape(topology, wbits, abits)
         chkpt_name = get_checkpoint_name(topology, wbits, abits, "export")
         bo.export_finn_onnx(model, ishape, chkpt_name)
+        nname = "%s_w%da%d" % (topology, wbits, abits)
+        update_dashboard_data(topology, wbits, abits, "network", nname)
+        dtstr = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        update_dashboard_data(topology, wbits, abits, "datetime", dtstr)
+        finn_commit = subprocess.check_output(
+            ["git", "rev-parse", "HEAD"], cwd="/workspace/finn"
+        )
+        finn_commit = finn_commit.decode("utf-8").strip()
+        update_dashboard_data(topology, wbits, abits, "finn-commit", finn_commit)
         assert os.path.isfile(chkpt_name)
 
     def test_import_and_tidy(self, topology, wbits, abits):
@@ -216,11 +317,43 @@ class TestEnd2End:
         model = model.transform(GiveReadableTensorNames())
         model = model.transform(InferDataTypes())
         model = model.transform(RemoveStaticGraphInputs())
-        model.save(get_checkpoint_name(topology, wbits, abits, "import_and_tidy"))
+        chkpt = get_checkpoint_name(topology, wbits, abits, "import_and_tidy")
+        model.save(chkpt)
 
-    def test_streamline(self, topology, wbits, abits):
+    def test_add_pre_and_postproc(self, topology, wbits, abits):
         prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "import_and_tidy")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
+        global_inp_name = model.graph.input[0].name
+        ishape = model.get_tensor_shape(global_inp_name)
+        # preprocessing: torchvision's ToTensor divides uint8 inputs by 255
+        totensor_pyt = ToTensor()
+        chkpt_preproc_name = get_checkpoint_name(topology, wbits, abits, "preproc")
+        bo.export_finn_onnx(totensor_pyt, ishape, chkpt_preproc_name)
+        assert os.path.isfile(chkpt_preproc_name)
+        # join preprocessing and core model
+        pre_model = ModelWrapper(chkpt_preproc_name)
+        model = model.transform(MergeONNXModels(pre_model))
+        # add input quantization annotation: UINT8 for all BNN-PYNQ models
+        global_inp_name = model.graph.input[0].name
+        model.set_tensor_datatype(global_inp_name, DataType.UINT8)
+        # postprocessing: insert Top-1 node at the end
+        model = model.transform(InsertTopK(k=1))
+        chkpt_name = get_checkpoint_name(topology, wbits, abits, "pre_post")
+        # tidy-up again
+        model = model.transform(InferShapes())
+        model = model.transform(FoldConstants())
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(GiveReadableTensorNames())
+        model = model.transform(InferDataTypes())
+        model = model.transform(RemoveStaticGraphInputs())
+        model.save(chkpt_name)
+        assert os.path.isfile(chkpt_name)
+
+    def test_streamline(self, topology, wbits, abits):
+        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "pre_post")
+        model = load_test_checkpoint_or_skip(prev_chkpt_name)
+        # move past any reshapes to be able to streamline input scaling
+        model = model.transform(MoveScalarLinearPastInvariants())
         model = model.transform(Streamline())
         if "fc" not in topology:
             model = model.transform(LowerConvsToMatMul())
@@ -228,6 +361,9 @@ class TestEnd2End:
             model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
         model = model.transform(ConvertBipolarMatMulToXnorPopcount())
         model = model.transform(Streamline())
+        # absorb final add-mul nodes into TopK
+        model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())
+        model = model.transform(InferDataLayouts())
         model = model.transform(RemoveUnusedTensors())
         model.save(get_checkpoint_name(topology, wbits, abits, "streamline"))
 
@@ -238,11 +374,17 @@ class TestEnd2End:
         model = model.transform(to_hls.InferBinaryStreamingFCLayer(mem_mode))
         # needed for non-bipolar MatMul layers
         model = model.transform(to_hls.InferQuantizedStreamingFCLayer(mem_mode))
+        # TopK to LabelSelect
+        model = model.transform(to_hls.InferLabelSelectLayer())
+        # input quantization (if any) to standalone thresholding
+        model = model.transform(to_hls.InferThresholdingLayer())
         # needed for convolutions
         if "fc" not in topology:
             model = model.transform(to_hls.InferConvInpGen())
             model = model.transform(to_hls.InferStreamingMaxPool())
             model = model.transform(RemoveCNVtoFCFlatten())
+        # get rid of Tranpose -> Tranpose identity seq
+        model = model.transform(absorb.AbsorbConsecutiveTransposes())
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(InferDataLayouts())
         model.save(get_checkpoint_name(topology, wbits, abits, "convert_to_hls_layers"))
@@ -285,7 +427,7 @@ class TestEnd2End:
         model.save(cppsim_chkpt)
         parent_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_parent")
         (input_tensor_npy, output_tensor_npy) = get_golden_io_pair(
-            topology, wbits, abits
+            topology, wbits, abits, return_topk=1
         )
         y = execute_parent(parent_chkpt, cppsim_chkpt, input_tensor_npy)
         assert np.isclose(y, output_tensor_npy).all()
@@ -294,6 +436,8 @@ class TestEnd2End:
     @pytest.mark.vivado
     @pytest.mark.parametrize("kind", ["zynq", "alveo"])
     def test_ipgen(self, topology, wbits, abits, kind):
+        if kind == "alveo" and ("VITIS_PATH" not in os.environ):
+            pytest.skip("VITIS_PATH not set")
         prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "fold")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         test_fpga_part = get_build_env(kind, target_clk_ns)["part"]
@@ -304,7 +448,7 @@ class TestEnd2End:
 
     @pytest.mark.slow
     @pytest.mark.vivado
-    @pytest.mark.parametrize("kind", ["zynq", "alveo"])
+    @pytest.mark.parametrize("kind", ["zynq"])
     def test_ipstitch_rtlsim(self, topology, wbits, abits, kind):
         prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "ipgen_" + kind)
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
@@ -326,23 +470,32 @@ class TestEnd2End:
                 "rtlsim_trace", "%s_w%da%d.vcd" % (topology, wbits, abits)
             )
             os.environ["RTLSIM_TRACE_DEPTH"] = "3"
-        rtlsim_chkpt = get_checkpoint_name(topology, wbits, abits, "ipstitch_rtlsim_" + kind)
+        rtlsim_chkpt = get_checkpoint_name(
+            topology, wbits, abits, "ipstitch_rtlsim_" + kind
+        )
         model.save(rtlsim_chkpt)
         parent_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_parent")
         (input_tensor_npy, output_tensor_npy) = get_golden_io_pair(
-            topology, wbits, abits
+            topology, wbits, abits, return_topk=1
         )
         y = execute_parent(parent_chkpt, rtlsim_chkpt, input_tensor_npy)
         model = ModelWrapper(rtlsim_chkpt)
         perf["cycles_rtlsim"] = model.get_metadata_prop("cycles_rtlsim")
-        warnings.warn("Estimated & rtlsim performance: " + str(perf))
+        # warnings.warn("Estimated & rtlsim performance: " + str(perf))
+        # for (k, v) in perf.items():
+        #    update_dashboard_data(topology, wbits, abits, k, v)
+        update_dashboard_data(
+            topology, wbits, abits, "cycles_rtlsim", perf["cycles_rtlsim"]
+        )
         assert np.isclose(y, output_tensor_npy).all()
 
     @pytest.mark.slow
     @pytest.mark.vivado
-    @pytest.mark.parametrize("kind", ["zynq", "alveo"])
+    @pytest.mark.parametrize("kind", ["zynq"])
     def test_throughput_rtlsim(self, topology, wbits, abits, kind):
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "ipstitch_rtlsim_" + kind)
+        prev_chkpt_name = get_checkpoint_name(
+            topology, wbits, abits, "ipstitch_rtlsim_" + kind
+        )
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         n_nodes = len(model.graph.node)
         perf_est = model.analysis(dataflow_performance)
@@ -354,6 +507,25 @@ class TestEnd2End:
         est_cycles = latency + cycles_per_sample_est * batchsize
         assert (abs(res_cycles - est_cycles) / res_cycles) < 0.15
 
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.parametrize("kind", ["zynq"])
+    def test_validate_top1(self, topology, wbits, abits, kind):
+        if "TEST_END2END_VALIDATE_TOP1" not in os.environ:
+            pytest.skip("TEST_END2END_VALIDATE_TOP1 not set")
+        prepostproc_chkpt = get_checkpoint_name(topology, wbits, abits, "pre_post")
+        streamline_chkpt = get_checkpoint_name(topology, wbits, abits, "streamline")
+        parent_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_parent")
+        cppsim_chkpt = get_checkpoint_name(topology, wbits, abits, "cppsim")
+        rtlsim_chkpt = get_checkpoint_name(
+            topology, wbits, abits, "ipstitch_rtlsim_" + kind
+        )
+        dataset = topology2dataset(topology)
+        assert measure_top1_accuracy(prepostproc_chkpt, dataset) > 80
+        assert measure_top1_accuracy(streamline_chkpt, dataset) > 80
+        assert measure_top1_accuracy(cppsim_chkpt, dataset, parent_chkpt) > 80
+        assert measure_top1_accuracy(rtlsim_chkpt, dataset, parent_chkpt) > 80
+
     @pytest.mark.slow
     @pytest.mark.vivado
     @pytest.mark.vitis
@@ -366,14 +538,10 @@ class TestEnd2End:
         cfg = get_build_env(kind, target_clk_ns)
         model = model.transform(cfg["build_fxn"])
         model = model.transform(AnnotateResources("synth"))
-        warnings.warn(
-            "Post-synthesis resources (excluding shell): "
-            + model.get_metadata_prop("res_total_synth")
-        )
-        warnings.warn(
-            "Post-synthesis resources (all inclusive): "
-            + model.get_metadata_prop("res_total_top_synth")
-        )
+        synth_dct = eval(model.get_metadata_prop("res_total_top_synth"))
+        for (k, v) in synth_dct.items():
+            update_dashboard_data(topology, wbits, abits, k, v)
+        update_dashboard_data(topology, wbits, abits, "board", cfg["board"])
         model.save(get_checkpoint_name(topology, wbits, abits, "build_" + kind))
 
     @pytest.mark.parametrize("kind", ["zynq", "alveo"])
@@ -396,14 +564,14 @@ class TestEnd2End:
         model.save(get_checkpoint_name(topology, wbits, abits, "deploy_" + kind))
 
     @pytest.mark.parametrize("kind", ["zynq", "alveo"])
-    def test_run_on_pynq(self, topology, wbits, abits, kind):
+    def test_run_on_hw(self, topology, wbits, abits, kind):
         prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "deploy_" + kind)
         model = load_test_checkpoint_or_skip(prev_chkpt_name)  # NOQA
         cfg = get_build_env(kind, target_clk_ns)
         if cfg["ip"] == "":
             pytest.skip("PYNQ board IP address not specified")
         (input_tensor_npy, output_tensor_npy) = get_golden_io_pair(
-            topology, wbits, abits
+            topology, wbits, abits, return_topk=1
         )
         parent_model = load_test_checkpoint_or_skip(
             get_checkpoint_name(topology, wbits, abits, "dataflow_parent")
@@ -412,9 +580,7 @@ class TestEnd2End:
         oname = parent_model.graph.output[0].name
         sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
         sdp_node = getCustomOp(sdp_node)
-        sdp_chkpt = get_checkpoint_name(topology, wbits, abits, "deploy")
-        load_test_checkpoint_or_skip(sdp_chkpt)
-        sdp_node.set_nodeattr("model", sdp_chkpt)
+        sdp_node.set_nodeattr("model", prev_chkpt_name)
         ret = execute_onnx(parent_model, {iname: input_tensor_npy}, True)
         y = ret[oname]
         assert np.isclose(y, output_tensor_npy).all()
@@ -465,3 +631,21 @@ class TestEnd2End:
             )
         ret_str += "\n" + "-----------------------------"
         warnings.warn(ret_str)
+        largest_bsize = bsize_range[-1]
+        update_dashboard_data(
+            topology, wbits, abits, "fclk[mhz]", ret[largest_bsize]["fclk[mhz]"]
+        )
+        update_dashboard_data(
+            topology,
+            wbits,
+            abits,
+            "throughput[images/s]",
+            ret[largest_bsize]["throughput[images/s]"],
+        )
+
+    def test_upload_results_to_dashboard(self, topology, wbits, abits):
+        dashboard_data = get_dashboard_data(topology, wbits, abits)
+        if len(dashboard_data.keys()) > 0:
+            upload_to_end2end_dashboard(dashboard_data)
+        else:
+            pytest.skip("No data to upload to dashboard")
diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py b/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py
index 3c8da5de1d8629b3692646e1aa18120ffcc30b99..86875d2ac7f37e697c5de198e15aa3045a9e3d42 100644
--- a/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py
+++ b/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py
@@ -52,7 +52,7 @@ from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.streamline.absorb import (
-    AbsorbScalarMulIntoTopK,
+    AbsorbScalarMulAddIntoTopK,
     AbsorbConsecutiveTransposes,
 )
 from finn.transformation.streamline.collapse_repeated import (
@@ -192,7 +192,7 @@ def test_convert_to_hls_layers_synthetic(ch, ifmdim, idt):
     model = model.transform(to_hls.InferGlobalAccPoolLayer())
     model = model.transform(MoveScalarLinearPastInvariants())
     model = model.transform(InsertTopK())
-    model = model.transform(AbsorbScalarMulIntoTopK())
+    model = model.transform(AbsorbScalarMulAddIntoTopK())
     model = model.transform(InferDataTypes())
     model = model.transform(to_hls.InferLabelSelectLayer())
     model = model.transform(AbsorbConsecutiveTransposes())
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
index ad69b89d8a159f6eb423c0739bdb4c0dd5103792..75fa625ff00ad6d367e2d6c94d98705f391fb9be 100644
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
@@ -48,7 +48,7 @@ from finn.custom_op.registry import getCustomOp
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 
 
-def make_single_thresholding_modelwrapper(T, pe, idt, odt):
+def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval):
     NumChannels = T.shape[0]
 
     inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, NumChannels])
@@ -66,6 +66,7 @@ def make_single_thresholding_modelwrapper(T, pe, idt, odt):
         PE=pe,
         inputDataType=idt.name,
         outputDataType=odt.name,
+        ActVal=actval,
     )
     graph = helper.make_graph(
         nodes=[Thresholding_node],
@@ -112,7 +113,12 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode):
     # provide non-decreasing thresholds
     T = np.sort(T, axis=1)
 
-    model = make_single_thresholding_modelwrapper(T, pe, idt, odt)
+    if odt == DataType.BIPOLAR:
+        actval = 0
+    else:
+        actval = odt.min()
+
+    model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval)
 
     if exec_mode == "cppsim":
         model = model.transform(PrepareCppSim())
diff --git a/tests/transformation/test_absorb_mul_into_topk.py b/tests/transformation/test_absorb_mul_into_topk.py
index 1394220f7c336ccea8fe9c494734c4175bf2e847..d0a089f9e5f894a5da635672eb58af1d8ddef3ef 100644
--- a/tests/transformation/test_absorb_mul_into_topk.py
+++ b/tests/transformation/test_absorb_mul_into_topk.py
@@ -35,7 +35,7 @@ from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.infer_datatypes import InferDataTypes
 from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames
 from finn.transformation.insert_topk import InsertTopK
-from finn.transformation.streamline.absorb import AbsorbScalarMulIntoTopK
+from finn.transformation.streamline.absorb import AbsorbScalarMulAddIntoTopK
 import finn.core.onnx_exec as oxe
 
 # parameter to indicate if mul parameter is negative or positive
@@ -49,20 +49,24 @@ def test_absorb_mul_into_topk(mul_positive, scalar):
         shape = [1, 1, 1, 1000]
     inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, 1, 1, 1000])
     a0 = helper.make_tensor_value_info("a0", TensorProto.FLOAT, shape)
+    b0 = helper.make_tensor_value_info("b0", TensorProto.FLOAT, [1, 1, 1, 1000])
+    c0 = helper.make_tensor_value_info("c0", TensorProto.FLOAT, shape)
     outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, 1, 1, 1000])
 
-    mul_node = helper.make_node("Mul", ["inp", "a0"], ["outp"])
+    mul_node = helper.make_node("Mul", ["inp", "a0"], ["b0"])
+    add_node = helper.make_node("Add", ["b0", "c0"], ["outp"])
     mul_graph = helper.make_graph(
-        nodes=[mul_node],
+        nodes=[mul_node, add_node],
         name="mul-graph",
         inputs=[inp],
         outputs=[outp],
-        value_info=[a0],
+        value_info=[a0, b0, c0],
     )
 
     model = helper.make_model(mul_graph, producer_name="mul_model")
     model = ModelWrapper(model)
     # initialize values
+    # for mul
     if mul_positive is True:
         a0_values = np.random.uniform(low=0.1, high=1, size=tuple(shape)).astype(
             np.float32
@@ -72,12 +76,17 @@ def test_absorb_mul_into_topk(mul_positive, scalar):
             np.float32
         )
     model.set_initializer("a0", a0_values)
+    # for add
+    c0_values = np.random.uniform(low=-1, high=-0.1, size=tuple(shape)).astype(
+        np.float32
+    )
+    model.set_initializer("c0", c0_values)
     model = model.transform(InsertTopK())
     model = model.transform(InferShapes())
     model = model.transform(InferDataTypes())
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(GiveReadableTensorNames())
-    model_transformed = model.transform(AbsorbScalarMulIntoTopK())
+    model_transformed = model.transform(AbsorbScalarMulAddIntoTopK())
 
     # compare execution results
     inp_values = np.random.uniform(low=-10, high=10, size=(1, 1, 1, 1000)).astype(
@@ -100,9 +109,5 @@ def test_absorb_mul_into_topk(mul_positive, scalar):
 
         # check for new order
         assert model.graph != model_transformed.graph
-        assert len(model.graph.node) - 1 == len(model_transformed.graph.node)
+        assert len(model.graph.node) - 2 == len(model_transformed.graph.node)
         assert model_transformed.graph.node[0].op_type == "TopK"
-
-    else:
-        assert (y_values == y_tr_values).all()
-        assert model.graph == model_transformed.graph