diff --git a/finn-rtllib/checksum/checksum.cpp b/custom_hls/checksum.cpp
similarity index 100%
rename from finn-rtllib/checksum/checksum.cpp
rename to custom_hls/checksum.cpp
diff --git a/finn-rtllib/checksum/checksum.hpp b/custom_hls/checksum.hpp
similarity index 99%
rename from finn-rtllib/checksum/checksum.hpp
rename to custom_hls/checksum.hpp
index 35f6271d6e154508e17d68c410895056bc4409ae..bf580f31a6228ffd446221ff5c7cd5f29e439837 100644
--- a/finn-rtllib/checksum/checksum.hpp
+++ b/custom_hls/checksum.hpp
@@ -43,7 +43,7 @@
  *	  The provided DefaultSubwordSlicer assumes an `ap_(u)int`-like word
  *	  type with a member `width` and a range-based slicing operator. It
  *	  further assumes a little-endian arrangement of subwords within words
- *	  for the canonical subword stream order. 
+ *	  for the canonical subword stream order.
  *	- Subwords wider than 23 bits are folded using bitwise XOR across
  *	  slices of 23 bits starting from the LSB.
  *	- The folded subword values are weighted according to their position
diff --git a/finn-rtllib/checksum/checksum_tb.sv b/custom_hls/checksum_tb.sv
similarity index 100%
rename from finn-rtllib/checksum/checksum_tb.sv
rename to custom_hls/checksum_tb.sv
diff --git a/src/finn/custom_op/fpgadataflow/checksum.py b/src/finn/custom_op/fpgadataflow/checksum.py
index 0c264fcce6a6bdaaf70ff8ed8d0daa2784dfd9f6..22f9a92bd8cd856561b21659b3f828135b0cd08f 100644
--- a/src/finn/custom_op/fpgadataflow/checksum.py
+++ b/src/finn/custom_op/fpgadataflow/checksum.py
@@ -124,6 +124,9 @@ class checksum(HLSCustomOp):
 
         return normal_ishape
 
+    def get_ap_int_max_w(self):
+        return max(super().get_ap_int_max_w(), 32)
+
     def get_normal_output_shape(self):
         # same shape as input
         return self.get_normal_input_shape()
@@ -132,11 +135,17 @@ class checksum(HLSCustomOp):
         folded_oshape = self.get_folded_output_shape()
         return np.prod(folded_oshape[:-1])
 
+    def npy_to_dynamic_output(self, context):
+        super().npy_to_dynamic_output(context)
+        node = self.onnx_node
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        output_checksum = np.load("{}/output_checksum.npy".format(code_gen_dir))
+        context[node.output[1]] = output_checksum
+
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
         node = self.onnx_node
         inp = context[node.input[0]]
-        exp_shape = self.get_normal_input_shape()
 
         # TODO ensure codegen dir exists
         if mode == "cppsim":
@@ -152,9 +161,9 @@ class checksum(HLSCustomOp):
             )
 
         if mode == "cppsim":
-            output = inp
-            output = np.asarray([output], dtype=np.float32).reshape(*exp_shape)
-            context[node.output[0]] = output
+            self.dynamic_input_to_npy(context, 1)
+            self.exec_precompiled_singlenode_model()
+            self.npy_to_dynamic_output(context)
         elif mode == "rtlsim":
             # create a npy file for the input of the node
             assert (
@@ -221,10 +230,30 @@ class checksum(HLSCustomOp):
         self.code_gen_dict["$DEFINES$"] = my_defines
 
     def read_npy_data(self):
-        pass
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_input_datatype()
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"] = []
+        # note: the innermost dim is reversed for the input
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0, false);'
+            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+        )
 
     def strm_decl(self):
-        pass
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append("ap_uint<32> chk;")
 
     def docompute(self):
         self.code_gen_dict["$DOCOMPUTE$"] = [
@@ -232,10 +261,39 @@ class checksum(HLSCustomOp):
         ]
 
     def dataoutstrm(self):
-        pass
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        if dtype == DataType["BIPOLAR"]:
+            # use binary for bipolar storage
+            dtype = DataType["BINARY"]
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        shape = tuple(self.get_folded_output_shape())
+        shape_cpp_str = str(shape).replace("(", "{").replace(")", "}")
+
+        # note: the innermost dim is not reversed for the output
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", false);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                shape_cpp_str,
+                npy_out,
+            ),
+            "std::vector<unsigned int> checksum(1);",
+            "checksum[0] = chk;",
+            'cnpy::npy_save("%s/output_checksum.npy",&checksum[0],{1},"w");'
+            % code_gen_dir,
+        ]
 
     def save_as_npy(self):
-        pass
+        self.code_gen_dict["$SAVEASCNPY$"] = []
 
     def blackboxfunction(self):
         self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py
index ed12d2f1af7fcbba019d8896bedfb67ef03847b0..030d1834ffbb59bda1f9473af60aeb1a181af4dc 100644
--- a/src/finn/custom_op/fpgadataflow/hlscustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py
@@ -32,6 +32,7 @@ import os
 import subprocess
 from abc import abstractmethod
 
+from finn.core.datatype import DataType
 from finn.custom_op.base import CustomOp
 from finn.util.basic import (
     CppBuilder,
@@ -433,10 +434,22 @@ Found no codegen dir for this node, did you run the prepare_cppsim transformatio
         # assuming dynamic inputs start from 0
         for in_ind in range(count):
             current_input_name = node.input[in_ind]
-            # make copy before saving array
-            input_array = context[current_input_name].copy()
+            input_array = context[current_input_name]
+            if in_ind == 0:
+                expected_inp_shape = self.get_folded_input_shape()
+                idt = self.get_input_datatype()
+            else:
+                expected_inp_shape = self.get_folded_input_shape(in_ind)
+                idt = self.get_input_datatype(in_ind)
+            reshaped_input = input_array.reshape(expected_inp_shape)
+            if idt == DataType["BIPOLAR"]:
+                # store bipolar activations as binary
+                reshaped_input = (reshaped_input + 1) / 2
+            # make copy before saving the array
+            reshaped_input = reshaped_input.copy()
             np.save(
-                os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), input_array
+                os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)),
+                reshaped_input,
             )
 
     def npy_to_dynamic_output(self, context):
@@ -445,7 +458,8 @@ Found no codegen dir for this node, did you run the prepare_cppsim transformatio
         node = self.onnx_node
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         output = np.load("{}/output.npy".format(code_gen_dir))
-        context[node.output[0]] = output
+        exp_shape = self.get_normal_output_shape()
+        context[node.output[0]] = output.reshape(exp_shape)
 
     def npy_to_dynamic_outputs(self, context, npy_list):
         """Reads the output from .npy files generated from cppsim and places
@@ -456,7 +470,11 @@ Found no codegen dir for this node, did you run the prepare_cppsim transformatio
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         for i in range(len(npy_list)):
             output = np.load("{}/{}".format(code_gen_dir, npy_list[i]))
-            context[node.output[i]] = output
+            if i == 0:
+                exp_shape = self.get_normal_output_shape()
+            else:
+                exp_shape = self.get_normal_output_shape(i)
+            context[node.output[i]] = output.reshape(exp_shape)
 
     def exec_precompiled_singlenode_model(self):
         """Executes precompiled executable."""
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 975da7b6d2a85bd58879a92ac5a8dc1efb4dabe6..e73fa9bb2872d4a5023afb0c4e6953b4e6866b8d 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -90,13 +90,11 @@ set config_bnnlibdir "$::env(FINN_ROOT)/deps/finn-hlslib"
 puts "finn-hlslib dir: $config_bnnlibdir"
 set config_customhlsdir "$::env(FINN_ROOT)/custom_hls"
 puts "custom HLS dir: $config_customhlsdir"
-set config_customrtldir "$::env(FINN_ROOT)/finn-rtllib/checksum"
-puts "custom RTL dir: $config_customrtldir"
 set config_toplevelfxn "$TOPFXN$"
 set config_clkperiod $CLKPERIOD$
 
 open_project $config_proj_name
-add_files $config_hwsrcdir/top_$TOPFXN$.cpp -cflags "-std=c++14 -I$config_bnnlibdir -I$config_customhlsdir -I$config_customrtldir"
+add_files $config_hwsrcdir/top_$TOPFXN$.cpp -cflags "-std=c++14 -I$config_bnnlibdir -I$config_customhlsdir"
 
 set_top $config_toplevelfxn
 open_solution sol1
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
index 707289d393e2486780aed2c4af336dd3bafd37a6..3acfc7d8b004733131ee997f69aa4ac2aac88577 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
@@ -545,12 +545,10 @@ class Thresholding_Batch(HLSCustomOp):
                 out = context[node.output[0]]
                 out = 2 * out - 1
                 context[node.output[0]] = out
+            oshape = self.get_normal_output_shape()
             assert (
-                context[node.output[0]].shape == self.get_folded_output_shape()
+                context[node.output[0]].shape == oshape
             ), """Output shape is not as expected"""
-            # reshape output to have expected shape
-            oshape = self.get_normal_output_shape()
-            context[node.output[0]] = context[node.output[0]].reshape(*oshape)
         elif mode == "rtlsim":
             sim = self.get_rtlsim()
             nbits = self.get_instream_width()
@@ -691,9 +689,12 @@ class Thresholding_Batch(HLSCustomOp):
                 )
             ]
         elif mem_mode == "decoupled":
+            # note that numReps is set to 1 in the invocation below, since
+            # - for cppsim the repetition comes from the threshold stream reader+input
+            # - for synth the unit runs continuously anyway (ap_ctrl_none)
             self.code_gen_dict["$DOCOMPUTE$"] = [
                 """{}<{}, NumChannels1, PE1, {}, {}, ActVal1, ThresType1, NumSteps1>
-                (in0, out, weights, numReps);""".format(
+                (in0, out, weights, 1);""".format(
                     "Thresholding_Stream_Batch",
                     total_spatial_size,
                     tmpl_args["TSrcI"],
diff --git a/src/finn/transformation/fpgadataflow/insert_hook.py b/src/finn/transformation/fpgadataflow/insert_hook.py
index 22050c008b2991671b8483404e1a6e8772de691e..c1fce40c574eb58b67e728b78d31454f0c709b78 100644
--- a/src/finn/transformation/fpgadataflow/insert_hook.py
+++ b/src/finn/transformation/fpgadataflow/insert_hook.py
@@ -84,6 +84,9 @@ class InsertHook(Transformation):
                             )
                             # insert checksum node
                             graph.node.insert(node_ind + 1, chk_node)
+                            # insert newly-created tensors
+                            graph.value_info.append(chk_otensor)
+                            graph.value_info.append(chk_result)
 
                             # set chk output tensor as new input tensor of second node
                             if len(consumers) == 1: