diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index cf74d6730be6634b273dd0578c59539c564aac62..57661efec4bdc33ed2c73fd94b7b2f43b1256749 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -6,6 +6,7 @@ import numpy as np
 
 from finn.backend.fpgadataflow.utils import numpy_to_hls_code
 from finn.core.datatype import DataType
+from finn.core.utils import interleave_matrix_outer_dim_from_partitions
 from finn.custom_op.fpgadataflow import HLSCustomOp
 
 
@@ -85,6 +86,63 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             ret["TDstI"] = "Slice<%s>" % out_hls_str
         return ret
 
+    def get_hls_compatible_weight_tensor(self, orig_weight_matrix):
+        """Convert the original numpy weight matrix orig_weight_matrix into
+        a form suitable for passing to the hlslib call:
+        * ensure MH % PE == 0 and MW % SIMD == 0
+        * for bipolar {-1,+1} weights, convert to binary {0, 1}
+        * interleave rows between PEs
+        * reshape into (1, PE, WMEM, SIMD) and return
+        """
+        mw = self.get_nodeattr("MW")
+        mh = self.get_nodeattr("MH")
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        wmem = mw * mh // (pe * simd)
+        assert orig_weight_matrix.shape == (mw, mh)
+        assert mw % simd == 0
+        assert mh % pe == 0
+        ret = orig_weight_matrix
+        if self.get_weight_datatype() == DataType.BIPOLAR:
+            # convert bipolar to binary
+            ret = (ret + 1) / 2
+        # interleave rows between PEs and reshape
+        # distribute rows between PEs
+        ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
+        # create SIMD as innermost dimension and add a dummy outer dim
+        ret = ret.reshape(1, pe, wmem, simd)
+        return ret
+
+    def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
+        """Convert the original numpy weight matrix orig_weight_matrix into
+        a form suitable for passing to the hlslib call:
+        * ensure MH % PE == 0
+        * for bipolar weights&inputs, ensure thresholds are positive
+        * interleave rows between PEs
+        * reshape into (PE, TMEM, n_thres_steps) and return
+        """
+        mh = self.get_nodeattr("MH")
+        pe = self.get_nodeattr("PE")
+        tmem = mh // pe
+        assert mh % pe == 0
+        assert orig_thres_matrix.ndim == 2
+        n_thres_steps = orig_thres_matrix.shape[1]
+        inp_is_bipolar = self.get_input_datatype() == DataType.BIPOLAR
+        wt_is_bipolar = self.get_weight_datatype() == DataType.BIPOLAR
+        if inp_is_bipolar and wt_is_bipolar:
+            assert (orig_thres_matrix >= 0).all()
+        ret = orig_thres_matrix
+        # ensure channels = mh , duplicating if necessary
+        if ret.shape[0] == 1:
+            ret = np.tile(ret, (mh, 1))
+        assert ret.shape[0] == mh
+        # distribute rows between PEs
+        ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
+        assert ret.shape[0] == pe
+        assert ret.shape[1] == tmem
+        assert ret.shape[2] == n_thres_steps
+        return ret
+
     def execute_node(self, context, graph):
         node = self.onnx_node
         # make temporary directory for generated files
@@ -101,22 +159,38 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             # the second input are the weights
             # the third input are the thresholds
             if in_ind == 0:
-                np.save(
-                    os.path.join(self.tmp_dir, "input_{}.npy".format(in_ind)),
-                    context[inputs],
-                )
+                simd = self.get_nodeattr("SIMD")
+                sf = int(self.get_nodeattr("MW") / simd)
+                assert context[inputs].shape == (1, sf, simd)
+                assert str(context[inputs].dtype) == "float32"
+                if self.get_input_datatype() == DataType.BIPOLAR:
+                    # store bipolar activations as binary
+                    np.save(
+                        os.path.join(self.tmp_dir, "input_{}.npy".format(in_ind)),
+                        (context[inputs] + 1) / 2,
+                    )
+                else:
+                    np.save(
+                        os.path.join(self.tmp_dir, "input_{}.npy".format(in_ind)),
+                        context[inputs],
+                    )
                 temp_files.append("{}/input_{}.npy".format(self.tmp_dir, in_ind))
             elif in_ind == 1:
                 weights = context[inputs]
-                # transpose and expand the weights to get the right shape
-                # for the code generation
-                weights = np.expand_dims(weights, 0)
-                weights = numpy_to_hls_code(
-                    weights, self.get_weight_datatype(), "weights", True, True
+                # convert weights into hlslib-compatible format
+                weight_tensor = self.get_hls_compatible_weight_tensor(weights)
+                export_wdt = self.get_weight_datatype()
+                # we have converted bipolar weights to binary for export,
+                # so use it as such for weight generation
+                if self.get_weight_datatype() == DataType.BIPOLAR:
+                    export_wdt = DataType.BINARY
+                weight_hls_code = numpy_to_hls_code(
+                    weight_tensor, export_wdt, "weights", True, True
                 )
-
                 # write weights into params.h
                 f_weights = open("{}/params.h".format(self.tmp_dir), "w")
+                # TODO fix this for non-1-bit weights, needs FixedPointWeights
+                assert export_wdt.bitwidth() == 1
                 f_weights.write(
                     "static BinaryWeights<{},{},{}> weights = ".format(
                         self.get_nodeattr("SIMD"),
@@ -124,28 +198,41 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                         self.get_nodeattr("WMEM"),
                     )
                 )
-                f_weights.write(weights)
+                f_weights.write(weight_hls_code)
                 f_weights.close()
                 temp_files.append("{}/params.h".format(self.tmp_dir))
 
-            else:
+            elif in_ind == 2:
                 thresholds = context[inputs]
-                thresholds = np.expand_dims(thresholds, 0)
-                thresholds = numpy_to_hls_code(
-                    thresholds, DataType.UINT32, "thresholds", True, True
+                threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
+                tdt = DataType.INT32
+                # use UINT32 threshold export for bipolar times bipolar
+                inp_is_bipolar = self.get_input_datatype() == DataType.BIPOLAR
+                wt_is_bipolar = self.get_weight_datatype() == DataType.BIPOLAR
+                if inp_is_bipolar and wt_is_bipolar:
+                    tdt = DataType.UINT32
+                thresholds_hls_code = numpy_to_hls_code(
+                    threshold_tensor, tdt, "thresholds", False, True
                 )
-
                 # write weights into thresh.h
                 f_thresh = open("{}/thresh.h".format(self.tmp_dir), "w")
+                tdt_hls = tdt.get_hls_datatype_str()
+                odt_hls = self.get_output_datatype().get_hls_datatype_str()
                 f_thresh.write(
-                    """static ThresholdsActivation<{},{},1,ap_uint<16>,
-                    ap_uint<1>> threshs = """.format(
-                        self.get_nodeattr("TMEM"), self.get_nodeattr("PE")
+                    "static ThresholdsActivation<{},{},{},{},{},{}> threshs = ".format(
+                        self.get_nodeattr("TMEM"),
+                        self.get_nodeattr("PE"),
+                        threshold_tensor.shape[-1],
+                        tdt_hls,
+                        odt_hls,
+                        self.get_nodeattr("ActVal"),
                     )
                 )
-                f_thresh.write(thresholds)
+                f_thresh.write(thresholds_hls_code)
                 f_thresh.close()
                 temp_files.append("{}/thresh.h".format(self.tmp_dir))
+            else:
+                raise Exception("Unexpected input found for StreamingFCLayer")
 
             in_ind += 1