diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index ee8d88789e16a6fde77850e742258dfc089c8659..1ed8875e886ea78511f1992d95be4417b3af80df 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -15,7 +15,7 @@ gecho () {
diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
index 635f37d5695a56d7c22f2287030ccb7331ab347b..097ec336ff24cd826e6530c42b7cdb1108971fa1 100644
--- a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
+++ b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
@@ -363,7 +363,7 @@ class ChannelwiseOp_Batch(HLSCustomOp):
         # get desired function
         func = self.get_nodeattr("Func")
         if func == "cmp_le":
-            func_str = "std::less_equal"
+            func_str = "comp::less_equal"
         elif func == "cmp_ge":
             func_str = "std::greater_equal"
         elif func == "add":
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index 118d668225d3d8f628d51645e8822436e806d097..8868002c9e2cb8726eeb573e104140e3e1a61d27 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -643,6 +643,13 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             # ensure all thresholds are integer
             assert (orig_thres_matrix.astype(np.int32) == orig_thres_matrix).all()
         ret = orig_thres_matrix
+        # workaround for vivado_hls threshold bug
+        if ret[0][0] == 0:
+            ret = np.copy(ret)
+            ret[0][0] = 1
+            warnings.warn(
+                "Setting 0-valued first threshold to 1 to avoid vivado_hls bug"
+            )
         # ensure channels = mh , duplicating if necessary
         if ret.shape[0] == 1:
             ret = np.tile(ret, (mh, 1))
@@ -846,7 +853,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
-                        "std::less_equal<%s>" % tdt_hls,
+                        "comp::less_equal<%s>" % tdt_hls,
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 48ba434195118b18e6cbe4506522c7cc03ccb758..40221ce3b303fc9c1ec1851c7260c82dc3f0b40a 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -372,7 +372,7 @@ void Thresholding_Stream_Batch(hls::stream<TI> &in,
   // alternatively: number of vertical matrix chunks
   unsigned const NF = NumChannels / PE;
-  ThresholdsActivation<1, PE, NumSteps, TT, TO, ActVal, std::less_equal<TT>> internal_thr;
+  ThresholdsActivation<1, PE, NumSteps, TT, TO, ActVal, comp::less_equal<TT>> internal_thr;
   #pragma HLS ARRAY_PARTITION variable=internal_thr.m_thresholds complete dim=0
   // everything merged into a common iteration space (one "big" loop instead
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
index ede3a2e5854ef62370001df218009cea50996ae3..30374a7d97f4d2189e142a9b7b6e44a5abbb46b0 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
@@ -334,6 +334,13 @@ class Thresholding_Batch(HLSCustomOp):
             np.mod(orig_thres_matrix, 1), 0
         ).all(), "Need int threshold tensor"
         ret = orig_thres_matrix
+        # workaround for vivado_hls threshold bug
+        if ret[0][0] == 0:
+            ret = np.copy(ret)
+            ret[0][0] = 1
+            warnings.warn(
+                "Setting 0-valued first threshold to 1 to avoid vivado_hls bug"
+            )
         # ensure channels = mh , duplicating if necessary
         if ret.shape[0] == 1:
             ret = np.tile(ret, (mh, 1))
@@ -394,7 +401,7 @@ class Thresholding_Batch(HLSCustomOp):
-                    "std::less_equal<%s>" % tdt_hls,
+                    "comp::less_equal<%s>" % tdt_hls,
diff --git a/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py b/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py
index c7e6466bc5eb3dd745341a91b1b839609b381315..9a897d9fa16064017dfc02f500d2360ae8431b4a 100644
--- a/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py
+++ b/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py
@@ -278,6 +278,13 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
         not as expected (2)."""
         n_thres_steps = orig_thres_matrix.shape[1]
         ret = orig_thres_matrix
+        # workaround for vivado_hls threshold bug
+        if ret[0][0] == 0:
+            ret = np.copy(ret)
+            ret[0][0] = 1
+            warnings.warn(
+                "Setting 0-valued first threshold to 1 to avoid vivado_hls bug"
+            )
         # distribute rows between PEs
         ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
         assert (
@@ -352,7 +359,7 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
-                        "std::less_equal<%s>" % tdt_hls,
+                        "comp::less_equal<%s>" % tdt_hls,
@@ -450,11 +457,13 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
     def defines(self, var):
         dim = self.get_nodeattr("Dim")
         numReps = 1 * dim * dim
+        kernel = self.get_nodeattr("Kernel")
+        innerProdDim = kernel * kernel
         self.code_gen_dict["$DEFINES$"] = [
-            """#define Channels1 {}\n #define Kernel1 {}\n
+            """#define Channels1 {}\n #define InnerProdDim {}\n
             #define SIMD1 1\n #define PE1 {}\n #define numReps {}""".format(
-                self.get_nodeattr("Kernel"),
+                innerProdDim,
@@ -499,7 +508,7 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
             threshs = "threshs"
         node = self.onnx_node
         self.code_gen_dict["$DOCOMPUTE$"] = [
-            """{}<Channels1, Kernel1, SIMD1, PE1, 1, {}, {}, {}>
+            """{}<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}>
             (in0, out, weights, {}, numReps, {});""".format(
diff --git a/src/finn/util/test.py b/src/finn/util/test.py
index 498b8b551cd6bf7e14cbd1754c812191651ca890..0a34751786170a03361d6a17a24c7250c5ce49fd 100644
--- a/src/finn/util/test.py
+++ b/src/finn/util/test.py
@@ -159,6 +159,7 @@ def get_trained_network_and_ishape(topology, wbits, abits):
     topology_to_ishape = {
         "tfc": (1, 1, 28, 28),
+        "lfc": (1, 1, 28, 28),
         "cnv": (1, 3, 32, 32),
     ishape = topology_to_ishape[topology]
diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py
index 8ed544e8d0bea98ade57eb08d121afb516279e51..5f54eeacf6b68c019e37762bad9677264e6c234d 100644
--- a/tests/end2end/test_end2end_bnn_pynq.py
+++ b/tests/end2end/test_end2end_bnn_pynq.py
@@ -150,6 +150,27 @@ def fold_tfc(model):
     return model
+def fold_lfc(model):
+    fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
+    # (PE, SIMD, ramstyle) for each layer
+    config = [
+        (32, 49, "block"),
+        (64, 32, "auto"),
+        (32, 64, "auto"),
+        (10, 8, "distributed"),
+    ]
+    for fcl, (pe, simd, ramstyle) in zip(fc_layers, config):
+        fcl_inst = getCustomOp(fcl)
+        fcl_inst.set_nodeattr("PE", pe)
+        fcl_inst.set_nodeattr("SIMD", simd)
+        fcl_inst.set_nodeattr("ram_style", ramstyle)
+    # set parallelism for input quantizer to be same as first layer's SIMD
+    inp_qnt_node = model.get_nodes_by_op_type("Thresholding_Batch")[0]
+    inp_qnt = getCustomOp(inp_qnt_node)
+    inp_qnt.set_nodeattr("PE", 49)
+    return model
 def fold_cnv_large(model):
     fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
     # each tuple is (PE, SIMD) for a layer
@@ -208,6 +229,8 @@ def fold_cnv_small(model):
 def get_folding_function(topology, wbits, abits):
     if "tfc" in topology:
         return fold_tfc
+    elif "lfc" in topology:
+        return fold_lfc
     elif "cnv" in topology:
         if wbits == 1 and abits == 1:
             return fold_cnv_large
@@ -284,11 +307,13 @@ def topology2dataset(topology):
 @pytest.mark.parametrize("wbits", [1, 2])
 @pytest.mark.parametrize("abits", [1, 2])
-@pytest.mark.parametrize("topology", ["tfc", "cnv"])
+@pytest.mark.parametrize("topology", ["lfc", "tfc", "cnv"])
 class TestEnd2End:
     def test_export(self, topology, wbits, abits):
         if wbits > abits:
             pytest.skip("No wbits > abits end2end network configs for now")
+        if topology == "lfc" and wbits > 1:
+            pytest.skip("Skipping non-existing lfc configs")
         (model, ishape) = get_trained_network_and_ishape(topology, wbits, abits)
         chkpt_name = get_checkpoint_name(topology, wbits, abits, "export")
         bo.export_finn_onnx(model, ishape, chkpt_name)
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
index 5d46f4c3db35c159458dfc9e0eb8aae8ee89cb20..bbc7e8227d80fb9d064f484dafe91ecdcdc47144 100644
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
@@ -122,6 +122,10 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode):
     odt = act
     n_steps = act.get_num_possible_values() - 1
     T = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(np.float32)
+    # make the vivado_hls threshold bug appear (incorrect rtlsim result when first
+    # threshold of first channel is zero, while using BIPOLAR output)
+    if act == DataType.BIPOLAR:
+        T[0][0] = 0
     # provide non-decreasing thresholds
     T = np.sort(T, axis=1)