diff --git a/fetch-repos.sh b/fetch-repos.sh
index 86a2176c7549ae1debcae6365137ac30374de7cb..1e01a058ff368dfe227fdf33bd2d2866aa4f13c9 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -32,7 +32,7 @@ FINN_EXP_COMMIT="0aa7e1c44b20cf085b6fe42cff360f0a832afd2c"
 BREVITAS_COMMIT="c65f9c13dc124971f14739349531bbcda5c2a4aa"
 PYVERILATOR_COMMIT="766e457465f5c0dd315490d7b9cc5d74f9a76f4f"
 CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
-HLSLIB_COMMIT="4ddfa00b07275a3f1de1c13409e6acb489115fe2"
+HLSLIB_COMMIT="c17aa478ae574971d115afa9fa4d9c215857d1ac"
 OMX_COMMIT="d1065a788219ca0eb54d5e57600b1f9d7f67d4cc"
 AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b"
 XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e"
diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
index 40f625093b62c6f18282066d018a08ed2e587c81..d6285a6f699c774d18998c4c7426ac52362e9dec 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
@@ -709,13 +709,6 @@ class MatrixVectorActivation(HLSCustomOp):
             # ensure all thresholds are integer
             assert (orig_thres_matrix.astype(np.int32) == orig_thres_matrix).all()
         ret = orig_thres_matrix
-        # workaround for vivado_hls threshold bug
-        if ret[0][0] == 0 and n_thres_steps == 1:
-            ret = np.copy(ret)
-            ret[0][0] = 1
-            warnings.warn(
-                "Setting 0-valued first threshold to 1 to avoid vivado_hls bug"
-            )
         # ensure channels = mh , duplicating if necessary
         if ret.shape[0] == 1:
             ret = np.tile(ret, (mh, 1))
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
index ce8c31ee9a6cf335dadacff95cf3dbd5cd7590f7..292f70941a3c18a594d904eb4922dc4bce550d82 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
@@ -319,13 +319,6 @@ class Thresholding_Batch(HLSCustomOp):
             np.mod(orig_thres_matrix, 1), 0
         ).all(), "Need int threshold tensor"
         ret = orig_thres_matrix
-        # workaround for vivado_hls threshold bug
-        if ret[0][0] == 0 and n_thres_steps == 1:
-            ret = np.copy(ret)
-            ret[0][0] = 1
-            warnings.warn(
-                "Setting 0-valued first threshold to 1 to avoid vivado_hls bug"
-            )
         # ensure channels = mh , duplicating if necessary
         if ret.shape[0] == 1:
             ret = np.tile(ret, (mh, 1))
diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
index 10ee30f89ab6313a4e5863890483e1da3f76e511..a2dd3c75dc3c02faab2465e8ac5c70474560bba5 100644
--- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
@@ -56,6 +56,7 @@ class VectorVectorActivation(HLSCustomOp):
     def get_nodeattr_types(self):
         my_attrs = {
             "PE": ("i", True, 0),
+            "SIMD": ("i", False, 1),
             "Dim": ("ints", True, []),  # [H, W]
             "Channels": ("i", True, 0),
             "Kernel": ("ints", True, []),  # [H, W]
@@ -198,7 +199,8 @@ class VectorVectorActivation(HLSCustomOp):
         ch = self.get_nodeattr("Channels")
         k_h, k_w = self.get_nodeattr("Kernel")
         pe = self.get_nodeattr("PE")
-        wmem = k_h * k_w * ch // pe
+        simd = self.get_nodeattr("SIMD")
+        wmem = (k_h * k_w * ch // pe) // simd
         return wmem
 
     def calc_tmem(self):
@@ -250,7 +252,9 @@ class VectorVectorActivation(HLSCustomOp):
 
     def get_instream_width(self, ind=0):
         i_bits = self.get_input_datatype().bitwidth()
-        in_width = i_bits * self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        pe = self.get_nodeattr("PE")
+        in_width = i_bits * simd * pe
         return in_width
 
     def get_outstream_width(self, ind=0):
@@ -260,15 +264,21 @@ class VectorVectorActivation(HLSCustomOp):
 
     def get_folded_input_shape(self, ind=0):
         k_h, k_w = self.get_nodeattr("Kernel")
-        sf = k_h * k_w
         dim_h, dim_w = self.get_nodeattr("Dim")
         ch = self.get_nodeattr("Channels")
+        simd = self.get_nodeattr("SIMD")
         pe = self.get_nodeattr("PE")
+        kernel_2 = k_h * k_w
+        assert (
+            kernel_2 % simd == 0
+        ), "Requirement kernel (k_h * k_w) divisable by SIMD is violated."
+        sf = kernel_2 // simd
+        assert ch % pe == 0, "Requirement Channels divisable by PE is violated."
         nf = ch // pe
 
         if ind == 0:
             # calculate shape of input 0
-            folded_input_shape = tuple([1, dim_h, dim_w, sf * nf, pe])
+            folded_input_shape = tuple([1, dim_h, dim_w, sf * nf, simd * pe])
         elif ind == 1 and self.get_nodeattr("mem_mode") == "external":
             # calculate shape of input 1 (weights)
             folded_input_shape = tuple([1, sf * nf, pe])
@@ -304,6 +314,7 @@ class VectorVectorActivation(HLSCustomOp):
 
     def get_exp_cycles(self):
         pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
         ch = self.get_nodeattr("Channels")
         dim_h, dim_w = self.get_nodeattr("Dim")
         k_h, k_w = self.get_nodeattr("Kernel")
@@ -311,7 +322,7 @@ class VectorVectorActivation(HLSCustomOp):
         batch_size = 1
         # since mmv != 1 is not supported yet, we set mmv for now to 1
         mmv = 1
-        exp_cycles = ((ch * k_h * k_w) / pe) * batch_size * (dim_h * dim_w) / mmv
+        exp_cycles = ((ch * k_h * k_w) / pe / simd) * batch_size * (dim_h * dim_w) / mmv
         return int(exp_cycles)
 
     def get_template_param_values(self):
@@ -355,6 +366,7 @@ class VectorVectorActivation(HLSCustomOp):
 
     def get_hls_compatible_weight_tensor(self, orig_weight_matrix):
         pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
         ch = self.get_nodeattr("Channels")
         k_h, k_w = self.get_nodeattr("Kernel")
         wmem = self.calc_wmem()
@@ -366,10 +378,13 @@ class VectorVectorActivation(HLSCustomOp):
         ), """Weights matrix doesn't
         have expected shape (channels, 1, kernel_size, kernel_size)"""
         ret = orig_weight_matrix
+        if self.get_weight_datatype() == DataType["BIPOLAR"]:
+            # convert bipolar to binary
+            ret = (ret + 1) / 2
         ret = ret.reshape(ch, k_h * k_w)
         # distribute rows between PEs
         ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
-        ret = ret.reshape(1, pe, wmem, 1)
+        ret = ret.reshape(1, pe, wmem, simd)
         return ret
 
     def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
@@ -403,13 +418,6 @@ class VectorVectorActivation(HLSCustomOp):
             # ensure all thresholds are integer
             assert (orig_thres_matrix.astype(np.int32) == orig_thres_matrix).all()
         ret = orig_thres_matrix
-        # workaround for vivado_hls threshold bug
-        if ret[0][0] == 0 and n_thres_steps == 1:
-            ret = np.copy(ret)
-            ret[0][0] = 1
-            warnings.warn(
-                "Setting 0-valued first threshold to 1 to avoid vivado_hls bug"
-            )
         # ensure channels = mh , duplicating if necessary
         if ret.shape[0] == 1:
             ret = np.tile(ret, (ch, 1))
@@ -460,7 +468,8 @@ class VectorVectorActivation(HLSCustomOp):
             f_weights = open(weight_file_name, "w")
             if export_wdt.bitwidth() != 1:
                 f_weights.write(
-                    "const FixedPointWeights<1,{},{},{}> weights = ".format(
+                    "const FixedPointWeights<{},{},{},{}> weights = ".format(
+                        self.get_nodeattr("SIMD"),
                         export_wdt.get_hls_datatype_str(),
                         self.get_nodeattr("PE"),
                         self.calc_wmem(),
@@ -468,7 +477,8 @@ class VectorVectorActivation(HLSCustomOp):
                 )
             else:
                 f_weights.write(
-                    "const BinaryWeights<1,{},{}> weights = ".format(
+                    "const BinaryWeights<{},{},{}> weights = ".format(
+                        self.get_nodeattr("SIMD"),
                         self.get_nodeattr("PE"),
                         self.calc_wmem(),
                     )
@@ -485,7 +495,7 @@ class VectorVectorActivation(HLSCustomOp):
             weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2)
             # reshape weight tensor (simd_flipped and pe_flipped) to desired shape
             pe = self.get_nodeattr("PE")
-            simd = 1
+            simd = self.get_nodeattr("SIMD")
             # simd_flipped
             weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape(
                 1, -1, pe * simd
@@ -664,6 +674,12 @@ class VectorVectorActivation(HLSCustomOp):
                 not float32 as expected."""
                 expected_inp_shape = self.get_folded_input_shape()
                 reshaped_input = context[inputs].reshape(expected_inp_shape)
+                if self.get_input_datatype() == DataType["BIPOLAR"]:
+                    # store bipolar activations as binary
+                    reshaped_input = (reshaped_input + 1) / 2
+                    export_idt = DataType["BINARY"]
+                else:
+                    export_idt = self.get_input_datatype()
                 # make copy before saving the array
                 reshaped_input = reshaped_input.copy()
                 np.save(
@@ -679,14 +695,20 @@ class VectorVectorActivation(HLSCustomOp):
             super().exec_precompiled_singlenode_model()
             # load output npy file
             super().npy_to_dynamic_output(context)
+            # reinterpret binary output as bipolar where needed
+            if self.get_output_datatype() == DataType["BIPOLAR"]:
+                out = context[node.output[0]]
+                out = 2 * out - 1
+                context[node.output[0]] = out
             assert (
                 context[node.output[0]].shape == self.get_normal_output_shape()
             ), "cppsim did not produce expected output shape"
         elif mode == "rtlsim":
             sim = self.get_rtlsim()
             nbits = self.get_instream_width()
-            idt = self.get_input_datatype()
-            inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), idt, nbits)
+            inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
             super().reset_rtlsim(sim)
             super().toggle_clk(sim)
 
@@ -754,9 +776,10 @@ class VectorVectorActivation(HLSCustomOp):
 
         self.code_gen_dict["$DEFINES$"] = [
             """#define Channels1 {}\n #define InnerProdDim {}\n
-            #define SIMD1 1\n #define PE1 {}\n #define numReps {}""".format(
+            #define SIMD1 {}\n #define PE1 {}\n #define numReps {}""".format(
                 self.get_nodeattr("Channels"),
                 innerProdDim,
+                self.get_nodeattr("SIMD"),
                 self.get_nodeattr("PE"),
                 numReps,
             )
@@ -770,6 +793,9 @@ class VectorVectorActivation(HLSCustomOp):
     def read_npy_data(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         dtype = self.get_input_datatype()
+        if dtype == DataType["BIPOLAR"]:
+            # use binary for bipolar storage
+            dtype = DataType["BINARY"]
         elem_bits = dtype.bitwidth()
         packed_bits = self.get_instream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
@@ -867,6 +893,9 @@ class VectorVectorActivation(HLSCustomOp):
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         dtype = self.get_output_datatype()
+        if dtype == DataType["BIPOLAR"]:
+            # use binary for bipolar storage
+            dtype = DataType["BINARY"]
         elem_bits = dtype.bitwidth()
         packed_bits = self.get_outstream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
@@ -1108,7 +1137,7 @@ class VectorVectorActivation(HLSCustomOp):
 
     def uram_estimation(self):
         P = self.get_nodeattr("PE")
-        Q = 1
+        Q = self.get_nodeattr("SIMD")
         wdt = self.get_weight_datatype()
         W = wdt.bitwidth()
         omega = self.calc_wmem()
@@ -1117,7 +1146,7 @@ class VectorVectorActivation(HLSCustomOp):
         mstyle = self.get_nodeattr("ram_style")
         if (
             (mmode == "decoupled" and mstyle != "ultra")
-            or (mmode == "const" and self.calc_wmem() <= 128)
+            or (mmode == "const")
             or (mmode == "external")
         ):
             return 0
@@ -1129,9 +1158,11 @@ class VectorVectorActivation(HLSCustomOp):
         """Calculates resource estimation for BRAM"""
         # TODO add in/out FIFO contributions
         P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
         wdt = self.get_weight_datatype()
         W = wdt.bitwidth()
         omega = self.calc_wmem()
+        mem_width = Q * W * P
         # assuming SDP mode RAMB18s (see UG573 Table 1-10)
         # since this is HLS memory, not using the full width of a BRAM
         # assuming memories up to 128 deep get implemented in LUTs
@@ -1139,23 +1170,24 @@ class VectorVectorActivation(HLSCustomOp):
         mstyle = self.get_nodeattr("ram_style")
         if (
             (mmode == "decoupled" and mstyle in ["distributed", "ultra"])
+            or (mstyle == "auto" and self.calc_wmem() <= 128)
             or (mmode == "const" and self.calc_wmem() <= 128)
             or (mmode == "external")
         ):
             return 0
 
-        if W == 1:
-            return math.ceil(omega / 16384) * P
-        elif W == 2:
-            return math.ceil(omega / 8192) * P
-        elif W <= 4:
-            return (math.ceil(omega / 4096)) * (math.ceil(W / 4)) * P
-        elif W <= 9:
-            return (math.ceil(omega / 2048)) * (math.ceil(W / 8)) * P
-        elif W <= 18 or omega > 512:
-            return (math.ceil(omega / 1024)) * (math.ceil(W / 16)) * P
+        if mem_width == 1:
+            return math.ceil(omega / 16384)
+        elif mem_width == 2:
+            return math.ceil(omega / 8192)
+        elif mem_width <= 4:
+            return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4))
+        elif mem_width <= 9:
+            return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 8))
+        elif mem_width <= 18 or omega > 512:
+            return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 16))
         else:
-            return (math.ceil(omega / 512)) * (math.ceil(W / 32)) * P
+            return (math.ceil(omega / 512)) * (math.ceil(mem_width / 32))
 
     def bram_efficiency_estimation(self):
         P = self.get_nodeattr("PE")
@@ -1179,6 +1211,7 @@ class VectorVectorActivation(HLSCustomOp):
         """
         # TODO add in/out FIFO contributions
         P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
         wdt = self.get_weight_datatype()
         W = wdt.bitwidth()
         # determine tdt with input and weight data types
@@ -1193,14 +1226,16 @@ class VectorVectorActivation(HLSCustomOp):
         if (mmode == "decoupled" and mstyle == "distributed") or (
             mmode == "const" and self.calc_wmem() <= 128
         ):
-            c2 = (P * W) * math.ceil(self.calc_wmem() / 64)
+            c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64)
 
         # multiplication
         res_type = self.get_nodeattr("resType")
         if res_type == "dsp":
             mult_luts = 0
         else:
-            mult_luts = (2 * math.ceil((W + A) / 6) - 1) * (W + A)
+            mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A)
+        # adder tree
+        addertree_luts = (W + A) * (2 * Q - 1)
         # accumulator
         acc_datatype = self.get_accumulator_datatype()
         acc_bits = acc_datatype.bitwidth()
@@ -1223,10 +1258,14 @@ class VectorVectorActivation(HLSCustomOp):
         if noact == 0:
             odt = self.get_output_datatype()
             B = odt.bitwidth()
-            thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64)
+            thr_luts = (2**B - 1) * acc_bits * self.calc_tmem() / 64
             comp_luts = (2**B - 1) * acc_bits
 
-        return int(c0 + c1 * (P * (mult_luts + acc_luts + thr_luts + comp_luts)) + c2)
+        return int(
+            c0
+            + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts))
+            + c2
+        )
 
     def dsp_estimation(self):
         # multiplication
@@ -1248,9 +1287,10 @@ class VectorVectorActivation(HLSCustomOp):
             self.get_nodeattr("mem_mode") == "decoupled"
             or self.get_nodeattr("mem_mode") == "external"
         ):
+            simd = self.get_nodeattr("SIMD")
             pe = self.get_nodeattr("PE")
             wp = self.get_weight_datatype().bitwidth()
-            w_width = pe * wp
+            w_width = simd * pe * wp
             return w_width
         else:
             return 0
diff --git a/src/finn/util/data_packing.py b/src/finn/util/data_packing.py
index 797dad32a2cfeb3e00e224f264d91b5ee0e9247b..3602b1bdd5d013ee8ce2f6cf156490478f0cc74e 100644
--- a/src/finn/util/data_packing.py
+++ b/src/finn/util/data_packing.py
@@ -220,7 +220,7 @@ def unpack_innermost_dim_from_hex_string(
         if conv_dtype == DataType["BIPOLAR"]:
             ar_list = [2 * x - 1 for x in ar_list]
         # interpret values as signed values
-        elif conv_dtype.name.startswith("INT"):
+        elif dtype.signed():
             mask = 2 ** (conv_dtype.bitwidth() - 1)
             ar_list = [-(x & mask) + (x & ~mask) for x in ar_list]
 
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
index 96cd69c3453793c1634f132cb159f0cc8a94a28c..445afdf458c0e99d4a4dcc4eeb2f6fb305dbac4a 100644
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
@@ -132,10 +132,6 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode):
     odt = act
     n_steps = act.get_num_possible_values() - 1
     T = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(np.float32)
-    # make the vivado_hls threshold bug appear (incorrect rtlsim result when first
-    # threshold of first channel is zero, while using BIPOLAR output)
-    if act == DataType["BIPOLAR"]:
-        T[0][0] = 0
     # provide non-decreasing thresholds
     T = np.sort(T, axis=1)
 
diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau.py b/tests/fpgadataflow/test_fpgadataflow_vvau.py
index abf8ba0b9efde67c77711abc8451475887430cae..be1ada59a1ef50cf8d1c9ea26b31ce4956d9f3db 100644
--- a/tests/fpgadataflow/test_fpgadataflow_vvau.py
+++ b/tests/fpgadataflow/test_fpgadataflow_vvau.py
@@ -35,12 +35,17 @@ from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.general.multithreshold import multithreshold
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.minimize_accumulator_width import (
+    MinimizeAccumulatorWidth,
+)
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
@@ -77,6 +82,7 @@ def _calculate_dot_prod_range(dt_a, dt_b, len):
 def _make_single_vvau_modelwrapper(
     W,
     pe,
+    simd,
     k_h,
     k_w,
     channels,
@@ -103,7 +109,10 @@ def _make_single_vvau_modelwrapper(
     if T is not None:
         no_act = 0
         node_inp_list = ["inp", "weights", "thresh"]
-        actval = odt.min()
+        if odt == DataType["BIPOLAR"]:
+            actval = 0
+        else:
+            actval = odt.min()
     else:
         no_act = 1
         node_inp_list = ["inp", "weights"]
@@ -116,6 +125,7 @@ def _make_single_vvau_modelwrapper(
         domain="finn.custom_op.fpgadataflow",
         backend="fpgadataflow",
         PE=pe,
+        SIMD=simd,
         Dim=[dim_h, dim_w],
         Channels=channels,
         Kernel=[k_h, k_w],
@@ -146,6 +156,11 @@ def _make_single_vvau_modelwrapper(
         model.set_tensor_datatype("thresh", tdt)
         model.set_initializer("thresh", T)
 
+    # Minimize accumulator width to obtain realistic HLS reports
+    model = model.transform(MinimizeAccumulatorWidth())
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+
     return model
 
 
@@ -154,13 +169,15 @@ def prepare_inputs(input_tensor):
 
 
 # input datatype
-@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]])
+@pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["UINT4"]])
 # weight datatype
-@pytest.mark.parametrize("wdt", [DataType["INT4"]])
+@pytest.mark.parametrize("wdt", [DataType["BIPOLAR"], DataType["UINT4"]])
 # activation: None or DataType
-@pytest.mark.parametrize("act", [DataType["UINT4"], None])
+@pytest.mark.parametrize("act", [DataType["BIPOLAR"], DataType["UINT4"], None])
 # PE
-@pytest.mark.parametrize("pe", [1, "channels"])
+@pytest.mark.parametrize("pe", [1, 3, 6])
+# SIMD
+@pytest.mark.parametrize("simd", [1, 9])
 # Input image shape
 @pytest.mark.parametrize("dim_h", [10])
 @pytest.mark.parametrize("dim_w", [10, 1])
@@ -168,7 +185,7 @@ def prepare_inputs(input_tensor):
 @pytest.mark.parametrize("k_h", [3])
 @pytest.mark.parametrize("k_w", [3, 1])
 # Number of input and output channels
-@pytest.mark.parametrize("channels", [3, 4])
+@pytest.mark.parametrize("channels", [3, 6])
 # memory mode
 @pytest.mark.parametrize("mem_mode", ["const", "decoupled"])
 # execution mode
@@ -177,17 +194,17 @@ def prepare_inputs(input_tensor):
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_fpgadataflow_vvau(
-    idt, wdt, act, pe, dim_h, dim_w, k_h, k_w, channels, mem_mode, exec_mode
+    idt, wdt, act, pe, simd, dim_h, dim_w, k_h, k_w, channels, mem_mode, exec_mode
 ):
-    if pe == "channels":
-        pe = channels
-
     if dim_w == 1 and k_w != 1:
         pytest.skip("1D image requires 1D kernel, skipping.")
 
     if channels % pe != 0:
         pytest.skip("Requirement Channels divisable by PE is violated.")
 
+    if (k_h * k_w) % simd != 0:
+        pytest.skip("Requirement kernel (k_h * k_w) divisable by SIMD is violated.")
+
     # Generate weights in expected shape for ONNX and HLS node
     W = gen_finn_dt_tensor(wdt, (channels, 1, k_h, k_w))  # shape: [channels, 1, k, k]
     W_onnx = _infer_sparse_weight_tensor(
@@ -203,17 +220,26 @@ def test_fpgadataflow_vvau(
     if act is None:
         T = None
         tdt = None
-        odt = DataType["INT32"]
+        if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+            odt = DataType["UINT32"]
+        else:
+            odt = DataType["INT32"]
     else:
         odt = act
-        (min_v, max_v) = _calculate_dot_prod_range(idt, wdt, k_h * k_w * channels)
+        (min_v, max_v) = _calculate_dot_prod_range(idt, wdt, k_h * k_w)
         n_steps = act.get_num_possible_values() - 1
         T = np.random.randint(min_v, max_v - 1, (channels, n_steps)).astype(np.float32)
         T = np.sort(T, axis=1)
-        tdt = DataType["INT32"]
+        if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+            tdt = DataType["UINT32"]
+            # bias thresholds to be positive
+            T = np.ceil((T + (k_h * k_w)) / 2)
+            assert (T >= 0).all()
+        else:
+            tdt = DataType["INT32"]
 
     model = _make_single_vvau_modelwrapper(
-        W, pe, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt, mem_mode
+        W, pe, simd, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt, mem_mode
     )
 
     if exec_mode == "cppsim":
@@ -232,20 +258,31 @@ def test_fpgadataflow_vvau(
     input_dict = prepare_inputs(x_vvau)
 
     # Calculate output
-    y_expected = np.matmul(x, W_onnx)  # Y is in [N, H, W, C] format
+    if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+        # Simulate XNOR-popcount matrix multiplication, see
+        # qonnx.custom_op.general.xnorpopcount (not usable due to sparse W)
+        y_expected = np.matmul(x, W_onnx)
+        y_expected = (y_expected + (k_h * k_w)) / 2
+    else:
+        y_expected = np.matmul(x, W_onnx)  # Y is in [N, H, W, C] format
+
     if T is not None:
         # Reshape Y, as multithreshold expects Y to be in [N, C, H, W] format
         y_expected = np.transpose(y_expected, (0, 3, 1, 2))
         y_expected = multithreshold(y_expected, T)
         y_expected = np.transpose(y_expected, (0, 2, 3, 1))
-        # signed offset
-        y_expected += act.min()
+        if act == DataType["BIPOLAR"]:
+            # binary to bipolar
+            y_expected = 2 * y_expected - 1
+        else:
+            # signed offset
+            y_expected += act.min()
 
     y_produced = oxe.execute_onnx(model, input_dict, return_full_exec_context=False)[
         "outp"
     ]
 
-    assert (y_produced == y_expected).all(), "cppsim failed"
+    assert (y_produced == y_expected).all(), "incorrect result"
 
     if exec_mode == "rtlsim":
         node = model.get_nodes_by_op_type("VectorVectorActivation")[0]