diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
index 813b673b396f3f63e6eb275c7ac9d7f2eb1eef56..6d4b5fb9e64ebdd3b491e4cddd26b35eca50cbee 100644
--- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
@@ -225,9 +225,9 @@ class VectorVectorActivation(HLSCustomOp):
     def get_instream_width(self, ind=0):
         i_bits = self.get_input_datatype().bitwidth()
         simd = self.get_nodeattr("SIMD")
-        #if simd > 1:
-            #pe = self.get_nodeattr("Channels")
-        #else:
+        # if simd > 1:
+        # pe = self.get_nodeattr("Channels")
+        # else:
         pe = self.get_nodeattr("PE")
         in_width = i_bits * simd * pe
         return in_width
@@ -242,9 +242,9 @@ class VectorVectorActivation(HLSCustomOp):
         dim_h, dim_w = self.get_nodeattr("Dim")
         ch = self.get_nodeattr("Channels")
         simd = self.get_nodeattr("SIMD")
-        #if simd > 1:
-            #pe = self.get_nodeattr("Channels")
-        #else:
+        # if simd > 1:
+        # pe = self.get_nodeattr("Channels")
+        # else:
         pe = self.get_nodeattr("PE")
         sf = k_h * k_w // simd
         nf = ch // pe
@@ -351,6 +351,9 @@ class VectorVectorActivation(HLSCustomOp):
         ), """Weights matrix doesn't
         have expected shape (channels, 1, kernel_size, kernel_size)"""
         ret = orig_weight_matrix
+        if self.get_weight_datatype() == DataType["BIPOLAR"]:
+            # convert bipolar to binary
+            ret = (ret + 1) / 2
         ret = ret.reshape(ch, k_h * k_w)
         # distribute rows between PEs
         ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
@@ -649,6 +652,12 @@ class VectorVectorActivation(HLSCustomOp):
                 not float32 as expected."""
                 expected_inp_shape = self.get_folded_input_shape()
                 reshaped_input = context[inputs].reshape(expected_inp_shape)
+                if self.get_input_datatype() == DataType["BIPOLAR"]:
+                    # store bipolar activations as binary
+                    reshaped_input = (reshaped_input + 1) / 2
+                    export_idt = DataType["BINARY"]
+                else:
+                    export_idt = self.get_input_datatype()
                 # make copy before saving the array
                 reshaped_input = reshaped_input.copy()
                 np.save(
@@ -664,14 +673,20 @@ class VectorVectorActivation(HLSCustomOp):
             super().exec_precompiled_singlenode_model()
             # load output npy file
             super().npy_to_dynamic_output(context)
+            # reinterpret binary output as bipolar where needed
+            if self.get_output_datatype() == DataType["BIPOLAR"]:
+                out = context[node.output[0]]
+                out = 2 * out - 1
+                context[node.output[0]] = out
             assert (
                 context[node.output[0]].shape == self.get_normal_output_shape()
             ), "cppsim did not produce expected output shape"
         elif mode == "rtlsim":
             sim = self.get_rtlsim()
             nbits = self.get_instream_width()
-            idt = self.get_input_datatype()
-            inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), idt, nbits)
+            inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
             super().reset_rtlsim(sim)
             super().toggle_clk(sim)
 
@@ -756,6 +771,9 @@ class VectorVectorActivation(HLSCustomOp):
     def read_npy_data(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         dtype = self.get_input_datatype()
+        if dtype == DataType["BIPOLAR"]:
+            # use binary for bipolar storage
+            dtype = DataType["BINARY"]
         elem_bits = dtype.bitwidth()
         packed_bits = self.get_instream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
@@ -826,6 +844,11 @@ class VectorVectorActivation(HLSCustomOp):
                 )
             ]
         elif mem_mode == "decoupled" or mem_mode == "external":
+            simd = self.get_nodeattr("SIMD")
+            if simd > 1:
+                raise Exception(
+                    "SIMD parallelism not supported for decoupled or external mode"
+                )
             wdt = self.get_weight_datatype()
             if wdt == DataType["BIPOLAR"]:
                 export_wdt = DataType["BINARY"]
@@ -853,6 +876,9 @@ class VectorVectorActivation(HLSCustomOp):
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         dtype = self.get_output_datatype()
+        if dtype == DataType["BIPOLAR"]:
+            # use binary for bipolar storage
+            dtype = DataType["BINARY"]
         elem_bits = dtype.bitwidth()
         packed_bits = self.get_outstream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
diff --git a/src/finn/util/data_packing.py b/src/finn/util/data_packing.py
index 65478d2540b53443d3f74b44a22fde3defd8ca93..f7ea2ff9430b1371760e8eb44b38c2c982c1c30a 100644
--- a/src/finn/util/data_packing.py
+++ b/src/finn/util/data_packing.py
@@ -220,7 +220,7 @@ def unpack_innermost_dim_from_hex_string(
         if conv_dtype == DataType["BIPOLAR"]:
             ar_list = [2 * x - 1 for x in ar_list]
         # interpret values as signed values
-        elif conv_dtype.name.startswith("INT"):
+        elif dtype.signed():
             mask = 2 ** (conv_dtype.bitwidth() - 1)
             ar_list = [-(x & mask) + (x & ~mask) for x in ar_list]
 
diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau.py b/tests/fpgadataflow/test_fpgadataflow_vvau.py
index ea4be473344bdecd4b200472bfe546aeff864e21..a418de5728d73e22b67f1107ff842421ba680941 100644
--- a/tests/fpgadataflow/test_fpgadataflow_vvau.py
+++ b/tests/fpgadataflow/test_fpgadataflow_vvau.py
@@ -27,30 +27,29 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pytest
+
 import numpy as np
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.general.multithreshold import multithreshold
-
-# from qonnx.custom_op.registry import getCustomOp
+from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import gen_finn_dt_tensor
 
 import finn.core.onnx_exec as oxe
-
-# from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.minimize_accumulator_width import (
+    MinimizeAccumulatorWidth,
+)
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.fpgadataflow.minimize_accumulator_width import (
-    MinimizeAccumulatorWidth,
-)
 
 
 def _infer_sparse_weight_tensor(W_conv, k_h, k_w, channels):
@@ -110,7 +109,10 @@ def _make_single_vvau_modelwrapper(
     if T is not None:
         no_act = 0
         node_inp_list = ["inp", "weights", "thresh"]
-        actval = odt.min()
+        if odt == DataType["BIPOLAR"]:
+            actval = 0
+        else:
+            actval = odt.min()
     else:
         no_act = 1
         node_inp_list = ["inp", "weights"]
@@ -167,15 +169,15 @@ def prepare_inputs(input_tensor):
 
 
 # input datatype
-@pytest.mark.parametrize("idt", [DataType["UINT4"]])
+@pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["UINT4"]])
 # weight datatype
-@pytest.mark.parametrize("wdt", [DataType["UINT4"]])
+@pytest.mark.parametrize("wdt", [DataType["BIPOLAR"], DataType["UINT4"]])
 # activation: None or DataType
-@pytest.mark.parametrize("act", [DataType["UINT4"], None])
+@pytest.mark.parametrize("act", [DataType["BIPOLAR"], DataType["UINT4"], None])
 # PE
-@pytest.mark.parametrize("pe", [1,2,3,6])
+@pytest.mark.parametrize("pe", [1, 3, 6])
 # SIMD
-@pytest.mark.parametrize("simd", [1,9])
+@pytest.mark.parametrize("simd", [1, 9])
 # Input image shape
 @pytest.mark.parametrize("dim_h", [10])
 @pytest.mark.parametrize("dim_w", [10])
@@ -187,7 +189,7 @@ def prepare_inputs(input_tensor):
 # memory mode
 @pytest.mark.parametrize("mem_mode", ["const"])
 # execution mode
-@pytest.mark.parametrize("exec_mode", ["cppsim","rtlsim"])
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
@@ -203,9 +205,6 @@ def test_fpgadataflow_vvau(
     if channels % pe != 0:
         pytest.skip("Requirement Channels divisable by PE is violated.")
 
-    #if pe < channels and simd > 1:
-    #    pytest.skip("Do not apply SIMD parallelism before max PE parallelism")
-
     # Generate weights in expected shape for ONNX and HLS node
     W = gen_finn_dt_tensor(wdt, (channels, 1, k_h, k_w))  # shape: [channels, 1, k, k]
     W_onnx = _infer_sparse_weight_tensor(
@@ -221,14 +220,23 @@ def test_fpgadataflow_vvau(
     if act is None:
         T = None
         tdt = None
-        odt = DataType["INT32"]
+        if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+            odt = DataType["UINT32"]
+        else:
+            odt = DataType["INT32"]
     else:
         odt = act
-        (min_v, max_v) = _calculate_dot_prod_range(idt, wdt, k_h * k_w * channels)
+        (min_v, max_v) = _calculate_dot_prod_range(idt, wdt, k_h * k_w)
         n_steps = act.get_num_possible_values() - 1
         T = np.random.randint(min_v, max_v - 1, (channels, n_steps)).astype(np.float32)
         T = np.sort(T, axis=1)
-        tdt = DataType["INT32"]
+        if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+            tdt = DataType["UINT32"]
+            # bias thresholds to be positive
+            T = np.ceil((T + (k_h * k_w)) / 2)
+            assert (T >= 0).all()
+        else:
+            tdt = DataType["INT32"]
 
     model = _make_single_vvau_modelwrapper(
         W, pe, simd, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt, mem_mode
@@ -250,14 +258,25 @@ def test_fpgadataflow_vvau(
     input_dict = prepare_inputs(x_vvau)
 
     # Calculate output
-    y_expected = np.matmul(x, W_onnx)  # Y is in [N, H, W, C] format
+    if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+        # Simulate XNOR-popcount matrix multiplication, see
+        # qonnx.custom_op.general.xnorpopcount (not usable due to sparse W)
+        y_expected = np.matmul(x, W_onnx)
+        y_expected = (y_expected + (k_h * k_w)) / 2
+    else:
+        y_expected = np.matmul(x, W_onnx)  # Y is in [N, H, W, C] format
+
     if T is not None:
         # Reshape Y, as multithreshold expects Y to be in [N, C, H, W] format
         y_expected = np.transpose(y_expected, (0, 3, 1, 2))
         y_expected = multithreshold(y_expected, T)
         y_expected = np.transpose(y_expected, (0, 2, 3, 1))
-        # signed offset
-        y_expected += act.min()
+        if act == DataType["BIPOLAR"]:
+            # binary to bipolar
+            y_expected = 2 * y_expected - 1
+        else:
+            # signed offset
+            y_expected += act.min()
 
     y_produced = oxe.execute_onnx(model, input_dict, return_full_exec_context=False)[
         "outp"
@@ -265,11 +284,11 @@ def test_fpgadataflow_vvau(
 
     assert (y_produced == y_expected).all(), "incorrect result"
 
-    # if exec_mode == "rtlsim":
-    # node = model.get_nodes_by_op_type("VectorVectorActivation")[0]
-    # inst = getCustomOp(node)
-    # cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
-    # exp_cycles_dict = model.analysis(exp_cycles_per_layer)
-    # exp_cycles = exp_cycles_dict[node.name]
-    # assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
-    # assert exp_cycles != 0
+    if exec_mode == "rtlsim":
+        node = model.get_nodes_by_op_type("VectorVectorActivation")[0]
+        inst = getCustomOp(node)
+        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
+        assert exp_cycles != 0