diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index 7ceb411a2bd524274ea45664ca1fc965329b6592..5a095f041a1696c7d4b37f9b733d5d152d9ecd46 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -469,6 +469,10 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             # reshape weight tensor to desired shape
             pe = self.get_nodeattr("PE")
             simd = self.get_nodeattr("SIMD")
+            if simd > pe:
+                weight_tensor = np.flip(weight_tensor, axis=-3)
+                #weight_tensor = np.flip(weight_tensor, axis=-2)
+
             weight_tensor = weight_tensor.reshape(1, -1, pe*simd)
             np.save(
                     os.path.join(code_gen_dir, "weights.npy"),
diff --git a/tests/fpgadataflow/test_fpgadataflow_fclayer.py b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
index 282e7a72e48ea42d3a42f683a8c47595a4eb6710..5b321b4bae71ce0eba00f6a86caccb490cce27d6 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fclayer.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
@@ -134,13 +134,13 @@ def prepare_inputs(input_tensor, idt, wdt):
 # activation: None or DataType
 @pytest.mark.parametrize("act", [None, DataType.BIPOLAR, DataType.INT2])
 # weight datatype
-@pytest.mark.parametrize("wdt", [DataType.INT2])
+@pytest.mark.parametrize("wdt", [DataType.BIPOLAR, DataType.INT2])
 # input datatype
-@pytest.mark.parametrize("idt", [DataType.INT2])
+@pytest.mark.parametrize("idt", [DataType.BIPOLAR, DataType.INT2])
 # neuron folding, -1 is maximum possible
 @pytest.mark.parametrize("nf", [-1]) #, 1])
 # synapse folding, -1 is maximum possible
-@pytest.mark.parametrize("sf", [1])
+@pytest.mark.parametrize("sf", [1]) #, 1])
 # HLS matrix width (input features)
 @pytest.mark.parametrize("mw", [4])
 # HLS matrix height (output features)
@@ -155,13 +155,13 @@ def test_fpgadataflow_fclayer_npysim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
     assert mh % pe == 0
     assert mw % sf == 0
     # generate weights
-    #W = gen_finn_dt_tensor(wdt, (mw, mh))
+    W = gen_finn_dt_tensor(wdt, (mw, mh))
     #W = np.eye(mw, mh)
-    W = np.asarray([-2., -2.,  1., -2., 0., -1., -2.,  0., -1., -1.,  0.,  0., -2., -1.,  1., -1.], dtype=np.float32).reshape(mw, mh)
+    #W = np.asarray([-2., -2.,  1., -2., 0., -1., -2.,  0., -1., -1.,  0.,  0., -2., -1.,  1., -1.], dtype=np.float32).reshape(mw, mh)
     #import pdb; pdb.set_trace()
     # generate input data
-    #x = gen_finn_dt_tensor(idt, (1, mw))
-    x = np.asarray([[-2, -1, 0, 1]], dtype=np.float32)
+    x = gen_finn_dt_tensor(idt, (1, mw))
+    #x = np.asarray([[-2, -1, 0, 1]], dtype=np.float32)
     if act is None:
         # no activation, produce accumulators
         T = None
@@ -212,7 +212,10 @@ def test_fpgadataflow_fclayer_npysim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
     y_expected = y.reshape(oshape)
     # execute model
     y_produced = oxe.execute_onnx(model, input_dict)["outp"]
-    if (y_produced.reshape(y_expected.shape) == y_expected).all():
+    y_produced =y_produced.reshape(y_expected.shape)
+    if simd > pe:
+        y_produced = np.flip(y_produced,axis=1)
+    if (y_produced == y_expected).all():
         test = "passed"
     else:
         test = "failed"
@@ -222,7 +225,7 @@ def test_fpgadataflow_fclayer_npysim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
                 act = "None"
             writer = csv.writer(file)
             writer.writerow([act, wdt, idt, nf, sf, mw, mh, test, y_expected, y_produced])
-    assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "npysim failed"
+    assert (y_produced == y_expected).all(), "npysim failed"
 
 
 # activation: None or DataType