diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
index a3aa9d570d0efcbe82090d19a151d4f5b12078b6..a80d2bbefac96e8ec2a48e04179d3d285e78cef7 100644
--- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
@@ -78,24 +78,33 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
 
     def check_divisible_iowidths(self):
         impl_style = self.get_nodeattr("impl_style")
-        if impl_style == "hls":
-            # when using impl_style = hls must have the following
-            # if inWidth > outWidth: inWidth % outWidth = 0
-            # if inWidth < outWidth: outWidth % inWidth = 0
-            iwidth = self.get_nodeattr("inWidth")
-            owidth = self.get_nodeattr("outWidth")
-            if iwidth > owidth:
-                assert (
-                    iwidth % owidth == 0
-                ), """DWC InWidth is bigger than OutWidth and is not divisible by it.
-                Please adjust PE and SIMD values so that InWidth % OutWidth = 0
-                or alternatively use impl_style = vivado"""
-            else:
-                assert (
-                    owidth % iwidth == 0
-                ), """DWC OutWidth is bigger than InWidth and is not divisible by it.
-                Please adjust PE and SIMD values so that OutWidth % InWidth = 0
-                or alternatively use impl_style = vivado"""
+        iwidth = self.get_nodeattr("inWidth")
+        owidth = self.get_nodeattr("outWidth")
+        if impl_style == "vivado":
+            # the AXIS IP we use in vivado mode only supports
+            # stream widths that are divisible by 8
+            iwidth_d8 = iwidth % 8 == 0
+            owidth_d8 = owidth % 8 == 0
+            assert (
+                iwidth_d8 and owidth_d8
+            ), """DWC impl_style=vivado requires
+            stream widths that are divisible by 8: (%d, %d)""" % (
+                iwidth,
+                owidth,
+            )
+
+    def get_iowidth_lcm(self):
+        iwidth = self.get_nodeattr("inWidth")
+        owidth = self.get_nodeattr("outWidth")
+        return int(np.lcm(iwidth, owidth))
+
+    def needs_lcm(self):
+        iwidth = self.get_nodeattr("inWidth")
+        owidth = self.get_nodeattr("outWidth")
+        maxwidth = max(iwidth, owidth)
+        minwidth = min(iwidth, owidth)
+        impl_style = self.get_nodeattr("impl_style")
+        return (impl_style == "hls") and (maxwidth % minwidth != 0)
 
     def get_folded_input_shape(self, ind=0):
         self.check_divisible_iowidths()
@@ -202,6 +211,16 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
             "#define NumInWords %d " % numInWords,
             "#define numReps %d" % numReps,
         ]
+        if self.needs_lcm():
+            lcmWidth = self.get_iowidth_lcm()
+            assert (
+                numInWords % (lcmWidth / inWidth) == 0
+            ), "Error in DWC LCM calculation"
+            numLCMToOut = numInWords // (lcmWidth / inWidth)
+            self.code_gen_dict["$DEFINES$"].append("#define LCMWidth %d" % lcmWidth)
+            self.code_gen_dict["$DEFINES$"].append(
+                "#define NumLCMToOut %d" % (numLCMToOut)
+            )
 
     def read_npy_data(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -226,6 +245,12 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
             'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
         )
+        if self.needs_lcm():
+            self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+                'hls::stream<ap_uint<{}>> intermediate ("intermediate");'.format(
+                    self.get_iowidth_lcm()
+                )
+            )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
             'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
         )
@@ -233,9 +258,19 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
     def docompute(self):
         # TODO continue with fxns below, they are copy-pasted
         op = "StreamingDataWidthConverter_Batch"
-        self.code_gen_dict["$DOCOMPUTE$"] = [
-            "%s<InWidth, OutWidth, NumInWords>(in0, out, numReps);" % (op)
-        ]
+        if self.needs_lcm():
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                'hls::stream<ap_uint<{}>> intermediate ("intermediate");'.format(
+                    self.get_iowidth_lcm()
+                ),
+                "%s<InWidth, LCMWidth, NumInWords>(in0, intermediate, numReps);" % (op),
+                "%s<LCMWidth, OutWidth, NumLCMToOut>(intermediate, out, numReps);"
+                % (op),
+            ]
+        else:
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                "%s<InWidth, OutWidth, NumInWords>(in0, out, numReps);" % (op)
+            ]
 
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -287,6 +322,10 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
+        if self.needs_lcm():
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS DATAFLOW disable_start_propagation"
+            )
 
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
@@ -466,3 +505,28 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
             cset_luts += outw
 
         return int(cnt_luts + cset_luts)
+
+    def prepare_rtlsim(self):
+        assert self.get_nodeattr("impl_style") != "vivado", (
+            "StreamingDataWidthConverter impl_style "
+            "cannot be vivado for rtlsim. Only impl_style=rtl supported."
+        )
+        super().prepare_rtlsim()
+
+    def code_generation_ipgen(self, model, fpgapart, clk):
+        # no codegen required for impl_style=vivado since
+        # that uses premade, configurable AXIS IP
+        if self.get_nodeattr("impl_style") == "hls":
+            super().code_generation_ipgen(model, fpgapart, clk)
+
+    def ipgen_singlenode_code(self):
+        # no IP generation required for impl_style=vivado since
+        # that uses premade, configurable AXIS IP
+        if self.get_nodeattr("impl_style") == "hls":
+            super().ipgen_singlenode_code()
+        else:
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+            # set ipgen_path and ip_path so that HLSSynthIP
+            # and CreatedStitchedIP transformations do not complain
+            self.set_nodeattr("ipgen_path", code_gen_dir)
+            self.set_nodeattr("ip_path", code_gen_dir)
diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py
index efc179923545eb06e4d173c683b0941887f8bb79..632d1f813b4d2509407930bc9294f7531d4c90af 100644
--- a/src/finn/transformation/fpgadataflow/insert_dwc.py
+++ b/src/finn/transformation/fpgadataflow/insert_dwc.py
@@ -83,10 +83,13 @@ class InsertDWC(Transformation):
                             dwc_out_width = n1.get_instream_width()
                             larger_width = max(dwc_in_width, dwc_out_width)
                             smaller_width = min(dwc_in_width, dwc_out_width)
-                            if larger_width % smaller_width == 0:
-                                impl_style = "hls"
-                            else:
+                            both_8bit_aligned = (larger_width % 8 == 0) and (
+                                smaller_width % 8 == 0
+                            )
+                            if both_8bit_aligned:
                                 impl_style = "vivado"
+                            else:
+                                impl_style = "hls"
 
                             # determine shape for dwc
                             dwc_shape = n0.get_normal_output_shape()
diff --git a/src/finn/util/create.py b/src/finn/util/create.py
index 642cabcf6dd320f226539e03fd6800156c9fe852..ed3e1a843eca47d2e20e9ca1c9df0d2d6f5a8a13 100644
--- a/src/finn/util/create.py
+++ b/src/finn/util/create.py
@@ -30,7 +30,11 @@ import numpy as np
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
-from qonnx.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor, qonnx_make_model
+from qonnx.util.basic import (
+    calculate_signed_dot_prod_range,
+    gen_finn_dt_tensor,
+    qonnx_make_model,
+)
 
 
 def hls_random_mlp_maker(layer_spec):
diff --git a/tests/fpgadataflow/test_depthwise_convolution.py b/tests/fpgadataflow/test_depthwise_convolution.py
index caa22e077f58035ac2acf0a3455ae08a1163cabc..8ab22bcfdcb0312bd49677f0e00d8e97cdcad3c1 100644
--- a/tests/fpgadataflow/test_depthwise_convolution.py
+++ b/tests/fpgadataflow/test_depthwise_convolution.py
@@ -37,7 +37,11 @@ from qonnx.custom_op.general.im2col import compute_conv_output_dim
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor, qonnx_make_model
+from qonnx.util.basic import (
+    calculate_signed_dot_prod_range,
+    gen_finn_dt_tensor,
+    qonnx_make_model,
+)
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
diff --git a/tests/fpgadataflow/test_fpgadataflow_dwc.py b/tests/fpgadataflow/test_fpgadataflow_dwc.py
index 104bfa011fae03aeab0880709763d1b098bbbaa5..2bde148a1499e4c7065ab1e151e3c4198e1e96da 100644
--- a/tests/fpgadataflow/test_fpgadataflow_dwc.py
+++ b/tests/fpgadataflow/test_fpgadataflow_dwc.py
@@ -35,16 +35,16 @@ from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 
 
-def make_single_dwc_modelwrapper(Shape, INWidth, OUTWidth, finn_dtype):
+def make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype, impl_style):
 
-    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, Shape)
-    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, Shape)
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape)
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, shape)
 
     DWC_node = helper.make_node(
         "StreamingDataWidthConverter_Batch",
@@ -52,10 +52,11 @@ def make_single_dwc_modelwrapper(Shape, INWidth, OUTWidth, finn_dtype):
         ["outp"],
         domain="finn.custom_op.fpgadataflow",
         backend="fpgadataflow",
-        shape=Shape,
-        inWidth=INWidth,
-        outWidth=OUTWidth,
+        shape=shape,
+        inWidth=inWidth,
+        outWidth=outWidth,
         dataType=str(finn_dtype.name),
+        impl_style=impl_style,
     )
 
     graph = helper.make_graph(
@@ -75,34 +76,42 @@ def prepare_inputs(input_tensor, dt):
     return {"inp": input_tensor}
 
 
-# shape
-@pytest.mark.parametrize("Shape", [[1, 4], [1, 2, 8]])
-# inWidth
-@pytest.mark.parametrize("INWidth", [2, 4])
-# outWidth
-@pytest.mark.parametrize("OUTWidth", [2, 4])
-# finn_dtype
-@pytest.mark.parametrize("finn_dtype", [DataType["BIPOLAR"], DataType["INT2"]])
+@pytest.mark.parametrize(
+    "config",
+    [
+        ([1, 24], 6, 4, DataType["INT2"], "hls"),
+        ([1, 24], 4, 6, DataType["INT2"], "hls"),
+        ([1, 4], 2, 4, DataType["BIPOLAR"], "hls"),
+        ([1, 2, 8], 2, 4, DataType["BIPOLAR"], "hls"),
+        ([1, 4], 4, 2, DataType["INT2"], "hls"),
+        ([1, 2, 8], 4, 4, DataType["INT2"], "hls"),
+        ([1, 2, 8], 8, 16, DataType["INT2"], "vivado"),
+    ],
+)
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_dwc_rtlsim(Shape, INWidth, OUTWidth, finn_dtype):
-
+def test_fpgadataflow_dwc_rtlsim(config):
+    shape, inWidth, outWidth, finn_dtype, impl_style = config
+    test_fpga_part = "xc7z020clg400-1"
+    target_clk_ns = 10.0
     # generate input data
-    x = gen_finn_dt_tensor(finn_dtype, Shape)
+    x = gen_finn_dt_tensor(finn_dtype, shape)
     input_dict = prepare_inputs(x, finn_dtype)
 
-    model = make_single_dwc_modelwrapper(Shape, INWidth, OUTWidth, finn_dtype)
-
-    model = model.transform(SetExecMode("rtlsim"))
+    model = make_single_dwc_modelwrapper(
+        shape, inWidth, outWidth, finn_dtype, impl_style
+    )
+    model = model.transform(InsertFIFO(create_shallow_fifos=True))
     model = model.transform(GiveUniqueNodeNames())
-    model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+    model = model.transform(PrepareIP(test_fpga_part, 5))
     model = model.transform(HLSSynthIP())
-    model = model.transform(PrepareRTLSim())
+    model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
+    model.set_metadata_prop("exec_mode", "rtlsim")
     y = oxe.execute_onnx(model, input_dict)["outp"]
 
     assert (
         y == x
     ).all(), """The output values are not the same as the
         input values anymore."""
-    assert y.shape == tuple(Shape), """The output shape is incorrect."""
+    assert y.shape == tuple(shape), """The output shape is incorrect."""
diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py
index f3efd6a686b630cc4031b99efd199490e481aeab..b80ef76a19e487a93b23ae7db17350e85fb66822 100644
--- a/tests/fpgadataflow/test_fpgadataflow_mvau.py
+++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py
@@ -36,7 +36,11 @@ from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.general.multithreshold import multithreshold
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor, qonnx_make_model
+from qonnx.util.basic import (
+    calculate_signed_dot_prod_range,
+    gen_finn_dt_tensor,
+    qonnx_make_model,
+)
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer