diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py
index e2f96395ad74255ad67549255608cd52737e97d9..cd14765f388d76b3e42ba88e959c4eecb87ccab0 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfifo.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py
@@ -30,6 +30,7 @@ import numpy as np
 from shutil import copy
 import subprocess
 import math
+import warnings
 
 from finn.custom_op.fpgadataflow import HLSCustomOp
 from finn.core.datatype import DataType
@@ -178,14 +179,11 @@ class StreamingFIFO(HLSCustomOp):
         depth = self.get_nodeattr("depth")
         # depth has to be between 2 and 256 with the current
         # StreamingFIFO implementation
-        assert (
-            depth >= 2
-        ), """Depth is too low. Please set node attribute "depth" to a value
-        between 2 and 256"""
-        assert (
-            depth <= 256
-        ), """Depth is too high. Please set node attribute "depth" to a value
-        between 2 and 256"""
+        assert depth >= 2, """Depth is too low"""
+        if depth > 256 and self.get_nodeattr("impl_style") == "rtl":
+            warnings.warn(
+                "Depth is high, set between 2 and 256 for efficient SRL implementation"
+            )
         # derive normal shape from folded shape
         # StreamingFIFOs are inserted in between fpgadataflow nodes
         # the folded shape could be for example (1, nf, pe)
@@ -424,7 +422,6 @@ class StreamingFIFO(HLSCustomOp):
         else:
             return (math.ceil(depth / 4096)) * (math.ceil(W / 72))
 
-
     def bram_efficiency_estimation(self):
         depth = self.get_nodeattr("depth")
         W = self.get_instream_width()
@@ -450,4 +447,3 @@ class StreamingFIFO(HLSCustomOp):
             ram_luts = 0
 
         return int(address_luts + ram_luts)
-
diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index 6f7fde0c4faba09e584eb578819f44c18639bc9d..38d438927677b853e1f256adcc1ca3048cdf1f28 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -4,6 +4,7 @@ from onnx import helper as oh
 from finn.custom_op.registry import getCustomOp
 from finn.transformation import Transformation
 from finn.util.fpgadataflow import is_fpgadataflow_node
+import warnings
 import numpy as np
 
 
@@ -56,66 +57,81 @@ class InsertFIFO(Transformation):
         for n in graph.node:
             node_ind += 1
             if _suitable_node(n):
-                n_output = n.output[0]
-                consumer = model.find_consumer(n_output)
-                if _suitable_node(consumer) is True:
-                    n0 = getCustomOp(n)
-                    # determine fifo node attributes
-                    fld_shape = n0.get_folded_output_shape()
-                    dtype = n0.get_output_datatype()
-
-                    # check if folded_shape of output of first node and
-                    # input of the second node is equal
-                    n1 = getCustomOp(consumer)
-                    fld_shape_2 = n1.get_folded_input_shape()
-                    assert _suitable_folded_shapes(
-                        fld_shape, fld_shape_2
-                    ), """The
-                    folded output shape of the first node is not the same as the
-                    folded output shape of the second node. A streaming fifo can't
-                    be implemented in between these nodes."""
-
-                    # check if outFIFOdepth attribute of first node
-                    # and inFIFOdepth attribute of consumer node is equal
-                    n0_depth = n0.get_nodeattr("outFIFODepth")
-                    n1_depth = n1.get_nodeattr("inFIFODepth")
-                    if n0_depth == n1_depth:
-                        fifo_depth = n0_depth
-                    elif n0_depth != n1_depth:
-                        fifo_depth = max(n0_depth, n1_depth)
-
-                    if fifo_depth > 2:
-                        # assumption: HLS streaming components already have
-                        # depth-2 FIFOs on inputs and outputs, so no point
-                        # creating additional small FIFOs in between --
-                        # we only create the larger FIFOs specified
-                        # create fifo node
-                        fifo_output_tensor = oh.make_tensor_value_info(
-                            model.make_new_valueinfo_name(),
-                            TensorProto.FLOAT,
-                            n0.get_normal_output_shape(),
+                for n_output in n.output:
+                    consumers = model.find_consumers(n_output)
+                    if consumers is None:
+                        continue
+                    if len(consumers) > 1:
+                        warnings.warn(
+                            n.name
+                            + ": HLS node with fan-out higher than 1 cannot be stitched"
                         )
-                        graph.value_info.append(fifo_output_tensor)
-                        model.set_tensor_datatype(fifo_output_tensor.name, dtype)
-
-                        fifo_node = oh.make_node(
-                            "StreamingFIFO",
-                            [n_output],
-                            [fifo_output_tensor.name],
-                            domain="finn",
-                            backend="fpgadataflow",
-                            depth=fifo_depth,
-                            folded_shape=fld_shape,
-                            dataType=str(dtype.name),
-                        )
-                        # insert fifo
-                        graph.node.insert(node_ind + 1, fifo_node)
-                        # set fifo output tensor as new input tensor of second node
-                        consumer.input[0] = fifo_output_tensor.name
-                        # ensure created FIFO depth is reflected on both sides
-                        n0.set_nodeattr("outFIFODepth", fifo_depth)
-                        n1.set_nodeattr("inFIFODepth", fifo_depth)
-                        graph_modified = True
+                    consumer = consumers[0]
+                    if _suitable_node(consumer) is True:
+                        n0 = getCustomOp(n)
+                        # determine fifo node attributes
+                        fld_shape = n0.get_folded_output_shape()
+                        dtype = n0.get_output_datatype()
+
+                        # check if folded_shape of output of first node and
+                        # input of the second node is equal
+                        n1 = getCustomOp(consumer)
+                        for idx, inp in enumerate(consumer.input):
+                            if inp == n_output:
+                                if idx == 0:
+                                    fld_shape_2 = n1.get_folded_input_shape()
+                                else:
+                                    fld_shape_2 = n1.get_folded_input_shape(ind=idx)
+                        assert _suitable_folded_shapes(
+                            fld_shape, fld_shape_2
+                        ), """The
+                        folded output shape of the first node is not the same as the
+                        folded output shape of the second node. A streaming fifo can't
+                        be implemented in between these nodes."""
+
+                        # check if outFIFOdepth attribute of first node
+                        # and inFIFOdepth attribute of consumer node is equal
+                        n0_depth = n0.get_nodeattr("outFIFODepth")
+                        n1_depth = n1.get_nodeattr("inFIFODepth")
+                        if n0_depth == n1_depth:
+                            fifo_depth = n0_depth
+                        elif n0_depth != n1_depth:
+                            fifo_depth = max(n0_depth, n1_depth)
+
+                        if fifo_depth > 2:
+                            # assumption: HLS streaming components already have
+                            # depth-2 FIFOs on inputs and outputs, so no point
+                            # creating additional small FIFOs in between --
+                            # we only create the larger FIFOs specified
+                            # create fifo node
+                            fifo_output_tensor = oh.make_tensor_value_info(
+                                model.make_new_valueinfo_name(),
+                                TensorProto.FLOAT,
+                                n0.get_normal_output_shape(),
+                            )
+                            graph.value_info.append(fifo_output_tensor)
+                            model.set_tensor_datatype(fifo_output_tensor.name, dtype)
+
+                            fifo_node = oh.make_node(
+                                "StreamingFIFO",
+                                [n_output],
+                                [fifo_output_tensor.name],
+                                domain="finn",
+                                backend="fpgadataflow",
+                                depth=fifo_depth,
+                                folded_shape=fld_shape,
+                                dataType=str(dtype.name),
+                            )
+                            # insert fifo
+                            graph.node.insert(node_ind + 1, fifo_node)
+                            # set fifo output tensor as new input tensor of second node
+                            for idx, inp in enumerate(consumer.input):
+                                if inp == n_output:
+                                    consumer.input[idx] = fifo_output_tensor.name
+                            # ensure created FIFO depth is reflected on both sides
+                            n0.set_nodeattr("outFIFODepth", fifo_depth)
+                            n1.set_nodeattr("inFIFODepth", fifo_depth)
+                            graph_modified = True
 
         if graph_modified is False:
             # insert FIFO as first node, except when first node is DMA
@@ -131,30 +147,31 @@ class InsertFIFO(Transformation):
                 dtype = n0.get_input_datatype()
                 fifo_depth = n0.get_nodeattr("inFIFODepth")
 
-                # create fifo node
-                fifo_output_tensor = oh.make_tensor_value_info(
-                    model.make_new_valueinfo_name(),
-                    TensorProto.FLOAT,
-                    n0.get_normal_input_shape(),
-                )
-                graph.value_info.append(fifo_output_tensor)
-                model.set_tensor_datatype(fifo_output_tensor.name, dtype)
-
-                fifo_node = oh.make_node(
-                    "StreamingFIFO",
-                    [n_input],
-                    [fifo_output_tensor.name],
-                    domain="finn",
-                    backend="fpgadataflow",
-                    depth=fifo_depth,
-                    folded_shape=fld_shape,
-                    dataType=str(dtype.name),
-                )
-                # insert fifo
-                graph.node.insert(0, fifo_node)
-
-                # set fifo output tensor as new input tensor of second node
-                n.input[0] = fifo_output_tensor.name
+                if fifo_depth > 2:
+                    # create fifo node
+                    fifo_output_tensor = oh.make_tensor_value_info(
+                        model.make_new_valueinfo_name(),
+                        TensorProto.FLOAT,
+                        n0.get_normal_input_shape(),
+                    )
+                    graph.value_info.append(fifo_output_tensor)
+                    model.set_tensor_datatype(fifo_output_tensor.name, dtype)
+
+                    fifo_node = oh.make_node(
+                        "StreamingFIFO",
+                        [n_input],
+                        [fifo_output_tensor.name],
+                        domain="finn",
+                        backend="fpgadataflow",
+                        depth=fifo_depth,
+                        folded_shape=fld_shape,
+                        dataType=str(dtype.name),
+                    )
+                    # insert fifo
+                    graph.node.insert(0, fifo_node)
+
+                    # set fifo output tensor as new input tensor of second node
+                    n.input[0] = fifo_output_tensor.name
 
             # insert FIFO as last node, except when last node is DMA
             if (
@@ -173,29 +190,30 @@ class InsertFIFO(Transformation):
                 dtype = n0.get_output_datatype()
                 fifo_depth = n0.get_nodeattr("outFIFODepth")
 
-                # create fifo node
-                fifo_input_tensor = oh.make_tensor_value_info(
-                    model.make_new_valueinfo_name(),
-                    TensorProto.FLOAT,
-                    n0.get_normal_output_shape(),
-                )
-                graph.value_info.append(fifo_input_tensor)
-                model.set_tensor_datatype(fifo_output_tensor.name, dtype)
-
-                fifo_node = oh.make_node(
-                    "StreamingFIFO",
-                    [fifo_input_tensor.name],
-                    [graph_out_name],
-                    domain="finn",
-                    backend="fpgadataflow",
-                    depth=fifo_depth,
-                    folded_shape=fld_shape,
-                    dataType=str(dtype.name),
-                )
-                # insert fifo
-                graph.node.append(fifo_node)
-
-                # set fifo output tensor as new input tensor of second node
-                n.output[0] = fifo_input_tensor.name
+                if fifo_depth > 2:
+                    # create fifo node
+                    fifo_input_tensor = oh.make_tensor_value_info(
+                        model.make_new_valueinfo_name(),
+                        TensorProto.FLOAT,
+                        n0.get_normal_output_shape(),
+                    )
+                    graph.value_info.append(fifo_input_tensor)
+                    model.set_tensor_datatype(fifo_input_tensor.name, dtype)
+
+                    fifo_node = oh.make_node(
+                        "StreamingFIFO",
+                        [fifo_input_tensor.name],
+                        [graph_out_name],
+                        domain="finn",
+                        backend="fpgadataflow",
+                        depth=fifo_depth,
+                        folded_shape=fld_shape,
+                        dataType=str(dtype.name),
+                    )
+                    # insert fifo
+                    graph.node.append(fifo_node)
+
+                    # set fifo output tensor as new input tensor of second node
+                    n.output[0] = fifo_input_tensor.name
 
         return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
index 798bbd335f0028d1103d992fd2b8b9cd30bbb6e1..71712d8ca8e3fb7f4050dd0f489d74f177f2cab8 100644
--- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py
+++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
@@ -26,7 +26,6 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import copy
 import math
 import numpy as np
 import warnings
@@ -39,7 +38,7 @@ from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
-from finn.transformation.general import GiveUniqueNodeNames
+from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames
 from finn.core.rtlsim_exec import (
     _reset_rtlsim,
     _toggle_clk,
@@ -49,6 +48,12 @@ from finn.util.fpgadataflow import (
 )
 
 
+def reset_implementation(node):
+    node.set_nodeattr("code_gen_dir_ipgen", "")
+    node.set_nodeattr("ipgen_path", "")
+    node.set_nodeattr("ip_path", "")
+
+
 def set_signal(sim, keyw, value):
     for i in range(len(sim.inputs)):
         input_name = sim.inputs[i][0]
@@ -56,6 +61,13 @@ def set_signal(sim, keyw, value):
             sim.io[input_name] = value
 
 
+def get_signal(sim, keyw):
+    for i in range(len(sim.outputs)):
+        output_name = sim.outputs[i][0]
+        if keyw in output_name:
+            return sim.io[output_name]
+
+
 def optimize_depth(depth):
     if depth <= 2:
         return 2
@@ -63,7 +75,7 @@ def optimize_depth(depth):
         return 32
     if depth <= 1024:
         return int(2 ** math.ceil(math.log2(depth)))
-    return int(math.ceil(depth / 1024))
+    return int(math.ceil(depth / 1024) * 1024)
 
 
 class SetFIFODepths(Transformation):
@@ -73,28 +85,28 @@ class SetFIFODepths(Transformation):
     images on input (random/constant data) and keep track of maximum
     occupancy counts in each FIFO."""
 
-    def __init__(self, fpgapart, clk_ns=10.0):
+    def __init__(self, fpgapart, clk_ns=10.0, max_qsrl_depth=256, max_depth=2 ** 14):
         super().__init__()
         self.fpgapart = fpgapart
         self.clk_ns = clk_ns
+        self.max_qsrl_depth = max_qsrl_depth
+        self.max_depth = max_depth
 
     def apply(self, model):
 
-        orig_model = model
-
-        # work on a copy of the model
-        model = copy.deepcopy(model)
-
-        # change external to decoupled and warn user;
+        # change external to decoupled and warn user
         # this way we are sure we have exactly one input/output
+        modified_fc_nodes = []
         for node in model.graph.node:
             node = getCustomOp(node)
-            node.set_nodeattr("inFIFODepth", 2 ** 14)
-            node.set_nodeattr("outFIFODepth", 2 ** 14)
+            node.set_nodeattr("inFIFODepth", self.max_depth)
+            node.set_nodeattr("outFIFODepth", self.max_depth)
             if node.onnx_node.op_type == "StreamingFCLayer_Batch":
                 mmode = node.get_nodeattr("mem_mode")
                 if mmode == "external":
+                    modified_fc_nodes.append(node.onnx_node.name)
                     node.set_nodeattr("mem_mode", "decoupled")
+                    reset_implementation(node)
                     warnings.warn(
                         "Changed mem_mode from external to decoupled for "
                         + node.onnx_node.name
@@ -104,27 +116,17 @@ class SetFIFODepths(Transformation):
         model = model.transform(InsertDWC())
         model = model.transform(InsertFIFO())
         model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(GiveReadableTensorNames())
 
         # gather FIFO names, check they are of expected depth
         fifos = {}
         for node in model.graph.node:
             if node.op_type == "StreamingFIFO":
-                consumer = model.find_consumers(node.output[0])
-                if consumer is not None:
-                    consumer = consumer[0].name
-                producer = model.find_producer(node.input[0])
-                if producer is not None:
-                    producer = producer.name
-                fifos[node.name] = {
-                    "depth": 0,
-                    "consumer": consumer,
-                    "producer": producer,
-                }
+                fifos[node.name] = 0
                 node = getCustomOp(node)
-                # check depths
-                # if model came in with FIFOs, the depths will not have been updated
-                if node.get_nodeattr("depth") != 2 ** 14:
-                    node.set_nodeattr("depth", 2 ** 14)
+                # check depths and fix as necessary
+                if node.get_nodeattr("depth") != self.max_depth:
+                    node.set_nodeattr("depth", self.max_depth)
 
         # insert FIFOs and do all transformations for RTLsim
         model = model.transform(AnnotateCycles())
@@ -138,12 +140,17 @@ class SetFIFODepths(Transformation):
 
         # calculate input frequency (number of cycles for each input word)
         first_node = getCustomOp(model.graph.node[0])
-        ncycles_per_input = math.ceil(
-            perf["max_cycles"]
-            / (
-                np.prod(first_node.get_folded_input_shape())
-                / first_node.get_folded_input_shape()[-1]
-            )
+        ncycles_per_input = max(
+            1,
+            int(
+                math.ceil(
+                    perf["max_cycles"]
+                    / (
+                        np.prod(first_node.get_folded_input_shape())
+                        / first_node.get_folded_input_shape()[-1]
+                    )
+                )
+            ),
         )
 
         # set sufficiently large threshold for 1 image to  fully execute and exit
@@ -161,6 +168,7 @@ class SetFIFODepths(Transformation):
         set_signal(sim, "tready", 1)
         set_signal(sim, "tdata", 0)
 
+        output_detected = False
         while ncycles > 0:
             _toggle_clk(sim)
             # set/unset valids
@@ -181,37 +189,83 @@ class SetFIFODepths(Transformation):
                     current_count = current_addr + 2
                 else:
                     current_count = current_state
-                if current_count > fifos[key]["depth"]:
-                    fifos[key]["depth"] = current_count
-            ncycles = ncycles - 1
-
-        # for each node in the original graph, determine in/outFIFODepth
-        ret = {}
-        for key in fifos:
-            predecessor_node = fifos[key]["producer"]
-            if predecessor_node is not None:
-                if predecessor_node not in ret:
-                    ret[predecessor_node] = {"inFIFODepth": 0, "outFIFODepth": 0}
-                out_depth = ret[predecessor_node]["outFIFODepth"]
-                ret[predecessor_node]["outFIFODepth"] = max(
-                    out_depth, fifos[key]["depth"]
-                )
+                if current_count > fifos[key]:
+                    fifos[key] = current_count
 
-            succcessor_node = fifos[key]["consumer"]
-            if succcessor_node is not None:
-                if succcessor_node not in ret:
-                    ret[succcessor_node] = {"inFIFODepth": 0, "outFIFODepth": 0}
-                in_depth = ret[succcessor_node]["inFIFODepth"]
-                ret[succcessor_node]["inFIFODepth"] = max(in_depth, fifos[key]["depth"])
-
-        # tweak and apply depths to original model
-        for node in orig_model.graph.node:
-            if node.name in ret:
-                depths = ret[node.name]
-                node = getCustomOp(node)
-                node.set_nodeattr("inFIFODepth", optimize_depth(depths["inFIFODepth"]))
-                node.set_nodeattr(
-                    "outFIFODepth", optimize_depth(depths["outFIFODepth"])
-                )
+            # since latency estimation is very pessimistic, detect first output
+            # and fast-forward the sim
+            if get_signal(sim, "tvalid") != 0 and not output_detected:
+                ncycles = max_cycles
+                output_detected = True
+            else:
+                ncycles = ncycles - 1
+
+        if not output_detected:
+            warnings.warn(
+                "No output detected, calculated FIFO depths may not be correct"
+            )
+
+        # Apply depths back into the model;
+        # also set in/outFIFODepth to zero for non-FIFO
+        # nodes, preventing further FIFO insertion
+        for node in model.graph.node:
+            # set FIFO depth, reset FIFO implementation,
+            # and set implementation/ram styles
+            if node.op_type == "StreamingFIFO":
+                assert node.name in fifos, "FIFO node not found in size dictionary"
+                # set depth of FIFO
+                depth = optimize_depth(fifos[node.name])
+                node_inst = getCustomOp(node)
+                node_inst.set_nodeattr("depth", depth)
+                # Set FIFO implementation/ram styles
+                if depth > self.max_qsrl_depth:
+                    node_inst.set_nodeattr("impl_style", "vivado")
+                    node_inst.set_nodeattr("ram_style", "auto")
+                else:
+                    node_inst.set_nodeattr("impl_style", "rtl")
+                # reset implementation
+                reset_implementation(node_inst)
+                del fifos[node.name]
+            else:
+                getCustomOp(node).set_nodeattr("inFIFODepth", 0)
+                getCustomOp(node).set_nodeattr("outFIFODepth", 0)
+                # for every FC node we changed from external to decoupled,
+                # change back and reset implementation
+                if node.op_type == "StreamingFCLayer_Batch":
+                    if node.name in modified_fc_nodes:
+                        node_inst = getCustomOp(node)
+                        node_inst.set_nodeattr("mem_mode", "external")
+                        reset_implementation(node_inst)
+                        modified_fc_nodes.remove(node.name)
+
+        assert (
+            len(modified_fc_nodes) == 0 and len(fifos.keys()) == 0
+        ), "FIFO/FC nodes left untouched after model reconfiguration"
+
+        # Remove FIFOs which have depth <= 2
+        shallow_fifos = []
+        # First, bypass them
+        for node in model.graph.node:
+            if (
+                node.op_type == "StreamingFIFO"
+                and getCustomOp(node).get_nodeattr("depth") <= 2
+            ):
+                shallow_fifos.append(node)
+                consumers = model.find_consumers(node.output[0])
+                if consumers is None:
+                    producer = model.find_producer(node.input[0])
+                    for idx, inp in enumerate(producer.output):
+                        if inp == node.input[0]:
+                            producer.output[idx] = node.output[0]
+                else:
+                    assert len(consumers) == 1, "Fanout detected from FIFO output"
+                    consumer = consumers[0]
+                    # set fifo input tensor as new input tensor of second node
+                    for idx, inp in enumerate(consumer.input):
+                        if inp == node.output[0]:
+                            consumer.input[idx] = node.input[0]
+        # now filter out
+        for node_to_remove in shallow_fifos:
+            model.graph.node.remove(node_to_remove)
 
-        return (orig_model, False)
+        return (model, False)
diff --git a/src/finn/util/fpgadataflow.py b/src/finn/util/fpgadataflow.py
index 3fe747a84985b2702ffb1e5855d9071362efebda..f849ee5267de1ddab96a948f8c3408c62957fd8a 100644
--- a/src/finn/util/fpgadataflow.py
+++ b/src/finn/util/fpgadataflow.py
@@ -86,21 +86,40 @@ def pyverilate_stitched_ip(model):
     def file_to_basename(x):
         return os.path.basename(os.path.realpath(x))
 
-    all_verilog_dirs = list(map(file_to_dir, all_verilog_srcs))
-    all_verilog_files = list(
-        set(
-            filter(
-                lambda x: x.endswith(".v"),
-                list(map(file_to_basename, all_verilog_srcs)),
-            )
-        )
-    )
-    top_module_name = model.get_metadata_prop("wrapper_filename")
-    top_module_name = file_to_basename(top_module_name).strip(".v")
+    top_module_file_name = file_to_basename(model.get_metadata_prop("wrapper_filename"))
+    top_module_name = top_module_file_name.strip(".v")
     build_dir = make_build_dir("pyverilator_ipstitched_")
+
+    # dump all Verilog code to a single file
+    # this is because large models with many files require
+    # a verilator command line too long for bash on most systems
+    # NOTE: there are duplicates in this list, and some files
+    # are identical but in multiple directories (regslice_core.v)
+
+    # remove duplicates from list by doing list -> set -> list
+    all_verilog_files = list(set(filter(lambda x: x.endswith(".v"), all_verilog_srcs)))
+
+    # remove all but one instances of regslice_core.v
+    filtered_verilog_files = []
+    remove_entry = False
+    for vfile in all_verilog_files:
+        if "regslice_core" in vfile:
+            if not remove_entry:
+                filtered_verilog_files.append(vfile)
+            remove_entry = True
+        else:
+            filtered_verilog_files.append(vfile)
+
+    # concatenate all verilog code into a single file
+    with open(vivado_stitch_proj_dir + "/" + top_module_file_name, "w") as wf:
+        for vfile in filtered_verilog_files:
+            with open(vfile) as rf:
+                wf.write("//Added from " + vfile + "\n\n")
+                wf.write(rf.read())
+
     sim = PyVerilator.build(
-        all_verilog_files,
-        verilog_path=all_verilog_dirs,
+        top_module_file_name,
+        verilog_path=[vivado_stitch_proj_dir],
         build_dir=build_dir,
         trace_depth=get_rtlsim_trace_depth(),
         top_module_name=top_module_name,