diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
index 736f95048a2aed024194d626d68038f9f49d7d5a..bc795aa922595a6c3fecb00844201a210cf1c89c 100644
--- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py
+++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
@@ -70,14 +70,87 @@ def optimize_depth(depth):
     if depth <= 2:
         return 2
     if depth <= 32:
+        # Q_srl FIFOs do not benefit from size < 32
+        # add some slack
         return 32
+    # round to nearest power of two for Vivado IP FIFO implementation
     return int(2 ** math.ceil(math.log2(depth)))
 
 
+class CapConvolutionFIFODepths(Transformation):
+    """Make the size of FIFOs for convolution layers smaller where possible.
+    Will be automatically called from InsertAndSetFIFODepths if the appropriate
+    constructor flag is set.
+
+    Constructor arguments:
+    - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of
+                       Verilog FIFOs (Q_srl.v)
+
+    Assumed input graph properties:
+    - all nodes are fpgadataflow nodes
+    - FIFOs inserted with InsertAndSetFIFODepths
+
+    Output:
+    - graph with smaller-depth FIFOs for convolutions
+
+    Background:
+    The simulation-based rtlsim_exec tends to overestimate the required depth
+    of FIFOs between the ConvolutionInputGenerator (here called SWG) and the
+    StreamingFCLayer (here called MVAU). As the SWG has an internal buffer of 1
+    image row, we use this as a rule of thumb to set FIFO depth to be no larger
+    than 1 row.
+    """
+
+    def __init__(self, max_qsrl_depth=256):
+        super().__init__()
+        self.max_qsrl_depth = max_qsrl_depth
+
+    def apply(self, model):
+        # TODO move this to own transformation
+        for node in model.graph.node:
+            # look for following pattern:
+            # ConvolutionInputGenerator -> StreamingFIFO -> StreamingFCLayer
+            if node.op_type == "StreamingFIFO":
+                fifo_prod = model.find_producer(node.input[0])
+                fifo_cons = model.find_consumer(node.output[0])
+                if fifo_prod is None:
+                    continue
+                if fifo_prod.op_type != "ConvolutionInputGenerator":
+                    continue
+                if fifo_cons is None:
+                    continue
+                if fifo_cons.op_type != "StreamingFCLayer_Batch":
+                    continue
+                op_inst = getCustomOp(node)
+                depth = op_inst.get_nodeattr("depth")
+                # SWG has an internal buffer of 1 row, so we use this as a
+                # rule of thumb to set FIFO depth to be no larger than 1 row
+                (bs, h, w, ifold, simd) = op_inst.get_folded_input_shape()
+                new_depth = optimize_depth(w * ifold)
+                new_depth = min(new_depth, depth)
+                op_inst.set_nodeattr("depth", new_depth)
+                # Set FIFO implementation/ram styles
+                if new_depth > self.max_qsrl_depth:
+                    op_inst.set_nodeattr("impl_style", "vivado")
+                    op_inst.set_nodeattr("ram_style", "auto")
+                else:
+                    op_inst.set_nodeattr("impl_style", "rtl")
+
+        return (model, False)
+
+
 class InsertAndSetFIFODepths(Transformation):
     """Insert appropriate-depth StreamingFIFOs through RTLSim that preserve
     throughput in the created accelerator.
 
+    Constructor arguments:
+    - clk_ns : clock period (used for IP preparation)
+    - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of
+                       Verilog FIFOs (Q_srl.v)
+    - max_depth : how deep the "max"-sized FIFOs initially inserted will be
+    - swg_exception : call CapConvolutionFIFODepths to make convolution FIFOs
+                        smaller where appropriate
+
     Assumed input graph properties:
     - all nodes are fpgadataflow nodes
     - no FIFOs inserted,
@@ -86,7 +159,11 @@ class InsertAndSetFIFODepths(Transformation):
     Output:
     - graph with appropriate-depth FIFOs inserted
 
-    How it works:
+    Background:
+    Even with all FINN HLS fpgadatflow layers appropriately parallelized, it is
+    necessary to insert FIFOs between them to prevent stalls due to bursty
+    behavior. The sizes of those FIFOs are hard to predict analytically, so
+    we do the following:
     - insert very deep (default 16k deep) FIFOs between all fpgadataflow nodes
     - create stitched design
     - run through rtlsim with stream of multiple random input images (to fill pipeline)
@@ -95,12 +172,20 @@ class InsertAndSetFIFODepths(Transformation):
       and set inFIFODepth/outFIFODepth attrs to 0 on relevant nodes
     """
 
-    def __init__(self, fpgapart, clk_ns=10.0, max_qsrl_depth=256, max_depth=2 ** 14):
+    def __init__(
+        self,
+        fpgapart,
+        clk_ns=10.0,
+        max_qsrl_depth=256,
+        max_depth=2 ** 14,
+        swg_exception=True,
+    ):
         super().__init__()
         self.fpgapart = fpgapart
         self.clk_ns = clk_ns
         self.max_qsrl_depth = max_qsrl_depth
         self.max_depth = max_depth
+        self.swg_exception = swg_exception
 
     def apply(self, model):
         # change external to decoupled and warn user
@@ -256,7 +341,14 @@ class InsertAndSetFIFODepths(Transformation):
             len(modified_fc_nodes) == 0 and len(fifos.keys()) == 0
         ), "FIFO/FC nodes left untouched after model reconfiguration"
 
+        # handle custom sizing for SWG FIFOs if desired
+        if self.swg_exception:
+            model = model.transform(
+                CapConvolutionFIFODepths(max_qsrl_depth=self.max_qsrl_depth)
+            )
+
         # Remove FIFOs which have depth <= 2
+        # TODO move this to own transformation
         shallow_fifos = []
         # First, bypass them
         for node in model.graph.node: