diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn
index 9c18c03d7bdb8406d43aa8fc4efdb8a206b1217e..b3c669ec1097745bd30f650ca0b9dacda647c61d 100644
--- a/docker/Dockerfile.finn
+++ b/docker/Dockerfile.finn
@@ -65,7 +65,7 @@ RUN locale-gen "en_US.UTF-8"
 RUN apt-get install -y git perl python3 make autoconf g++ flex bison ccache libgoogle-perftools-dev numactl perl-doc libfl2 libfl-dev zlibc zlib1g zlib1g-dev
 RUN git clone https://github.com/verilator/verilator
 RUN cd verilator && \
-    git checkout v4.012 && \
+    git checkout v4.224 && \
     autoconf && \
     ./configure && \
     make -j4 && \
diff --git a/fetch-repos.sh b/fetch-repos.sh
index 10b6b332550be5d914d80e242f01e77daeaf08a0..36c9ae55780fe0f945f065d7a0214c683bf513a8 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -27,10 +27,10 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-QONNX_COMMIT="92184fea2dd417bc7a53c82811fef271e4833c4c"
+QONNX_COMMIT="f702b17cdb9d5e57f85f43a5d33890647e063de6"
 FINN_EXP_COMMIT="9cbd2787b5160e2b44e0e8164a0df1457dbd5366"
 BREVITAS_COMMIT="a5b71d6de1389d3e7db898fef72e014842670f03"
-PYVERILATOR_COMMIT="64b8294ff1afebb47be76fcad6ae87027e0402c2"
+PYVERILATOR_COMMIT="766e457465f5c0dd315490d7b9cc5d74f9a76f4f"
 CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
 HLSLIB_COMMIT="e7f2de91d1a2ddadaaea06b8f4c20e97a575470e"
 OMX_COMMIT="d1065a788219ca0eb54d5e57600b1f9d7f67d4cc"
diff --git a/finn-rtllib/memstream/hdl/Q_srl.v b/finn-rtllib/memstream/hdl/Q_srl.v
index b4e89628a44bb1f55c3445ee8e6866beada23585..2f3d813504b06c9f12875c5bad00ce99c0707d82 100644
--- a/finn-rtllib/memstream/hdl/Q_srl.v
+++ b/finn-rtllib/memstream/hdl/Q_srl.v
@@ -69,7 +69,7 @@
 `define Q_srl
 
 
-module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
+module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount);
 
    parameter depth = 16;   // - greatest #items in queue  (2 <= depth <= 256)
    parameter width = 16;   // - width of data (i_d, o_d)
@@ -90,7 +90,9 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
    wire               o_b;	// - output stream back-pressure
 
    output [addrwidth:0] count;  // - output number of elems in queue
+   output [addrwidth:0] maxcount;  // - maximum observed count since reset
 
+   reg [addrwidth:0] maxcount_reg;  // - maximum count seen until now
    reg    [addrwidth-1:0] addr, addr_, a_;		// - SRL16 address
 							//     for data output
    reg 			  shift_en_;			// - SRL16 shift enable
@@ -124,6 +126,7 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
    assign o_d = srlo;				// - output data from queue
    assign o_v = o_v_reg;			// - output valid if non-empty
    assign i_b = i_b_reg;			// - input bp if full
+   assign maxcount = maxcount_reg;
 
    assign i_r = !i_b;
    assign o_b = !o_r;
@@ -140,6 +143,7 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
          addr_full <= 0;
 	 o_v_reg   <= 0;
 	 i_b_reg   <= 1;
+	 maxcount_reg <= 0;
       end
       else begin
 	 state     <= state_;
@@ -147,6 +151,7 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
          addr_full <= addr_full_;
 	 o_v_reg   <= o_v_reg_;
 	 i_b_reg   <= i_b_reg_;
+	 maxcount_reg <= (count > maxcount_reg ? count : maxcount_reg);
       end
    end // always @ (posedge clock)
 
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index e16711f63b954707bc7ad9050dd7627ca1ce99c1..d842d89e234fd59f953a246293d271154d50954a 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -320,6 +320,10 @@ class DataflowBuildConfig:
     #: Override the number of inputs for rtlsim performance measurement.
     rtlsim_batch_size: Optional[int] = 1
 
+    #: If set to True, FIFOs and DWCs with impl_style=vivado will be kept during
+    #: rtlsim, otherwise they will be replaced by HLS implementations.
+    rtlsim_use_vivado_comps: Optional[bool] = True
+
     def _resolve_hls_clk_period(self):
         if self.hls_clk_period_ns is None:
             # use same clk for synth and hls if not explicitly specified
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index 70ae70f4e1912aa0a31b0c00c7e41f35780e207f..8290621056f9e4531693a3266bfb633735a4db33 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -204,40 +204,44 @@ def verify_step(
 
 
 def prepare_for_stitched_ip_rtlsim(verify_model, cfg):
-    need_restitch = False
-    # rtlsim only supports certain impl_style for some nodes
-    # StreamingFIFO must have impl_style=rtl
-    for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO"):
-        inst = getCustomOp(fifo_layer)
-        if inst.get_nodeattr("impl_style") != "rtl":
-            inst.set_nodeattr("impl_style", "rtl")
-            inst.set_nodeattr("code_gen_dir_ipgen", "")
-            inst.set_nodeattr("ipgen_path", "")
-            need_restitch = True
-    # StreamingDataWidthConverter must have impl_style=hls
-    for dwc_layer in verify_model.get_nodes_by_op_type(
-        "StreamingDataWidthConverter_Batch"
-    ):
-        inst = getCustomOp(dwc_layer)
-        if inst.get_nodeattr("impl_style") != "hls":
-            inst.set_nodeattr("impl_style", "hls")
-            inst.set_nodeattr("code_gen_dir_ipgen", "")
-            inst.set_nodeattr("ipgen_path", "")
-            need_restitch = True
-    # if we've made alterations to the model, need to do some re-prep
-    if need_restitch:
-        print("Need to regen/re-stitch some IP for STITCHED_IP_RTLSIM")
-        verify_model = verify_model.transform(
-            PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
-        )
-        verify_model = verify_model.transform(HLSSynthIP())
-        verify_model = verify_model.transform(
-            CreateStitchedIP(
-                cfg._resolve_fpga_part(),
-                cfg.synth_clk_period_ns,
-                vitis=False,
+    if not cfg.rtlsim_use_vivado_comps:
+        need_restitch = False
+        # switch impl_style=vivado components to rtl/hls
+        # StreamingFIFO must have impl_style=rtl
+        for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO"):
+            inst = getCustomOp(fifo_layer)
+            if inst.get_nodeattr("impl_style") != "rtl":
+                inst.set_nodeattr("impl_style", "rtl")
+                inst.set_nodeattr("code_gen_dir_ipgen", "")
+                inst.set_nodeattr("ipgen_path", "")
+                need_restitch = True
+        # StreamingDataWidthConverter must have impl_style=hls
+        for dwc_layer in verify_model.get_nodes_by_op_type(
+            "StreamingDataWidthConverter_Batch"
+        ):
+            inst = getCustomOp(dwc_layer)
+            if inst.get_nodeattr("impl_style") != "hls":
+                inst.set_nodeattr("impl_style", "hls")
+                inst.set_nodeattr("code_gen_dir_ipgen", "")
+                inst.set_nodeattr("ipgen_path", "")
+                need_restitch = True
+        # if we've made alterations to the model, need to do some re-prep
+        if need_restitch:
+            print("Need to regen/re-stitch some IP for STITCHED_IP_RTLSIM")
+            verify_model = verify_model.transform(
+                PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
             )
-        )
+            verify_model = verify_model.transform(HLSSynthIP())
+            verify_model = verify_model.transform(
+                CreateStitchedIP(
+                    cfg._resolve_fpga_part(),
+                    cfg.synth_clk_period_ns,
+                    vitis=False,
+                )
+            )
+    else:
+        print("rtlsim_use_vivado_comps is enabled, may yield incorrect results")
+
     # set top-level prop for stitched-ip rtlsim and launch
     verify_model.set_metadata_prop("exec_mode", "rtlsim")
     # TODO make configurable
diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py
index a7c3cd0be59db4ba8665f8fba5be72282339b8c8..a0346f50bf6b7e88a79ba5ef4700039eb39c32ef 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfifo.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py
@@ -68,6 +68,8 @@ class StreamingFIFO(HLSCustomOp):
                 "auto",
                 {"auto", "block", "distributed", "ultra"},
             ),
+            # whether depth monitoring is enabled (impl_style=rtl only)
+            "depth_monitor": ("i", False, 0),
         }
         my_attrs.update(super().get_nodeattr_types())
 
@@ -97,6 +99,14 @@ class StreamingFIFO(HLSCustomOp):
     def verify_node(self):
         pass
 
+    def get_verilog_top_module_intf_names(self):
+        ret = super().get_verilog_top_module_intf_names()
+        is_rtl = self.get_nodeattr("impl_style") == "rtl"
+        is_depth_monitor = self.get_nodeattr("depth_monitor") == 1
+        if is_rtl and is_depth_monitor:
+            ret["ap_none"] = ["maxcount"]
+        return ret
+
     def get_verilog_top_module_name(self):
         "Return the Verilog top module name for this node."
 
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index e73fa9bb2872d4a5023afb0c4e6953b4e6866b8d..c7bbc3f139b64f57943b2b099083a9611951e9c4 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -319,6 +319,7 @@ module $TOPNAME$(
 ap_clk,
 ap_rst_n,
 count,
+maxcount,
 in0_$HLS_SNAME$_TDATA,
 in0_$HLS_SNAME$_TVALID,
 in0_$HLS_SNAME$_TREADY,
@@ -330,6 +331,7 @@ out_$HLS_SNAME$_TREADY
 input   ap_clk;
 input   ap_rst_n;
 output $COUNT_RANGE$ count;
+output $COUNT_RANGE$ maxcount;
 input  $IN_RANGE$ in0_$HLS_SNAME$_TDATA;
 input   in0_$HLS_SNAME$_TVALID;
 output   in0_$HLS_SNAME$_TREADY;
@@ -346,6 +348,7 @@ $LAYER_NAME$
  .clock(ap_clk),
  .reset(!ap_rst_n),
  .count(count),
+ .maxcount(maxcount),
  .i_d(in0_$HLS_SNAME$_TDATA),
  .i_v(in0_$HLS_SNAME$_TVALID),
  .i_r(in0_$HLS_SNAME$_TREADY),
diff --git a/src/finn/qnn-data/verilog/custom_axis_infrastructure.vh b/src/finn/qnn-data/verilog/custom_axis_infrastructure.vh
new file mode 100644
index 0000000000000000000000000000000000000000..1c8b6403e8628e3647810ca5fca65ca1122eaf9d
--- /dev/null
+++ b/src/finn/qnn-data/verilog/custom_axis_infrastructure.vh
@@ -0,0 +1,346 @@
+//  (c) Copyright 2011-2013 Xilinx, Inc. All rights reserved.
+//
+//  This file contains confidential and proprietary information
+//  of Xilinx, Inc. and is protected under U.S. and
+//  international copyright and other intellectual property
+//  laws.
+//
+//  DISCLAIMER
+//  This disclaimer is not a license and does not grant any
+//  rights to the materials distributed herewith. Except as
+//  otherwise provided in a valid license issued to you by
+//  Xilinx, and to the maximum extent permitted by applicable
+//  law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND
+//  WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES
+//  AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING
+//  BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-
+//  INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and
+//  (2) Xilinx shall not be liable (whether in contract or tort,
+//  including negligence, or under any other theory of
+//  liability) for any loss or damage of any kind or nature
+//  related to, arising under or in connection with these
+//  materials, including for any direct, or any indirect,
+//  special, incidental, or consequential loss or damage
+//  (including loss of data, profits, goodwill, or any type of
+//  loss or damage suffered as a result of any action brought
+//  by a third party) even if such damage or loss was
+//  reasonably foreseeable or Xilinx had been advised of the
+//  possibility of the same.
+//
+//  CRITICAL APPLICATIONS
+//  Xilinx products are not designed or intended to be fail-
+//  safe, or for use in any application requiring fail-safe
+//  performance, such as life-support or safety devices or
+//  systems, Class III medical devices, nuclear facilities,
+//  applications related to the deployment of airbags, or any
+//  other applications that could lead to death, personal
+//  injury, or severe property or environmental damage
+//  (individually and collectively, "Critical
+//  Applications"). Customer assumes the sole risk and
+//  liability of any use of Xilinx products in Critical
+//  Applications, subject only to applicable laws and
+//  regulations governing limitations on product liability.
+//
+//  THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS
+//  PART OF THIS FILE AT ALL TIMES.
+//-----------------------------------------------------------------------------
+//
+// Generic Functions used by AXIS-Interconnect and Infrastrucutre Modules
+//
+// Verilog-standard:  Verilog 2001
+//--------------------------------------------------------------------------
+// Global Parameters:
+//
+// Functions:
+//   f_clogb2
+//   f_gcd
+//   f_lcm
+//   f_get_tdata_indx
+//   f_get_tstrb_indx
+//   f_get_tkeep_indx
+//   f_get_tlast_indx
+//   f_get_tid_indx
+//   f_get_tdest_indx
+//   f_get_tuser_indx
+//   f_payload_width
+// Tasks:
+//   t_display_tdata_error
+//--------------------------------------------------------------------------
+///////////////////////////////////////////////////////////////////////////////
+// BEGIN Global Parameters
+///////////////////////////////////////////////////////////////////////////////
+// Define Signal Set indices
+localparam G_INDX_SS_TREADY = 0;
+localparam G_INDX_SS_TDATA  = 1;
+localparam G_INDX_SS_TSTRB  = 2;
+localparam G_INDX_SS_TKEEP  = 3;
+localparam G_INDX_SS_TLAST  = 4;
+localparam G_INDX_SS_TID    = 5;
+localparam G_INDX_SS_TDEST  = 6;
+localparam G_INDX_SS_TUSER  = 7;
+localparam G_MASK_SS_TREADY = 32'h1 << G_INDX_SS_TREADY;
+localparam G_MASK_SS_TDATA  = 32'h1 << G_INDX_SS_TDATA;
+localparam G_MASK_SS_TSTRB  = 32'h1 << G_INDX_SS_TSTRB;
+localparam G_MASK_SS_TKEEP  = 32'h1 << G_INDX_SS_TKEEP;
+localparam G_MASK_SS_TLAST  = 32'h1 << G_INDX_SS_TLAST;
+localparam G_MASK_SS_TID    = 32'h1 << G_INDX_SS_TID  ;
+localparam G_MASK_SS_TDEST  = 32'h1 << G_INDX_SS_TDEST;
+localparam G_MASK_SS_TUSER  = 32'h1 << G_INDX_SS_TUSER;
+
+// Task DRC error levels
+localparam G_TASK_SEVERITY_ERR   = 2;
+localparam G_TASK_SEVERITY_WARNING = 1;
+localparam G_TASK_SEVERITY_INFO    = 0;
+
+///////////////////////////////////////////////////////////////////////////////
+// BEGIN Functions
+///////////////////////////////////////////////////////////////////////////////
+// ceiling logb2
+  function integer f_clogb2 (input integer size);
+    integer s;
+    begin
+      s = size;
+      s = s - 1;
+      for (f_clogb2=1; s>1; f_clogb2=f_clogb2+1)
+            s = s >> 1;
+    end
+  endfunction // clogb2
+
+  // Calculates the Greatest Common Divisor between two integers using the
+  // euclidean algorithm.
+  function automatic integer f_gcd (
+    input integer a,
+    input integer b
+    );
+    begin : main
+      integer A, B, done, swap;
+      A = a;
+      B = b;
+      done = 0;
+      while(!done)
+      begin
+        if (A < B ) begin
+          swap = A;
+          A = B;
+          B = swap;
+        end else if ( B != 0 ) begin
+          A = A - B;
+        end else begin
+          done = 1;
+        end
+      end
+
+      f_gcd = A;
+    end
+  endfunction
+
+
+  // Calculates the Lowest Common Denominator between two integers
+  function integer f_lcm (
+    input integer a,
+    input integer b
+    );
+    begin : main
+      f_lcm = ( a / f_gcd(a, b)) * b;
+    end
+  endfunction
+
+  // Returns back the index to the TDATA portion of TPAYLOAD, returns 0 if the
+  // signal is not enabled.
+  function integer f_get_tdata_indx (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      f_get_tdata_indx = 0;
+    end
+  endfunction
+
+  // Returns back the index to the tstrb portion of TPAYLOAD, returns 0 if the
+  // signal is not enabled.
+  function integer f_get_tstrb_indx (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      integer cur_indx;
+      cur_indx = f_get_tdata_indx(DAW, IDW, DEW, USW, SST);
+      // If TDATA exists, then add its width to its base to get the tstrb index
+      f_get_tstrb_indx = SST[G_INDX_SS_TDATA] ? cur_indx + DAW : cur_indx;
+    end
+  endfunction
+
+  // Returns back the index to the tkeep portion of TPAYLOAD, returns 0 if the
+  // signal is not enabled.
+  function integer f_get_tkeep_indx (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      integer cur_indx;
+      cur_indx = f_get_tstrb_indx(DAW, IDW, DEW, USW, SST);
+      f_get_tkeep_indx = SST[G_INDX_SS_TSTRB] ? cur_indx + DAW/8 : cur_indx;
+    end
+  endfunction
+
+  // Returns back the index to the tlast portion of TPAYLOAD, returns 0 if the
+  // signal is not enabled.
+  function integer f_get_tlast_indx (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      integer cur_indx;
+      cur_indx = f_get_tkeep_indx(DAW, IDW, DEW, USW, SST);
+      f_get_tlast_indx = SST[G_INDX_SS_TKEEP] ? cur_indx + DAW/8 : cur_indx;
+    end
+  endfunction
+
+  // Returns back the index to the tid portion of TPAYLOAD, returns 0 if the
+  // signal is not enabled.
+  function integer f_get_tid_indx (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      integer cur_indx;
+      cur_indx = f_get_tlast_indx(DAW, IDW, DEW, USW, SST);
+      f_get_tid_indx = SST[G_INDX_SS_TLAST] ? cur_indx + 1 : cur_indx;
+    end
+  endfunction
+
+  // Returns back the index to the tdest portion of TPAYLOAD, returns 0 if the
+  // signal is not enabled.
+  function integer f_get_tdest_indx (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      integer cur_indx;
+      cur_indx = f_get_tid_indx(DAW, IDW, DEW, USW, SST);
+      f_get_tdest_indx = SST[G_INDX_SS_TID] ? cur_indx + IDW : cur_indx;
+    end
+  endfunction
+
+  // Returns back the index to the tuser portion of TPAYLOAD, returns 0 if the
+  // signal is not enabled.
+  function integer f_get_tuser_indx (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      integer cur_indx;
+      cur_indx = f_get_tdest_indx(DAW, IDW, DEW, USW, SST);
+      f_get_tuser_indx = SST[G_INDX_SS_TDEST] ? cur_indx + DEW : cur_indx;
+    end
+  endfunction
+
+  // Payload is the sum of all the AXIS signals present except for
+  // TREADY/TVALID
+  function integer f_payload_width (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      integer cur_indx;
+      cur_indx = f_get_tuser_indx(DAW, IDW, DEW, USW, SST);
+      f_payload_width = SST[G_INDX_SS_TUSER] ? cur_indx + USW : cur_indx;
+      // Ensure that the return value is never less than 1
+      f_payload_width = (f_payload_width < 1) ? 1 : f_payload_width;
+    end
+  endfunction
+
+  task t_check_tdata_width(
+    input  integer    data_width,
+    input  [8*80-1:0] var_name,
+    input  [8*80-1:0] inst_name,
+    input  integer    severity_lvl,
+    output integer    ret_val
+  );
+    // Severity levels:
+    // 0 = INFO
+    // 1 = WARNING
+    // 2 = ERROR
+    begin : t_check_tdata_width
+      if (data_width%8 != 0) begin
+        //       000       1          2         3         4         5         6         7         8
+        //       012       0          0         0         0         0         0         0         0
+        if (severity_lvl >= 2) begin
+        $display("ERROR: %m::%s", inst_name);
+        end else if (severity_lvl == 1) begin
+        $display("WARNING: %m::%s", inst_name);
+        end else begin
+        $display("INFO: %m::%s", inst_name);
+        end
+        $display("       Parameter %s (%2d) must be a multiple of 8.", var_name, data_width);
+        $display("       AXI4-Stream data width is only defined for byte multiples. See the ");
+        $display("       AMBA4 AXI4-Stream Protocol Specification v1.0 Section 2.1 for more");
+        $display("       information.");
+        ret_val = 1;
+      end else begin
+        ret_val = 0;
+      end
+    end
+  endtask
+
+  task t_check_tuser_width(
+    input  integer    tuser_width,
+    input  [8*80-1:0] tuser_name,
+    input  integer    tdata_width,
+    input  [8*80-1:0] tdata_name,
+    input  [8*80-1:0] inst_name,
+    input  integer    severity_lvl,
+    output integer    ret_val
+  );
+    // Severity levels:
+    // 0 = INFO
+    // 1 = WARNING
+    // 2 = ERROR
+    begin : t_check_tuser_width
+      integer tdata_bytes;
+      tdata_bytes = tdata_width/8;
+      if ((tuser_width%tdata_bytes) != 0) begin
+        //       000       1          2         3         4         5         6         7         8
+        //       012       0          0         0         0         0         0         0         0
+        if (severity_lvl >= 2) begin
+        $display("ERROR: %m::%s", inst_name);
+        end else if (severity_lvl == 1) begin
+        $display("WARNING: %m::%s", inst_name);
+        end else begin
+        $display("INFO: %m::%s", inst_name);
+        end
+        $display("       Parameter %s == %2d is not the recommended value of 'an integer ", tuser_name, tuser_width);
+        $display("       multiple of the width of the interface (%s == %2d) in bytes.'  AXI4-Stream", tdata_name, tdata_width);
+        $display("       TUSER width in this module is only defined when the TUSER is the");
+        $display("       recommended value.  See the AMBA4 AXI4-Stream Protocol Specification v1.0");
+        $display("       Section 2.1, 2.3.3 and 2.8 for more information.  ");
+        ret_val = 1;
+      end else begin
+        ret_val = 0;
+      end
+    end
+  endtask
diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py
index 9817f2e3d2857bd5e59b304fbdaf3bad74a9b037..efc179923545eb06e4d173c683b0941887f8bb79 100644
--- a/src/finn/transformation/fpgadataflow/insert_dwc.py
+++ b/src/finn/transformation/fpgadataflow/insert_dwc.py
@@ -81,6 +81,12 @@ class InsertDWC(Transformation):
                             dwc_in_width = n0.get_outstream_width()
                             # determine dwc outwidth
                             dwc_out_width = n1.get_instream_width()
+                            larger_width = max(dwc_in_width, dwc_out_width)
+                            smaller_width = min(dwc_in_width, dwc_out_width)
+                            if larger_width % smaller_width == 0:
+                                impl_style = "hls"
+                            else:
+                                impl_style = "vivado"
 
                             # determine shape for dwc
                             dwc_shape = n0.get_normal_output_shape()
@@ -105,6 +111,7 @@ class InsertDWC(Transformation):
                                 inWidth=dwc_in_width,
                                 outWidth=dwc_out_width,
                                 dataType=str(dtype.name),
+                                impl_style=impl_style,
                             )
                             # insert dwc
                             graph.node.insert(node_ind + 1, dwc_node)
diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
index 0139c71666fdfa4b60cb356ceb65ce2c5b831c13..90ea853b6072b145df64a8a73ee93c65989fe447 100644
--- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py
+++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
@@ -192,10 +192,11 @@ class InsertAndSetFIFODepths(Transformation):
     - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of
                        Verilog FIFOs (Q_srl.v)
     - max_depth : how deep the "max"-sized FIFOs initially inserted will be
+                   if set to None, use the tensor size as the depth
     - swg_exception : call CapConvolutionFIFODepths to make convolution FIFOs
                         smaller where appropriate
     - vivado_ram_style : the StreamingFIFO.ram_style attribute to be used for
-                          large FIFOs implemented by Vivado
+                          large FIFOs implemented by Vivado afterwards
 
     Assumed input graph properties:
     - all nodes are fpgadataflow nodes
@@ -210,7 +211,7 @@ class InsertAndSetFIFODepths(Transformation):
     necessary to insert FIFOs between them to prevent stalls due to bursty
     behavior. The sizes of those FIFOs are hard to predict analytically, so
     we do the following:
-    - insert very deep (default 16k deep) FIFOs between all fpgadataflow nodes
+    - insert deep (=tensor size) FIFOs between all fpgadataflow nodes
     - create stitched design
     - run through rtlsim with stream of multiple random input images (to fill pipeline)
     - keep track of observed maximum occupancy for each FIFO during rtlsim
@@ -223,7 +224,7 @@ class InsertAndSetFIFODepths(Transformation):
         fpgapart,
         clk_ns=10.0,
         max_qsrl_depth=256,
-        max_depth=2**14,
+        max_depth=None,
         swg_exception=True,
         vivado_ram_style="auto",
     ):
@@ -236,6 +237,9 @@ class InsertAndSetFIFODepths(Transformation):
         self.vivado_ram_style = vivado_ram_style
 
     def apply(self, model):
+        # these optypes may potentially use external weights
+        # we'll temporarily change them to use decoupled mode for FIFO sizing
+        extw_optypes = ["MatrixVectorActivation", "VectorVectorActivation"]
         # change external to decoupled and warn user
         # this way we are sure we have exactly one input/output
         modified_fc_nodes = []
@@ -246,9 +250,15 @@ class InsertAndSetFIFODepths(Transformation):
             )
             assert node.op_type != "StreamingFIFO", "Found existing StreamingFIFO node"
             node = getCustomOp(node)
-            node.set_nodeattr("inFIFODepth", self.max_depth)
-            node.set_nodeattr("outFIFODepth", self.max_depth)
-            if node.onnx_node.op_type == "MatrixVectorActivation":
+            if self.max_depth is not None:
+                node.set_nodeattr("inFIFODepth", self.max_depth)
+                node.set_nodeattr("outFIFODepth", self.max_depth)
+            else:
+                i_depth = np.prod(node.get_folded_input_shape()[:-1])
+                o_depth = np.prod(node.get_folded_output_shape()[:-1])
+                node.set_nodeattr("inFIFODepth", i_depth)
+                node.set_nodeattr("outFIFODepth", o_depth)
+            if node.onnx_node.op_type in extw_optypes:
                 mmode = node.get_nodeattr("mem_mode")
                 if mmode == "external":
                     modified_fc_nodes.append(node.onnx_node.name)
@@ -267,13 +277,17 @@ class InsertAndSetFIFODepths(Transformation):
 
         # gather FIFO names, check they are of expected depth
         fifos = {}
-        for node in model.graph.node:
-            if node.op_type == "StreamingFIFO":
-                fifos[node.name] = 0
-                node = getCustomOp(node)
-                # check depths and fix as necessary
-                if node.get_nodeattr("depth") != self.max_depth:
-                    node.set_nodeattr("depth", self.max_depth)
+        fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO")
+        for node in fifo_nodes:
+            fifos[node.name] = 0
+            node = getCustomOp(node)
+            node.set_nodeattr("depth_monitor", 1)
+            node.set_nodeattr("impl_style", "rtl")
+            # check depths and fix as necessary
+            if (self.max_depth is not None) and (
+                node.get_nodeattr("depth") != self.max_depth
+            ):
+                node.set_nodeattr("depth", self.max_depth)
 
         # insert FIFOs and do all transformations for RTLsim
         model = model.transform(AnnotateCycles())
@@ -324,21 +338,6 @@ class InsertAndSetFIFODepths(Transformation):
             else:
                 set_signal(sim, "tvalid", 0)
 
-            # check/update all fifo counts
-            for key in fifos:
-                current_state = sim.internals["finn_design_i"][key]["inst"][
-                    key + "_" + key
-                ]["state"]
-                current_addr = sim.internals["finn_design_i"][key]["inst"][
-                    key + "_" + key
-                ]["addr"]
-                if current_state == 2:
-                    current_count = current_addr + 2
-                else:
-                    current_count = current_state
-                if current_count > fifos[key]:
-                    fifos[key] = current_count
-
             # since latency estimation is very pessimistic, detect first output
             # and fast-forward the sim
             if get_signal(sim, "tvalid") != 0 and not output_detected:
@@ -352,6 +351,12 @@ class InsertAndSetFIFODepths(Transformation):
                 "No output detected, calculated FIFO depths may not be correct"
             )
 
+        for ind, node in enumerate(fifo_nodes):
+            maxcount_name = "maxcount_%d" % ind
+            if ind == 0:
+                maxcount_name = "maxcount"
+            fifos[node.name] = sim[maxcount_name]
+
         # Apply depths back into the model;
         # also set in/outFIFODepth to zero for non-FIFO
         # nodes, preventing further FIFO insertion
@@ -364,6 +369,7 @@ class InsertAndSetFIFODepths(Transformation):
                 depth = optimize_depth(fifos[node.name])
                 node_inst = getCustomOp(node)
                 node_inst.set_nodeattr("depth", depth)
+                node_inst.set_nodeattr("depth_monitor", 0)
                 # Set FIFO implementation/ram styles
                 if depth > self.max_qsrl_depth:
                     node_inst.set_nodeattr("impl_style", "vivado")
@@ -376,9 +382,9 @@ class InsertAndSetFIFODepths(Transformation):
             else:
                 getCustomOp(node).set_nodeattr("inFIFODepth", 0)
                 getCustomOp(node).set_nodeattr("outFIFODepth", 0)
-                # for every FC node we changed from external to decoupled,
+                # for every extw node we changed from external to decoupled,
                 # change back and reset implementation
-                if node.op_type == "MatrixVectorActivation":
+                if node.op_type in extw_optypes:
                     if node.name in modified_fc_nodes:
                         node_inst = getCustomOp(node)
                         node_inst.set_nodeattr("mem_mode", "external")
diff --git a/src/finn/util/pyverilator.py b/src/finn/util/pyverilator.py
index f6a51da8e44ea60ae5693cdd033b39bdf51376ac..d7ed3e261fe024b7f054382f12184628d3f3e94c 100644
--- a/src/finn/util/pyverilator.py
+++ b/src/finn/util/pyverilator.py
@@ -26,7 +26,10 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import pkg_resources as pk
+
 import os
+import shutil
 from pyverilator import PyVerilator
 
 from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
@@ -74,14 +77,35 @@ def pyverilate_stitched_ip(
     # are identical but in multiple directories (regslice_core.v)
 
     # remove duplicates from list by doing list -> set -> list
-    all_verilog_files = list(
-        set(filter(lambda x: x.endswith(".v") or x.endswith(".sv"), all_verilog_srcs))
+    src_exts = [".v", ".sv"]
+
+    all_verilog_src_files = list(
+        set(
+            filter(
+                lambda x: any(map(lambda y: x.endswith(y), src_exts)), all_verilog_srcs
+            )
+        )
+    )
+
+    verilog_header_dir = make_build_dir("pyverilator_vh_")
+    # use custom version of axis infrastructure vh
+    # to enable Verilator to simulate AMD/Xilinx components (e.g DWC)
+    custom_vh = pk.resource_filename(
+        "finn.qnn-data", "verilog/custom_axis_infrastructure.vh"
     )
+    shutil.copy(custom_vh, verilog_header_dir + "/axis_infrastructure_v1_1_0.vh")
+    for fn in all_verilog_srcs:
+        if fn.endswith(".vh"):
+            if "axis_infrastructure_v1_1_0.vh" in fn:
+                # skip, we use a custom version for this file without recursive gcd
+                continue
+            else:
+                shutil.copy(fn, verilog_header_dir)
 
     # remove all but one instances of regslice_core.v
     filtered_verilog_files = []
     remove_entry = False
-    for vfile in all_verilog_files:
+    for vfile in all_verilog_src_files:
         if "regslice_core" in vfile:
             if not remove_entry:
                 filtered_verilog_files.append(vfile)
@@ -94,7 +118,12 @@ def pyverilate_stitched_ip(
         for vfile in filtered_verilog_files:
             with open(vfile) as rf:
                 wf.write("//Added from " + vfile + "\n\n")
-                wf.write(rf.read())
+                lines = rf.read()
+                for line in lines.split("\n"):
+                    # break down too-long lines, Verilator complains otherwise
+                    if len(line) > 20000:
+                        line = line.replace("&", "\n&")
+                    wf.write("\n" + line)
 
     verilator_args = []
     # disable common verilator warnings that should be harmless but commonly occur
@@ -108,10 +137,20 @@ def pyverilate_stitched_ip(
     # force inlining of all submodules to ensure we can read internal signals properly
     if read_internal_signals:
         verilator_args += ["--inline-mult", "0"]
+    # add defines to make certain XPM src files work with Verilator
+    verilator_args.append("-DDISABLE_XPM_ASSERTIONS")
+    verilator_args.append("-DOBSOLETE")
+    verilator_args.append("-DONESPIN")
+    verilator_args.append("--bbox-unsup")
+    vivado_path = os.environ["VIVADO_PATH"]
+    # additional SystemVerilog modules to make XPMs work with Verilator
+    xpm_memory = f"{vivado_path}/data/ip/xpm/xpm_memory/hdl/xpm_memory.sv"
+    xpm_cdc = f"{vivado_path}/data/ip/xpm/xpm_cdc/hdl/xpm_cdc.sv"
+    xpm_fifo = f"{vivado_path}/data/ip/xpm/xpm_fifo/hdl/xpm_fifo.sv"
 
     sim = PyVerilator.build(
-        top_module_file_name,
-        verilog_path=[vivado_stitch_proj_dir],
+        [top_module_file_name, xpm_fifo, xpm_memory, xpm_cdc],
+        verilog_path=[vivado_stitch_proj_dir, verilog_header_dir],
         build_dir=build_dir,
         trace_depth=get_rtlsim_trace_depth(),
         top_module_name=top_module_name,