diff --git a/custom_hls/lookup.hpp b/custom_hls/lookup.hpp
index 3001f6613ec6ed9a9e5f47d9be356d4b032f7192..037b038a09a10ff2bd066740d20f0b47489e24e4 100644
--- a/custom_hls/lookup.hpp
+++ b/custom_hls/lookup.hpp
@@ -26,14 +26,15 @@
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- *******************************************************************************/
+*******************************************************************************/
+#ifndef LOOKUP_HPP
+#define LOOKUP_HPP
 
 #include <ap_int.h>
 #include <hls_stream.h>
 
-#ifndef LOOKUP_HPP
-#define LOOKUP_HPP
+#include "utils.hpp"
+
 
 template <
     unsigned NumEmbeddings,
@@ -57,4 +58,50 @@ void StreamingLookup(
     }
 }
 
+/**
+ * Lookup implementation over a table stored in AXI-accessible memory.
+ */
+template <
+	unsigned  EmbeddingSize,                            // Number of memory words per embedding
+	unsigned  EmbeddingAlign = clog2(EmbeddingSize),    // Alignment of entries = number of word index bits
+	typename  T_SRC,
+	typename  T_DST
+>
+void StreamingLookup_ext(
+	hls::stream<T_SRC> &in0,
+	hls::stream<T_DST> &out,
+	T_DST const *const  mem,
+	unsigned  const     size,
+	unsigned           &oob_count,
+	bool               &oob_irq
+) {
+#pragma HLS pipeline II=EmbeddingSize+9 style=flp
+
+	static unsigned  oob_count_li;
+	static unsigned  oob_count_int;
+#pragma HLS reset variable=oob_count_li
+#pragma HLS reset variable=oob_count_int
+
+	if(oob_count != oob_count_li) {
+		oob_count_int -= oob_count_li;
+		oob_count_li   = oob_count;
+	}
+	if(!in0.empty()) {
+		T_SRC const  x = in0.read();
+
+		// Map out-of-bounds inputs to an offset of zero and increment counter
+		bool  const  oob = x >= T_SRC(size);
+		ap_uint<T_SRC::width+EmbeddingAlign> const  ofs =
+			((oob? T_SRC(0) : x), ap_uint<EmbeddingAlign>(0));
+		oob_count_int += oob;
+
+		// Stream lookup data (burst inferred)
+		for(unsigned  i = 0; i < EmbeddingSize; i++) {
+#pragma HLS pipeline II=1 style=flp
+			out.write(mem[ofs+i]);
+		}
+	}
+	oob_count =  oob_count_int;
+	oob_irq   = (oob_count_int != 0);
+}
 #endif
diff --git a/finn-rtllib/memstream/hdl/Q_srl.v b/finn-rtllib/memstream/hdl/Q_srl.v
index b4e89628a44bb1f55c3445ee8e6866beada23585..3c884770e026ec90b16dfd562e1861d132e714bd 100644
--- a/finn-rtllib/memstream/hdl/Q_srl.v
+++ b/finn-rtllib/memstream/hdl/Q_srl.v
@@ -69,7 +69,7 @@
 `define Q_srl
 
 
-module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
+module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount);
 
    parameter depth = 16;   // - greatest #items in queue  (2 <= depth <= 256)
    parameter width = 16;   // - width of data (i_d, o_d)
@@ -90,7 +90,9 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
    wire               o_b;	// - output stream back-pressure
 
    output [addrwidth:0] count;  // - output number of elems in queue
+   output [addrwidth:0] maxcount;  // - maximum observed count since reset
 
+   reg [addrwidth:0] maxcount_reg;  // - maximum count seen until now
    reg    [addrwidth-1:0] addr, addr_, a_;		// - SRL16 address
 							//     for data output
    reg 			  shift_en_;			// - SRL16 shift enable
@@ -124,6 +126,7 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
    assign o_d = srlo;				// - output data from queue
    assign o_v = o_v_reg;			// - output valid if non-empty
    assign i_b = i_b_reg;			// - input bp if full
+   assign maxcount = maxcount_reg;
 
    assign i_r = !i_b;
    assign o_b = !o_r;
@@ -140,6 +143,7 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
          addr_full <= 0;
 	 o_v_reg   <= 0;
 	 i_b_reg   <= 1;
+	 maxcount_reg <= '0;
       end
       else begin
 	 state     <= state_;
@@ -147,6 +151,7 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
          addr_full <= addr_full_;
 	 o_v_reg   <= o_v_reg_;
 	 i_b_reg   <= i_b_reg_;
+	 maxcount_reg <= (count > maxcount_reg ? count : maxcount_reg);
       end
    end // always @ (posedge clock)
 
diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py
index 2ee436aae9e6800609bd873925e612b8a5f55954..79ae6957564ce07c18b76552089f64107fe51356 100644
--- a/src/finn/custom_op/fpgadataflow/hlscustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py
@@ -150,6 +150,7 @@ class HLSCustomOp(CustomOp):
         intf_names["m_axis"] = [("out_" + sname, self.get_outstream_width_padded())]
         intf_names["aximm"] = []
         intf_names["axilite"] = []
+        intf_names["ap_none"] = []
         return intf_names
 
     def get_verilog_top_filename(self):
diff --git a/src/finn/custom_op/fpgadataflow/lookup.py b/src/finn/custom_op/fpgadataflow/lookup.py
index d90fa0f05ab2a92391f610ae1c4516a95a881ce4..613a91b6284e0789dff2446e1615690a03336d99 100644
--- a/src/finn/custom_op/fpgadataflow/lookup.py
+++ b/src/finn/custom_op/fpgadataflow/lookup.py
@@ -159,8 +159,8 @@ class Lookup(HLSCustomOp):
     def global_includes(self):
         mem_mode = self.get_nodeattr("mem_mode")
         global_incls = []
+        global_incls.append('#include "lookup.hpp"')
         if mem_mode == "const":
-            global_incls.append('#include "lookup.hpp"')
             global_incls.append('#include "embeddings.hpp"')
         self.code_gen_dict["$GLOBALS$"] = global_incls
 
@@ -258,17 +258,10 @@ class Lookup(HLSCustomOp):
                 InputType, EmbeddingType >(in0, out, embeddings);"""
             ]
         elif mem_mode == "external":
-            hls_impl = """
-    if(!in0.empty()) {
-        ap_uint<T_SRC::width+EmbeddingAlign> const  base =
-            (in0.read(), ap_uint<EmbeddingAlign>(0));
-        for(unsigned  j = 0; j < EmbeddingSize; j++) {
-#pragma HLS PIPELINE II=1
-            out.write(mem[base+j]);
-        }
-    }
-            """
-            self.code_gen_dict["$DOCOMPUTE$"] = [hls_impl]
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """StreamingLookup_ext<EmbeddingSize>(in0, out, mem, size, oob_count,
+                oob_irq);"""
+            ]
 
     def blackboxfunction(self):
         mem_mode = self.get_nodeattr("mem_mode")
@@ -286,7 +279,8 @@ class Lookup(HLSCustomOp):
                 "void "
                 + self.onnx_node.name
                 + "(hls::stream<T_SRC> &in0, hls::stream<T_DST> &out, "
-                + "T_DST const *const  mem)"
+                + "T_DST const *const  mem, unsigned const size, "
+                + "unsigned &oob_count, bool &oob_irq)"
             ]
 
     def pragmas(self):
@@ -305,6 +299,13 @@ class Lookup(HLSCustomOp):
         elif mem_mode == "external":
             my_pragmas.append("#pragma HLS INTERFACE m_axi offset=slave port=mem")
             my_pragmas.append("#pragma HLS INTERFACE s_axilite port=mem bundle=control")
+            my_pragmas.append(
+                "#pragma HLS INTERFACE s_axilite port=size bundle=control"
+            )
+            my_pragmas.append(
+                "#pragma HLS INTERFACE s_axilite port=oob_count bundle=control"
+            )
+            my_pragmas.append("#pragma HLS INTERFACE ap_none port=oob_irq")
         else:
             raise Exception("Unrecognized mem_mode: " + mem_mode)
         self.code_gen_dict["$PRAGMAS$"] = my_pragmas
@@ -475,4 +476,5 @@ class Lookup(HLSCustomOp):
         if mem_mode == "external":
             intf_names["axilite"] = ["s_axi_control"]
             intf_names["aximm"] = [("m_axi_gmem", self.get_nodeattr("ext_mem_width"))]
+            intf_names["ap_none"] = ["oob_irq"]
         return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py
index 06b557982daea58ae753c843843ab31a953d07e2..f24cdcb932b7df6095ffdee9275170cfec85b008 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfifo.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py
@@ -68,6 +68,8 @@ class StreamingFIFO(HLSCustomOp):
                 "auto",
                 {"auto", "block", "distributed", "ultra"},
             ),
+            # whether depth monitoring is enabled (impl_style=rtl only)
+            "depth_monitor": ("i", False, 0),
         }
         my_attrs.update(super().get_nodeattr_types())
 
@@ -113,6 +115,14 @@ class StreamingFIFO(HLSCustomOp):
     def verify_node(self):
         pass
 
+    def get_verilog_top_module_intf_names(self):
+        ret = super().get_verilog_top_module_intf_names()
+        is_rtl = self.get_nodeattr("impl_style") == "rtl"
+        is_depth_monitor = self.get_nodeattr("depth_monitor") == 1
+        if is_rtl and is_depth_monitor:
+            ret["ap_none"] = ["maxcount"]
+        return ret
+
     def get_verilog_top_module_name(self):
         "Return the Verilog top module name for this node."
 
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index e73fa9bb2872d4a5023afb0c4e6953b4e6866b8d..c7bbc3f139b64f57943b2b099083a9611951e9c4 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -319,6 +319,7 @@ module $TOPNAME$(
 ap_clk,
 ap_rst_n,
 count,
+maxcount,
 in0_$HLS_SNAME$_TDATA,
 in0_$HLS_SNAME$_TVALID,
 in0_$HLS_SNAME$_TREADY,
@@ -330,6 +331,7 @@ out_$HLS_SNAME$_TREADY
 input   ap_clk;
 input   ap_rst_n;
 output $COUNT_RANGE$ count;
+output $COUNT_RANGE$ maxcount;
 input  $IN_RANGE$ in0_$HLS_SNAME$_TDATA;
 input   in0_$HLS_SNAME$_TVALID;
 output   in0_$HLS_SNAME$_TREADY;
@@ -346,6 +348,7 @@ $LAYER_NAME$
  .clock(ap_clk),
  .reset(!ap_rst_n),
  .count(count),
+ .maxcount(maxcount),
  .i_d(in0_$HLS_SNAME$_TDATA),
  .i_v(in0_$HLS_SNAME$_TVALID),
  .i_r(in0_$HLS_SNAME$_TREADY),
diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index 1287f677eebfd360cfe0bbac0710f82a7906f308..cfb44c0a9161bc6220a426a307a2d3393dbb7cb2 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -228,6 +228,22 @@ class CreateStitchedIP(Transformation):
             )
             self.s_axis_idx += 1
 
+    def connect_ap_none_external(self, node):
+        inst_name = node.name
+        node_inst = getCustomOp(node)
+        input_intf_names = node_inst.get_verilog_top_module_intf_names()["ap_none"]
+        # make external
+        for i in range(len(input_intf_names)):
+            input_intf_name = input_intf_names[i]
+            self.connect_cmds.append(
+                "make_bd_pins_external [get_bd_pins %s/%s]"
+                % (inst_name, input_intf_name)
+            )
+            self.connect_cmds.append(
+                "set_property name %s [get_bd_ports %s_0]"
+                % (input_intf_name, input_intf_name)
+            )
+
     def insert_signature(self, checksum_count):
         signature_vlnv = "AMD:user:axi_info_top:1.0"
         signature_name = "axi_info_top0"
@@ -305,6 +321,7 @@ class CreateStitchedIP(Transformation):
             ip_dirs += [ip_dir_value]
             self.create_cmds += node_inst.code_generation_ipi()
             self.connect_clk_rst(node)
+            self.connect_ap_none_external(node)
             self.connect_axi(node)
             for i in range(len(node.input)):
                 if not is_external_input(model, node, i):
diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py
index 9817f2e3d2857bd5e59b304fbdaf3bad74a9b037..efc179923545eb06e4d173c683b0941887f8bb79 100644
--- a/src/finn/transformation/fpgadataflow/insert_dwc.py
+++ b/src/finn/transformation/fpgadataflow/insert_dwc.py
@@ -81,6 +81,12 @@ class InsertDWC(Transformation):
                             dwc_in_width = n0.get_outstream_width()
                             # determine dwc outwidth
                             dwc_out_width = n1.get_instream_width()
+                            larger_width = max(dwc_in_width, dwc_out_width)
+                            smaller_width = min(dwc_in_width, dwc_out_width)
+                            if larger_width % smaller_width == 0:
+                                impl_style = "hls"
+                            else:
+                                impl_style = "vivado"
 
                             # determine shape for dwc
                             dwc_shape = n0.get_normal_output_shape()
@@ -105,6 +111,7 @@ class InsertDWC(Transformation):
                                 inWidth=dwc_in_width,
                                 outWidth=dwc_out_width,
                                 dataType=str(dtype.name),
+                                impl_style=impl_style,
                             )
                             # insert dwc
                             graph.node.insert(node_ind + 1, dwc_node)
diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
index 0139c71666fdfa4b60cb356ceb65ce2c5b831c13..90ea853b6072b145df64a8a73ee93c65989fe447 100644
--- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py
+++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
@@ -192,10 +192,11 @@ class InsertAndSetFIFODepths(Transformation):
     - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of
                        Verilog FIFOs (Q_srl.v)
     - max_depth : how deep the "max"-sized FIFOs initially inserted will be
+                   if set to None, use the tensor size as the depth
     - swg_exception : call CapConvolutionFIFODepths to make convolution FIFOs
                         smaller where appropriate
     - vivado_ram_style : the StreamingFIFO.ram_style attribute to be used for
-                          large FIFOs implemented by Vivado
+                          large FIFOs implemented by Vivado afterwards
 
     Assumed input graph properties:
     - all nodes are fpgadataflow nodes
@@ -210,7 +211,7 @@ class InsertAndSetFIFODepths(Transformation):
     necessary to insert FIFOs between them to prevent stalls due to bursty
     behavior. The sizes of those FIFOs are hard to predict analytically, so
     we do the following:
-    - insert very deep (default 16k deep) FIFOs between all fpgadataflow nodes
+    - insert deep (=tensor size) FIFOs between all fpgadataflow nodes
     - create stitched design
     - run through rtlsim with stream of multiple random input images (to fill pipeline)
     - keep track of observed maximum occupancy for each FIFO during rtlsim
@@ -223,7 +224,7 @@ class InsertAndSetFIFODepths(Transformation):
         fpgapart,
         clk_ns=10.0,
         max_qsrl_depth=256,
-        max_depth=2**14,
+        max_depth=None,
         swg_exception=True,
         vivado_ram_style="auto",
     ):
@@ -236,6 +237,9 @@ class InsertAndSetFIFODepths(Transformation):
         self.vivado_ram_style = vivado_ram_style
 
     def apply(self, model):
+        # these optypes may potentially use external weights
+        # we'll temporarily change them to use decoupled mode for FIFO sizing
+        extw_optypes = ["MatrixVectorActivation", "VectorVectorActivation"]
         # change external to decoupled and warn user
         # this way we are sure we have exactly one input/output
         modified_fc_nodes = []
@@ -246,9 +250,15 @@ class InsertAndSetFIFODepths(Transformation):
             )
             assert node.op_type != "StreamingFIFO", "Found existing StreamingFIFO node"
             node = getCustomOp(node)
-            node.set_nodeattr("inFIFODepth", self.max_depth)
-            node.set_nodeattr("outFIFODepth", self.max_depth)
-            if node.onnx_node.op_type == "MatrixVectorActivation":
+            if self.max_depth is not None:
+                node.set_nodeattr("inFIFODepth", self.max_depth)
+                node.set_nodeattr("outFIFODepth", self.max_depth)
+            else:
+                i_depth = np.prod(node.get_folded_input_shape()[:-1])
+                o_depth = np.prod(node.get_folded_output_shape()[:-1])
+                node.set_nodeattr("inFIFODepth", i_depth)
+                node.set_nodeattr("outFIFODepth", o_depth)
+            if node.onnx_node.op_type in extw_optypes:
                 mmode = node.get_nodeattr("mem_mode")
                 if mmode == "external":
                     modified_fc_nodes.append(node.onnx_node.name)
@@ -267,13 +277,17 @@ class InsertAndSetFIFODepths(Transformation):
 
         # gather FIFO names, check they are of expected depth
         fifos = {}
-        for node in model.graph.node:
-            if node.op_type == "StreamingFIFO":
-                fifos[node.name] = 0
-                node = getCustomOp(node)
-                # check depths and fix as necessary
-                if node.get_nodeattr("depth") != self.max_depth:
-                    node.set_nodeattr("depth", self.max_depth)
+        fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO")
+        for node in fifo_nodes:
+            fifos[node.name] = 0
+            node = getCustomOp(node)
+            node.set_nodeattr("depth_monitor", 1)
+            node.set_nodeattr("impl_style", "rtl")
+            # check depths and fix as necessary
+            if (self.max_depth is not None) and (
+                node.get_nodeattr("depth") != self.max_depth
+            ):
+                node.set_nodeattr("depth", self.max_depth)
 
         # insert FIFOs and do all transformations for RTLsim
         model = model.transform(AnnotateCycles())
@@ -324,21 +338,6 @@ class InsertAndSetFIFODepths(Transformation):
             else:
                 set_signal(sim, "tvalid", 0)
 
-            # check/update all fifo counts
-            for key in fifos:
-                current_state = sim.internals["finn_design_i"][key]["inst"][
-                    key + "_" + key
-                ]["state"]
-                current_addr = sim.internals["finn_design_i"][key]["inst"][
-                    key + "_" + key
-                ]["addr"]
-                if current_state == 2:
-                    current_count = current_addr + 2
-                else:
-                    current_count = current_state
-                if current_count > fifos[key]:
-                    fifos[key] = current_count
-
             # since latency estimation is very pessimistic, detect first output
             # and fast-forward the sim
             if get_signal(sim, "tvalid") != 0 and not output_detected:
@@ -352,6 +351,12 @@ class InsertAndSetFIFODepths(Transformation):
                 "No output detected, calculated FIFO depths may not be correct"
             )
 
+        for ind, node in enumerate(fifo_nodes):
+            maxcount_name = "maxcount_%d" % ind
+            if ind == 0:
+                maxcount_name = "maxcount"
+            fifos[node.name] = sim[maxcount_name]
+
         # Apply depths back into the model;
         # also set in/outFIFODepth to zero for non-FIFO
         # nodes, preventing further FIFO insertion
@@ -364,6 +369,7 @@ class InsertAndSetFIFODepths(Transformation):
                 depth = optimize_depth(fifos[node.name])
                 node_inst = getCustomOp(node)
                 node_inst.set_nodeattr("depth", depth)
+                node_inst.set_nodeattr("depth_monitor", 0)
                 # Set FIFO implementation/ram styles
                 if depth > self.max_qsrl_depth:
                     node_inst.set_nodeattr("impl_style", "vivado")
@@ -376,9 +382,9 @@ class InsertAndSetFIFODepths(Transformation):
             else:
                 getCustomOp(node).set_nodeattr("inFIFODepth", 0)
                 getCustomOp(node).set_nodeattr("outFIFODepth", 0)
-                # for every FC node we changed from external to decoupled,
+                # for every extw node we changed from external to decoupled,
                 # change back and reset implementation
-                if node.op_type == "MatrixVectorActivation":
+                if node.op_type in extw_optypes:
                     if node.name in modified_fc_nodes:
                         node_inst = getCustomOp(node)
                         node_inst.set_nodeattr("mem_mode", "external")