diff --git a/custom_hls/lookup.hpp b/custom_hls/lookup.hpp index 3001f6613ec6ed9a9e5f47d9be356d4b032f7192..037b038a09a10ff2bd066740d20f0b47489e24e4 100644 --- a/custom_hls/lookup.hpp +++ b/custom_hls/lookup.hpp @@ -26,14 +26,15 @@ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - *******************************************************************************/ +*******************************************************************************/ +#ifndef LOOKUP_HPP +#define LOOKUP_HPP #include <ap_int.h> #include <hls_stream.h> -#ifndef LOOKUP_HPP -#define LOOKUP_HPP +#include "utils.hpp" + template < unsigned NumEmbeddings, @@ -57,4 +58,50 @@ void StreamingLookup( } } +/** + * Lookup implementation over a table stored in AXI-accessible memory. + */ +template < + unsigned EmbeddingSize, // Number of memory words per embedding + unsigned EmbeddingAlign = clog2(EmbeddingSize), // Alignment of entries = number of word index bits + typename T_SRC, + typename T_DST +> +void StreamingLookup_ext( + hls::stream<T_SRC> &in0, + hls::stream<T_DST> &out, + T_DST const *const mem, + unsigned const size, + unsigned &oob_count, + bool &oob_irq +) { +#pragma HLS pipeline II=EmbeddingSize+9 style=flp + + static unsigned oob_count_li; + static unsigned oob_count_int; +#pragma HLS reset variable=oob_count_li +#pragma HLS reset variable=oob_count_int + + if(oob_count != oob_count_li) { + oob_count_int -= oob_count_li; + oob_count_li = oob_count; + } + if(!in0.empty()) { + T_SRC const x = in0.read(); + + // Map out-of-bounds inputs to an offset of zero and increment counter + bool const oob = x >= T_SRC(size); + ap_uint<T_SRC::width+EmbeddingAlign> const ofs = + ((oob? T_SRC(0) : x), ap_uint<EmbeddingAlign>(0)); + oob_count_int += oob; + + // Stream lookup data (burst inferred) + for(unsigned i = 0; i < EmbeddingSize; i++) { +#pragma HLS pipeline II=1 style=flp + out.write(mem[ofs+i]); + } + } + oob_count = oob_count_int; + oob_irq = (oob_count_int != 0); +} #endif diff --git a/finn-rtllib/memstream/hdl/Q_srl.v b/finn-rtllib/memstream/hdl/Q_srl.v index b4e89628a44bb1f55c3445ee8e6866beada23585..3c884770e026ec90b16dfd562e1861d132e714bd 100644 --- a/finn-rtllib/memstream/hdl/Q_srl.v +++ b/finn-rtllib/memstream/hdl/Q_srl.v @@ -69,7 +69,7 @@ `define Q_srl -module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count); +module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount); parameter depth = 16; // - greatest #items in queue (2 <= depth <= 256) parameter width = 16; // - width of data (i_d, o_d) @@ -90,7 +90,9 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count); wire o_b; // - output stream back-pressure output [addrwidth:0] count; // - output number of elems in queue + output [addrwidth:0] maxcount; // - maximum observed count since reset + reg [addrwidth:0] maxcount_reg; // - maximum count seen until now reg [addrwidth-1:0] addr, addr_, a_; // - SRL16 address // for data output reg shift_en_; // - SRL16 shift enable @@ -124,6 +126,7 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count); assign o_d = srlo; // - output data from queue assign o_v = o_v_reg; // - output valid if non-empty assign i_b = i_b_reg; // - input bp if full + assign maxcount = maxcount_reg; assign i_r = !i_b; assign o_b = !o_r; @@ -140,6 +143,7 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count); addr_full <= 0; o_v_reg <= 0; i_b_reg <= 1; + maxcount_reg <= '0; end else begin state <= state_; @@ -147,6 +151,7 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count); addr_full <= addr_full_; o_v_reg <= o_v_reg_; i_b_reg <= i_b_reg_; + maxcount_reg <= (count > maxcount_reg ? count : maxcount_reg); end end // always @ (posedge clock) diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py index 2ee436aae9e6800609bd873925e612b8a5f55954..79ae6957564ce07c18b76552089f64107fe51356 100644 --- a/src/finn/custom_op/fpgadataflow/hlscustomop.py +++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py @@ -150,6 +150,7 @@ class HLSCustomOp(CustomOp): intf_names["m_axis"] = [("out_" + sname, self.get_outstream_width_padded())] intf_names["aximm"] = [] intf_names["axilite"] = [] + intf_names["ap_none"] = [] return intf_names def get_verilog_top_filename(self): diff --git a/src/finn/custom_op/fpgadataflow/lookup.py b/src/finn/custom_op/fpgadataflow/lookup.py index d90fa0f05ab2a92391f610ae1c4516a95a881ce4..613a91b6284e0789dff2446e1615690a03336d99 100644 --- a/src/finn/custom_op/fpgadataflow/lookup.py +++ b/src/finn/custom_op/fpgadataflow/lookup.py @@ -159,8 +159,8 @@ class Lookup(HLSCustomOp): def global_includes(self): mem_mode = self.get_nodeattr("mem_mode") global_incls = [] + global_incls.append('#include "lookup.hpp"') if mem_mode == "const": - global_incls.append('#include "lookup.hpp"') global_incls.append('#include "embeddings.hpp"') self.code_gen_dict["$GLOBALS$"] = global_incls @@ -258,17 +258,10 @@ class Lookup(HLSCustomOp): InputType, EmbeddingType >(in0, out, embeddings);""" ] elif mem_mode == "external": - hls_impl = """ - if(!in0.empty()) { - ap_uint<T_SRC::width+EmbeddingAlign> const base = - (in0.read(), ap_uint<EmbeddingAlign>(0)); - for(unsigned j = 0; j < EmbeddingSize; j++) { -#pragma HLS PIPELINE II=1 - out.write(mem[base+j]); - } - } - """ - self.code_gen_dict["$DOCOMPUTE$"] = [hls_impl] + self.code_gen_dict["$DOCOMPUTE$"] = [ + """StreamingLookup_ext<EmbeddingSize>(in0, out, mem, size, oob_count, + oob_irq);""" + ] def blackboxfunction(self): mem_mode = self.get_nodeattr("mem_mode") @@ -286,7 +279,8 @@ class Lookup(HLSCustomOp): "void " + self.onnx_node.name + "(hls::stream<T_SRC> &in0, hls::stream<T_DST> &out, " - + "T_DST const *const mem)" + + "T_DST const *const mem, unsigned const size, " + + "unsigned &oob_count, bool &oob_irq)" ] def pragmas(self): @@ -305,6 +299,13 @@ class Lookup(HLSCustomOp): elif mem_mode == "external": my_pragmas.append("#pragma HLS INTERFACE m_axi offset=slave port=mem") my_pragmas.append("#pragma HLS INTERFACE s_axilite port=mem bundle=control") + my_pragmas.append( + "#pragma HLS INTERFACE s_axilite port=size bundle=control" + ) + my_pragmas.append( + "#pragma HLS INTERFACE s_axilite port=oob_count bundle=control" + ) + my_pragmas.append("#pragma HLS INTERFACE ap_none port=oob_irq") else: raise Exception("Unrecognized mem_mode: " + mem_mode) self.code_gen_dict["$PRAGMAS$"] = my_pragmas @@ -475,4 +476,5 @@ class Lookup(HLSCustomOp): if mem_mode == "external": intf_names["axilite"] = ["s_axi_control"] intf_names["aximm"] = [("m_axi_gmem", self.get_nodeattr("ext_mem_width"))] + intf_names["ap_none"] = ["oob_irq"] return intf_names diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py index 06b557982daea58ae753c843843ab31a953d07e2..f24cdcb932b7df6095ffdee9275170cfec85b008 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfifo.py +++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py @@ -68,6 +68,8 @@ class StreamingFIFO(HLSCustomOp): "auto", {"auto", "block", "distributed", "ultra"}, ), + # whether depth monitoring is enabled (impl_style=rtl only) + "depth_monitor": ("i", False, 0), } my_attrs.update(super().get_nodeattr_types()) @@ -113,6 +115,14 @@ class StreamingFIFO(HLSCustomOp): def verify_node(self): pass + def get_verilog_top_module_intf_names(self): + ret = super().get_verilog_top_module_intf_names() + is_rtl = self.get_nodeattr("impl_style") == "rtl" + is_depth_monitor = self.get_nodeattr("depth_monitor") == 1 + if is_rtl and is_depth_monitor: + ret["ap_none"] = ["maxcount"] + return ret + def get_verilog_top_module_name(self): "Return the Verilog top module name for this node." diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py index e73fa9bb2872d4a5023afb0c4e6953b4e6866b8d..c7bbc3f139b64f57943b2b099083a9611951e9c4 100644 --- a/src/finn/custom_op/fpgadataflow/templates.py +++ b/src/finn/custom_op/fpgadataflow/templates.py @@ -319,6 +319,7 @@ module $TOPNAME$( ap_clk, ap_rst_n, count, +maxcount, in0_$HLS_SNAME$_TDATA, in0_$HLS_SNAME$_TVALID, in0_$HLS_SNAME$_TREADY, @@ -330,6 +331,7 @@ out_$HLS_SNAME$_TREADY input ap_clk; input ap_rst_n; output $COUNT_RANGE$ count; +output $COUNT_RANGE$ maxcount; input $IN_RANGE$ in0_$HLS_SNAME$_TDATA; input in0_$HLS_SNAME$_TVALID; output in0_$HLS_SNAME$_TREADY; @@ -346,6 +348,7 @@ $LAYER_NAME$ .clock(ap_clk), .reset(!ap_rst_n), .count(count), + .maxcount(maxcount), .i_d(in0_$HLS_SNAME$_TDATA), .i_v(in0_$HLS_SNAME$_TVALID), .i_r(in0_$HLS_SNAME$_TREADY), diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py index 1287f677eebfd360cfe0bbac0710f82a7906f308..cfb44c0a9161bc6220a426a307a2d3393dbb7cb2 100644 --- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py +++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py @@ -228,6 +228,22 @@ class CreateStitchedIP(Transformation): ) self.s_axis_idx += 1 + def connect_ap_none_external(self, node): + inst_name = node.name + node_inst = getCustomOp(node) + input_intf_names = node_inst.get_verilog_top_module_intf_names()["ap_none"] + # make external + for i in range(len(input_intf_names)): + input_intf_name = input_intf_names[i] + self.connect_cmds.append( + "make_bd_pins_external [get_bd_pins %s/%s]" + % (inst_name, input_intf_name) + ) + self.connect_cmds.append( + "set_property name %s [get_bd_ports %s_0]" + % (input_intf_name, input_intf_name) + ) + def insert_signature(self, checksum_count): signature_vlnv = "AMD:user:axi_info_top:1.0" signature_name = "axi_info_top0" @@ -305,6 +321,7 @@ class CreateStitchedIP(Transformation): ip_dirs += [ip_dir_value] self.create_cmds += node_inst.code_generation_ipi() self.connect_clk_rst(node) + self.connect_ap_none_external(node) self.connect_axi(node) for i in range(len(node.input)): if not is_external_input(model, node, i): diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py index 9817f2e3d2857bd5e59b304fbdaf3bad74a9b037..efc179923545eb06e4d173c683b0941887f8bb79 100644 --- a/src/finn/transformation/fpgadataflow/insert_dwc.py +++ b/src/finn/transformation/fpgadataflow/insert_dwc.py @@ -81,6 +81,12 @@ class InsertDWC(Transformation): dwc_in_width = n0.get_outstream_width() # determine dwc outwidth dwc_out_width = n1.get_instream_width() + larger_width = max(dwc_in_width, dwc_out_width) + smaller_width = min(dwc_in_width, dwc_out_width) + if larger_width % smaller_width == 0: + impl_style = "hls" + else: + impl_style = "vivado" # determine shape for dwc dwc_shape = n0.get_normal_output_shape() @@ -105,6 +111,7 @@ class InsertDWC(Transformation): inWidth=dwc_in_width, outWidth=dwc_out_width, dataType=str(dtype.name), + impl_style=impl_style, ) # insert dwc graph.node.insert(node_ind + 1, dwc_node) diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py index 0139c71666fdfa4b60cb356ceb65ce2c5b831c13..90ea853b6072b145df64a8a73ee93c65989fe447 100644 --- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -192,10 +192,11 @@ class InsertAndSetFIFODepths(Transformation): - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of Verilog FIFOs (Q_srl.v) - max_depth : how deep the "max"-sized FIFOs initially inserted will be + if set to None, use the tensor size as the depth - swg_exception : call CapConvolutionFIFODepths to make convolution FIFOs smaller where appropriate - vivado_ram_style : the StreamingFIFO.ram_style attribute to be used for - large FIFOs implemented by Vivado + large FIFOs implemented by Vivado afterwards Assumed input graph properties: - all nodes are fpgadataflow nodes @@ -210,7 +211,7 @@ class InsertAndSetFIFODepths(Transformation): necessary to insert FIFOs between them to prevent stalls due to bursty behavior. The sizes of those FIFOs are hard to predict analytically, so we do the following: - - insert very deep (default 16k deep) FIFOs between all fpgadataflow nodes + - insert deep (=tensor size) FIFOs between all fpgadataflow nodes - create stitched design - run through rtlsim with stream of multiple random input images (to fill pipeline) - keep track of observed maximum occupancy for each FIFO during rtlsim @@ -223,7 +224,7 @@ class InsertAndSetFIFODepths(Transformation): fpgapart, clk_ns=10.0, max_qsrl_depth=256, - max_depth=2**14, + max_depth=None, swg_exception=True, vivado_ram_style="auto", ): @@ -236,6 +237,9 @@ class InsertAndSetFIFODepths(Transformation): self.vivado_ram_style = vivado_ram_style def apply(self, model): + # these optypes may potentially use external weights + # we'll temporarily change them to use decoupled mode for FIFO sizing + extw_optypes = ["MatrixVectorActivation", "VectorVectorActivation"] # change external to decoupled and warn user # this way we are sure we have exactly one input/output modified_fc_nodes = [] @@ -246,9 +250,15 @@ class InsertAndSetFIFODepths(Transformation): ) assert node.op_type != "StreamingFIFO", "Found existing StreamingFIFO node" node = getCustomOp(node) - node.set_nodeattr("inFIFODepth", self.max_depth) - node.set_nodeattr("outFIFODepth", self.max_depth) - if node.onnx_node.op_type == "MatrixVectorActivation": + if self.max_depth is not None: + node.set_nodeattr("inFIFODepth", self.max_depth) + node.set_nodeattr("outFIFODepth", self.max_depth) + else: + i_depth = np.prod(node.get_folded_input_shape()[:-1]) + o_depth = np.prod(node.get_folded_output_shape()[:-1]) + node.set_nodeattr("inFIFODepth", i_depth) + node.set_nodeattr("outFIFODepth", o_depth) + if node.onnx_node.op_type in extw_optypes: mmode = node.get_nodeattr("mem_mode") if mmode == "external": modified_fc_nodes.append(node.onnx_node.name) @@ -267,13 +277,17 @@ class InsertAndSetFIFODepths(Transformation): # gather FIFO names, check they are of expected depth fifos = {} - for node in model.graph.node: - if node.op_type == "StreamingFIFO": - fifos[node.name] = 0 - node = getCustomOp(node) - # check depths and fix as necessary - if node.get_nodeattr("depth") != self.max_depth: - node.set_nodeattr("depth", self.max_depth) + fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO") + for node in fifo_nodes: + fifos[node.name] = 0 + node = getCustomOp(node) + node.set_nodeattr("depth_monitor", 1) + node.set_nodeattr("impl_style", "rtl") + # check depths and fix as necessary + if (self.max_depth is not None) and ( + node.get_nodeattr("depth") != self.max_depth + ): + node.set_nodeattr("depth", self.max_depth) # insert FIFOs and do all transformations for RTLsim model = model.transform(AnnotateCycles()) @@ -324,21 +338,6 @@ class InsertAndSetFIFODepths(Transformation): else: set_signal(sim, "tvalid", 0) - # check/update all fifo counts - for key in fifos: - current_state = sim.internals["finn_design_i"][key]["inst"][ - key + "_" + key - ]["state"] - current_addr = sim.internals["finn_design_i"][key]["inst"][ - key + "_" + key - ]["addr"] - if current_state == 2: - current_count = current_addr + 2 - else: - current_count = current_state - if current_count > fifos[key]: - fifos[key] = current_count - # since latency estimation is very pessimistic, detect first output # and fast-forward the sim if get_signal(sim, "tvalid") != 0 and not output_detected: @@ -352,6 +351,12 @@ class InsertAndSetFIFODepths(Transformation): "No output detected, calculated FIFO depths may not be correct" ) + for ind, node in enumerate(fifo_nodes): + maxcount_name = "maxcount_%d" % ind + if ind == 0: + maxcount_name = "maxcount" + fifos[node.name] = sim[maxcount_name] + # Apply depths back into the model; # also set in/outFIFODepth to zero for non-FIFO # nodes, preventing further FIFO insertion @@ -364,6 +369,7 @@ class InsertAndSetFIFODepths(Transformation): depth = optimize_depth(fifos[node.name]) node_inst = getCustomOp(node) node_inst.set_nodeattr("depth", depth) + node_inst.set_nodeattr("depth_monitor", 0) # Set FIFO implementation/ram styles if depth > self.max_qsrl_depth: node_inst.set_nodeattr("impl_style", "vivado") @@ -376,9 +382,9 @@ class InsertAndSetFIFODepths(Transformation): else: getCustomOp(node).set_nodeattr("inFIFODepth", 0) getCustomOp(node).set_nodeattr("outFIFODepth", 0) - # for every FC node we changed from external to decoupled, + # for every extw node we changed from external to decoupled, # change back and reset implementation - if node.op_type == "MatrixVectorActivation": + if node.op_type in extw_optypes: if node.name in modified_fc_nodes: node_inst = getCustomOp(node) node_inst.set_nodeattr("mem_mode", "external")