diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn index 9c18c03d7bdb8406d43aa8fc4efdb8a206b1217e..b3c669ec1097745bd30f650ca0b9dacda647c61d 100644 --- a/docker/Dockerfile.finn +++ b/docker/Dockerfile.finn @@ -65,7 +65,7 @@ RUN locale-gen "en_US.UTF-8" RUN apt-get install -y git perl python3 make autoconf g++ flex bison ccache libgoogle-perftools-dev numactl perl-doc libfl2 libfl-dev zlibc zlib1g zlib1g-dev RUN git clone https://github.com/verilator/verilator RUN cd verilator && \ - git checkout v4.012 && \ + git checkout v4.224 && \ autoconf && \ ./configure && \ make -j4 && \ diff --git a/fetch-repos.sh b/fetch-repos.sh index 10b6b332550be5d914d80e242f01e77daeaf08a0..36c9ae55780fe0f945f065d7a0214c683bf513a8 100755 --- a/fetch-repos.sh +++ b/fetch-repos.sh @@ -27,10 +27,10 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -QONNX_COMMIT="92184fea2dd417bc7a53c82811fef271e4833c4c" +QONNX_COMMIT="f702b17cdb9d5e57f85f43a5d33890647e063de6" FINN_EXP_COMMIT="9cbd2787b5160e2b44e0e8164a0df1457dbd5366" BREVITAS_COMMIT="a5b71d6de1389d3e7db898fef72e014842670f03" -PYVERILATOR_COMMIT="64b8294ff1afebb47be76fcad6ae87027e0402c2" +PYVERILATOR_COMMIT="766e457465f5c0dd315490d7b9cc5d74f9a76f4f" CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4" HLSLIB_COMMIT="e7f2de91d1a2ddadaaea06b8f4c20e97a575470e" OMX_COMMIT="d1065a788219ca0eb54d5e57600b1f9d7f67d4cc" diff --git a/finn-rtllib/memstream/hdl/Q_srl.v b/finn-rtllib/memstream/hdl/Q_srl.v index b4e89628a44bb1f55c3445ee8e6866beada23585..2f3d813504b06c9f12875c5bad00ce99c0707d82 100644 --- a/finn-rtllib/memstream/hdl/Q_srl.v +++ b/finn-rtllib/memstream/hdl/Q_srl.v @@ -69,7 +69,7 @@ `define Q_srl -module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count); +module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount); parameter depth = 16; // - greatest #items in queue (2 <= depth <= 256) parameter width = 16; // - width of data (i_d, o_d) @@ -90,7 +90,9 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count); wire o_b; // - output stream back-pressure output [addrwidth:0] count; // - output number of elems in queue + output [addrwidth:0] maxcount; // - maximum observed count since reset + reg [addrwidth:0] maxcount_reg; // - maximum count seen until now reg [addrwidth-1:0] addr, addr_, a_; // - SRL16 address // for data output reg shift_en_; // - SRL16 shift enable @@ -124,6 +126,7 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count); assign o_d = srlo; // - output data from queue assign o_v = o_v_reg; // - output valid if non-empty assign i_b = i_b_reg; // - input bp if full + assign maxcount = maxcount_reg; assign i_r = !i_b; assign o_b = !o_r; @@ -140,6 +143,7 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count); addr_full <= 0; o_v_reg <= 0; i_b_reg <= 1; + maxcount_reg <= 0; end else begin state <= state_; @@ -147,6 +151,7 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count); addr_full <= addr_full_; o_v_reg <= o_v_reg_; i_b_reg <= i_b_reg_; + maxcount_reg <= (count > maxcount_reg ? count : maxcount_reg); end end // always @ (posedge clock) diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index e16711f63b954707bc7ad9050dd7627ca1ce99c1..d842d89e234fd59f953a246293d271154d50954a 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -320,6 +320,10 @@ class DataflowBuildConfig: #: Override the number of inputs for rtlsim performance measurement. rtlsim_batch_size: Optional[int] = 1 + #: If set to True, FIFOs and DWCs with impl_style=vivado will be kept during + #: rtlsim, otherwise they will be replaced by HLS implementations. + rtlsim_use_vivado_comps: Optional[bool] = True + def _resolve_hls_clk_period(self): if self.hls_clk_period_ns is None: # use same clk for synth and hls if not explicitly specified diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 70ae70f4e1912aa0a31b0c00c7e41f35780e207f..8290621056f9e4531693a3266bfb633735a4db33 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -204,40 +204,44 @@ def verify_step( def prepare_for_stitched_ip_rtlsim(verify_model, cfg): - need_restitch = False - # rtlsim only supports certain impl_style for some nodes - # StreamingFIFO must have impl_style=rtl - for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO"): - inst = getCustomOp(fifo_layer) - if inst.get_nodeattr("impl_style") != "rtl": - inst.set_nodeattr("impl_style", "rtl") - inst.set_nodeattr("code_gen_dir_ipgen", "") - inst.set_nodeattr("ipgen_path", "") - need_restitch = True - # StreamingDataWidthConverter must have impl_style=hls - for dwc_layer in verify_model.get_nodes_by_op_type( - "StreamingDataWidthConverter_Batch" - ): - inst = getCustomOp(dwc_layer) - if inst.get_nodeattr("impl_style") != "hls": - inst.set_nodeattr("impl_style", "hls") - inst.set_nodeattr("code_gen_dir_ipgen", "") - inst.set_nodeattr("ipgen_path", "") - need_restitch = True - # if we've made alterations to the model, need to do some re-prep - if need_restitch: - print("Need to regen/re-stitch some IP for STITCHED_IP_RTLSIM") - verify_model = verify_model.transform( - PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()) - ) - verify_model = verify_model.transform(HLSSynthIP()) - verify_model = verify_model.transform( - CreateStitchedIP( - cfg._resolve_fpga_part(), - cfg.synth_clk_period_ns, - vitis=False, + if not cfg.rtlsim_use_vivado_comps: + need_restitch = False + # switch impl_style=vivado components to rtl/hls + # StreamingFIFO must have impl_style=rtl + for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO"): + inst = getCustomOp(fifo_layer) + if inst.get_nodeattr("impl_style") != "rtl": + inst.set_nodeattr("impl_style", "rtl") + inst.set_nodeattr("code_gen_dir_ipgen", "") + inst.set_nodeattr("ipgen_path", "") + need_restitch = True + # StreamingDataWidthConverter must have impl_style=hls + for dwc_layer in verify_model.get_nodes_by_op_type( + "StreamingDataWidthConverter_Batch" + ): + inst = getCustomOp(dwc_layer) + if inst.get_nodeattr("impl_style") != "hls": + inst.set_nodeattr("impl_style", "hls") + inst.set_nodeattr("code_gen_dir_ipgen", "") + inst.set_nodeattr("ipgen_path", "") + need_restitch = True + # if we've made alterations to the model, need to do some re-prep + if need_restitch: + print("Need to regen/re-stitch some IP for STITCHED_IP_RTLSIM") + verify_model = verify_model.transform( + PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()) ) - ) + verify_model = verify_model.transform(HLSSynthIP()) + verify_model = verify_model.transform( + CreateStitchedIP( + cfg._resolve_fpga_part(), + cfg.synth_clk_period_ns, + vitis=False, + ) + ) + else: + print("rtlsim_use_vivado_comps is enabled, may yield incorrect results") + # set top-level prop for stitched-ip rtlsim and launch verify_model.set_metadata_prop("exec_mode", "rtlsim") # TODO make configurable diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py index a7c3cd0be59db4ba8665f8fba5be72282339b8c8..a0346f50bf6b7e88a79ba5ef4700039eb39c32ef 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfifo.py +++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py @@ -68,6 +68,8 @@ class StreamingFIFO(HLSCustomOp): "auto", {"auto", "block", "distributed", "ultra"}, ), + # whether depth monitoring is enabled (impl_style=rtl only) + "depth_monitor": ("i", False, 0), } my_attrs.update(super().get_nodeattr_types()) @@ -97,6 +99,14 @@ class StreamingFIFO(HLSCustomOp): def verify_node(self): pass + def get_verilog_top_module_intf_names(self): + ret = super().get_verilog_top_module_intf_names() + is_rtl = self.get_nodeattr("impl_style") == "rtl" + is_depth_monitor = self.get_nodeattr("depth_monitor") == 1 + if is_rtl and is_depth_monitor: + ret["ap_none"] = ["maxcount"] + return ret + def get_verilog_top_module_name(self): "Return the Verilog top module name for this node." diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py index e73fa9bb2872d4a5023afb0c4e6953b4e6866b8d..c7bbc3f139b64f57943b2b099083a9611951e9c4 100644 --- a/src/finn/custom_op/fpgadataflow/templates.py +++ b/src/finn/custom_op/fpgadataflow/templates.py @@ -319,6 +319,7 @@ module $TOPNAME$( ap_clk, ap_rst_n, count, +maxcount, in0_$HLS_SNAME$_TDATA, in0_$HLS_SNAME$_TVALID, in0_$HLS_SNAME$_TREADY, @@ -330,6 +331,7 @@ out_$HLS_SNAME$_TREADY input ap_clk; input ap_rst_n; output $COUNT_RANGE$ count; +output $COUNT_RANGE$ maxcount; input $IN_RANGE$ in0_$HLS_SNAME$_TDATA; input in0_$HLS_SNAME$_TVALID; output in0_$HLS_SNAME$_TREADY; @@ -346,6 +348,7 @@ $LAYER_NAME$ .clock(ap_clk), .reset(!ap_rst_n), .count(count), + .maxcount(maxcount), .i_d(in0_$HLS_SNAME$_TDATA), .i_v(in0_$HLS_SNAME$_TVALID), .i_r(in0_$HLS_SNAME$_TREADY), diff --git a/src/finn/qnn-data/verilog/custom_axis_infrastructure.vh b/src/finn/qnn-data/verilog/custom_axis_infrastructure.vh new file mode 100644 index 0000000000000000000000000000000000000000..1c8b6403e8628e3647810ca5fca65ca1122eaf9d --- /dev/null +++ b/src/finn/qnn-data/verilog/custom_axis_infrastructure.vh @@ -0,0 +1,346 @@ +// (c) Copyright 2011-2013 Xilinx, Inc. All rights reserved. +// +// This file contains confidential and proprietary information +// of Xilinx, Inc. and is protected under U.S. and +// international copyright and other intellectual property +// laws. +// +// DISCLAIMER +// This disclaimer is not a license and does not grant any +// rights to the materials distributed herewith. Except as +// otherwise provided in a valid license issued to you by +// Xilinx, and to the maximum extent permitted by applicable +// law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND +// WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES +// AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING +// BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON- +// INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and +// (2) Xilinx shall not be liable (whether in contract or tort, +// including negligence, or under any other theory of +// liability) for any loss or damage of any kind or nature +// related to, arising under or in connection with these +// materials, including for any direct, or any indirect, +// special, incidental, or consequential loss or damage +// (including loss of data, profits, goodwill, or any type of +// loss or damage suffered as a result of any action brought +// by a third party) even if such damage or loss was +// reasonably foreseeable or Xilinx had been advised of the +// possibility of the same. +// +// CRITICAL APPLICATIONS +// Xilinx products are not designed or intended to be fail- +// safe, or for use in any application requiring fail-safe +// performance, such as life-support or safety devices or +// systems, Class III medical devices, nuclear facilities, +// applications related to the deployment of airbags, or any +// other applications that could lead to death, personal +// injury, or severe property or environmental damage +// (individually and collectively, "Critical +// Applications"). Customer assumes the sole risk and +// liability of any use of Xilinx products in Critical +// Applications, subject only to applicable laws and +// regulations governing limitations on product liability. +// +// THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS +// PART OF THIS FILE AT ALL TIMES. +//----------------------------------------------------------------------------- +// +// Generic Functions used by AXIS-Interconnect and Infrastrucutre Modules +// +// Verilog-standard: Verilog 2001 +//-------------------------------------------------------------------------- +// Global Parameters: +// +// Functions: +// f_clogb2 +// f_gcd +// f_lcm +// f_get_tdata_indx +// f_get_tstrb_indx +// f_get_tkeep_indx +// f_get_tlast_indx +// f_get_tid_indx +// f_get_tdest_indx +// f_get_tuser_indx +// f_payload_width +// Tasks: +// t_display_tdata_error +//-------------------------------------------------------------------------- +/////////////////////////////////////////////////////////////////////////////// +// BEGIN Global Parameters +/////////////////////////////////////////////////////////////////////////////// +// Define Signal Set indices +localparam G_INDX_SS_TREADY = 0; +localparam G_INDX_SS_TDATA = 1; +localparam G_INDX_SS_TSTRB = 2; +localparam G_INDX_SS_TKEEP = 3; +localparam G_INDX_SS_TLAST = 4; +localparam G_INDX_SS_TID = 5; +localparam G_INDX_SS_TDEST = 6; +localparam G_INDX_SS_TUSER = 7; +localparam G_MASK_SS_TREADY = 32'h1 << G_INDX_SS_TREADY; +localparam G_MASK_SS_TDATA = 32'h1 << G_INDX_SS_TDATA; +localparam G_MASK_SS_TSTRB = 32'h1 << G_INDX_SS_TSTRB; +localparam G_MASK_SS_TKEEP = 32'h1 << G_INDX_SS_TKEEP; +localparam G_MASK_SS_TLAST = 32'h1 << G_INDX_SS_TLAST; +localparam G_MASK_SS_TID = 32'h1 << G_INDX_SS_TID ; +localparam G_MASK_SS_TDEST = 32'h1 << G_INDX_SS_TDEST; +localparam G_MASK_SS_TUSER = 32'h1 << G_INDX_SS_TUSER; + +// Task DRC error levels +localparam G_TASK_SEVERITY_ERR = 2; +localparam G_TASK_SEVERITY_WARNING = 1; +localparam G_TASK_SEVERITY_INFO = 0; + +/////////////////////////////////////////////////////////////////////////////// +// BEGIN Functions +/////////////////////////////////////////////////////////////////////////////// +// ceiling logb2 + function integer f_clogb2 (input integer size); + integer s; + begin + s = size; + s = s - 1; + for (f_clogb2=1; s>1; f_clogb2=f_clogb2+1) + s = s >> 1; + end + endfunction // clogb2 + + // Calculates the Greatest Common Divisor between two integers using the + // euclidean algorithm. + function automatic integer f_gcd ( + input integer a, + input integer b + ); + begin : main + integer A, B, done, swap; + A = a; + B = b; + done = 0; + while(!done) + begin + if (A < B ) begin + swap = A; + A = B; + B = swap; + end else if ( B != 0 ) begin + A = A - B; + end else begin + done = 1; + end + end + + f_gcd = A; + end + endfunction + + + // Calculates the Lowest Common Denominator between two integers + function integer f_lcm ( + input integer a, + input integer b + ); + begin : main + f_lcm = ( a / f_gcd(a, b)) * b; + end + endfunction + + // Returns back the index to the TDATA portion of TPAYLOAD, returns 0 if the + // signal is not enabled. + function integer f_get_tdata_indx ( + input integer DAW, // TDATA Width + input integer IDW, // TID Width + input integer DEW, // TDEST Width + input integer USW, // TUSER Width + input [31:0] SST // Signal Set + ); + begin : main + f_get_tdata_indx = 0; + end + endfunction + + // Returns back the index to the tstrb portion of TPAYLOAD, returns 0 if the + // signal is not enabled. + function integer f_get_tstrb_indx ( + input integer DAW, // TDATA Width + input integer IDW, // TID Width + input integer DEW, // TDEST Width + input integer USW, // TUSER Width + input [31:0] SST // Signal Set + ); + begin : main + integer cur_indx; + cur_indx = f_get_tdata_indx(DAW, IDW, DEW, USW, SST); + // If TDATA exists, then add its width to its base to get the tstrb index + f_get_tstrb_indx = SST[G_INDX_SS_TDATA] ? cur_indx + DAW : cur_indx; + end + endfunction + + // Returns back the index to the tkeep portion of TPAYLOAD, returns 0 if the + // signal is not enabled. + function integer f_get_tkeep_indx ( + input integer DAW, // TDATA Width + input integer IDW, // TID Width + input integer DEW, // TDEST Width + input integer USW, // TUSER Width + input [31:0] SST // Signal Set + ); + begin : main + integer cur_indx; + cur_indx = f_get_tstrb_indx(DAW, IDW, DEW, USW, SST); + f_get_tkeep_indx = SST[G_INDX_SS_TSTRB] ? cur_indx + DAW/8 : cur_indx; + end + endfunction + + // Returns back the index to the tlast portion of TPAYLOAD, returns 0 if the + // signal is not enabled. + function integer f_get_tlast_indx ( + input integer DAW, // TDATA Width + input integer IDW, // TID Width + input integer DEW, // TDEST Width + input integer USW, // TUSER Width + input [31:0] SST // Signal Set + ); + begin : main + integer cur_indx; + cur_indx = f_get_tkeep_indx(DAW, IDW, DEW, USW, SST); + f_get_tlast_indx = SST[G_INDX_SS_TKEEP] ? cur_indx + DAW/8 : cur_indx; + end + endfunction + + // Returns back the index to the tid portion of TPAYLOAD, returns 0 if the + // signal is not enabled. + function integer f_get_tid_indx ( + input integer DAW, // TDATA Width + input integer IDW, // TID Width + input integer DEW, // TDEST Width + input integer USW, // TUSER Width + input [31:0] SST // Signal Set + ); + begin : main + integer cur_indx; + cur_indx = f_get_tlast_indx(DAW, IDW, DEW, USW, SST); + f_get_tid_indx = SST[G_INDX_SS_TLAST] ? cur_indx + 1 : cur_indx; + end + endfunction + + // Returns back the index to the tdest portion of TPAYLOAD, returns 0 if the + // signal is not enabled. + function integer f_get_tdest_indx ( + input integer DAW, // TDATA Width + input integer IDW, // TID Width + input integer DEW, // TDEST Width + input integer USW, // TUSER Width + input [31:0] SST // Signal Set + ); + begin : main + integer cur_indx; + cur_indx = f_get_tid_indx(DAW, IDW, DEW, USW, SST); + f_get_tdest_indx = SST[G_INDX_SS_TID] ? cur_indx + IDW : cur_indx; + end + endfunction + + // Returns back the index to the tuser portion of TPAYLOAD, returns 0 if the + // signal is not enabled. + function integer f_get_tuser_indx ( + input integer DAW, // TDATA Width + input integer IDW, // TID Width + input integer DEW, // TDEST Width + input integer USW, // TUSER Width + input [31:0] SST // Signal Set + ); + begin : main + integer cur_indx; + cur_indx = f_get_tdest_indx(DAW, IDW, DEW, USW, SST); + f_get_tuser_indx = SST[G_INDX_SS_TDEST] ? cur_indx + DEW : cur_indx; + end + endfunction + + // Payload is the sum of all the AXIS signals present except for + // TREADY/TVALID + function integer f_payload_width ( + input integer DAW, // TDATA Width + input integer IDW, // TID Width + input integer DEW, // TDEST Width + input integer USW, // TUSER Width + input [31:0] SST // Signal Set + ); + begin : main + integer cur_indx; + cur_indx = f_get_tuser_indx(DAW, IDW, DEW, USW, SST); + f_payload_width = SST[G_INDX_SS_TUSER] ? cur_indx + USW : cur_indx; + // Ensure that the return value is never less than 1 + f_payload_width = (f_payload_width < 1) ? 1 : f_payload_width; + end + endfunction + + task t_check_tdata_width( + input integer data_width, + input [8*80-1:0] var_name, + input [8*80-1:0] inst_name, + input integer severity_lvl, + output integer ret_val + ); + // Severity levels: + // 0 = INFO + // 1 = WARNING + // 2 = ERROR + begin : t_check_tdata_width + if (data_width%8 != 0) begin + // 000 1 2 3 4 5 6 7 8 + // 012 0 0 0 0 0 0 0 0 + if (severity_lvl >= 2) begin + $display("ERROR: %m::%s", inst_name); + end else if (severity_lvl == 1) begin + $display("WARNING: %m::%s", inst_name); + end else begin + $display("INFO: %m::%s", inst_name); + end + $display(" Parameter %s (%2d) must be a multiple of 8.", var_name, data_width); + $display(" AXI4-Stream data width is only defined for byte multiples. See the "); + $display(" AMBA4 AXI4-Stream Protocol Specification v1.0 Section 2.1 for more"); + $display(" information."); + ret_val = 1; + end else begin + ret_val = 0; + end + end + endtask + + task t_check_tuser_width( + input integer tuser_width, + input [8*80-1:0] tuser_name, + input integer tdata_width, + input [8*80-1:0] tdata_name, + input [8*80-1:0] inst_name, + input integer severity_lvl, + output integer ret_val + ); + // Severity levels: + // 0 = INFO + // 1 = WARNING + // 2 = ERROR + begin : t_check_tuser_width + integer tdata_bytes; + tdata_bytes = tdata_width/8; + if ((tuser_width%tdata_bytes) != 0) begin + // 000 1 2 3 4 5 6 7 8 + // 012 0 0 0 0 0 0 0 0 + if (severity_lvl >= 2) begin + $display("ERROR: %m::%s", inst_name); + end else if (severity_lvl == 1) begin + $display("WARNING: %m::%s", inst_name); + end else begin + $display("INFO: %m::%s", inst_name); + end + $display(" Parameter %s == %2d is not the recommended value of 'an integer ", tuser_name, tuser_width); + $display(" multiple of the width of the interface (%s == %2d) in bytes.' AXI4-Stream", tdata_name, tdata_width); + $display(" TUSER width in this module is only defined when the TUSER is the"); + $display(" recommended value. See the AMBA4 AXI4-Stream Protocol Specification v1.0"); + $display(" Section 2.1, 2.3.3 and 2.8 for more information. "); + ret_val = 1; + end else begin + ret_val = 0; + end + end + endtask diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py index 9817f2e3d2857bd5e59b304fbdaf3bad74a9b037..efc179923545eb06e4d173c683b0941887f8bb79 100644 --- a/src/finn/transformation/fpgadataflow/insert_dwc.py +++ b/src/finn/transformation/fpgadataflow/insert_dwc.py @@ -81,6 +81,12 @@ class InsertDWC(Transformation): dwc_in_width = n0.get_outstream_width() # determine dwc outwidth dwc_out_width = n1.get_instream_width() + larger_width = max(dwc_in_width, dwc_out_width) + smaller_width = min(dwc_in_width, dwc_out_width) + if larger_width % smaller_width == 0: + impl_style = "hls" + else: + impl_style = "vivado" # determine shape for dwc dwc_shape = n0.get_normal_output_shape() @@ -105,6 +111,7 @@ class InsertDWC(Transformation): inWidth=dwc_in_width, outWidth=dwc_out_width, dataType=str(dtype.name), + impl_style=impl_style, ) # insert dwc graph.node.insert(node_ind + 1, dwc_node) diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py index 0139c71666fdfa4b60cb356ceb65ce2c5b831c13..90ea853b6072b145df64a8a73ee93c65989fe447 100644 --- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -192,10 +192,11 @@ class InsertAndSetFIFODepths(Transformation): - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of Verilog FIFOs (Q_srl.v) - max_depth : how deep the "max"-sized FIFOs initially inserted will be + if set to None, use the tensor size as the depth - swg_exception : call CapConvolutionFIFODepths to make convolution FIFOs smaller where appropriate - vivado_ram_style : the StreamingFIFO.ram_style attribute to be used for - large FIFOs implemented by Vivado + large FIFOs implemented by Vivado afterwards Assumed input graph properties: - all nodes are fpgadataflow nodes @@ -210,7 +211,7 @@ class InsertAndSetFIFODepths(Transformation): necessary to insert FIFOs between them to prevent stalls due to bursty behavior. The sizes of those FIFOs are hard to predict analytically, so we do the following: - - insert very deep (default 16k deep) FIFOs between all fpgadataflow nodes + - insert deep (=tensor size) FIFOs between all fpgadataflow nodes - create stitched design - run through rtlsim with stream of multiple random input images (to fill pipeline) - keep track of observed maximum occupancy for each FIFO during rtlsim @@ -223,7 +224,7 @@ class InsertAndSetFIFODepths(Transformation): fpgapart, clk_ns=10.0, max_qsrl_depth=256, - max_depth=2**14, + max_depth=None, swg_exception=True, vivado_ram_style="auto", ): @@ -236,6 +237,9 @@ class InsertAndSetFIFODepths(Transformation): self.vivado_ram_style = vivado_ram_style def apply(self, model): + # these optypes may potentially use external weights + # we'll temporarily change them to use decoupled mode for FIFO sizing + extw_optypes = ["MatrixVectorActivation", "VectorVectorActivation"] # change external to decoupled and warn user # this way we are sure we have exactly one input/output modified_fc_nodes = [] @@ -246,9 +250,15 @@ class InsertAndSetFIFODepths(Transformation): ) assert node.op_type != "StreamingFIFO", "Found existing StreamingFIFO node" node = getCustomOp(node) - node.set_nodeattr("inFIFODepth", self.max_depth) - node.set_nodeattr("outFIFODepth", self.max_depth) - if node.onnx_node.op_type == "MatrixVectorActivation": + if self.max_depth is not None: + node.set_nodeattr("inFIFODepth", self.max_depth) + node.set_nodeattr("outFIFODepth", self.max_depth) + else: + i_depth = np.prod(node.get_folded_input_shape()[:-1]) + o_depth = np.prod(node.get_folded_output_shape()[:-1]) + node.set_nodeattr("inFIFODepth", i_depth) + node.set_nodeattr("outFIFODepth", o_depth) + if node.onnx_node.op_type in extw_optypes: mmode = node.get_nodeattr("mem_mode") if mmode == "external": modified_fc_nodes.append(node.onnx_node.name) @@ -267,13 +277,17 @@ class InsertAndSetFIFODepths(Transformation): # gather FIFO names, check they are of expected depth fifos = {} - for node in model.graph.node: - if node.op_type == "StreamingFIFO": - fifos[node.name] = 0 - node = getCustomOp(node) - # check depths and fix as necessary - if node.get_nodeattr("depth") != self.max_depth: - node.set_nodeattr("depth", self.max_depth) + fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO") + for node in fifo_nodes: + fifos[node.name] = 0 + node = getCustomOp(node) + node.set_nodeattr("depth_monitor", 1) + node.set_nodeattr("impl_style", "rtl") + # check depths and fix as necessary + if (self.max_depth is not None) and ( + node.get_nodeattr("depth") != self.max_depth + ): + node.set_nodeattr("depth", self.max_depth) # insert FIFOs and do all transformations for RTLsim model = model.transform(AnnotateCycles()) @@ -324,21 +338,6 @@ class InsertAndSetFIFODepths(Transformation): else: set_signal(sim, "tvalid", 0) - # check/update all fifo counts - for key in fifos: - current_state = sim.internals["finn_design_i"][key]["inst"][ - key + "_" + key - ]["state"] - current_addr = sim.internals["finn_design_i"][key]["inst"][ - key + "_" + key - ]["addr"] - if current_state == 2: - current_count = current_addr + 2 - else: - current_count = current_state - if current_count > fifos[key]: - fifos[key] = current_count - # since latency estimation is very pessimistic, detect first output # and fast-forward the sim if get_signal(sim, "tvalid") != 0 and not output_detected: @@ -352,6 +351,12 @@ class InsertAndSetFIFODepths(Transformation): "No output detected, calculated FIFO depths may not be correct" ) + for ind, node in enumerate(fifo_nodes): + maxcount_name = "maxcount_%d" % ind + if ind == 0: + maxcount_name = "maxcount" + fifos[node.name] = sim[maxcount_name] + # Apply depths back into the model; # also set in/outFIFODepth to zero for non-FIFO # nodes, preventing further FIFO insertion @@ -364,6 +369,7 @@ class InsertAndSetFIFODepths(Transformation): depth = optimize_depth(fifos[node.name]) node_inst = getCustomOp(node) node_inst.set_nodeattr("depth", depth) + node_inst.set_nodeattr("depth_monitor", 0) # Set FIFO implementation/ram styles if depth > self.max_qsrl_depth: node_inst.set_nodeattr("impl_style", "vivado") @@ -376,9 +382,9 @@ class InsertAndSetFIFODepths(Transformation): else: getCustomOp(node).set_nodeattr("inFIFODepth", 0) getCustomOp(node).set_nodeattr("outFIFODepth", 0) - # for every FC node we changed from external to decoupled, + # for every extw node we changed from external to decoupled, # change back and reset implementation - if node.op_type == "MatrixVectorActivation": + if node.op_type in extw_optypes: if node.name in modified_fc_nodes: node_inst = getCustomOp(node) node_inst.set_nodeattr("mem_mode", "external") diff --git a/src/finn/util/pyverilator.py b/src/finn/util/pyverilator.py index f6a51da8e44ea60ae5693cdd033b39bdf51376ac..d7ed3e261fe024b7f054382f12184628d3f3e94c 100644 --- a/src/finn/util/pyverilator.py +++ b/src/finn/util/pyverilator.py @@ -26,7 +26,10 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pkg_resources as pk + import os +import shutil from pyverilator import PyVerilator from finn.util.basic import get_rtlsim_trace_depth, make_build_dir @@ -74,14 +77,35 @@ def pyverilate_stitched_ip( # are identical but in multiple directories (regslice_core.v) # remove duplicates from list by doing list -> set -> list - all_verilog_files = list( - set(filter(lambda x: x.endswith(".v") or x.endswith(".sv"), all_verilog_srcs)) + src_exts = [".v", ".sv"] + + all_verilog_src_files = list( + set( + filter( + lambda x: any(map(lambda y: x.endswith(y), src_exts)), all_verilog_srcs + ) + ) + ) + + verilog_header_dir = make_build_dir("pyverilator_vh_") + # use custom version of axis infrastructure vh + # to enable Verilator to simulate AMD/Xilinx components (e.g DWC) + custom_vh = pk.resource_filename( + "finn.qnn-data", "verilog/custom_axis_infrastructure.vh" ) + shutil.copy(custom_vh, verilog_header_dir + "/axis_infrastructure_v1_1_0.vh") + for fn in all_verilog_srcs: + if fn.endswith(".vh"): + if "axis_infrastructure_v1_1_0.vh" in fn: + # skip, we use a custom version for this file without recursive gcd + continue + else: + shutil.copy(fn, verilog_header_dir) # remove all but one instances of regslice_core.v filtered_verilog_files = [] remove_entry = False - for vfile in all_verilog_files: + for vfile in all_verilog_src_files: if "regslice_core" in vfile: if not remove_entry: filtered_verilog_files.append(vfile) @@ -94,7 +118,12 @@ def pyverilate_stitched_ip( for vfile in filtered_verilog_files: with open(vfile) as rf: wf.write("//Added from " + vfile + "\n\n") - wf.write(rf.read()) + lines = rf.read() + for line in lines.split("\n"): + # break down too-long lines, Verilator complains otherwise + if len(line) > 20000: + line = line.replace("&", "\n&") + wf.write("\n" + line) verilator_args = [] # disable common verilator warnings that should be harmless but commonly occur @@ -108,10 +137,20 @@ def pyverilate_stitched_ip( # force inlining of all submodules to ensure we can read internal signals properly if read_internal_signals: verilator_args += ["--inline-mult", "0"] + # add defines to make certain XPM src files work with Verilator + verilator_args.append("-DDISABLE_XPM_ASSERTIONS") + verilator_args.append("-DOBSOLETE") + verilator_args.append("-DONESPIN") + verilator_args.append("--bbox-unsup") + vivado_path = os.environ["VIVADO_PATH"] + # additional SystemVerilog modules to make XPMs work with Verilator + xpm_memory = f"{vivado_path}/data/ip/xpm/xpm_memory/hdl/xpm_memory.sv" + xpm_cdc = f"{vivado_path}/data/ip/xpm/xpm_cdc/hdl/xpm_cdc.sv" + xpm_fifo = f"{vivado_path}/data/ip/xpm/xpm_fifo/hdl/xpm_fifo.sv" sim = PyVerilator.build( - top_module_file_name, - verilog_path=[vivado_stitch_proj_dir], + [top_module_file_name, xpm_fifo, xpm_memory, xpm_cdc], + verilog_path=[vivado_stitch_proj_dir, verilog_header_dir], build_dir=build_dir, trace_depth=get_rtlsim_trace_depth(), top_module_name=top_module_name,