diff --git a/finn-rtllib/memstream/hdl/Q_srl.v b/finn-rtllib/memstream/hdl/Q_srl.v new file mode 100644 index 0000000000000000000000000000000000000000..80b015f6d4eb69df36831b25262cda3539ac8ae9 --- /dev/null +++ b/finn-rtllib/memstream/hdl/Q_srl.v @@ -0,0 +1,325 @@ +// original source: +// https://github.com/nachiket/tdfc/blob/master/verilog/queues/Q_srl_oreg3_prefull_SIMPLE.v + + +// Copyright (c) 1999 The Regents of the University of California +// Copyright (c) 2010 The Regents of the University of Pennsylvania +// Copyright (c) 2011 Department of Electrical and Electronic Engineering, Imperial College London +// Copyright (c) 2020 Xilinx +// +// Permission to use, copy, modify, and distribute this software and +// its documentation for any purpose, without fee, and without a +// written agreement is hereby granted, provided that the above copyright +// notice and this paragraph and the following two paragraphs appear in +// all copies. +// +// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR +// DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING +// LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, +// EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, +// INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY +// AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON +// AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO +// PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. +// + +// Q_srl_oreg3_prefull_SIMPLE.v +// +// - In-page queue with parameterizable depth, bit width +// - Stream I/O is triple (data, valid, back-pressure), +// with EOS concatenated into the data +// - Flow control for input & output is combinationally decoupled +// - 2 <= depth <= 256 +// * (depth >= 2) is required to decouple I/O flow control, +// where empty => no produce, full => no consume, +// and depth 1 would ping-pong between the two at half rate +// * (depth <= 256) can be modified +// by changing ''synthesis loop_limit X'' below +// and changing ''addrwidth'' or its log computation +// - 1 <= width +// - Queue storage is in SRL16E, up to depth 16 per LUT per bit-slice, +// plus output register (for fast output) +// - Queue addressing is done by ''addr'' up-down counter +// - Queue fullness is checked by comparator (addr==depth) +// - Queue fullness is pre-computed for next cycle +// - Queue input back-pressure is pre-computed for next cycle +// - Queue output valid (state!=state__empty) is pre-computed for next cycle +// (necessary since SRL data output reg requires non-boolean state) +// - FSM has 3 states (empty, one, more) +// - When empty, continue to emit most recently emitted value (for debugging) +// +// - Queue slots used = / (state==state_empty) ? 0 +// | (state==state_one) ? 1 +// \ (state==state_more) ? addr+2 +// - Queue slots used <= depth +// - Queue slots remaining = depth - used +// = / (state==state_empty) ? depth +// | (state==state_one) ? depth-1 +// \ (state==state_more) ? depth-2-addr +// +// - Synplify 7.1 / 8.0 +// - Eylon Caspi, 9/11/03, 8/18/04, 3/29/05 + + +`ifdef Q_srl +`else +`define Q_srl + + +module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count); + + parameter depth = 16; // - greatest #items in queue (2 <= depth <= 256) + parameter width = 16; // - width of data (i_d, o_d) + + `define LOG2 ( (((depth)) ==0) ? 0 /* - depth==0 LOG2=0 */ \ + : (((depth-1)>>0)==0) ? 0 /* - depth<=1 LOG2=0 */ \ + : (((depth-1)>>1)==0) ? 1 /* - depth<=2 LOG2=1 */ \ + : (((depth-1)>>2)==0) ? 2 /* - depth<=4 LOG2=2 */ \ + : (((depth-1)>>3)==0) ? 3 /* - depth<=8 LOG2=3 */ \ + : (((depth-1)>>4)==0) ? 4 /* - depth<=16 LOG2=4 */ \ + : (((depth-1)>>5)==0) ? 5 /* - depth<=32 LOG2=5 */ \ + : (((depth-1)>>6)==0) ? 6 /* - depth<=64 LOG2=6 */ \ + : (((depth-1)>>7)==0) ? 7 /* - depth<=128 LOG2=7 */ \ + : 8) /* - depth<=256 LOG2=8 */ + +// parameter addrwidth = LOG2; // - width of queue addr + + parameter addrwidth = + ( (((depth)) ==0) ? 0 // - depth==0 LOG2=0 + : (((depth-1)>>0)==0) ? 0 // - depth<=1 LOG2=0 + : (((depth-1)>>1)==0) ? 1 // - depth<=2 LOG2=1 + : (((depth-1)>>2)==0) ? 2 // - depth<=4 LOG2=2 + : (((depth-1)>>3)==0) ? 3 // - depth<=8 LOG2=3 + : (((depth-1)>>4)==0) ? 4 // - depth<=16 LOG2=4 + : (((depth-1)>>5)==0) ? 5 // - depth<=32 LOG2=5 + : (((depth-1)>>6)==0) ? 6 // - depth<=64 LOG2=6 + : (((depth-1)>>7)==0) ? 7 // - depth<=128 LOG2=7 + : 8) // - depth<=256 LOG2=8 + ; + + input clock; + input reset; + + input [width-1:0] i_d; // - input stream data (concat data + eos) + input i_v; // - input stream valid + output i_r; // - input stream ready + wire i_b; // - input stream back-pressure + + output [width-1:0] o_d; // - output stream data (concat data + eos) + output o_v; // - output stream valid + input o_r; // - output stream ready + wire o_b; // - output stream back-pressure + + output [addrwidth:0] count; // - output number of elems in queue + + reg [addrwidth-1:0] addr, addr_, a_; // - SRL16 address + // for data output + reg shift_en_; // - SRL16 shift enable + reg [width-1:0] srl [depth-2:0]; // - SRL16 memory + reg shift_en_o_; // - SRLO shift enable + reg [width-1:0] srlo_, srlo // - SRLO output reg + /* synthesis syn_allow_retiming=0 */ ; + + parameter state_empty = 2'd0; // - state empty : o_v=0 o_d=UNDEFINED + parameter state_one = 2'd1; // - state one : o_v=1 o_d=srlo + parameter state_more = 2'd2; // - state more : o_v=1 o_d=srlo + // #items in srl = addr+2 + + reg [1:0] state, state_; // - state register + + wire addr_full_; // - true iff addr==depth-2 on NEXT cycle + reg addr_full; // - true iff addr==depth-2 + wire addr_zero_; // - true iff addr==0 + wire o_v_reg_; // - true iff state_empty on NEXT cycle + reg o_v_reg // - true iff state_empty + /* synthesis syn_allow_retiming=0 */ ; + wire i_b_reg_; // - true iff !full on NEXT cycle + reg i_b_reg // - true iff !full + /* synthesis syn_allow_retiming=0 */ ; + + assign addr_full_ = (state_==state_more) && (addr_==depth-2); + // - queue full + assign addr_zero_ = (addr==0); // - queue contains 2 (or 1,0) + assign o_v_reg_ = (state_!=state_empty); // - output valid if non-empty + assign i_b_reg_ = addr_full_; // - input bp if full + assign o_d = srlo; // - output data from queue + assign o_v = o_v_reg; // - output valid if non-empty + assign i_b = i_b_reg; // - input bp if full + + assign i_r = !i_b; + assign o_b = !o_r; + + assign count = (state==state_more ? addr+2 : (state==state_one ? 1 : 0)); + + // - ''always'' block with both FFs and SRL16 does not work, + // since FFs need reset but SRL16 does not + + always @(posedge clock) begin // - seq always: FFs + if (reset) begin + state <= state_empty; + addr <= 0; + addr_full <= 0; + o_v_reg <= 0; + i_b_reg <= 1; + end + else begin + state <= state_; + addr <= addr_; + addr_full <= addr_full_; + o_v_reg <= o_v_reg_; + i_b_reg <= i_b_reg_; + end + end // always @ (posedge clock) + + always @(posedge clock) begin // - seq always: srlo + // - infer enabled output reg at end of shift chain + // - input first element from i_d, all subsequent elements from SRL16 + if (reset) begin + srlo <= 0; + end + else begin + if (shift_en_o_) begin + srlo <= srlo_; + end + end + end // always @ (posedge clock) + + always @(posedge clock) begin // - seq always: srl + // - infer enabled SRL16E from shifting srl array + // - no reset capability; srl[] contents undefined on reset + if (shift_en_) begin + // synthesis loop_limit 256 + for (a_=depth-2; a_>0; a_=a_-1) begin + srl[a_] <= srl[a_-1]; + end + srl[0] <= i_d; + end + end // always @ (posedge clock or negedge reset) + + always @* begin // - combi always + srlo_ <= 'bx; + shift_en_o_ <= 1'bx; + shift_en_ <= 1'bx; + addr_ <= 'bx; + state_ <= 2'bx; + case (state) + + state_empty: begin // - (empty, will not produce) + if (i_v) begin // - empty & i_v => consume + srlo_ <= i_d; + shift_en_o_ <= 1; + shift_en_ <= 1'bx; + addr_ <= 0; + state_ <= state_one; + end + else begin // - empty & !i_v => idle + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 1'bx; + addr_ <= 0; + state_ <= state_empty; + end + end + + state_one: begin // - (contains one) + if (i_v && o_b) begin // - one & i_v & o_b => consume + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 1; + addr_ <= 0; + state_ <= state_more; + end + else if (i_v && !o_b) begin // - one & i_v & !o_b => cons+prod + srlo_ <= i_d; + shift_en_o_ <= 1; + shift_en_ <= 1; + addr_ <= 0; + state_ <= state_one; + end + else if (!i_v && o_b) begin // - one & !i_v & o_b => idle + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 1'bx; + addr_ <= 0; + state_ <= state_one; + end + else if (!i_v && !o_b) begin // - one & !i_v & !o_b => produce + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 1'bx; + addr_ <= 0; + state_ <= state_empty; + end + end // case: state_one + + state_more: begin // - (contains more than one) + if (addr_full || (depth==2)) begin + // - (full, will not consume) + // - (full here if depth==2) + if (o_b) begin // - full & o_b => idle + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 0; + addr_ <= addr; + state_ <= state_more; + end + else begin // - full & !o_b => produce + srlo_ <= srl[addr]; + shift_en_o_ <= 1; + shift_en_ <= 0; +// addr_ <= addr-1; +// state_ <= state_more; + addr_ <= addr_zero_ ? 0 : addr-1; + state_ <= addr_zero_ ? state_one : state_more; + end + end + else begin // - (mid: neither empty nor full) + if (i_v && o_b) begin // - mid & i_v & o_b => consume + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 1; + addr_ <= addr+1; + state_ <= state_more; + end + else if (i_v && !o_b) begin // - mid & i_v & !o_b => cons+prod + srlo_ <= srl[addr]; + shift_en_o_ <= 1; + shift_en_ <= 1; + addr_ <= addr; + state_ <= state_more; + end + else if (!i_v && o_b) begin // - mid & !i_v & o_b => idle + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 0; + addr_ <= addr; + state_ <= state_more; + end + else if (!i_v && !o_b) begin // - mid & !i_v & !o_b => produce + srlo_ <= srl[addr]; + shift_en_o_ <= 1; + shift_en_ <= 0; + addr_ <= addr_zero_ ? 0 : addr-1; + state_ <= addr_zero_ ? state_one : state_more; + end + end // else: !if(addr_full) + end // case: state_more + + default: begin + srlo_ <= 'bx; + shift_en_o_ <= 1'bx; + shift_en_ <= 1'bx; + addr_ <= 'bx; + state_ <= 2'bx; + end // case: default + + endcase // case(state) + end // always @ * + +endmodule // Q_srl + + +`endif // `ifdef Q_srl diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 5a846b12c1c91b89174d337d14a188a15ebd9a44..10f8b7feedf7584afb66a7fad8f1ee20745bf67d 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -68,6 +68,8 @@ class HLSCustomOp(CustomOp): "code_gen_dir_ipgen": ("s", False, ""), "executable_path": ("s", False, ""), "ipgen_path": ("s", False, ""), + "ip_path": ("s", False, ""), + "ip_vlnv": ("s", False, ""), "exec_mode": ("s", False, ""), "sim_cycles": ("i", False, 0), "rtlsim_trace": ("s", False, ""), @@ -146,6 +148,9 @@ class HLSCustomOp(CustomOp): builder.set_ipgen_path(code_gen_dir + "/project_{}".format(node.name)) builder.build(code_gen_dir) self.set_nodeattr("ipgen_path", builder.ipgen_path) + self.set_nodeattr("ip_path", builder.ipgen_path + "/sol1/impl/ip") + vlnv = "xilinx.com:hls:%s:1.0" % node.name + self.set_nodeattr("ip_vlnv", vlnv) def code_generation_npysim(self, model): """Generates c++ code for simulation (npysim).""" diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py index 606c02778d8e44e93d621880bd769450da577ec5..00b8287a312fc82425b508ffef66f5187d074617 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py +++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py @@ -28,6 +28,7 @@ import math import os +import subprocess from shutil import copy import numpy as np @@ -130,9 +131,13 @@ class StreamingFCLayer_Batch(HLSCustomOp): def infer_node_datatype(self, model): node = self.onnx_node - # data type stays the same - dtype = model.get_tensor_datatype(node.input[0]) - model.set_tensor_datatype(node.output[0], dtype) + # check input datatype against property + idt_name = self.get_input_datatype().name + exp_idt_name = self.get_nodeattr("inputDataType") + assert exp_idt_name == idt_name, "Bad input DataType for StreamingFCLayer" + # set output datatype from property + odt = self.get_output_datatype() + model.set_tensor_datatype(node.output[0], odt) def verify_node(self): info_messages = [] @@ -493,22 +498,26 @@ class StreamingFCLayer_Batch(HLSCustomOp): np.save(os.path.join(code_gen_dir, "weights.npy"), weight_tensor_flipped) """Saves weights into .dat file""" - # convert weight value sinto hexstring + # convert weight values into hexstring weight_width = self.get_weightstream_width() weight_tensor_unflipped = pack_innermost_dim_as_hex_string( - weight_tensor_unflipped, export_wdt, weight_width + weight_tensor_unflipped, export_wdt, weight_width, prefix="" ) - weight_pad = np.zeros((1024), int).astype(str) - weight_tensor_unflipped = weight_tensor_unflipped.flatten() - # delete "0x" in the beginning of the hexstring - for i in range(len(weight_tensor_unflipped)): - weight_tensor_unflipped[i] = weight_tensor_unflipped[i][2:] - weight_pad[: weight_tensor_unflipped.shape[0]] = weight_tensor_unflipped - weight_pad = weight_pad.copy() - f = open("{}/memblock_0.dat".format(code_gen_dir), "w+") - for val in weight_pad: - f.write(val + "\n") - f.close() + weight_stream_len = np.prod(weight_tensor_unflipped.shape) + assert ( + weight_stream_len <= 1024 + ), """Decoupled mem mode needs + weight stream length <= 1024 for now""" + # add zeroes to pad out file to 1024 entries + weight_stream = weight_tensor_unflipped.flatten() + pad_amt = 1024 - weight_stream_len + weight_stream = np.pad( + weight_stream, (0, pad_amt), mode="constant", constant_values="0" + ) + weight_stream = weight_stream.copy() + with open("{}/memblock_0.dat".format(code_gen_dir), "w+") as f: + for val in weight_stream: + f.write(val + "\n") else: raise Exception( @@ -920,7 +929,7 @@ class StreamingFCLayer_Batch(HLSCustomOp): "#pragma HLS INTERFACE axis port=weights" ) self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS stream depth=8 variable=8" + "#pragma HLS stream depth=8 variable=weights" ) else: @@ -1023,3 +1032,35 @@ class StreamingFCLayer_Batch(HLSCustomOp): code_gen_dir, self.onnx_node.name ) copy(verilog_wrapper, verilog_folder) + # prepare the IP packaging tcl template + template = templates.ip_package_tcl + self.code_gen_dict["$TOPNAME$"] = [ + "{}_memstream".format(self.onnx_node.name) + ] + self.code_gen_dict["$VERILOG_DIR$"] = [verilog_folder] + for key in self.code_gen_dict: + # transform list into long string separated by '\n' + code_gen_line = "\n".join(self.code_gen_dict[key]) + template = template.replace(key, code_gen_line) + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + f = open(os.path.join(verilog_folder, "package_ip.tcl"), "w") + f.write(template) + f.close() + # create a shell script and call Vivado to invoke the IP pkg script + make_project_sh = verilog_folder + "/make_ip.sh" + working_dir = os.environ["PWD"] + with open(make_project_sh, "w") as f: + f.write("#!/bin/bash \n") + f.write("cd {}\n".format(verilog_folder)) + f.write("vivado -mode batch -source package_ip.tcl\n") + f.write("cd {}\n".format(working_dir)) + bash_command = ["bash", make_project_sh] + process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE) + process_compile.communicate() + # re-set ip_path to point to the new packaged IP + self.set_nodeattr("ip_path", verilog_folder) + vlnv = "xilinx.com:hls:%s:1.0" % ( + "{}_memstream".format(self.onnx_node.name) + ) + self.set_nodeattr("ip_vlnv", vlnv) + self.code_gen_dict.clear() diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py index c8191d714777b05b9cbb548eaa351b3e20d84a4b..90a54b019b090ea47e77c8efa841c86a1802edb5 100644 --- a/src/finn/custom_op/fpgadataflow/templates.py +++ b/src/finn/custom_op/fpgadataflow/templates.py @@ -1,3 +1,4 @@ +# flake8: noqa # Copyright (c) 2020, Xilinx # All rights reserved. # @@ -137,6 +138,14 @@ reg m_axis_0_tready; wire m_axis_0_tvalid; wire $WEIGHT_RANGE$ m_axis_0_tdata; +reg m_axis_0_tready_q; +wire m_axis_0_tvalid_q; +wire $WEIGHT_RANGE$ m_axis_0_tdata_q; + +reg m_axis_0_tready_q2; +wire m_axis_0_tvalid_q2; +wire $WEIGHT_RANGE$ m_axis_0_tdata_q2; + reg m_axis_1_afull = 0; reg m_axis_1_tready = 1; wire m_axis_1_tvalid; @@ -244,6 +253,43 @@ mem ); +// two consecutive weight streamer FIFOs to provide the same functionality +// as "programmable full" + +// weight streamer FIFO 1 +Q_srl #( +.depth(16), +.width($WEIGHT_WIDTH$) +) +$LAYER_NAME$_w_fifo_1 +( + .clock(ap_clk), + .reset(!ap_rst_n), + .i_d(m_axis_0_tdata), + .i_v(m_axis_0_tvalid), + .i_r(m_axis_0_tready), + .o_d(m_axis_0_tdata_q), + .o_v(m_axis_0_tvalid_q), + .o_r(m_axis_0_tready_q) +); + +// weight streamer FIFO 2 +Q_srl #( +.depth(16), +.width($WEIGHT_WIDTH$) +) +$LAYER_NAME$_w_fifo_2 +( + .clock(ap_clk), + .reset(!ap_rst_n), + .i_d(m_axis_0_tdata_q), + .i_v(m_axis_0_tvalid_q), + .i_r(m_axis_0_tready_q), + .o_d(m_axis_0_tdata_q2), + .o_v(m_axis_0_tvalid_q2), + .o_r(m_axis_0_tready_q2) +); + //MVA_Stream_Unit $LAYER_NAME$ @@ -254,9 +300,9 @@ MVA_Stream_U .in0_V_V_TDATA(in0_V_V_TDATA), //$IN_RANGE$ input .in0_V_V_TVALID(in0_V_V_TVALID), //input .in0_V_V_TREADY(in0_V_V_TREADY), //output -.weights_V_V_TDATA(m_axis_0_tdata), //$WEIGHT_RANGE$ input -.weights_V_V_TVALID(m_axis_0_tvalid), //input -.weights_V_V_TREADY(m_axis_0_tready), //output +.weights_V_V_TDATA(m_axis_0_tdata_q2), //$WEIGHT_RANGE$ input +.weights_V_V_TVALID(m_axis_0_tvalid_q2), //input +.weights_V_V_TREADY(m_axis_0_tready_q2), //output .out_V_V_TDATA(out_V_V_TDATA), //$OUT_RANGE$ output .out_V_V_TVALID(out_V_V_TVALID), //output .out_V_V_TREADY(out_V_V_TREADY) //input @@ -265,3 +311,103 @@ MVA_Stream_U endmodule """ + +ip_package_tcl = """ +## IP Info +set Vendor "xilinx.com" +set Library "hls" +set IPName "$TOPNAME$" +set Version "1.0" +set DisplayName "$TOPNAME$" +set Description "An IP generated by Xilinx FINN" +set Device "zynq" +set Catalog "/UserIP" +set RootDir "$VERILOG_DIR$" + +## Variables +set Top "$TOPNAME$" +set VerilogFiles [glob -nocomplain $RootDir/*] + + +## Enter IP directory +cd [file dir [info script]] + +## Generate sub cores +set IPs "" +set IPFiles "" + +## Basic info +set core [ipx::create_core $Vendor $Library $IPName $Version] +set_property display_name $DisplayName $core +set_property description $Description $core +set_property taxonomy $Catalog $core +set_property supported_families { \ + artix7 Production \ + artix7l Production \ + kintex7 Production \ + kintex7l Production \ + kintexu Production \ + kintexuplus Production \ + virtex7 Production \ + virtexu Production \ + virtexuplus Production \ + zynq Production \ + zynquplus Production \ + aartix7 Production \ + azynq Production \ + qartix7 Production \ + qkintex7 Production \ + qkintex7l Production \ + qvirtex7 Production \ + qzynq Production \ +} $core; + +## Add verilog files +if {[llength $VerilogFiles] > 0} { + # synthesis + set group [ipx::add_file_group xilinx_verilogsynthesis $core] + foreach f [concat $VerilogFiles $IPFiles] { + set current_file [ipx::add_file $f $group] + if {[file ext $f] == ".dat"} { + set_property type "mif" $current_file + } + } + set_property model_name $Top $group + if {$IPs != ""} { + set_property component_subcores $IPs $group + } + + # simulation + set group [ipx::add_file_group xilinx_verilogbehavioralsimulation $core] + foreach f [concat $VerilogFiles $IPFiles] { + set current_file [ipx::add_file $f $group] + if {[file ext $f] == ".dat"} { + set_property type "mif" $current_file + } + } + set_property model_name $Top $group + if {$IPs != ""} { + set_property component_subcores $IPs $group + } +} + +## Import ports +ipx::add_ports_from_hdl \ + -top_level_hdl_file $RootDir/$Top.v \ + -top_module_name $Top \ + $core + +## Infer interfaces +ipx::infer_bus_interface ap_clk xilinx.com:signal:clock_rtl:1.0 [ipx::current_core] +ipx::infer_bus_interface ap_rst_n xilinx.com:signal:reset_rtl:1.0 [ipx::current_core] +ipx::infer_bus_interface {in0_V_V_TDATA in0_V_V_TVALID in0_V_V_TREADY} xilinx.com:interface:axis_rtl:1.0 [ipx::current_core] +ipx::infer_bus_interface {out_V_V_TREADY out_V_V_TDATA out_V_V_TVALID} xilinx.com:interface:axis_rtl:1.0 [ipx::current_core] +ipx::associate_bus_interfaces -busif in0_V_V -clock ap_clk [ipx::current_core] +ipx::associate_bus_interfaces -busif out_V_V -clock ap_clk [ipx::current_core] + +## Finalize +set_property core_revision 2 [ipx::current_core] +ipx::create_xgui_files [ipx::current_core] +ipx::update_checksums [ipx::current_core] +ipx::save_core [ipx::current_core] +""" diff --git a/src/finn/transformation/fpgadataflow/codegen_ipstitch.py b/src/finn/transformation/fpgadataflow/codegen_ipstitch.py index fcb4af37c951de3869b731e755ef48ba4fdb579f..0fbd83199d88ec68cbf11c6ded5af33fdd4d91a3 100644 --- a/src/finn/transformation/fpgadataflow/codegen_ipstitch.py +++ b/src/finn/transformation/fpgadataflow/codegen_ipstitch.py @@ -31,6 +31,7 @@ import subprocess from finn.transformation import Transformation from finn.util.basic import get_by_name, make_build_dir +from finn.custom_op.registry import getCustomOp class CodeGen_ipstitch(Transformation): @@ -65,16 +66,11 @@ class CodeGen_ipstitch(Transformation): backend_value == "fpgadataflow" ), """Backend node attribute is not set to "fpgadataflow".""" - ip_dir_attribute = get_by_name(node.attribute, "ipgen_path") - assert ( - ip_dir_attribute is not None - ), """Node attribute "ipgen_path" is not set. - Please run transformation CodeGen_ipgen first.""" - ip_dir_value = ip_dir_attribute.s.decode("UTF-8") - ip_dir_value += "/sol1/impl/ip" + node_inst = getCustomOp(node) + ip_dir_value = node_inst.get_nodeattr("ip_path") assert os.path.isdir(ip_dir_value), "IP generation directory doesn't exist." ip_dirs += [ip_dir_value] - vlnv = "xilinx.com:hls:%s:1.0" % node.name + vlnv = node_inst.get_nodeattr("ip_vlnv") inst_name = node.name create_cmd = "create_bd_cell -type ip -vlnv %s %s" % (vlnv, inst_name) create_cmds += [create_cmd] diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index 113ea8ea02dc64aacf92b3fc3f5dda6417e25517..dbd98623c4cdf5baca9fa9c137debf8be0f70981 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -139,6 +139,10 @@ class InferBinaryStreamingFCLayer(Transformation): StreamingFCLayer_Batch layers. Any immediately following MultiThreshold layers will also be absorbed into the MVTU.""" + def __init__(self, mem_mode="const"): + super().__init__() + self.mem_mode = mem_mode + def apply(self, model): graph = model.graph node_ind = 0 @@ -219,6 +223,7 @@ class InferBinaryStreamingFCLayer(Transformation): binaryXnorMode=1, noActivation=0, numInputVectors=list(mm_in_shape[:-1]), + mem_mode=self.mem_mode, ) graph.node.insert(node_ind, new_node) # remove old nodes @@ -249,6 +254,7 @@ class InferBinaryStreamingFCLayer(Transformation): binaryXnorMode=1, noActivation=1, numInputVectors=list(mm_in_shape[:-1]), + mem_mode=self.mem_mode, ) graph.node.insert(node_ind, new_node) # remove old node @@ -265,6 +271,10 @@ class InferQuantizedStreamingFCLayer(Transformation): StreamingFCLayer_Batch layers. Any immediately following MultiThreshold layers will also be absorbed into the MVTU.""" + def __init__(self, mem_mode="const"): + super().__init__() + self.mem_mode = mem_mode + def apply(self, model): graph = model.graph node_ind = 0 @@ -347,6 +357,7 @@ class InferQuantizedStreamingFCLayer(Transformation): binaryXnorMode=0, noActivation=0, numInputVectors=list(mm_in_shape[:-1]), + mem_mode=self.mem_mode, ) graph.node.insert(node_ind, new_node) # remove old nodes @@ -377,6 +388,7 @@ class InferQuantizedStreamingFCLayer(Transformation): binaryXnorMode=0, noActivation=1, numInputVectors=list(mm_in_shape[:-1]), + mem_mode=self.mem_mode, ) graph.node.insert(node_ind, new_node) # remove old node diff --git a/src/finn/transformation/fpgadataflow/make_pynq_proj.py b/src/finn/transformation/fpgadataflow/make_pynq_proj.py index 41498edc078506b0d6db87f28dce558fdf5a1aa4..c2c3802635ba8b1be9bf7f0c71e48ad13b79771f 100644 --- a/src/finn/transformation/fpgadataflow/make_pynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_pynq_proj.py @@ -70,13 +70,12 @@ class MakePYNQProject(Transformation): # collect list of all IP dirs ip_dirs = ["list"] for node in model.graph.node: - ip_dir_attribute = get_by_name(node.attribute, "ipgen_path") + ip_dir_attribute = get_by_name(node.attribute, "ip_path") assert ( ip_dir_attribute is not None - ), """Node attribute "ipgen_path" is + ), """Node attribute "ip_path" is empty. Please run transformation HLSSynth_ipgen first.""" ip_dir_value = ip_dir_attribute.s.decode("UTF-8") - ip_dir_value += "/sol1/impl/ip" assert os.path.isdir( ip_dir_value ), """The directory that should diff --git a/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py b/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py index dc0a17893d9d9aa8f25fa7ca67242fca94810e3d..dce62c20b99097feee7208cbf57aa8921ddb3566 100644 --- a/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py +++ b/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py @@ -63,6 +63,9 @@ class ReplaceVerilogRelPaths(Transformation): old = '$readmemh(".' new = '$readmemh("%s' % dname s = s.replace(old, new) + old = '"./' + new = '"%s/' % dname + s = s.replace(old, new) with open(fpath, "w") as f: f.write(s) except KeyError: diff --git a/src/finn/util/data_packing.py b/src/finn/util/data_packing.py index 3788b2b6acb8bcb3c5919d3e1f0185dcc82aa4af..1d919de5d55363bbe71f0dfc44ca6fe3025f5a4a 100644 --- a/src/finn/util/data_packing.py +++ b/src/finn/util/data_packing.py @@ -119,7 +119,9 @@ def npbytearray2hexstring(npbytearray, prefix="0x"): return prefix + binascii.hexlify(bytearray(npbytearray)).decode("utf-8") -def pack_innermost_dim_as_hex_string(ndarray, dtype, pad_to_nbits, reverse_inner=False): +def pack_innermost_dim_as_hex_string( + ndarray, dtype, pad_to_nbits, reverse_inner=False, prefix="0x" +): """Pack the innermost dimension of the given numpy ndarray into hex strings using array2hexstring. @@ -143,7 +145,9 @@ def pack_innermost_dim_as_hex_string(ndarray, dtype, pad_to_nbits, reverse_inner ndarray = np.asarray(ndarray, dtype=np.float32) def fun(x): - return array2hexstring(x, dtype, pad_to_nbits, reverse=reverse_inner) + return array2hexstring( + x, dtype, pad_to_nbits, reverse=reverse_inner, prefix=prefix + ) return np.apply_along_axis(fun, ndarray.ndim - 1, ndarray) diff --git a/tests/fpgadataflow/test_fpgadataflow_fclayer.py b/tests/fpgadataflow/test_fpgadataflow_fclayer.py index 62d45ea8ba6f9398ff070d28168dc48eda37de42..80c9e84ba92c93e8a5d57ffaceb22b5abf188963 100644 --- a/tests/fpgadataflow/test_fpgadataflow_fclayer.py +++ b/tests/fpgadataflow/test_fpgadataflow_fclayer.py @@ -132,19 +132,19 @@ def prepare_inputs(input_tensor, idt, wdt): # mem_mode: const or decoupled @pytest.mark.parametrize("mem_mode", ["const", "decoupled"]) # activation: None or DataType -@pytest.mark.parametrize("act", [None, DataType.BIPOLAR, DataType.INT2]) +@pytest.mark.parametrize("act", [None, DataType.BIPOLAR, DataType.INT4]) # weight datatype -@pytest.mark.parametrize("wdt", [DataType.BIPOLAR, DataType.INT2]) +@pytest.mark.parametrize("wdt", [DataType.BIPOLAR, DataType.INT4]) # input datatype -@pytest.mark.parametrize("idt", [DataType.BIPOLAR, DataType.INT2]) +@pytest.mark.parametrize("idt", [DataType.BIPOLAR, DataType.INT4]) # neuron folding, -1 is maximum possible @pytest.mark.parametrize("nf", [-1, 2, 1]) # synapse folding, -1 is maximum possible @pytest.mark.parametrize("sf", [-1, 2, 1]) # HLS matrix width (input features) -@pytest.mark.parametrize("mw", [4]) +@pytest.mark.parametrize("mw", [16]) # HLS matrix height (output features) -@pytest.mark.parametrize("mh", [4]) +@pytest.mark.parametrize("mh", [16]) def test_fpgadataflow_fclayer_npysim(mem_mode, idt, wdt, act, nf, sf, mw, mh): if nf == -1: nf = mh @@ -217,19 +217,19 @@ def test_fpgadataflow_fclayer_npysim(mem_mode, idt, wdt, act, nf, sf, mw, mh): # mem_mode: const or decoupled @pytest.mark.parametrize("mem_mode", ["const", "decoupled"]) # activation: None or DataType -@pytest.mark.parametrize("act", [None, DataType.BIPOLAR, DataType.INT2]) +@pytest.mark.parametrize("act", [None, DataType.BIPOLAR, DataType.INT4]) # weight datatype -@pytest.mark.parametrize("wdt", [DataType.BIPOLAR, DataType.INT2]) +@pytest.mark.parametrize("wdt", [DataType.BIPOLAR, DataType.INT4]) # input datatype -@pytest.mark.parametrize("idt", [DataType.BIPOLAR, DataType.INT2]) +@pytest.mark.parametrize("idt", [DataType.BIPOLAR, DataType.INT4]) # neuron folding, -1 is maximum possible @pytest.mark.parametrize("nf", [-1, 2, 1]) # synapse folding, -1 is maximum possible @pytest.mark.parametrize("sf", [-1, 2, 1]) # HLS matrix width (input features) -@pytest.mark.parametrize("mw", [4]) +@pytest.mark.parametrize("mw", [16]) # HLS matrix height (output features) -@pytest.mark.parametrize("mh", [4]) +@pytest.mark.parametrize("mh", [16]) def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): if nf == -1: nf = mh diff --git a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py b/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py index eeff14c4d7c4aa8213f8673d9dd6a4745ececb1a..4a81977d49d174f66e1a02140a7643bd352db7a2 100644 --- a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py +++ b/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py @@ -48,12 +48,9 @@ from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject from finn.transformation.fpgadataflow.synth_pynq_proj import SynthPYNQProject +import finn.transformation.fpgadataflow.replace_verilog_relpaths as rvp from finn.transformation.general import GiveUniqueNodeNames -from finn.util.basic import ( - calculate_signed_dot_prod_range, - gen_finn_dt_tensor, - pynq_part_map, -) +from finn.util.basic import gen_finn_dt_tensor, pynq_part_map from finn.util.fpgadataflow import pyverilate_stitched_ip test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1") @@ -66,7 +63,7 @@ def create_one_fc_model(): # create a model with a StreamingFCLayer instance with no activation # the wider range of the full accumulator makes debugging a bit easier wdt = DataType.INT2 - idt = DataType.INT2 + idt = DataType.INT32 odt = DataType.INT32 m = 4 no_act = 1 @@ -119,13 +116,11 @@ def create_one_fc_model(): def create_two_fc_model(): # create a model with two StreamingFCLayer instances wdt = DataType.INT2 - idt = DataType.INT2 - odt = DataType.INT2 - act = DataType.INT2 + idt = DataType.INT32 + odt = DataType.INT32 m = 4 - tdt = DataType.INT32 - actval = odt.min() - no_act = 0 + actval = 0 + no_act = 1 binary_xnor_mode = 0 pe = 2 simd = 2 @@ -136,7 +131,7 @@ def create_two_fc_model(): fc0 = helper.make_node( "StreamingFCLayer_Batch", - ["inp", "w0", "t0"], + ["inp", "w0"], ["mid"], domain="finn", backend="fpgadataflow", @@ -151,11 +146,12 @@ def create_two_fc_model(): ActVal=actval, binaryXnorMode=binary_xnor_mode, noActivation=no_act, + mem_mode="decoupled", ) fc1 = helper.make_node( "StreamingFCLayer_Batch", - ["mid", "w1", "t1"], + ["mid", "w1"], ["outp"], domain="finn", backend="fpgadataflow", @@ -170,6 +166,7 @@ def create_two_fc_model(): ActVal=actval, binaryXnorMode=binary_xnor_mode, noActivation=no_act, + mem_mode="decoupled", ) graph = helper.make_graph( @@ -190,31 +187,19 @@ def create_two_fc_model(): model.set_tensor_datatype("w1", wdt) # generate weights - w0 = gen_finn_dt_tensor(wdt, (m, m)) - w1 = gen_finn_dt_tensor(wdt, (m, m)) + w0 = np.eye(m, dtype=np.float32) + w1 = np.eye(m, dtype=np.float32) model.set_initializer("w0", w0) model.set_initializer("w1", w1) - # generate thresholds - (min, max) = calculate_signed_dot_prod_range(idt, wdt, m) - n_steps = act.get_num_possible_values() - 1 - t0 = np.random.randint(min, max - 1, (m, n_steps)).astype(np.float32) - t1 = np.random.randint(min, max - 1, (m, n_steps)).astype(np.float32) - # provide non-decreasing thresholds - t0 = np.sort(t0, axis=1) - t1 = np.sort(t1, axis=1) - - model.set_initializer("t0", t0) - model.set_initializer("t1", t1) - model.set_tensor_datatype("t0", tdt) - model.set_tensor_datatype("t1", tdt) + model = model.transform(CreateDataflowPartition()) return model # exec_mode of StreamingDataflowPartition # @pytest.mark.parametrize("exec_mode", ["remote_pynq"]) #, "rtlsim"]) def test_fpgadataflow_ipstitch_gen_model(): # exec_mode): - model = create_one_fc_model() + model = create_two_fc_model() if model.graph.node[0].op_type == "StreamingDataflowPartition": sdp_node = getCustomOp(model.graph.node[0]) assert sdp_node.__class__.__name__ == "StreamingDataflowPartition" @@ -234,6 +219,7 @@ def test_fpgadataflow_ipstitch_do_stitch(): model = ModelWrapper( ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_gen_model.onnx" ) + model = model.transform(rvp.ReplaceVerilogRelPaths()) model = model.transform(CodeGen_ipstitch(test_fpga_part)) vivado_stitch_proj_dir = model.get_metadata_prop("vivado_stitch_proj") assert vivado_stitch_proj_dir is not None @@ -247,6 +233,7 @@ def test_fpgadataflow_ipstitch_do_stitch(): def test_fpgadataflow_ipstitch_rtlsim(): model = ModelWrapper(ip_stitch_model_dir + "/test_fpgadataflow_ip_stitch.onnx") + model.set_metadata_prop("rtlsim_trace", "whole_trace.vcd") sim = pyverilate_stitched_ip(model) exp_io = [ "ap_clk_0", @@ -265,6 +252,8 @@ def test_fpgadataflow_ipstitch_rtlsim(): idt = model.get_tensor_datatype("inp") ishape = model.get_tensor_shape("inp") x = gen_finn_dt_tensor(idt, ishape) + # x = np.zeros(ishape, dtype=np.float32) + # x = np.asarray([[-2, -1, 0, 1]], dtype=np.float32) rtlsim_res = execute_onnx(model, {"inp": x})["outp"] assert (rtlsim_res == x).all()