diff --git a/custom_hls/lookup.hpp b/custom_hls/lookup.hpp index 3001f6613ec6ed9a9e5f47d9be356d4b032f7192..037b038a09a10ff2bd066740d20f0b47489e24e4 100644 --- a/custom_hls/lookup.hpp +++ b/custom_hls/lookup.hpp @@ -26,14 +26,15 @@ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - *******************************************************************************/ +*******************************************************************************/ +#ifndef LOOKUP_HPP +#define LOOKUP_HPP #include <ap_int.h> #include <hls_stream.h> -#ifndef LOOKUP_HPP -#define LOOKUP_HPP +#include "utils.hpp" + template < unsigned NumEmbeddings, @@ -57,4 +58,50 @@ void StreamingLookup( } } +/** + * Lookup implementation over a table stored in AXI-accessible memory. + */ +template < + unsigned EmbeddingSize, // Number of memory words per embedding + unsigned EmbeddingAlign = clog2(EmbeddingSize), // Alignment of entries = number of word index bits + typename T_SRC, + typename T_DST +> +void StreamingLookup_ext( + hls::stream<T_SRC> &in0, + hls::stream<T_DST> &out, + T_DST const *const mem, + unsigned const size, + unsigned &oob_count, + bool &oob_irq +) { +#pragma HLS pipeline II=EmbeddingSize+9 style=flp + + static unsigned oob_count_li; + static unsigned oob_count_int; +#pragma HLS reset variable=oob_count_li +#pragma HLS reset variable=oob_count_int + + if(oob_count != oob_count_li) { + oob_count_int -= oob_count_li; + oob_count_li = oob_count; + } + if(!in0.empty()) { + T_SRC const x = in0.read(); + + // Map out-of-bounds inputs to an offset of zero and increment counter + bool const oob = x >= T_SRC(size); + ap_uint<T_SRC::width+EmbeddingAlign> const ofs = + ((oob? T_SRC(0) : x), ap_uint<EmbeddingAlign>(0)); + oob_count_int += oob; + + // Stream lookup data (burst inferred) + for(unsigned i = 0; i < EmbeddingSize; i++) { +#pragma HLS pipeline II=1 style=flp + out.write(mem[ofs+i]); + } + } + oob_count = oob_count_int; + oob_irq = (oob_count_int != 0); +} #endif diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn index 9c18c03d7bdb8406d43aa8fc4efdb8a206b1217e..b3c669ec1097745bd30f650ca0b9dacda647c61d 100644 --- a/docker/Dockerfile.finn +++ b/docker/Dockerfile.finn @@ -65,7 +65,7 @@ RUN locale-gen "en_US.UTF-8" RUN apt-get install -y git perl python3 make autoconf g++ flex bison ccache libgoogle-perftools-dev numactl perl-doc libfl2 libfl-dev zlibc zlib1g zlib1g-dev RUN git clone https://github.com/verilator/verilator RUN cd verilator && \ - git checkout v4.012 && \ + git checkout v4.224 && \ autoconf && \ ./configure && \ make -j4 && \ diff --git a/fetch-repos.sh b/fetch-repos.sh index 74d910478e83ce9a18000350c04e213a3e1f381e..b0f6400ed142b203b1c9f6d7ea4ac6ababcf34d1 100755 --- a/fetch-repos.sh +++ b/fetch-repos.sh @@ -27,12 +27,12 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -QONNX_COMMIT="398a0ecfcb32407c0a3df39246cf6d2bca02886c" +QONNX_COMMIT="f702b17cdb9d5e57f85f43a5d33890647e063de6" FINN_EXP_COMMIT="9cbd2787b5160e2b44e0e8164a0df1457dbd5366" BREVITAS_COMMIT="a5b71d6de1389d3e7db898fef72e014842670f03" -PYVERILATOR_COMMIT="64b8294ff1afebb47be76fcad6ae87027e0402c2" +PYVERILATOR_COMMIT="766e457465f5c0dd315490d7b9cc5d74f9a76f4f" CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4" -HLSLIB_COMMIT="79d7c61fbe318bfcd56e3c35bbfb774995a7870c" +HLSLIB_COMMIT="d27f6b6c5d8f1bb208db395659389603f63ad4be" OMX_COMMIT="d1065a788219ca0eb54d5e57600b1f9d7f67d4cc" AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b" XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e" diff --git a/finn-rtllib/memstream/hdl/Q_srl.v b/finn-rtllib/memstream/hdl/Q_srl.v index b4e89628a44bb1f55c3445ee8e6866beada23585..11cef604e0a3d106529a65ae229bc4cb419c4d70 100644 --- a/finn-rtllib/memstream/hdl/Q_srl.v +++ b/finn-rtllib/memstream/hdl/Q_srl.v @@ -69,7 +69,7 @@ `define Q_srl -module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count); +module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount); parameter depth = 16; // - greatest #items in queue (2 <= depth <= 256) parameter width = 16; // - width of data (i_d, o_d) @@ -90,7 +90,9 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count); wire o_b; // - output stream back-pressure output [addrwidth:0] count; // - output number of elems in queue + output [addrwidth:0] maxcount; // - maximum observed count since reset + reg [addrwidth:0] maxcount_reg; // - maximum count seen until now reg [addrwidth-1:0] addr, addr_, a_; // - SRL16 address // for data output reg shift_en_; // - SRL16 shift enable @@ -124,6 +126,7 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count); assign o_d = srlo; // - output data from queue assign o_v = o_v_reg; // - output valid if non-empty assign i_b = i_b_reg; // - input bp if full + assign maxcount = maxcount_reg; assign i_r = !i_b; assign o_b = !o_r; @@ -139,7 +142,10 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count); addr <= 0; addr_full <= 0; o_v_reg <= 0; - i_b_reg <= 1; + + i_b_reg <= 0; + maxcount_reg <= 0; + end else begin state <= state_; @@ -147,6 +153,7 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count); addr_full <= addr_full_; o_v_reg <= o_v_reg_; i_b_reg <= i_b_reg_; + maxcount_reg <= (count > maxcount_reg ? count : maxcount_reg); end end // always @ (posedge clock) diff --git a/finn-rtllib/swg/swg_template_default.sv b/finn-rtllib/swg/swg_template_default.sv new file mode 100644 index 0000000000000000000000000000000000000000..97517438a0c261e4488b74a677a352f9dc51743b --- /dev/null +++ b/finn-rtllib/swg/swg_template_default.sv @@ -0,0 +1,351 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +module $TOP_MODULE_NAME$_controller #( + int unsigned LOOP_H_ITERATIONS = $LOOP_H_ITERATIONS$, + int unsigned LOOP_W_ITERATIONS = $LOOP_W_ITERATIONS$, + int unsigned LOOP_KH_ITERATIONS = $LOOP_KH_ITERATIONS$, + int unsigned LOOP_KW_ITERATIONS = $LOOP_KW_ITERATIONS$, + int unsigned LOOP_SIMD_ITERATIONS = $LOOP_SIMD_ITERATIONS$, + + int unsigned INCR_BITWIDTH = $INCR_BITWIDTH$, + bit [INCR_BITWIDTH-1:0] ADDR_INCREMENT_MAP[6] = $ADDR_INCREMENT_MAP$, + + bit IS_DEPTHWISE = $IS_DEPTHWISE$ +)( + input logic clk, + input logic rst_n, + + input logic advance, + output logic [INCR_BITWIDTH-1:0] addr_incr, + output logic [INCR_BITWIDTH-1:0] tail_incr +); + + // state and counters + typedef enum logic [2:0] { + STATE_START, + STATE_LOOP_SIMD, + STATE_LOOP_KW, + STATE_LOOP_KH, + STATE_LOOP_W, + STATE_LOOP_H + } state_e; + state_e State = $INNERMOST_STATE$; + state_e state_next; + + logic signed [$clog2(LOOP_H_ITERATIONS +2)+1-1:0] Counter_loop_h = LOOP_H_ITERATIONS-1; + logic signed [$clog2(LOOP_W_ITERATIONS +2)+1-1:0] Counter_loop_w = LOOP_W_ITERATIONS-1; + logic signed [$clog2(LOOP_KH_ITERATIONS +2)+1-1:0] Counter_loop_kh = LOOP_KH_ITERATIONS-1; + logic signed [$clog2(LOOP_KW_ITERATIONS +2)+1-1:0] Counter_loop_kw = LOOP_KW_ITERATIONS-1; + logic signed [$clog2(LOOP_SIMD_ITERATIONS+2)+1-1:0] Counter_loop_simd = LOOP_SIMD_ITERATIONS-1; + + assign addr_incr = ADDR_INCREMENT_MAP[State]; + + // combinational logic for tail_incr generation + uwire tail_incr_inner_condition = IS_DEPTHWISE? (Counter_loop_kh >= 0) : 0; + always_comb begin : blkTail + if (tail_incr_inner_condition) + tail_incr = 1; + else if (Counter_loop_w >= 0) + tail_incr = $TAIL_INCR_W$; + else if (Counter_loop_h >= 0) + tail_incr = $TAIL_INCR_H$; + else + tail_incr = $TAIL_INCR_LAST$; + end + + // combinational next state logic + always_comb begin : blkState + state_next = State; + if(State != $INNERMOST_STATE$) state_next = $INNERMOST_STATE$; + else begin + if(Counter_loop_simd < 0) begin + state_next = + (Counter_loop_kw >= 0)? STATE_LOOP_KW : + (Counter_loop_kh >= 0)? STATE_LOOP_KH : + (Counter_loop_w >= 0)? STATE_LOOP_W : + (Counter_loop_h >= 0)? STATE_LOOP_H : + /* else */ STATE_START; + end + end + end : blkState + + // sequential logic + always_ff @ (posedge clk) begin + if(!rst_n) begin + State <= $INNERMOST_STATE$; + Counter_loop_h <= LOOP_H_ITERATIONS-1; + Counter_loop_w <= LOOP_W_ITERATIONS-1; + Counter_loop_kh <= LOOP_KH_ITERATIONS-1; + Counter_loop_kw <= LOOP_KW_ITERATIONS-1; + Counter_loop_simd <= LOOP_SIMD_ITERATIONS-1; + end + else if(advance) begin + State <= state_next; + if (State == $INNERMOST_STATE$) begin + if(Counter_loop_simd >= 0) Counter_loop_simd <= Counter_loop_simd-1; + else begin + Counter_loop_simd <= LOOP_SIMD_ITERATIONS-1; + if(Counter_loop_kw >= 0) Counter_loop_kw <= Counter_loop_kw-1; + else begin + Counter_loop_kw <= LOOP_KW_ITERATIONS-1; + if(Counter_loop_kh >= 0) Counter_loop_kh <= Counter_loop_kh-1; + else begin + Counter_loop_kh <= LOOP_KH_ITERATIONS-1; + if(Counter_loop_w >= 0) Counter_loop_w <= Counter_loop_w-1; + else begin + Counter_loop_w <= LOOP_W_ITERATIONS-1; + if(Counter_loop_h >= 0) Counter_loop_h <= Counter_loop_h-1; + else Counter_loop_h <= LOOP_H_ITERATIONS-1; + end + end + end + end + end + end + end + +endmodule : $TOP_MODULE_NAME$_controller + +module $TOP_MODULE_NAME$_cyclic_buffer_addressable #( + int unsigned WIDTH, + int unsigned DEPTH +)( + input logic clk, + input logic rst_n, + + input logic write_enable, + input logic [$clog2(DEPTH)-1:0] write_addr, + input logic [WIDTH-1:0] data_in, + + input logic read_enable, + input logic [$clog2(DEPTH)-1:0] read_addr, // absolute (!) read address of cyclic buffer + output logic [WIDTH-1:0] data_out +); + + $RAM_STYLE$ logic [WIDTH-1:0] Ram[DEPTH]; + logic [WIDTH-1:0] Out = 'x; + always_ff @(posedge clk) begin + if (read_enable) Out <= Ram[read_addr]; + if (write_enable) Ram[write_addr] <= data_in; + end + assign data_out = Out; + +endmodule : $TOP_MODULE_NAME$_cyclic_buffer_addressable + +module $TOP_MODULE_NAME$_impl #( + int BIT_WIDTH, + int SIMD, + int MMV_IN, + int MMV_OUT, + int LAST_READ_ELEM = $LAST_READ_ELEM$, + int LAST_WRITE_ELEM = $LAST_WRITE_ELEM$, + int BUF_ELEM_TOTAL = $BUF_ELEM_TOTAL$, + int ELEM_PER_WINDOW = $ELEM_PER_WINDOW$, + int INCR_BITWIDTH = $INCR_BITWIDTH$ +)( + input logic ap_clk, + input logic ap_rst_n, + + input logic in0_V_V_TVALID, + output logic in0_V_V_TREADY, + input logic [BIT_WIDTH * SIMD * MMV_IN-1:0] in0_V_V_TDATA, + + output logic out_V_V_TVALID, + input logic out_V_V_TREADY, + output logic [BIT_WIDTH * SIMD * MMV_OUT-1:0] out_V_V_TDATA +); + // derived Constants + localparam int unsigned BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN; + localparam int unsigned BUF_OUT_ELEM_WIDTH = BIT_WIDTH * SIMD; + localparam int unsigned BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT; + + // main buffer instantiation + uwire [BUF_IN_WIDTH -1:0] window_buffer_in; + uwire [BUF_OUT_WIDTH-1:0] window_buffer_out; + uwire window_buffer_write_enable; + uwire window_buffer_read_enable; + uwire [$clog2(BUF_ELEM_TOTAL)-1:0] window_buffer_write_addr; + uwire [$clog2(BUF_ELEM_TOTAL)-1:0] window_buffer_read_addr; + $TOP_MODULE_NAME$_cyclic_buffer_addressable #( + .WIDTH(BUF_IN_WIDTH), + .DEPTH(BUF_ELEM_TOTAL) + ) window_buffer_inst ( + .clk(ap_clk), + .rst_n(ap_rst_n), + + .write_enable(window_buffer_write_enable), + .write_addr(window_buffer_write_addr), + .data_in(window_buffer_in), + + .read_enable(window_buffer_read_enable), + .read_addr(window_buffer_read_addr), + .data_out(window_buffer_out) + ); + + //controller instantiation + uwire advance_controller; + uwire signed [INCR_BITWIDTH-1:0] addr_incr; + uwire [INCR_BITWIDTH-1:0] tail_incr; + $TOP_MODULE_NAME$_controller controller_inst ( + .clk(ap_clk), + .rst_n(ap_rst_n), + .advance(advance_controller), + .addr_incr(addr_incr), + .tail_incr(tail_incr) + ); + + // Counters/address registers + // Add a sign bit even to (most) unsigned counters and Window_buffer_read_addr_reg, + // so we can use automatic sign extension and simplify calculations w/ signed increment. + // Alternatively, we could manually sign-extend and shave off a bit here or there. + logic signed [$clog2(LAST_READ_ELEM+1)+1-1:0] Newest_buffered_elem = -1; + logic [$clog2(LAST_READ_ELEM+1)+1-1:0] Current_elem = 0; + logic [$clog2(LAST_READ_ELEM+1)+1-1:0] First_elem_next_window = 0; + logic [$clog2(ELEM_PER_WINDOW) -1:0] Position_in_window = 0; + logic [$clog2(BUF_ELEM_TOTAL)+1 -1:0] Window_buffer_read_addr_reg = 0; + logic [$clog2(BUF_ELEM_TOTAL)-1:0] Window_buffer_write_addr_reg = 0; + + // Control signals/registers + uwire read_cmd = + !reading_done && ( // if there is still an input element left to read + Fetching_done || ( // if fetching is done (e.g. for skipped rows at FM end due to stride) + $signed(((Newest_buffered_elem - (BUF_ELEM_TOTAL - 1)))) < $signed(First_elem_next_window) && + $signed(((Newest_buffered_elem - (BUF_ELEM_TOTAL - 1)))) < $signed(Current_elem) + ) // (over-)write to buffer if oldest buffered element will no longer be needed + ); + uwire read_ok = read_cmd && in0_V_V_TVALID; + uwire reading_done = Newest_buffered_elem == LAST_READ_ELEM; + + uwire fetch_cmd = !($signed(Current_elem) > Newest_buffered_elem) && !write_blocked && !Fetching_done; + logic Fetching_done = 0; + + logic Write_cmd = 0; + logic Writing_done = 0; + uwire write_ok = Write_cmd && out_V_V_TREADY; + uwire write_blocked = Write_cmd && !out_V_V_TREADY;; + + //assign buffer control + assign window_buffer_write_addr = Window_buffer_write_addr_reg; + assign window_buffer_read_addr = Window_buffer_read_addr_reg; + assign window_buffer_write_enable = read_ok; + assign window_buffer_read_enable = fetch_cmd; + assign advance_controller = fetch_cmd; + + //assign I/O ports + assign window_buffer_in = in0_V_V_TDATA; + assign out_V_V_TDATA = window_buffer_out; + assign in0_V_V_TREADY = ap_rst_n && read_ok; //only asserted if data is available and we can store it (allowed) + assign out_V_V_TVALID = ap_rst_n && Write_cmd; //only asserted if we have data available and it has not been read yet (don't wait for READY from sink) + + //main process for advancing counters + always_ff @(posedge ap_clk) begin + if(!ap_rst_n) begin + Newest_buffered_elem <= -1; + Current_elem <= 0; + First_elem_next_window <= 0; + Position_in_window <= 0; + Window_buffer_read_addr_reg <= 0; + Window_buffer_write_addr_reg <= 0; + Fetching_done <= 0; + Write_cmd <= 0; + Writing_done <= 0; + end + else begin + if (read_ok) begin + Window_buffer_write_addr_reg <= (Window_buffer_write_addr_reg == BUF_ELEM_TOTAL-1)? 0 : Window_buffer_write_addr_reg + 1; + Newest_buffered_elem <= Newest_buffered_elem+1; + + if (Newest_buffered_elem == LAST_READ_ELEM-1) begin + Window_buffer_write_addr_reg <= 0; + end + //check if this is the last read cycle (reading_done will be true afterwards) + if ((Newest_buffered_elem == LAST_READ_ELEM-1) && Writing_done) begin + //start processing of next FM if writing is done already (possible due to unused input elements at the tail end) + //todo: allow for read overlapping between feature maps (i.e., reading first elements from next FM while still writing last window of current FM) + Newest_buffered_elem <= -1; + Current_elem <= 0; + Window_buffer_read_addr_reg <= 0; + First_elem_next_window <= 0; + Writing_done <= 0; + Fetching_done <= 0; + end + end + + if (fetch_cmd) begin + //count up to track which element index is about to be read from the buffer, and where it is located within the buffer + //use increment value calculated by controller + + // absolute buffer address wrap-around + automatic logic signed [$clog2(BUF_ELEM_TOTAL)+1:0] ra = $signed(Window_buffer_read_addr_reg) + $signed(addr_incr); + automatic logic signed [$clog2(BUF_ELEM_TOTAL+1):0] ra_correct = + (ra >= BUF_ELEM_TOTAL)? -BUF_ELEM_TOTAL : + (ra < 0)? BUF_ELEM_TOTAL : 0; + Window_buffer_read_addr_reg <= ra + ra_correct; + + //keep track where we are within a window + Position_in_window <= (Position_in_window != ELEM_PER_WINDOW - 1)? Position_in_window+1 : 0; + + //update first element of next window to allow buffer overwrite up until that point + if (Position_in_window == 0) + First_elem_next_window <= First_elem_next_window + tail_incr; + + //check if this is the last write cycle (Writing_done will be true afterwards) + if (Current_elem == LAST_WRITE_ELEM) + Fetching_done <= 1; + else + Current_elem <= $signed(Current_elem) + addr_incr; + + // determine if prefetched data will be outstanding in the next cycle + // if we fetch in this cycle -> yes + // if we do not fetch nor write -> do not change + // if we do not fetch but write successfully-> clear outstanding data + Write_cmd <= fetch_cmd; + end + + if (write_ok) + Write_cmd <= fetch_cmd; + + if (write_ok && Fetching_done) begin + //check if this is the last write cycle (Writing_done will be true afterwards) + if (reading_done || (read_ok && (Newest_buffered_elem == LAST_READ_ELEM - 1))) begin + //start processing of next FM if reading is done already, or completes in the same cycle + Newest_buffered_elem <= -1; + Current_elem <= 0; + Window_buffer_read_addr_reg <= 0; + First_elem_next_window <= 0; + Fetching_done <= 0; + end else + Writing_done <= 1; + end + end + end + +endmodule : $TOP_MODULE_NAME$_impl diff --git a/finn-rtllib/swg/swg_template_wrapper.v b/finn-rtllib/swg/swg_template_wrapper.v new file mode 100644 index 0000000000000000000000000000000000000000..0cc3579a255fddaf1a470d440b9e8ac245abe486 --- /dev/null +++ b/finn-rtllib/swg/swg_template_wrapper.v @@ -0,0 +1,75 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +`timescale 1 ns / 1 ps + +module $TOP_MODULE_NAME$ ( +(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *) +input ap_clk, +(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *) +input ap_rst_n, +input [BUF_IN_WIDTH-1:0] in0_V_TDATA, +input in0_V_TVALID, +output in0_V_TREADY, +output [BUF_OUT_WIDTH-1:0] out_V_TDATA, +output out_V_TVALID, +input out_V_TREADY +); + +// top-level parameters (set via code-generation) +parameter BIT_WIDTH = $BIT_WIDTH$; +parameter SIMD = $SIMD$; +parameter MMV_IN = $MMV_IN$; +parameter MMV_OUT = $MMV_OUT$; + +// derived constants +parameter BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN; +parameter BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT; + +$TOP_MODULE_NAME$_impl +#( + .BIT_WIDTH(BIT_WIDTH), + .SIMD(SIMD), + .MMV_IN(MMV_IN), + .MMV_OUT(MMV_OUT) +) +impl +( + .ap_clk(ap_clk), + .ap_rst_n(ap_rst_n), + .in0_V_V_TDATA(in0_V_TDATA), + .in0_V_V_TVALID(in0_V_TVALID), + .in0_V_V_TREADY(in0_V_TREADY), + .out_V_V_TDATA(out_V_TDATA), + .out_V_V_TVALID(out_V_TVALID), + .out_V_V_TREADY(out_V_TREADY) +); + +endmodule //TOP_MODULE_NAME diff --git a/requirements.txt b/requirements.txt index 970acc342bb7984e69929d1ef5eaa027b765ced0..9038a5e8170301421529e0b570482316e4fff20a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,7 @@ onnx==1.11.0 onnxoptimizer onnxruntime==1.11.1 pre-commit==2.9.2 -protobuf==3.20.1 +protobuf==3.20.2 pyscaffold==3.2.1 scipy==1.5.2 setupext-janitor>=1.1.2 diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index 92263bd82ce291833c6868847876ac7e3b68e6f8..d3c4156d9b4ccf601d3eea348f6cb61c0d9a6e87 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -37,6 +37,13 @@ from finn.transformation.fpgadataflow.vitis_build import VitisOptStrategy from finn.util.basic import alveo_default_platform, alveo_part_map, pynq_part_map +class AutoFIFOSizingMethod(str, Enum): + "Select the type of automatic FIFO sizing strategy." + + CHARACTERIZE = "characterize" + LARGEFIFO_RTLSIM = "largefifo_rtlsim" + + class ShellFlowType(str, Enum): """For builds that produce a bitfile, select the shell flow that will integrate the FINN-generated accelerator.""" @@ -246,6 +253,12 @@ class DataflowBuildConfig: #: for each FIFO. auto_fifo_depths: Optional[bool] = True + #: When `auto_fifo_depths = True`, select which method will be used for + #: setting the FIFO sizes. + auto_fifo_strategy: Optional[ + AutoFIFOSizingMethod + ] = AutoFIFOSizingMethod.LARGEFIFO_RTLSIM + #: Memory resource type for large FIFOs #: Only relevant when `auto_fifo_depths = True` large_fifo_mem_style: Optional[LargeFIFOMemStyle] = LargeFIFOMemStyle.AUTO @@ -258,6 +271,10 @@ class DataflowBuildConfig: #: Which memory mode will be used for compute layers default_mem_mode: Optional[ComputeEngineMemMode] = ComputeEngineMemMode.DECOUPLED + #: Force inference of RTL ConvolutionInputGenerator over HLS implementation + #: If set to False, falls back to the default behavior of InferConvInpGen() + force_rtl_conv_inp_gen: Optional[bool] = False + #: Which Vitis platform will be used. #: Only relevant when `shell_flow_type = ShellFlowType.VITIS_ALVEO` #: e.g. "xilinx_u250_xdma_201830_2" @@ -316,6 +333,10 @@ class DataflowBuildConfig: #: Override the number of inputs for rtlsim performance measurement. rtlsim_batch_size: Optional[int] = 1 + #: If set to True, FIFOs and DWCs with impl_style=vivado will be kept during + #: rtlsim, otherwise they will be replaced by HLS implementations. + rtlsim_use_vivado_comps: Optional[bool] = True + def _resolve_hls_clk_period(self): if self.hls_clk_period_ns is None: # use same clk for synth and hls if not explicitly specified diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 59f77650da5c3c3f9db0ea65e2288544b376bec3..5da608c27def8136f9ad11f62b4707452eac3120 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -29,6 +29,7 @@ import json import numpy as np import os +import shutil from copy import deepcopy from distutils.dir_util import copy_tree from qonnx.core.modelwrapper import ModelWrapper @@ -78,6 +79,10 @@ from finn.transformation.fpgadataflow.create_dataflow_partition import ( CreateDataflowPartition, ) from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from finn.transformation.fpgadataflow.derive_characteristic import ( + DeriveCharacteristic, + DeriveFIFOSizes, +) from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.insert_dwc import InsertDWC from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO @@ -85,6 +90,7 @@ from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( ReplaceVerilogRelPaths, ) @@ -121,81 +127,126 @@ def verify_step( verify_out_dir = cfg.output_dir + "/verification_output" intermediate_models_dir = cfg.output_dir + "/intermediate_models" os.makedirs(verify_out_dir, exist_ok=True) - (in_npy, exp_out_npy) = cfg._resolve_verification_io_pair() - if need_parent: - assert ( - cfg.save_intermediate_models - ), "Enable save_intermediate_models for verification" - parent_model_fn = intermediate_models_dir + "/dataflow_parent.onnx" - child_model_fn = intermediate_models_dir + "/verify_%s.onnx" % step_name - model.save(child_model_fn) - out_tensor_name = ModelWrapper(parent_model_fn).graph.output[0].name - out_dict = execute_parent( - parent_model_fn, child_model_fn, in_npy, return_full_ctx=True - ) - out_npy = out_dict[out_tensor_name] - else: - inp_tensor_name = model.graph.input[0].name - out_tensor_name = model.graph.output[0].name - inp_dict = {inp_tensor_name: in_npy} - if rtlsim_pre_hook is not None: - out_dict = rtlsim_exec(model, inp_dict, pre_hook=rtlsim_pre_hook) + (in_npy_all, exp_out_npy_all) = cfg._resolve_verification_io_pair() + bsize_in = in_npy_all.shape[0] + bsize_out = exp_out_npy_all.shape[0] + assert bsize_in == bsize_out, "Batch sizes don't match for verification IO pair" + all_res = True + for b in range(bsize_in): + in_npy = np.expand_dims(in_npy_all[b], axis=0) + exp_out_npy = np.expand_dims(exp_out_npy_all[b], axis=0) + if need_parent: + assert ( + cfg.save_intermediate_models + ), "Enable save_intermediate_models for verification" + parent_model_fn = intermediate_models_dir + "/dataflow_parent.onnx" + child_model_fn = intermediate_models_dir + "/verify_%s.onnx" % step_name + model.save(child_model_fn) + parent_model = ModelWrapper(parent_model_fn) + out_tensor_name = parent_model.graph.output[0].name + exp_ishape = parent_model.get_tensor_shape(parent_model.graph.input[0].name) + if in_npy.shape != exp_ishape: + print( + "Verification input has shape %s while model expects %s" + % (str(in_npy.shape), str(exp_ishape)) + ) + print("Attempting to force model shape on verification input") + in_npy = in_npy.reshape(exp_ishape) + out_dict = execute_parent( + parent_model_fn, child_model_fn, in_npy, return_full_ctx=True + ) + out_npy = out_dict[out_tensor_name] else: - out_dict = execute_onnx(model, inp_dict, True) - out_npy = out_dict[out_tensor_name] - res = np.isclose(exp_out_npy, out_npy, atol=1e-3).all() - res_to_str = {True: "SUCCESS", False: "FAIL"} - res_str = res_to_str[res] - if cfg.verify_save_full_context: - verification_output_fn = verify_out_dir + "/verify_%s_%s.npz" % ( - step_name, - res_str, - ) - np.savez(verification_output_fn, **out_dict) - else: - verification_output_fn = verify_out_dir + "/verify_%s_%s.npy" % ( - step_name, - res_str, - ) - np.save(verification_output_fn, out_npy) - print("Verification for %s : %s" % (step_name, res_str)) + inp_tensor_name = model.graph.input[0].name + out_tensor_name = model.graph.output[0].name + exp_ishape = model.get_tensor_shape(inp_tensor_name) + if in_npy.shape != exp_ishape: + print( + "Verification input has shape %s while model expects %s" + % (str(in_npy.shape), str(exp_ishape)) + ) + print("Attempting to force model shape on verification input") + in_npy = in_npy.reshape(exp_ishape) + inp_dict = {inp_tensor_name: in_npy} + if rtlsim_pre_hook is not None: + out_dict = rtlsim_exec(model, inp_dict, pre_hook=rtlsim_pre_hook) + else: + out_dict = execute_onnx(model, inp_dict, True) + out_npy = out_dict[out_tensor_name] + exp_oshape = exp_out_npy.shape + if out_npy.shape != exp_oshape: + print( + "Verification output has shape %s while model produces %s" + % (str(exp_oshape), str(out_npy.shape)) + ) + print("Attempting to force model shape on verification output") + out_npy = out_npy.reshape(exp_oshape) + + res = np.isclose(exp_out_npy, out_npy, atol=1e-3).all() + all_res = all_res and res + res_to_str = {True: "SUCCESS", False: "FAIL"} + res_str = res_to_str[res] + if cfg.verify_save_full_context: + verification_output_fn = verify_out_dir + "/verify_%s_%d_%s.npz" % ( + step_name, + b, + res_str, + ) + np.savez(verification_output_fn, **out_dict) + else: + verification_output_fn = verify_out_dir + "/verify_%s_%d_%s.npy" % ( + step_name, + b, + res_str, + ) + np.save(verification_output_fn, out_npy) + if cfg.verify_save_rtlsim_waveforms: + vcd_path = model.get_metadata_prop("rtlsim_trace") + if vcd_path is not None and os.path.isfile(vcd_path): + new_vcd_path = vcd_path.replace(".vcd", "_%d.vcd" % b) + shutil.move(vcd_path, new_vcd_path) + print("Verification for %s : %s" % (step_name, res_to_str[all_res])) def prepare_for_stitched_ip_rtlsim(verify_model, cfg): - need_restitch = False - # rtlsim only supports certain impl_style for some nodes - # StreamingFIFO must have impl_style=rtl - for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO"): - inst = getCustomOp(fifo_layer) - if inst.get_nodeattr("impl_style") != "rtl": - inst.set_nodeattr("impl_style", "rtl") - inst.set_nodeattr("code_gen_dir_ipgen", "") - inst.set_nodeattr("ipgen_path", "") - need_restitch = True - # StreamingDataWidthConverter must have impl_style=hls - for dwc_layer in verify_model.get_nodes_by_op_type( - "StreamingDataWidthConverter_Batch" - ): - inst = getCustomOp(dwc_layer) - if inst.get_nodeattr("impl_style") != "hls": - inst.set_nodeattr("impl_style", "hls") - inst.set_nodeattr("code_gen_dir_ipgen", "") - inst.set_nodeattr("ipgen_path", "") - need_restitch = True - # if we've made alterations to the model, need to do some re-prep - if need_restitch: - print("Need to regen/re-stitch some IP for STITCHED_IP_RTLSIM") - verify_model = verify_model.transform( - PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()) - ) - verify_model = verify_model.transform(HLSSynthIP()) - verify_model = verify_model.transform( - CreateStitchedIP( - cfg._resolve_fpga_part(), - cfg.synth_clk_period_ns, - vitis=False, + if not cfg.rtlsim_use_vivado_comps: + need_restitch = False + # switch impl_style=vivado components to rtl/hls + # StreamingFIFO must have impl_style=rtl + for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO"): + inst = getCustomOp(fifo_layer) + if inst.get_nodeattr("impl_style") != "rtl": + inst.set_nodeattr("impl_style", "rtl") + inst.set_nodeattr("code_gen_dir_ipgen", "") + inst.set_nodeattr("ipgen_path", "") + need_restitch = True + # StreamingDataWidthConverter must have impl_style=hls + for dwc_layer in verify_model.get_nodes_by_op_type( + "StreamingDataWidthConverter_Batch" + ): + inst = getCustomOp(dwc_layer) + if inst.get_nodeattr("impl_style") != "hls": + inst.set_nodeattr("impl_style", "hls") + inst.set_nodeattr("code_gen_dir_ipgen", "") + inst.set_nodeattr("ipgen_path", "") + need_restitch = True + # if we've made alterations to the model, need to do some re-prep + if need_restitch: + print("Need to regen/re-stitch some IP for STITCHED_IP_RTLSIM") + verify_model = verify_model.transform( + PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()) ) - ) + verify_model = verify_model.transform(HLSSynthIP()) + verify_model = verify_model.transform( + CreateStitchedIP( + cfg._resolve_fpga_part(), + cfg.synth_clk_period_ns, + vitis=False, + ) + ) + else: + print("rtlsim_use_vivado_comps is enabled, may yield incorrect results") + # set top-level prop for stitched-ip rtlsim and launch verify_model.set_metadata_prop("exec_mode", "rtlsim") # TODO make configurable @@ -302,7 +353,10 @@ def step_convert_to_hls(model: ModelWrapper, cfg: DataflowBuildConfig): # needed for convolutions -- TODO always exec? need_conv = len(model.get_nodes_by_op_type("Im2Col")) > 0 if need_conv: - model = model.transform(to_hls.InferConvInpGen()) + if cfg.force_rtl_conv_inp_gen: + model = model.transform(to_hls.InferConvInpGen(use_rtl_variant=True)) + else: + model = model.transform(to_hls.InferConvInpGen()) model = model.transform(to_hls.InferStreamingMaxPool()) model = model.transform(RemoveCNVtoFCFlatten()) # get rid of Tranpose -> Tranpose identity seq @@ -446,9 +500,9 @@ def step_hls_ipgen(model: ModelWrapper, cfg: DataflowBuildConfig): def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): """ Depending on the auto_fifo_depths setting, do one of the following: - * if auto_fifo_depths=True: Run the `InsertAndSetFIFODepths` transformation - to attempt to determine the FIFO sizes that provide full throughput. Involves - running stitched-IP rtlsim and may take a long time. + * if auto_fifo_depths=True: Run the appropriate auto-sizing transformation + to attempt to determine the FIFO sizes that provide full throughput. + May take a long time. * if auto_fifo_depths=False: Assume the folding config file contains FIFO sizes as well. Runs the `InsertFIFO` transformation, then `ApplyConfig(cfg.folding_config_file)`, and finally `RemoveShallowFIFOs`. @@ -457,13 +511,35 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): """ if cfg.auto_fifo_depths: - model = model.transform( - InsertAndSetFIFODepths( - cfg._resolve_fpga_part(), - cfg._resolve_hls_clk_period(), - vivado_ram_style=cfg.large_fifo_mem_style, + if cfg.auto_fifo_strategy == "characterize": + model = model.transform(InsertDWC()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform( + PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()) ) - ) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + model = model.transform(AnnotateCycles()) + period = model.analysis(dataflow_performance)["max_cycles"] + 10 + model = model.transform(DeriveCharacteristic(period)) + model = model.transform(DeriveFIFOSizes()) + model = model.transform( + InsertFIFO( + vivado_ram_style=cfg.large_fifo_mem_style, max_qsrl_depth=256 + ) + ) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + elif cfg.auto_fifo_strategy == "largefifo_rtlsim": + model = model.transform( + InsertAndSetFIFODepths( + cfg._resolve_fpga_part(), + cfg._resolve_hls_clk_period(), + vivado_ram_style=cfg.large_fifo_mem_style, + ) + ) + else: + assert "Unsupported auto_fifo_strategy: " + cfg.auto_fifo_strategy else: # assume folding cfg json contains FIFO sizes too # insert DWCs, FIFOs and run ApplyConfig once more diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 2c7c86c64ea1279cb18cf8342aa20fb2792bdaf5..e5eb483a00f6890f5eeb16c5cec533a4533c9f15 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -36,8 +36,12 @@ from finn.custom_op.fpgadataflow.convolutioninputgenerator import ( from finn.custom_op.fpgadataflow.convolutioninputgenerator1d import ( ConvolutionInputGenerator1D, ) +from finn.custom_op.fpgadataflow.convolutioninputgenerator_rtl import ( + ConvolutionInputGenerator_rtl, +) from finn.custom_op.fpgadataflow.downsampler import DownSampler from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch +from finn.custom_op.fpgadataflow.eltwise import StreamingEltwise from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch from finn.custom_op.fpgadataflow.iodma import IODMA @@ -67,6 +71,7 @@ custom_op["StreamingMaxPool_Batch"] = StreamingMaxPool_Batch custom_op["MatrixVectorActivation"] = MatrixVectorActivation custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator custom_op["ConvolutionInputGenerator1D"] = ConvolutionInputGenerator1D +custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl custom_op["TLastMarker"] = TLastMarker custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch custom_op["StreamingFIFO"] = StreamingFIFO @@ -85,3 +90,4 @@ custom_op["UpsampleNearestNeighbour_Batch"] = UpsampleNearestNeighbour_Batch custom_op["Lookup"] = Lookup custom_op["StreamingConcat"] = StreamingConcat custom_op["CheckSum"] = CheckSum +custom_op["StreamingEltwise"] = StreamingEltwise diff --git a/src/finn/custom_op/fpgadataflow/addstreams_batch.py b/src/finn/custom_op/fpgadataflow/addstreams_batch.py index 13a4c5892c8f82c37e1794057a06217981a6a580..cd0af6b3ab3d8250abbf7d48e004622e55f09f04 100644 --- a/src/finn/custom_op/fpgadataflow/addstreams_batch.py +++ b/src/finn/custom_op/fpgadataflow/addstreams_batch.py @@ -42,18 +42,21 @@ class AddStreams_Batch(HLSCustomOp): super().__init__(onnx_node) def get_nodeattr_types(self): - my_attrs = { - "NumChannels": ("i", True, ""), - "PE": ("i", True, ""), - # FINN DataTypes for inputs; output datatype inferred from input - "inputDataType": ("s", True, ""), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), - } - my_attrs.update(super().get_nodeattr_types()) + my_attrs = super().get_nodeattr_types() + my_attrs.update( + { + "NumChannels": ("i", True, ""), + "PE": ("i", True, ""), + # FINN DataTypes for inputs; output datatype inferred from input + "inputDataType": ("s", True, ""), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + "inFIFODepths": ("ints", False, [2, 2]), + } + ) return my_attrs def get_normal_input_shape(self, ind=0): @@ -70,10 +73,10 @@ class AddStreams_Batch(HLSCustomOp): ishape = tuple(vecs + [ich // pe, pe]) return ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): return self.get_normal_input_shape() - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): return self.get_folded_input_shape() def make_shape_compatible_op(self, model): @@ -124,11 +127,11 @@ class AddStreams_Batch(HLSCustomOp): return info_messages - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" # we need to set output datatype to the next larger int or uint # enhancement: consider specifying w/ explicit outputDataType attribute @@ -139,14 +142,14 @@ class AddStreams_Batch(HLSCustomOp): else: return DataType.get_smallest_possible(2 * idt.max()) - def get_instream_width(self): + def get_instream_width(self, ind=0): """Returns input stream width.""" ibits = self.get_input_datatype().bitwidth() pe = self.get_nodeattr("PE") in_width = pe * ibits return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): """Returns output stream width.""" obits = self.get_output_datatype().bitwidth() pe = self.get_nodeattr("PE") @@ -357,3 +360,14 @@ class AddStreams_Batch(HLSCustomOp): swidth = self.get_instream_width_padded() intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]] return intf_names + + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + "in1": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, + } + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py index 3ed76db2982e411b711be5bd78e39dd866332714..46adca680d3c96695eeb5a91be53ea158fc78f1f 100644 --- a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py +++ b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py @@ -102,9 +102,6 @@ class ChannelwiseOp_Batch(HLSCustomOp): "inputDataType": ("s", True, ""), "paramDataType": ("s", True, ""), "outputDataType": ("s", True, ""), - # input and output FIFO depths - "inFIFODepth": ("i", False, 0), - "outFIFODepth": ("i", False, 0), # number of input vectors, examples: # [1] is a single vector (like a FC layer with batch=1) # [4] is four vectors (like a FC layer with batch=4) @@ -221,23 +218,23 @@ class ChannelwiseOp_Batch(HLSCustomOp): # total cost return comparator_cost + lutram_cost - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("outputDataType")] - def get_instream_width(self): + def get_instream_width(self, ind=0): i_bits = self.get_input_datatype().bitwidth() return i_bits * self.get_nodeattr("PE") - def get_outstream_width(self): + def get_outstream_width(self, ind=0): o_bits = self.get_output_datatype().bitwidth() return o_bits * self.get_nodeattr("PE") - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): ich = self.get_nodeattr("NumChannels") pe = self.get_nodeattr("PE") fold = ich // pe @@ -245,17 +242,17 @@ class ChannelwiseOp_Batch(HLSCustomOp): folded_input_shape = tuple(vecs + [fold, pe]) return folded_input_shape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): # same shape as input return self.get_folded_input_shape() - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): ich = self.get_nodeattr("NumChannels") vecs = list(self.get_nodeattr("numInputVectors")) normal_input_shape = tuple(vecs + [ich]) return normal_input_shape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): # same shape as input return self.get_normal_input_shape() diff --git a/src/finn/custom_op/fpgadataflow/checksum.py b/src/finn/custom_op/fpgadataflow/checksum.py index bde285eb0dd1b3818926c1feb7ac8d5de69a4be6..c927c07df21faf40ccbf9ddbe47e3f2f2ca61c89 100644 --- a/src/finn/custom_op/fpgadataflow/checksum.py +++ b/src/finn/custom_op/fpgadataflow/checksum.py @@ -77,31 +77,31 @@ class CheckSum(HLSCustomOp): def verify_node(self): pass - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" # here same as input data type return DataType[self.get_nodeattr("inputDataType")] - def get_instream_width(self): + def get_instream_width(self, ind=0): dtype = DataType[self.get_nodeattr("inputDataType")] folded_shape = self.get_nodeattr("folded_shape") in_width = folded_shape[-1] * dtype.bitwidth() return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): return self.get_instream_width() - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): return self.get_nodeattr("folded_shape") - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): return self.get_nodeattr("folded_shape") - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): # derive normal shape from folded shape # checksum nodes are inserted in between fpgadataflow nodes # the folded shape could be for example (1, nf, pe) @@ -127,7 +127,7 @@ class CheckSum(HLSCustomOp): def get_ap_int_max_w(self): return max(super().get_ap_int_max_w(), 32) - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): # same shape as input return self.get_normal_input_shape() diff --git a/src/finn/custom_op/fpgadataflow/concat.py b/src/finn/custom_op/fpgadataflow/concat.py index 5fcf9cf96cbacd4e444af0b90618a19eefb9bfe2..4437bcd1984c5194b0a19b43d692babb7e3cd158 100644 --- a/src/finn/custom_op/fpgadataflow/concat.py +++ b/src/finn/custom_op/fpgadataflow/concat.py @@ -74,12 +74,12 @@ class StreamingConcat(HLSCustomOp): def get_folded_input_shape(self, ind=0): return self.get_normal_input_shape(ind) - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): total_elems = self.get_total_elems() vecs = list(self.get_nodeattr("numInputVectors")) return tuple(vecs + [total_elems]) - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): return self.get_normal_output_shape() def make_shape_compatible_op(self, model): @@ -106,7 +106,7 @@ class StreamingConcat(HLSCustomOp): # input dt identical for all inputs return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): return self.get_input_datatype() def get_instream_width(self, ind=0): @@ -115,7 +115,7 @@ class StreamingConcat(HLSCustomOp): ibits = self.get_input_datatype().bitwidth() return elems * ibits - def get_outstream_width(self): + def get_outstream_width(self, ind=0): obits = self.get_output_datatype().bitwidth() total_elems = self.get_total_elems() out_width = total_elems * obits diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py index 251a9882c58a3cf94449701795b72c8a6adab318..1566445999a2c568b5c5a112d436bf05fd89aca5 100644 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py @@ -99,13 +99,13 @@ class ConvolutionInputGenerator(HLSCustomOp): assert ret[0] == ret[1] == 1, "Only dilation=1 supported" return ret - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) return ishape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") simd = self.get_nodeattr("SIMD") @@ -114,7 +114,7 @@ class ConvolutionInputGenerator(HLSCustomOp): folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd) return folded_ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): k_h, k_w = self.get_nodeattr("ConvKernelDim") ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") @@ -126,7 +126,7 @@ class ConvolutionInputGenerator(HLSCustomOp): oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch) return oshape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): k_h, k_w = self.get_nodeattr("ConvKernelDim") ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") @@ -158,15 +158,15 @@ class ConvolutionInputGenerator(HLSCustomOp): def verify_node(self): pass - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("outputDataType")] - def get_instream_width(self): + def get_instream_width(self, ind=0): """Returns stream width, input and output stream width are equal for the sliding window function""" ibits = self.get_input_datatype().bitwidth() @@ -176,7 +176,7 @@ class ConvolutionInputGenerator(HLSCustomOp): in_width = simd * ibits return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): """Returns stream width, input and output stream width are equal for the sliding window function, so the function to determine the input stream width can be reused.""" diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py index aba74baecc0f40571fa288459a04ad42e167ccf6..f1c84662cc06e89df5bd7c0762ac47b8c5723502 100644 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py @@ -91,13 +91,13 @@ class ConvolutionInputGenerator1D(HLSCustomOp): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) return ishape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") simd = self.get_nodeattr("SIMD") @@ -106,7 +106,7 @@ class ConvolutionInputGenerator1D(HLSCustomOp): folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd) return folded_ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): k_h, k_w = self.get_nodeattr("ConvKernelDim") ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") @@ -118,7 +118,7 @@ class ConvolutionInputGenerator1D(HLSCustomOp): oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch) return oshape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): k_h, k_w = self.get_nodeattr("ConvKernelDim") ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") @@ -153,15 +153,15 @@ class ConvolutionInputGenerator1D(HLSCustomOp): def verify_node(self): pass - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("outputDataType")] - def get_instream_width(self): + def get_instream_width(self, ind=0): ibits = self.get_input_datatype().bitwidth() simd = self.get_nodeattr("SIMD") ifm_ch = self.get_nodeattr("IFMChannels") @@ -169,7 +169,7 @@ class ConvolutionInputGenerator1D(HLSCustomOp): in_width = simd * ibits return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): if self.use_parallel_window_output(): # feed all window pixels in parallel k_h, k_w = self.get_nodeattr("ConvKernelDim") diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py new file mode 100755 index 0000000000000000000000000000000000000000..5424050a8ed0a353894721d5bba28c1d45e62771 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py @@ -0,0 +1,834 @@ +# Copyright (C) 2022, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import math +import numpy as np +import os +from math import copysign +from qonnx.core.datatype import DataType +from qonnx.custom_op.general import im2col +from qonnx.custom_op.general.im2col import compute_conv_output_dim + +from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.util.basic import get_rtlsim_trace_depth, make_build_dir +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None + +# RTL Convolution Input Generator / Sliding Window Generator (SWG) +# Matches and extends the functionality of all ConvolutionInputGenerator_* functions +# in finn-hlslib by generating HDL code for two different implementation styles: +# - Addressable cyclic buffer: to be used when out_width <= in_width +# - Parallel registers + line buffers: to be used when out_width > in_width +# Supports non-square, 1D, strided, dilated, and depthwise convolutions. +# Note: the actual data layout produced is different for depthwise and non-depthwise: +# * non-depthwise SWG: (1, OFMDim_H, OFMDim_W, K_H, K_W, IFMChannels/SIMD, SIMD) +# * depthwise SWG: (1, OFMDim_H, OFMDim_W, IFMChannels/SIMD, K_H, K_W, SIMD) + +# NOTE: "Parallel" implementation style not yet implemented in this version! + + +class ConvolutionInputGenerator_rtl(HLSCustomOp): + """Class that does not correspond to one of the finn-hlslib ConvolutionInputGenerator + (sliding window) function variants. Generates an RTL ConvolutionInputGenerator + implementation based on (System-)Verilog templates, defined in finn-rtllib/swg.""" + + def __init__(self, onnx_node): + super().__init__(onnx_node) + + def get_nodeattr_types(self): + my_attrs = { + "ConvKernelDim": ("ints", True, []), # [H, W] = [Y, X] + "IFMChannels": ("i", True, 0), + "IFMDim": ("ints", True, []), # [H, W] = [Y, X] + "OFMDim": ("ints", True, []), # [H, W] = [Y, X] + "SIMD": ("i", True, 0), + # additional parallelization parameter - not yet implemented + "M": ("i", False, 1), + # alternative implementation style - not yet implemented + "parallel_window": ("i", False, 0, {0}), + "Stride": ("ints", True, []), # [H, W] = [Y, X] + "Dilation": ("ints", True, []), # [H, W] = [Y, X] + # FINN DataTypes for inputs, weights, outputs + "inputDataType": ("s", True, ""), + "outputDataType": ("s", True, ""), + "depthwise": ("i", False, 0, {0, 1}), + # FPGA resource type for ConvolutionInputGenerator input buffer + # auto -- let Vivado decide + # block -- use BRAM + # distributed -- use LUTRAM + # ultra -- use URAM + "ram_style": ( + "s", + False, + "auto", + {"auto", "block", "distributed", "ultra"}, + ), + # attribute to save top module name - not user configurable + "gen_top_module": ("s", False, ""), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_normal_input_shape(self, ind=0): + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") + ifm_ch = self.get_nodeattr("IFMChannels") + ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) + return ishape + + def get_folded_input_shape(self, ind=0): + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") + ifm_ch = self.get_nodeattr("IFMChannels") + simd = self.get_nodeattr("SIMD") + assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" + wf = int(ifm_ch / simd) + folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd) + return folded_ishape + + def get_normal_output_shape(self, ind=0): + k_h, k_w = self.get_nodeattr("ConvKernelDim") + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") + ifm_ch = self.get_nodeattr("IFMChannels") + stride_h, stride_w = self.get_nodeattr("Stride") + dilation_h, dilation_w = self.get_nodeattr("Dilation") + pad = 0 + ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) + ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) + oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch) + return oshape + + def get_folded_output_shape(self, ind=0): + k_h, k_w = self.get_nodeattr("ConvKernelDim") + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") + ifm_ch = self.get_nodeattr("IFMChannels") + stride_h, stride_w = self.get_nodeattr("Stride") + dilation_h, dilation_w = self.get_nodeattr("Dilation") + simd = self.get_nodeattr("SIMD") + pad = 0 + ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) + ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) + assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" + if self.get_nodeattr("parallel_window"): + wf = int((ifm_ch) // simd) + folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd) + else: + wf = int((k_h * k_w * ifm_ch) // simd) + folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd) + return folded_oshape + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpect input shape for ConvInpGen." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + # data type stays the same + dtype = model.get_tensor_datatype(node.input[0]) + model.set_tensor_datatype(node.output[0], dtype) + + def verify_node(self): + pass + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("outputDataType")] + + def get_instream_width(self, ind=0): + ibits = self.get_input_datatype().bitwidth() + simd = self.get_nodeattr("SIMD") + ifm_ch = self.get_nodeattr("IFMChannels") + assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" + in_width = simd * ibits + return in_width + + def get_outstream_width(self, ind=0): + if self.get_nodeattr("parallel_window"): + # feed all window pixels in parallel + k_h, k_w = self.get_nodeattr("ConvKernelDim") + return self.get_instream_width() * k_h * k_w + else: + # if parallel variant not in use: same width for output and input stream + return self.get_instream_width() + + def get_number_input_values(self): + folded_ishape = self.get_folded_input_shape() + num_input_elems = np.prod(folded_ishape[:-1]) + return num_input_elems + + def get_number_output_values(self): + folded_oshape = self.get_folded_output_shape() + num_output_elems = np.prod(folded_oshape[:-1]) + return num_output_elems + + def get_1d_conv_attrs_normalized(self): + # normalize FM dimensions so that: + # [H, W] = [Y, X] = [1, D] or [D, 1] are always mapped to [1, D]. + # The dummy ('1') dimension is the Y-dimension. + ifm_ch = self.get_nodeattr("IFMChannels") + k = self.get_nodeattr("ConvKernelDim") + ifm_dim = self.get_nodeattr("IFMDim") + ofm_dim = self.get_nodeattr("OFMDim") + stride = self.get_nodeattr("Stride") + dilation = self.get_nodeattr("Dilation") + + if ifm_dim[1] == 1: + ifm_dim = ifm_dim[::-1] + ofm_dim = ofm_dim[::-1] + k = k[::-1] + stride = stride[::-1] + dilation = dilation[::-1] + + return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation) + + def get_buffer_depth(self): + ifm_ch = self.get_nodeattr("IFMChannels") + k = self.get_nodeattr("ConvKernelDim") + ifm_dim = self.get_nodeattr("IFMDim") + stride = self.get_nodeattr("Stride") + dilation = self.get_nodeattr("Dilation") + simd = self.get_nodeattr("SIMD") + + k_h, k_w = k + h, w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + mmv_in = 1 + mmv_out = 1 + channel_factor = int(ifm_ch / simd) + + impl_style = self.select_impl_style() + if impl_style == "default": + # compute minimal buffer length (assuming it holds 1 complete window) + buffer_min_size = ( + (k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1 + ) * channel_factor + + # add additional buffer space in case of stride > 1 + # this minimizes cycle count as it allows an earlier pre-load of inputs + buffer_depth = ( + buffer_min_size + + max( + 0, + ((stride_w - 1) - (int(mmv_out * k_h * k_w / mmv_in))) + * channel_factor, + ) + + max( + 0, + ((stride_h - 1) * w - (int(mmv_out * k_h * k_w / mmv_in))) + * channel_factor, + ) + ) + else: + buffer_depth = 0 + raise Exception("Requested impl. style not implemented") + return buffer_depth + + def get_exp_cycles(self): + simd = self.get_nodeattr("SIMD") + ifm_ch = self.get_nodeattr("IFMChannels") + k = self.get_nodeattr("ConvKernelDim") + ifm_dim = self.get_nodeattr("IFMDim") + ofm_dim = self.get_nodeattr("OFMDim") + stride = self.get_nodeattr("Stride") + dilation = self.get_nodeattr("Dilation") + depthwise = self.get_nodeattr("depthwise") + ifm_dim_h, ifm_dim_w = ifm_dim + ofm_dim_h, ofm_dim_w = ofm_dim + k_h, k_w = k + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + + channel_factor = int(ifm_ch / simd) + + if ifm_dim_h == 1 or ifm_dim_w == 1: + # 1D case + ( + ifm_ch, + [ifm_dim_h, ifm_dim_w], + [ofm_dim_h, ofm_dim_w], + [k_h, k_w], + [stride_h, stride_w], + [dilation_h, dilation_w], + ) = self.get_1d_conv_attrs_normalized() + + if depthwise: + exp_cycles = ( + +ofm_dim_w * k_w * channel_factor + + channel_factor * (k_w - 1) * (stride_w - 1) + - (k_w - 1) + + 2 + ) + else: + exp_cycles = ofm_dim_w * k_w * channel_factor + 2 + else: + # 2D case + buffer_min_size = ( + (k_h - 1) * dilation_h * ifm_dim_w + (k_w - 1) * dilation_w + 1 + ) * channel_factor + cycles_write_block = ofm_dim_w * k_w * k_h * channel_factor + cycles_read_block = stride_w * ifm_dim_w * channel_factor + max_cycles = max(cycles_write_block, cycles_read_block) + if depthwise: + max_cycles += ofm_dim_w * (stride_w - 1) * (channel_factor - 1) + exp_cycles = buffer_min_size + ofm_dim_h * max_cycles # initial buffering + if depthwise: + exp_cycles += (stride_h - 1) * ifm_dim_w * channel_factor + + return int(exp_cycles) + + def bram_estimation(self): + simd = self.get_nodeattr("SIMD") + ram_style = self.get_nodeattr("ram_style") + + # NOTE: Actual BRAM usage might be lower in some cases. + # This does not account for the exact Vivado behavior yet. + buffer_width = simd * self.get_input_datatype().bitwidth() + buffer_depth = self.get_buffer_depth() + if ram_style == "block" or ram_style == "auto": + if buffer_depth <= 512: + ram_width = 36 + elif buffer_depth <= 1024: + ram_width = 18 + elif buffer_depth <= 2048: + ram_width = 9 + elif buffer_depth <= 4096: + ram_width = 4 + elif buffer_depth <= 8192: + ram_width = 2 + else: + ram_width = 1 + + ram_cascade_depth = math.ceil(buffer_depth / 16384) + ram_cascade_width = math.ceil(buffer_width / ram_width) + cascade_savings = 0 + if buffer_depth > 16384: + remainder_depth = buffer_depth % 16384 + if remainder_depth <= 512: + remainder_width = 36 + elif remainder_depth <= 1024: + remainder_width = 18 + elif remainder_depth <= 2048: + remainder_width = 9 + elif remainder_depth <= 4096: + remainder_width = 4 + elif remainder_depth <= 8192: + remainder_width = 2 + else: + remainder_width = 1 + + remainder_cascade_width = math.ceil(buffer_width / remainder_width) + cascade_savings = ram_cascade_width - remainder_cascade_width + + return int(ram_cascade_depth * ram_cascade_width - cascade_savings) + else: + return 0 + + def lut_estimation(self): + simd = self.get_nodeattr("SIMD") + ram_style = self.get_nodeattr("ram_style") + buffer_width = simd * self.get_input_datatype().bitwidth() + buffer_depth = self.get_buffer_depth() + if ram_style == "distributed": + ram_luts = int(buffer_width * math.ceil(buffer_depth / 38)) + else: + ram_luts = 0 + return 300 + ram_luts + + def uram_estimation(self): + simd = self.get_nodeattr("SIMD") + ram_style = self.get_nodeattr("ram_style") + buffer_width = simd * self.get_input_datatype().bitwidth() + buffer_depth = self.get_buffer_depth() + + if ram_style == "ultra": + ram_depth = 4096 + ram_width = 72 + ram_cascade_depth = math.ceil(buffer_depth / ram_depth) + ram_cascade_width = math.ceil(buffer_width / ram_width) + return int(ram_cascade_depth * ram_cascade_width) + else: + return 0 + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + if mode == "cppsim": + raise Exception( + "cppsim not possible for RTL SWG, please set exec_mode to rtlsim" + ) + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't match expected shape (1, ifm_dim, ifm_dim, ifm_ch).""" + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + inp = (inp + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + + # reshape input into folded form + inp = inp.reshape(folded_ishape) + # make copy before saving array + reshaped_input = inp.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + + # binary -> bipolar if needed + if self.get_output_datatype() == DataType["BIPOLAR"]: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output + shape doesn't match expected shape (1, ofm_dim_h, ofm_dim_w, k_h*k_w*ifm_ch).""" + + def prepare_codegen_default(self): + # Default implementation style for MMV_out = 1: addressable cyclic buffer + # Computing incremental addressing scheme directly.. + template_path = ( + os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_template_default.sv" + ) + code_gen_dict = {} + + ifm_ch = self.get_nodeattr("IFMChannels") + k = self.get_nodeattr("ConvKernelDim") + ifm_dim = self.get_nodeattr("IFMDim") + stride = self.get_nodeattr("Stride") + dilation = self.get_nodeattr("Dilation") + depthwise = self.get_nodeattr("depthwise") + simd = self.get_nodeattr("SIMD") + + k_h, k_w = k + h, w = ifm_dim + pad = [0, 0, 0, 0] # padding happens in separate padding node for now + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + pad_h = pad[0] + pad[2] + pad_w = pad[1] + pad[3] + out_dim_h = im2col.compute_conv_output_dim(h, k_h, stride_h, pad_h, dilation_h) + out_dim_w = im2col.compute_conv_output_dim(w, k_w, stride_w, pad_w, dilation_w) + mmv_in = 1 + mmv_out = 1 + channel_factor = int(ifm_ch / simd) + + # compute minimal buffer length (assuming it holds 1 complete window) + buffer_min_size = ( + (k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1 + ) * channel_factor + + buffer_actual_size = self.get_buffer_depth() + code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_actual_size)] + + # compute some intermediate values, e.g., kernel "width" = k_w incl. dilation + # or cols/rows that are skipped due to imperfect stride<->dim combination + kernel_width = (k_w - 1) * dilation_w + 1 + kernel_height = (k_h - 1) * dilation_h + 1 + skip_columns = w % (kernel_width + (out_dim_w - 1) * stride_w) + skip_rows = h % (kernel_height + (out_dim_h - 1) * stride_h) + + # compute address increment values for 5-loop nest + addr_incr_end_simd = 1 + addr_incr_end_window_elem = (dilation_w - 1) * channel_factor + 1 + addr_incr_end_window_row = ( + ((w - kernel_width) * channel_factor) # remaining line + + ((dilation_h - 1) * w * channel_factor) # skip lines + + 1 # wrap-around of minimally sized buffer + ) + addr_incr_end_window = -buffer_min_size + stride_w * channel_factor + 1 + addr_incr_end_row = ( + -buffer_min_size + + ((skip_columns + kernel_width) * channel_factor) # remaining line + + ((stride_h - 1) * w * channel_factor) # skip lines + + 1 + ) + + # re-use same controller structure -> re-assign address increments + if depthwise: + addr_incr_end_window_elem = dilation_w * channel_factor + addr_incr_end_window_row = ( + channel_factor + + (w - kernel_width) * channel_factor + + (dilation_h - 1) * w * channel_factor + ) + addr_incr_end_simd = -buffer_min_size + (channel_factor + 1) + + # sanity check + assert not ( + abs(addr_incr_end_window) > buffer_actual_size + ), "ERROR: W increment > buffer size, wrap logic doesn't account for this" + assert not ( + abs(addr_incr_end_row) > buffer_actual_size + ), "ERROR: H increment > buffer size, wrap logic doesn't account for this" + + # set certain threshold indices to detect when reading/writing finishes + code_gen_dict["$LAST_READ_ELEM$"] = [str(h * w * channel_factor - 1)] + code_gen_dict["$LAST_WRITE_ELEM$"] = [ + str(((h - skip_rows - 1) * w + (w - skip_columns)) * channel_factor - 1) + ] + + # default controller loop structure: # iterations (counters) map directly + loop_h_iterations = out_dim_h + loop_w_iterations = out_dim_w + loop_kh_iterations = k_h + loop_kw_iterations = k_w + loop_simd_iterations = channel_factor + + if depthwise and channel_factor > 1: + # re-arrange existing controller loop structure for depthwise convolutions + loop_kh_iterations = channel_factor + loop_kw_iterations = k_h + loop_simd_iterations = k_w + addr_incr_end_simd_ = addr_incr_end_simd + addr_incr_end_simd = addr_incr_end_window_elem + addr_incr_end_window_elem = addr_incr_end_window_row + addr_incr_end_window_row = addr_incr_end_simd_ + elem_per_window = k_h * k_w + + tail_incr_w = addr_incr_end_window + buffer_min_size - channel_factor + tail_incr_h = addr_incr_end_row + buffer_min_size - channel_factor + tail_incr_last_window = buffer_min_size - 1 + code_gen_dict["$IS_DEPTHWISE$"] = ["1"] + else: + # depthwise output format is equivalent to non-depthwise if SIMD=C + elem_per_window = k_h * k_w * channel_factor + + tail_incr_w = addr_incr_end_window + buffer_min_size - 1 + tail_incr_h = addr_incr_end_row + buffer_min_size - 1 + tail_incr_last_window = buffer_min_size - 1 + code_gen_dict["$IS_DEPTHWISE$"] = ["0"] + + code_gen_dict["$TAIL_INCR_W$"] = [str(tail_incr_w)] + code_gen_dict["$TAIL_INCR_H$"] = [str(tail_incr_h)] + code_gen_dict["$TAIL_INCR_LAST$"] = [str(tail_incr_last_window)] + + # support SIMD = IFMChannels and k_w = 1 cases + # for k = [k_h, k_w] = [1, k_w], no adjustment is needed + # for k = [k_h, k_w] = [1, 1], do not use this impl. style (mmv_out=K=1) + # innermost loop is executed at least once -> adjust if needed + if loop_simd_iterations == 1: + # skip innermost SIMD loop completely + if loop_kw_iterations == 1: + # skip innermost KW loop completely + code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_KH"] + loop_kh_iterations -= 1 # -1 because state is initial state + else: + code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_KW"] + loop_kw_iterations -= 1 # -1 because state is initial state + else: + code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_SIMD"] + loop_simd_iterations -= 1 # -1 because state is initial state + + code_gen_dict["$LOOP_H_ITERATIONS$"] = [str(loop_h_iterations - 1)] + code_gen_dict["$LOOP_W_ITERATIONS$"] = [str(loop_w_iterations - 1)] + code_gen_dict["$LOOP_KH_ITERATIONS$"] = [str(loop_kh_iterations - 1)] + code_gen_dict["$LOOP_KW_ITERATIONS$"] = [str(loop_kw_iterations - 1)] + code_gen_dict["$LOOP_SIMD_ITERATIONS$"] = [str(loop_simd_iterations - 1)] + + incr_bitwidth = 1 + math.ceil( + math.log2( + max( + abs(addr_incr_end_simd) + 1, + abs(addr_incr_end_window_elem) + 1, + abs(addr_incr_end_window_row) + 1, + abs(addr_incr_end_window) + 1, + abs(addr_incr_end_row) + 1, + abs(tail_incr_w) + 1, + abs(tail_incr_h) + 1, + abs(tail_incr_last_window) + 1, + ) + ) + ) + code_gen_dict["$INCR_BITWIDTH$"] = [str(incr_bitwidth)] + code_gen_dict["$ADDR_INCREMENT_MAP$"] = [ + "'{{ {}'d0, {}'d{}, {}'d{}, {}'d{}, {}'d{}, {}'d{}}}".format( + incr_bitwidth, + int(copysign(incr_bitwidth, addr_incr_end_simd)), + abs(addr_incr_end_simd), + int(copysign(incr_bitwidth, addr_incr_end_window_elem)), + abs(addr_incr_end_window_elem), + int(copysign(incr_bitwidth, addr_incr_end_window_row)), + abs(addr_incr_end_window_row), + int(copysign(incr_bitwidth, addr_incr_end_window)), + abs(addr_incr_end_window), + int(copysign(incr_bitwidth, addr_incr_end_row)), + abs(addr_incr_end_row), + ) + ] + + code_gen_dict["$ELEM_PER_WINDOW$"] = [str(elem_per_window)] + code_gen_dict["$SIMD$"] = [str(simd)] + code_gen_dict["$MMV_IN$"] = [str(mmv_in)] + code_gen_dict["$MMV_OUT$"] = [str(mmv_out)] + + return template_path, code_gen_dict + + def select_impl_style(self): + simd = self.get_nodeattr("SIMD") + M = self.get_nodeattr("M") + ifm_ch = self.get_nodeattr("IFMChannels") + ifm_dim = self.get_nodeattr("IFMDim") + stride = self.get_nodeattr("Stride") + dilation = self.get_nodeattr("Dilation") + k = self.get_nodeattr("ConvKernelDim") + ifm_dim_h, ifm_dim_w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + k_h, k_w = k + kernel_width = (k_w - 1) * dilation_w + 1 # incl. dilation + kernel_height = (k_h - 1) * dilation_h + 1 # incl. dilation + + # check for valid configuration + assert ( + kernel_height <= ifm_dim_h + and kernel_width <= ifm_dim_w + and stride_h <= ifm_dim_h + and stride_w <= ifm_dim_w + ), "Illegal conv configuration: kernel or stride > FM dimension" + + # init folding config + if self.get_nodeattr("parallel_window"): + # mmv_in = M * 1 + mmv_out = M * k_h * k_w + assert ( + ifm_ch == simd + ), "Constraint violated: SIMD must be equal to IFMChannels" + else: + # mmv_in = 1 + mmv_out = 1 + assert ( + ifm_ch % simd == 0 + ), "Constraint violated: SIMD must divide IFMChannels" + + # choose implementation style + if mmv_out > 1 or (k_h == 1 and k_w == 1): + impl_style = "parallel" + assert ( + ifm_ch == simd + ), "Constraint violated: SIMD must be equal to IFMChannels" + else: + impl_style = "default" + + assert ( + impl_style == "default" + ), "ERROR: Parallel window mode not yet implemented" + return impl_style + + def generate_hdl(self): + impl_style = self.select_impl_style() + + # prepare code generation by filling out dictionaries + if impl_style == "default": + template_path, code_gen_dict = self.prepare_codegen_default() + else: + raise Exception("Requested impl. style not implemented") + + # add general parameters to dictionary + code_gen_dict["$TOP_MODULE_NAME$"] = [self.get_verilog_top_module_name()] + # save top module name so we can refer to it after this node has been renamed + # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) + self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) + code_gen_dict["$BIT_WIDTH$"] = [str(self.get_input_datatype().bitwidth())] + ram_style = self.get_nodeattr("ram_style") + if ram_style == "auto": + code_gen_dict["$RAM_STYLE$"] = [""] + else: + code_gen_dict["$RAM_STYLE$"] = ['(* ram_style = "{}" *)'.format(ram_style)] + + # apply code generation to templates + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + with open(template_path, "r") as f: + template = f.read() + with open( + os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_template_wrapper.v", "r" + ) as f: + template_wrapper = f.read() + for key in code_gen_dict: + # transform list into long string separated by '\n' + code_gen_line = "\n".join(code_gen_dict[key]) + template = template.replace(key, code_gen_line) + template_wrapper = template_wrapper.replace(key, code_gen_line) + with open( + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv" + ), + "w", + ) as f: + f.write(template) + with open( + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" + ), + "w", + ) as f: + f.write(template_wrapper) + + # set ipgen_path and ip_path so that HLS-Synth transformation + # and stich_ip transformation do not complain + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + + def prepare_rtlsim(self): + """Creates a Verilator emulation library for the RTL code generated + for this node, sets the rtlsim_so attribute to its path and returns + a PyVerilator wrapper around it.""" + # Modified to use generated (System-)Verilog instead of HLS output products + + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + verilog_paths = [code_gen_dir] + verilog_files = [ + self.get_nodeattr("gen_top_module") + "_wrapper.v", + self.get_nodeattr("gen_top_module") + "_impl.sv", + ] + + # build the Verilator emu library + sim = PyVerilator.build( + verilog_files, + build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), + verilog_path=verilog_paths, + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_verilog_top_module_name(), + ) + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + return sim + + def code_generation_ipi(self): + """Constructs and returns the TCL for node instantiation in Vivado IPI.""" + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + + cmd = [ + "add_files -norecurse %s" + % ( + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" + ) + ), + "add_files -norecurse %s" + % ( + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv" + ) + ), + "create_bd_cell -type module -reference %s %s" + % (self.get_nodeattr("gen_top_module"), self.onnx_node.name), + ] + + return cmd + + def code_generation_ipgen(self, model, fpgapart, clk): + """Normally: Generates C++ code and tcl script for IP generation. + Here: Generates (System-)Verilog code for IP generation.""" + self.generate_hdl() + + def ipgen_singlenode_code(self): + """Normally: Builds the bash script for IP generation.""" + pass + + def code_generation_cppsim(self, model): + """Normally: Generates C++ code for simulation (cppsim).""" + pass + + def compile_singlenode_code(self): + pass + + def global_includes(self): + pass + + def defines(self, var): + pass + + def read_npy_data(self): + pass + + def strm_decl(self): + pass + + def docompute(self): + pass + + def dataoutstrm(self): + pass + + def save_as_npy(self): + pass + + def blackboxfunction(self): + pass + + def pragmas(self): + pass diff --git a/src/finn/custom_op/fpgadataflow/downsampler.py b/src/finn/custom_op/fpgadataflow/downsampler.py index da29a524b6bba7ce0c7a71bc64a44ae128d91709..b7efaff440dd5cc2160fbfb8050b30924460ffe6 100644 --- a/src/finn/custom_op/fpgadataflow/downsampler.py +++ b/src/finn/custom_op/fpgadataflow/downsampler.py @@ -36,7 +36,7 @@ from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy class DownSampler(HLSCustomOp): - """Corresponds to finn-hlslib ConvolutionInputGenerator_kernel1 function. + """Corresponds to finn-hlslib ConvolutionInputGenerator_*_kernel1 function. Basically performs a down sampling of the image removing rows and columns.""" def __init__(self, onnx_node): @@ -55,6 +55,10 @@ class DownSampler(HLSCustomOp): "inputDataType": ("s", True, ""), # Batch size "numInputVectors": ("i", False, 1), + # 1D (True) or 2D (False) spatial data + "is1D": ("i", False, 0), + # for 1D only: (D, 1) (True) or (1, D) dims + "is1D_unitx": ("i", False, 1), } my_attrs.update(super().get_nodeattr_types()) return my_attrs @@ -66,28 +70,46 @@ class DownSampler(HLSCustomOp): return int(np.floor((idim - 1) / stride) + 1) def get_exp_cycles(self): + is_1D = self.get_nodeattr("is1D") idim = self.get_nodeattr("ImgDim") + idim_total = idim if is_1D else idim * idim channels = self.get_nodeattr("NumChannels") simd = self.get_nodeattr("SIMD") batch_size = self.get_nodeattr("numInputVectors") - exp_cycles = channels / simd * batch_size * idim * idim + exp_cycles = channels / simd * batch_size * idim_total return int(exp_cycles) - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): + is_1D = self.get_nodeattr("is1D") + is_1D_unitx = self.get_nodeattr("is1D_unitx") idim = self.get_nodeattr("ImgDim") num_ch = self.get_nodeattr("NumChannels") batch = self.get_nodeattr("numInputVectors") - ishape = (batch, idim, idim, num_ch) + if is_1D: + if is_1D_unitx: + ishape = (batch, idim, 1, num_ch) + else: + ishape = (batch, 1, idim, num_ch) + else: + ishape = (batch, idim, idim, num_ch) return ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): + is_1D = self.get_nodeattr("is1D") + is_1D_unitx = self.get_nodeattr("is1D_unitx") odim = self.get_downsampled_odim() num_ch = self.get_nodeattr("NumChannels") batch = self.get_nodeattr("numInputVectors") - oshape = (batch, odim, odim, num_ch) + if is_1D: + if is_1D_unitx: + oshape = (batch, odim, 1, num_ch) + else: + oshape = (batch, 1, odim, num_ch) + else: + oshape = (batch, odim, odim, num_ch) return oshape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): normal_ishape = list(self.get_normal_input_shape()) ifm_ch = self.get_nodeattr("NumChannels") simd = self.get_nodeattr("SIMD") @@ -96,7 +118,7 @@ class DownSampler(HLSCustomOp): folded_ishape = normal_ishape[:-1] + [fold, simd] return tuple(folded_ishape) - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): normal_oshape = list(self.get_normal_output_shape()) ifm_ch = self.get_nodeattr("NumChannels") simd = self.get_nodeattr("SIMD") @@ -129,21 +151,21 @@ class DownSampler(HLSCustomOp): def verify_node(self): pass - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" ret = DataType[self.get_nodeattr("inputDataType")] return ret - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output. (Same as input datatype)""" return self.get_input_datatype() - def get_instream_width(self): + def get_instream_width(self, ind=0): ibits = self.get_input_datatype().bitwidth() simd = self.get_nodeattr("SIMD") return ibits * simd - def get_outstream_width(self): + def get_outstream_width(self, ind=0): obits = self.get_output_datatype().bitwidth() simd = self.get_nodeattr("SIMD") return obits * simd @@ -204,8 +226,9 @@ class DownSampler(HLSCustomOp): ) def docompute(self): + dim_var = "1D" if (self.get_nodeattr("is1D") == 1) else "2D" self.code_gen_dict["$DOCOMPUTE$"] = [ - """ConvolutionInputGenerator_kernel1<IFMChannels, Input_precision, + f"""ConvolutionInputGenerator_{dim_var}_kernel1<IFMChannels, Input_precision, IFMDim, SIMD,Stride> (in0, out, numReps);""" ] diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py index 04ca45e7f1c1844a9976d46392be46f6cffc2167..93cde15ca7d42dbed12417837916359fdcc71b67 100644 --- a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py +++ b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py @@ -61,13 +61,13 @@ class DuplicateStreams_Batch(HLSCustomOp): def get_num_output_streams(self): return self.get_nodeattr("NumOutputStreams") - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): ch = self.get_nodeattr("NumChannels") vecs = list(self.get_nodeattr("numInputVectors")) ishape = tuple(vecs + [ch]) return ishape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): ch = self.get_nodeattr("NumChannels") pe = self.get_nodeattr("PE") vecs = list(self.get_nodeattr("numInputVectors")) @@ -138,22 +138,22 @@ class DuplicateStreams_Batch(HLSCustomOp): return info_messages - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("inputDataType")] - def get_instream_width(self): + def get_instream_width(self, ind=0): """Returns input stream width.""" ibits = self.get_input_datatype().bitwidth() pe = self.get_nodeattr("PE") in_width = pe * ibits return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): """Returns output stream width.""" obits = self.get_output_datatype().bitwidth() pe = self.get_nodeattr("PE") @@ -408,3 +408,13 @@ class DuplicateStreams_Batch(HLSCustomOp): ("out%d_%s" % (i, sname), self.get_outstream_width_padded()) ) return intf_names + + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out0": [], "out1": []}, + } + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/custom_op/fpgadataflow/eltwise.py b/src/finn/custom_op/fpgadataflow/eltwise.py new file mode 100644 index 0000000000000000000000000000000000000000..d6284750c73026c09fb7986ffc2517ed9ae3b153 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/eltwise.py @@ -0,0 +1,466 @@ +# Copyright (c) 2022, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class StreamingEltwise(HLSCustomOp): + """Class that corresponds to finn-hlslib StreamingEltwise function.""" + + def __init__(self, onnx_node): + super().__init__(onnx_node) + + def get_nodeattr_types(self): + + my_attrs = super().get_nodeattr_types() + my_attrs.update( + { + "NumChannels": ("i", True, ""), + "PE": ("i", True, ""), + # FINN DataTypes for inputs; output datatype inferred from input + "inputDataType0": ("s", True, ""), + "inputDataType1": ("s", True, ""), + # type of EltwiseFunction for the operation + "eltwiseOp": ("s", True, "", ["Add", "Sub", "AbsDiff"]), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + "inFIFODepths": ("ints", False, [2, 2]), + } + ) + return my_attrs + + def get_eltwise_op_lambda(self): + eltwise_op = self.get_nodeattr("eltwiseOp") + idt0 = self.get_input_datatype(0) + idt1 = self.get_input_datatype(1) + odt = self.get_output_datatype() + tin0 = idt0.get_hls_datatype_str() + tin1 = idt1.get_hls_datatype_str() + tout = odt.get_hls_datatype_str() + eltwise_ops = { + # "Add": "[](auto a, auto b) { return a + b; }", + # "Sub": "[](auto a, auto b) { return a - b; }", + # "AbsDiff": "[](auto a, auto b) { return a>b? a-b : b-a; }", + "Add": f"add<{tin0}, {tin1}, {tout}>()", + "Sub": f"sub<{tin0}, {tin1}, {tout}>()", + "AbsDiff": f"absdiff<{tin0}, {tin1}, {tout}>()", + } + return eltwise_ops[eltwise_op] + + def get_normal_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + ishape = tuple(vecs + [ich]) + return ishape + + def get_folded_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + assert ich % pe == 0, "PE must divide NumChannels" + vecs = list(self.get_nodeattr("numInputVectors")) + ishape = tuple(vecs + [ich // pe, pe]) + return ishape + + def get_normal_output_shape(self, ind=0): + return self.get_normal_input_shape() + + def get_folded_output_shape(self, ind=0): + return self.get_folded_input_shape() + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpected input1 shape." + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[1])) + assert ishape == exp_ishape, "Unexpected input2 shape." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt0 = model.get_tensor_datatype(node.input[0]) + if idt0 != self.get_input_datatype(0): + warn_str = "inputDataType0 changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype(0)), + str(idt0), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType0", idt0.name) + idt1 = model.get_tensor_datatype(node.input[1]) + if idt1 != self.get_input_datatype(1): + warn_str = "inputDataType1 changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype(1)), + str(idt1), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType1", idt1.name) + # enforce output data type (calculated based on idt) + odt = self.get_output_datatype() + model.set_tensor_datatype(self.onnx_node.output[0], odt) + + def verify_node(self): + info_messages = [] + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify that all necessary attributes exist + try: + self.get_nodeattr("code_gen_dir_cppsim") + self.get_nodeattr("executable_path") + self.get_nodeattr("NumChannels") + self.get_nodeattr("PE") + self.get_nodeattr("inputDataType0") + self.get_nodeattr("inputDataType1") + self.get_nodeattr("eltwiseOp") + info_messages.append("All necessary attributes exist") + except Exception: + info_messages.append( + """The required StreamingEltwise attributes do not exist.""" + ) + + return info_messages + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType" + str(ind))] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + op = self.get_nodeattr("eltwiseOp") + idt0 = self.get_input_datatype(0) + idt1 = self.get_input_datatype(1) + assert idt0.signed() == idt1.signed(), ( + "%s: Inputs must have same signedness" % self.onnx_node.name + ) + idt0_min, idt0_max = idt0.min(), idt0.max() + idt1_min, idt1_max = idt1.min(), idt1.max() + cands = [ + idt0_min - idt1_min, + idt0_min - idt1_max, + idt0_max - idt1_min, + idt0_max - idt1_max, + ] + largest_magnitude = max(map(abs, cands)) + if op == "Add": + if idt0.signed(): + return DataType.get_smallest_possible(idt0.min() + idt1.min()) + else: + return DataType.get_smallest_possible(idt0.max() + idt1.max()) + elif op == "Sub": + return DataType.get_smallest_possible(-largest_magnitude) + elif op == "AbsDiff": + return DataType.get_smallest_possible(largest_magnitude) + else: + raise Exception("%s: Unknown eltWiseOp = %s" % (self.onnx_node.name, op)) + + def get_instream_width(self, ind=0): + """Returns input stream width.""" + ibits = self.get_input_datatype(ind).bitwidth() + pe = self.get_nodeattr("PE") + in_width = pe * ibits + return in_width + + def get_outstream_width(self, ind=0): + """Returns output stream width.""" + obits = self.get_output_datatype().bitwidth() + pe = self.get_nodeattr("PE") + out_width = pe * obits + return out_width + + def get_number_output_values(self): + return np.prod(self.get_folded_output_shape()[:-1]) + + def get_exp_cycles(self): + # Channels/PE * batch size * fmdim * fmdim + return np.prod(self.get_folded_output_shape()[:-1]) + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input0 shape doesn't match expected shape .""" + export_idt0 = self.get_input_datatype(0) + # reshape input into folded form + inp = inp.reshape(folded_ishape) + # make copy before saving array + reshaped_input = inp.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + # exact same thing for input1 + inp = context[node.input[1]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input1 shape doesn't match expected shape .""" + export_idt1 = self.get_input_datatype(1) + # reshape input into folded form + inp = inp.reshape(folded_ishape) + # make copy before saving array + reshaped_input = inp.copy() + np.save(os.path.join(code_gen_dir, "input_1.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == exp_oshape + ), "cppsim did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits0 = self.get_instream_width(0) + nbits1 = self.get_instream_width(1) + rtlsim_inp0 = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt0, nbits0 + ) + rtlsim_inp1 = npy_to_rtlsim_input( + "{}/input_1.npy".format(code_gen_dir), export_idt1, nbits1 + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp0, rtlsim_inp1) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape.""" + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = [ + '#include "eltwise.hpp"', + '#include "interpret.hpp"', + ] + + self.code_gen_dict["$GLOBALS$"].extend( + [ + "template<typename TI1, typename TI2, typename TO>", + "struct absdiff {", + "TO operator()(TI1 const &a, TI2 const &b) const {", + "#pragma HLS inline", + "return a>b? a-b : b-a;", + "}", + "};", + "template<typename TI1, typename TI2, typename TO>", + "struct sub {", + "TO operator()(TI1 const &a, TI2 const &b) const {", + "#pragma HLS inline", + "return a-b;", + "}", + "};", + "template<typename TI1, typename TI2, typename TO>", + "struct add {", + "TO operator()(TI1 const &a, TI2 const &b) const {", + "#pragma HLS inline", + "return a+b;", + "}", + "};", + ] + ) + + def defines(self, var): + self.code_gen_dict["$DEFINES$"] = [] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + idt0 = self.get_input_datatype(0) + idt1 = self.get_input_datatype(1) + elem_bits_0 = idt0.bitwidth() + elem_bits_1 = idt1.bitwidth() + packed_bits_0 = self.get_instream_width(0) + packed_hls_type_0 = "ap_uint<%d>" % packed_bits_0 + packed_bits_1 = self.get_instream_width(1) + packed_hls_type_1 = "ap_uint<%d>" % packed_bits_1 + elem_hls_type_0 = idt0.get_hls_datatype_str() + elem_hls_type_1 = idt1.get_hls_datatype_str() + npy_type = "float" + self.code_gen_dict["$READNPYDATA$"] = [] + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0);' + % (packed_hls_type_0, elem_hls_type_0, elem_bits_0, npy_type, npy_in) + ) + npy_in = "%s/input_1.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in1);' + % (packed_hls_type_1, elem_hls_type_1, elem_bits_1, npy_type, npy_in) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width(0)) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> in1 ("in1");'.format(self.get_instream_width(1)) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width()) + ) + + def docompute(self): + op = self.get_nodeattr("eltwiseOp") + idt0 = self.get_input_datatype(0) + idt1 = self.get_input_datatype(1) + odt = self.get_output_datatype() + elem_hls_type_0 = idt0.get_hls_datatype_str() + elem_hls_type_1 = idt1.get_hls_datatype_str() + out_hls_type = odt.get_hls_datatype_str() + slice_in0 = "Slice<%s>" % elem_hls_type_0 + slice_in1 = "Slice<%s>" % elem_hls_type_1 + slice_out = "Slice<%s>" % out_hls_type + eltwise_op_str = self.get_eltwise_op_lambda() + "%sEltwiseFunction<%s, %s, %s>()" % ( + op, + elem_hls_type_0, + elem_hls_type_1, + out_hls_type, + ) + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<{}, {}, {}, {}, {}, {}>(in0, in1, out, {});""".format( + "StreamingEltwise", + self.get_nodeattr("NumChannels"), + self.get_nodeattr("PE"), + self.get_number_output_values(), + slice_in0, + slice_in1, + slice_out, + eltwise_op_str, + ) + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + oshape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream<ap_uint<{}>> &in0, hls::stream<ap_uint<{}>> &in1, + hls::stream<ap_uint<{}>> &out)""".format( + self.onnx_node.name, + self.get_nodeattr("PE") * self.get_input_datatype(0).bitwidth(), + self.get_nodeattr("PE") * self.get_input_datatype(1).bitwidth(), + self.get_nodeattr("PE") * self.get_output_datatype().bitwidth(), + ) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=in1 name=in1_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE ap_ctrl_none port=return" + ) + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + sname = self.hls_sname() + swidth = self.get_instream_width_padded() + intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]] + return intf_names diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py index d69ea471ea8ae1d58f97d056936b505cc2a2806b..dfc55d283fa664e3b60fc7c4d5a056f53a119292 100644 --- a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py +++ b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py @@ -47,10 +47,6 @@ class FMPadding_Batch(HLSCustomOp): # spatial size of input images "ImgDim": ("ints", True, []), # [H, W] = [Y, X] # total padding (per dimension) to apply - # NOTE: Current padding scheme that is applied tries to pad the same - # amount of zeros in front and behind the image for each dimension. - # As an example, a padding scheme such as [1, x, 3, x] is equal - # to [2, x, 2, x] "Padding": ( "ints", True, @@ -62,10 +58,6 @@ class FMPadding_Batch(HLSCustomOp): "SIMD": ("i", False, 1), # FINN input datatype "inputDataType": ("s", True, ""), - # controls distribution of padded pixels - # in case of uneven padding -- see FMPadding fxn - # in hlslib - "PaddingStyle": ("i", False, 2, {2, 1}), # shape describing input vecs per execution "numInputVectors": ("i", False, 1), } @@ -90,20 +82,20 @@ class FMPadding_Batch(HLSCustomOp): exp_cycles = (channels / simd) * batch_size * odim_h * odim_w return int(exp_cycles) - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): idim_h, idim_w = self.get_nodeattr("ImgDim") num_ch = self.get_nodeattr("NumChannels") ishape = (1, idim_h, idim_w, num_ch) return ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): odim_h, odim_w = self.get_padded_odim() num_ch = self.get_nodeattr("NumChannels") oshape = (1, odim_h, odim_w, num_ch) return oshape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): normal_ishape = list(self.get_normal_input_shape()) ifm_ch = self.get_nodeattr("NumChannels") simd = self.get_nodeattr("SIMD") @@ -112,7 +104,7 @@ class FMPadding_Batch(HLSCustomOp): folded_ishape = normal_ishape[:-1] + [fold, simd] return tuple(folded_ishape) - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): normal_oshape = list(self.get_normal_output_shape()) ifm_ch = self.get_nodeattr("NumChannels") simd = self.get_nodeattr("SIMD") @@ -144,7 +136,7 @@ class FMPadding_Batch(HLSCustomOp): def verify_node(self): pass - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" ret = DataType[self.get_nodeattr("inputDataType")] # the hlslib op always pads with zeros, so ensure that the DataType @@ -152,16 +144,16 @@ class FMPadding_Batch(HLSCustomOp): assert ret.allowed(0), "FMPadding_Batch DataType must support zero" return ret - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output. (Same as input datatype)""" return self.get_input_datatype() - def get_instream_width(self): + def get_instream_width(self, ind=0): ibits = self.get_input_datatype().bitwidth() simd = self.get_nodeattr("SIMD") return ibits * simd - def get_outstream_width(self): + def get_outstream_width(self, ind=0): obits = self.get_output_datatype().bitwidth() simd = self.get_nodeattr("SIMD") return obits * simd @@ -179,23 +171,21 @@ class FMPadding_Batch(HLSCustomOp): pad = self.get_nodeattr("Padding") pad_h = pad[0] + pad[2] pad_w = pad[1] + pad[3] - is_square = idim_h == idim_w + is_square_img = idim_h == idim_w + is_square_pad = pad_h == pad_w - if is_square: - assert ( - pad_h == pad_w - ), "Only equal padding along the dimensions for square images is supported" + if is_square_img and is_square_pad: self.code_gen_dict["$DEFINES$"] = [ """#define ImgDim1 {}\n#define OutputDim1 {}\n - #define Padding1 {}\n#define NumChannels1 {}\n - #define SIMD1 {}\n#define PaddingStyle1 {}\n + #define PaddingBefore1 {}\n#define PaddingBehind1 {}\n + #define NumChannels1 {}\n#define SIMD1 {}\n #define numReps {}\n""".format( idim_h, odim_h, - pad_h, + pad[0], + pad[2], self.get_nodeattr("NumChannels"), self.get_nodeattr("SIMD"), - self.get_nodeattr("PaddingStyle"), self.get_nodeattr("numInputVectors"), ) ] @@ -204,20 +194,22 @@ class FMPadding_Batch(HLSCustomOp): """ #define OutputDim1_x {}\n #define OutputDim1_y {}\n - #define Padding1_x {}\n - #define Padding1_y {}\n + #define PaddingLeft1 {}\n + #define PaddingRight1 {}\n + #define PaddingTop1 {}\n + #define PaddingBottom1 {}\n #define NumChannels1 {}\n #define SIMD1 {}\n - #define PaddingStyle1 {}\n #define numReps {}\n """.format( odim_w, odim_h, - pad_w, - pad_h, + pad[1], + pad[3], + pad[0], + pad[2], self.get_nodeattr("NumChannels"), self.get_nodeattr("SIMD"), - self.get_nodeattr("PaddingStyle"), self.get_nodeattr("numInputVectors"), ) ] @@ -254,21 +246,26 @@ class FMPadding_Batch(HLSCustomOp): node = self.onnx_node idim_h, idim_w = self.get_nodeattr("ImgDim") - is_square = idim_h == idim_w + pad = self.get_nodeattr("Padding") + pad_h = pad[0] + pad[2] + pad_w = pad[1] + pad[3] + is_square_img = idim_h == idim_w + is_square_pad = pad_h == pad_w - if is_square: + if is_square_img and is_square_pad: hls_call = node.op_type self.code_gen_dict["$DOCOMPUTE$"] = [ - """{}<ImgDim1, OutputDim1, Padding1, NumChannels1,SIMD1, - {}, PaddingStyle1> (in0, out, numReps);""".format( + """{}<ImgDim1, OutputDim1, PaddingBefore1, PaddingBehind1, NumChannels1, SIMD1, + {}> (in0, out, numReps);""".format( hls_call, in_t ) ] else: hls_call = "FMPadding_nonsquare_Batch" self.code_gen_dict["$DOCOMPUTE$"] = [ - """{}<OutputDim1_x, OutputDim1_y, Padding1_x, Padding1_y, NumChannels1, - SIMD1, {}, PaddingStyle1> (in0, out, numReps);""".format( + """{}<OutputDim1_x, OutputDim1_y, PaddingLeft1, PaddingRight1, + PaddingTop1, PaddingBottom1, NumChannels1, + SIMD1, {}> (in0, out, numReps);""".format( hls_call, in_t ) ] diff --git a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py index adafa7dcf36111e63fa49e0d184594fff54be99d..e7fa5bc0048b54a32ebc61482b96009fa019809e 100644 --- a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py +++ b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py @@ -56,13 +56,13 @@ class GlobalAccPool_Batch(HLSCustomOp): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): ch = self.get_nodeattr("NumChannels") vecs = list(self.get_nodeattr("numInputVectors")) ishape = tuple(vecs + [ch]) return ishape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): ch = self.get_nodeattr("NumChannels") pe = self.get_nodeattr("PE") vecs = list(self.get_nodeattr("numInputVectors")) @@ -71,7 +71,7 @@ class GlobalAccPool_Batch(HLSCustomOp): folded_ishape = tuple(vecs + [folds, pe]) return folded_ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): ch = self.get_nodeattr("NumChannels") vecs = list(self.get_nodeattr("numInputVectors")) if len(vecs) == 1: @@ -80,7 +80,7 @@ class GlobalAccPool_Batch(HLSCustomOp): oshape = tuple([vecs[0]] + [1, 1, ch]) return oshape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): ch = self.get_nodeattr("NumChannels") pe = self.get_nodeattr("PE") unfolded_shape = list(self.get_normal_output_shape()) @@ -139,11 +139,11 @@ class GlobalAccPool_Batch(HLSCustomOp): return info_messages - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" # determine data type from image size and input type idt = DataType[self.get_nodeattr("inputDataType")] @@ -155,14 +155,14 @@ class GlobalAccPool_Batch(HLSCustomOp): extreme_value = npixels * idt.max() return DataType.get_smallest_possible(extreme_value) - def get_instream_width(self): + def get_instream_width(self, ind=0): """Returns input stream width.""" ibits = self.get_input_datatype().bitwidth() pe = self.get_nodeattr("PE") in_width = pe * ibits return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): """Returns output stream width.""" obits = self.get_output_datatype().bitwidth() pe = self.get_nodeattr("PE") diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py index b202e95a28a26de3dabc098c2030bafcf840d164..f307be95c30d822dfc517e4c331bd8d82d727997 100644 --- a/src/finn/custom_op/fpgadataflow/hlscustomop.py +++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py @@ -29,8 +29,9 @@ import numpy as np import os import subprocess +import warnings from abc import abstractmethod -from pyverilator.util.axi_utils import rtlsim_multi_io +from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io from qonnx.core.datatype import DataType from qonnx.custom_op.base import CustomOp from qonnx.util.basic import roundup_to_integer_multiple @@ -107,10 +108,18 @@ class HLSCustomOp(CustomOp): # ID of FPGA device to which this Op is allocated, in # a multi-FPGA setting "device_id": ("i", False, 0), - # input and output FIFO depths - "inFIFODepth": ("i", False, 2), - "outFIFODepth": ("i", False, 2), + # input and output FIFO depths for multi-I/O nodes + "inFIFODepths": ("ints", False, [2]), + "outFIFODepths": ("ints", False, [2]), "output_hook": ("s", False, ""), + # accumulated characteristic function over two periods + "io_chrc_in": ("t", False, np.asarray([], dtype=np.int32)), + "io_chrc_out": ("t", False, np.asarray([], dtype=np.int32)), + # the period for which the characterization was run + "io_chrc_period": ("i", False, 0), + # amount of zero padding inserted during chrc. + "io_chrc_pads_in": ("ints", False, []), + "io_chrc_pads_out": ("ints", False, []), } def get_verilog_top_module_name(self): @@ -138,6 +147,7 @@ class HLSCustomOp(CustomOp): intf_names["m_axis"] = [("out_" + sname, self.get_outstream_width_padded())] intf_names["aximm"] = [] intf_names["axilite"] = [] + intf_names["ap_none"] = [] return intf_names def get_verilog_top_filename(self): @@ -687,40 +697,48 @@ compilation transformations? HLSCustomOp class but has to be filled by every node.""" pass - def get_normal_input_shape(self): + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input stream ind.""" + raise Exception("get_input_datatype not implemented for this op") + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output stream ind.""" + raise Exception("get_output_datatype not implemented for this op") + + def get_normal_input_shape(self, ind=0): """Returns normal input shape if implemented.""" raise Exception("get_normal_input_shape not implemented for this op") - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): """Returns folded output shape if implemented.""" raise Exception("get_normal_output_shape not implemented for this op") - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): """Returns folded input shape (according to synapse folding), if implemented.""" raise Exception("get_folded_input_shape not implemented for this op") - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): """Returns folded output shape (according to neuron folding), if implemented.""" raise Exception("get_folded_output_shape not implemented for this op") - def get_instream_width(self): + def get_instream_width(self, ind=0): """Returns input stream width, if implemented.""" raise Exception("get_instream_width not implemented for this op") - def get_outstream_width(self): + def get_outstream_width(self, ind=0): """Returns output stream width, if implemented.""" raise Exception("get_outstream_width not implemented for this op") - def get_instream_width_padded(self): + def get_instream_width_padded(self, ind=0): """Returns input stream width padded to a multiple of 8. This is required by the AXI Stream spec.""" - in_width = self.get_instream_width() + in_width = self.get_instream_width(ind=ind) return roundup_to_integer_multiple(in_width, 8) - def get_outstream_width_padded(self): + def get_outstream_width_padded(self, ind=0): """Returns output stream width padded to a multiple of 8. This is required by the AXI Stream spec.""" - out_width = self.get_outstream_width() + out_width = self.get_outstream_width(ind=ind) return roundup_to_integer_multiple(out_width, 8) def get_ap_int_max_w(self): @@ -733,3 +751,119 @@ compilation transformations? "AP_INT_MAX_W=%d is larger than allowed maximum of 32768" % ret ) return ret + + def derive_characteristic_fxns(self, period, override_rtlsim_dict=None): + """Return the unconstrained characteristic functions for this node.""" + # ensure rtlsim is ready + assert self.get_nodeattr("rtlsim_so") != "", ( + "rtlsim not ready for " + self.onnx_node.name + ) + if self.get_nodeattr("io_chrc_period") > 0: + warnings.warn( + "Skipping node %s: already has FIFO characteristic" + % self.onnx_node.name + ) + return + exp_cycles = self.get_exp_cycles() + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + n_outs = np.prod(self.get_folded_output_shape()[:-1]) + if exp_cycles == 0: + # try to come up with an optimistic estimate + exp_cycles = min(n_inps, n_outs) + assert ( + exp_cycles <= period + ), "Period %d too short to characterize %s : expects min %d cycles" % ( + period, + self.onnx_node.name, + exp_cycles, + ) + sim = self.get_rtlsim() + # signal name + sname = "_" + self.hls_sname() + "_" + if override_rtlsim_dict is not None: + io_dict = override_rtlsim_dict + else: + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, + } + + # extra dicts to keep track of cycle-by-cycle transaction behavior + # note that we restrict key names to filter out weight streams etc + txns_in = {key: [] for (key, value) in io_dict["inputs"].items() if "in" in key} + txns_out = { + key: [] for (key, value) in io_dict["outputs"].items() if "out" in key + } + + def monitor_txns(sim_obj): + for inp in txns_in: + in_ready = _read_signal(sim, inp + sname + "TREADY") == 1 + in_valid = _read_signal(sim, inp + sname + "TVALID") == 1 + if in_ready and in_valid: + txns_in[inp].append(1) + else: + txns_in[inp].append(0) + for outp in txns_out: + if ( + _read_signal(sim, outp + sname + "TREADY") == 1 + and _read_signal(sim, outp + sname + "TVALID") == 1 + ): + txns_out[outp].append(1) + else: + txns_out[outp].append(0) + + reset_rtlsim(sim) + total_cycle_count = rtlsim_multi_io( + sim, + io_dict, + n_outs, + sname=sname, + liveness_threshold=period, + hook_preclk=monitor_txns, + ) + assert ( + total_cycle_count <= period + ), """Total cycle count from rtl simulation is higher than + specified period, please set the period higher than {}""".format( + total_cycle_count + ) + self.set_nodeattr("io_chrc_period", period) + + def accumulate_char_fxn(chrc): + p = len(chrc) + ret = [] + for t in range(2 * p): + if t == 0: + ret.append(chrc[0]) + else: + ret.append(ret[-1] + chrc[t % p]) + return np.asarray(ret, dtype=np.int32) + + all_txns_in = np.empty((len(txns_in.keys()), 2 * period), dtype=np.int32) + all_txns_out = np.empty((len(txns_out.keys()), 2 * period), dtype=np.int32) + all_pad_in = [] + all_pad_out = [] + for in_idx, in_strm_nm in enumerate(txns_in.keys()): + txn_in = txns_in[in_strm_nm] + if len(txn_in) < period: + pad_in = period - len(txn_in) + txn_in += [0 for x in range(pad_in)] + txn_in = accumulate_char_fxn(txn_in) + all_txns_in[in_idx, :] = txn_in + all_pad_in.append(pad_in) + + for out_idx, out_strm_nm in enumerate(txns_out.keys()): + txn_out = txns_out[out_strm_nm] + if len(txn_out) < period: + pad_out = period - len(txn_out) + txn_out += [0 for x in range(pad_out)] + txn_out = accumulate_char_fxn(txn_out) + all_txns_out[out_idx, :] = txn_out + all_pad_out.append(pad_out) + + self.set_nodeattr("io_chrc_in", all_txns_in) + self.set_nodeattr("io_chrc_out", all_txns_out) + self.set_nodeattr("io_chrc_pads_in", all_pad_in) + self.set_nodeattr("io_chrc_pads_out", all_pad_out) diff --git a/src/finn/custom_op/fpgadataflow/iodma.py b/src/finn/custom_op/fpgadataflow/iodma.py index 33ee1d359c7b82494e1b5ce1b83aa5d0199f8153..65683079fc6a648de31148e398ea498f38b8d3d9 100644 --- a/src/finn/custom_op/fpgadataflow/iodma.py +++ b/src/finn/custom_op/fpgadataflow/iodma.py @@ -100,16 +100,16 @@ class IODMA(HLSCustomOp): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): vecs = list(self.get_nodeattr("numInputVectors")) num_ch = self.get_nodeattr("NumChannels") ishape = tuple(vecs + [num_ch]) return ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): return self.get_normal_input_shape() - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): if self.get_nodeattr("direction") == "in": raise ValueError("Folded input shape not defined for input IODMA") else: @@ -126,7 +126,7 @@ class IODMA(HLSCustomOp): shape.append(elems_per_word) return tuple(shape) - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): if self.get_nodeattr("direction") == "out": raise ValueError("Folded output shape not defined for output IODMA") else: @@ -166,15 +166,15 @@ class IODMA(HLSCustomOp): def verify_node(self): pass - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("dataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output. (Same as input datatype)""" return self.get_input_datatype() - def get_instream_width(self): + def get_instream_width(self, ind=0): if self.get_nodeattr("direction") == "in": return self.get_nodeattr("intfWidth") elif self.get_nodeattr("direction") == "out": @@ -182,7 +182,7 @@ class IODMA(HLSCustomOp): else: raise ValueError("Invalid IODMA direction, please set to in or out") - def get_outstream_width(self): + def get_outstream_width(self, ind=0): if self.get_nodeattr("direction") == "out": return self.get_nodeattr("intfWidth") elif self.get_nodeattr("direction") == "in": diff --git a/src/finn/custom_op/fpgadataflow/labelselect_batch.py b/src/finn/custom_op/fpgadataflow/labelselect_batch.py index 3e27ee01113392174c1206fc10e1c9abe82fdfe7..03f89bd7ecac69a9097f4f35c42bd528be709515 100644 --- a/src/finn/custom_op/fpgadataflow/labelselect_batch.py +++ b/src/finn/custom_op/fpgadataflow/labelselect_batch.py @@ -70,13 +70,13 @@ class LabelSelect_Batch(HLSCustomOp): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): nlabels = self.get_nodeattr("Labels") vecs = list(self.get_nodeattr("numInputVectors")) ishape = tuple(vecs + [nlabels]) return ishape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): nlabels = self.get_nodeattr("Labels") pe = self.get_nodeattr("PE") vecs = list(self.get_nodeattr("numInputVectors")) @@ -85,13 +85,13 @@ class LabelSelect_Batch(HLSCustomOp): folded_ishape = tuple(vecs + [folds, pe]) return folded_ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): k = self.get_nodeattr("K") vecs = list(self.get_nodeattr("numInputVectors")) oshape = tuple(vecs + [k]) return oshape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): k = self.get_nodeattr("K") vecs = list(self.get_nodeattr("numInputVectors")) oshape = tuple(vecs + [k, 1]) @@ -152,24 +152,24 @@ class LabelSelect_Batch(HLSCustomOp): return info_messages - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" ret = DataType[self.get_nodeattr("inputDataType")] return ret - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" ret = DataType[self.get_nodeattr("outputDataType")] return ret - def get_instream_width(self): + def get_instream_width(self, ind=0): """Returns input stream width.""" ibits = self.get_input_datatype().bitwidth() pe = self.get_nodeattr("PE") in_width = pe * ibits return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): """Returns output stream width.""" return self.get_output_datatype().bitwidth() diff --git a/src/finn/custom_op/fpgadataflow/lookup.py b/src/finn/custom_op/fpgadataflow/lookup.py index d90fa0f05ab2a92391f610ae1c4516a95a881ce4..fd3e2b5b1cfa74eb4f957df4b568e6c46da47617 100644 --- a/src/finn/custom_op/fpgadataflow/lookup.py +++ b/src/finn/custom_op/fpgadataflow/lookup.py @@ -75,21 +75,21 @@ class Lookup(HLSCustomOp): exp_cycles = int(n_inputs) return exp_cycles - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): return self.get_nodeattr("InputShape") - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): ishape = self.get_normal_input_shape() emb_dim = self.get_nodeattr("EmbeddingDim") oshape = list(ishape) + [emb_dim] return tuple(oshape) - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): ishape = self.get_normal_input_shape() folded_ishape = list(ishape) + [1] return tuple(folded_ishape) - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): ishape = self.get_normal_input_shape() mem_mode = self.get_nodeattr("mem_mode") emb_dim = self.get_nodeattr("EmbeddingDim") @@ -135,19 +135,19 @@ class Lookup(HLSCustomOp): def verify_node(self): pass - def get_input_datatype(self): + def get_input_datatype(self, ind=0): ret = DataType[self.get_nodeattr("InputType")] return ret - def get_output_datatype(self): + def get_output_datatype(self, ind=0): ret = DataType[self.get_nodeattr("EmbeddingType")] return ret - def get_instream_width(self): + def get_instream_width(self, ind=0): ibits = self.get_input_datatype().bitwidth() return ibits - def get_outstream_width(self): + def get_outstream_width(self, ind=0): folded_oshape = self.get_folded_output_shape() obits = self.get_output_datatype().bitwidth() return obits * folded_oshape[-1] @@ -159,8 +159,8 @@ class Lookup(HLSCustomOp): def global_includes(self): mem_mode = self.get_nodeattr("mem_mode") global_incls = [] + global_incls.append('#include "lookup.hpp"') if mem_mode == "const": - global_incls.append('#include "lookup.hpp"') global_incls.append('#include "embeddings.hpp"') self.code_gen_dict["$GLOBALS$"] = global_incls @@ -258,17 +258,10 @@ class Lookup(HLSCustomOp): InputType, EmbeddingType >(in0, out, embeddings);""" ] elif mem_mode == "external": - hls_impl = """ - if(!in0.empty()) { - ap_uint<T_SRC::width+EmbeddingAlign> const base = - (in0.read(), ap_uint<EmbeddingAlign>(0)); - for(unsigned j = 0; j < EmbeddingSize; j++) { -#pragma HLS PIPELINE II=1 - out.write(mem[base+j]); - } - } - """ - self.code_gen_dict["$DOCOMPUTE$"] = [hls_impl] + self.code_gen_dict["$DOCOMPUTE$"] = [ + """StreamingLookup_ext<EmbeddingSize>(in0, out, mem, size, oob_count, + oob_irq);""" + ] def blackboxfunction(self): mem_mode = self.get_nodeattr("mem_mode") @@ -286,7 +279,8 @@ class Lookup(HLSCustomOp): "void " + self.onnx_node.name + "(hls::stream<T_SRC> &in0, hls::stream<T_DST> &out, " - + "T_DST const *const mem)" + + "T_DST const *const mem, unsigned const size, " + + "unsigned &oob_count, bool &oob_irq)" ] def pragmas(self): @@ -305,6 +299,13 @@ class Lookup(HLSCustomOp): elif mem_mode == "external": my_pragmas.append("#pragma HLS INTERFACE m_axi offset=slave port=mem") my_pragmas.append("#pragma HLS INTERFACE s_axilite port=mem bundle=control") + my_pragmas.append( + "#pragma HLS INTERFACE s_axilite port=size bundle=control" + ) + my_pragmas.append( + "#pragma HLS INTERFACE s_axilite port=oob_count bundle=control" + ) + my_pragmas.append("#pragma HLS INTERFACE ap_none port=oob_irq") else: raise Exception("Unrecognized mem_mode: " + mem_mode) self.code_gen_dict["$PRAGMAS$"] = my_pragmas @@ -475,4 +476,5 @@ class Lookup(HLSCustomOp): if mem_mode == "external": intf_names["axilite"] = ["s_axi_control"] intf_names["aximm"] = [("m_axi_gmem", self.get_nodeattr("ext_mem_width"))] + intf_names["ap_none"] = ["oob_irq"] return intf_names diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index 9d2717dc8c65ddb5329816880067b81b10db2c02..69763fbea8a6079c7b0a61e14da37a3af69dfdfb 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -409,16 +409,16 @@ class MatrixVectorActivation(HLSCustomOp): """Returns FINN DataType of weights.""" return DataType[self.get_nodeattr("weightDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("outputDataType")] - def get_instream_width(self): + def get_instream_width(self, ind=0): i_bits = self.get_input_datatype().bitwidth() in_width = i_bits * self.get_nodeattr("SIMD") return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): o_bits = self.get_output_datatype().bitwidth() out_width = o_bits * self.get_nodeattr("PE") return out_width @@ -474,7 +474,7 @@ class MatrixVectorActivation(HLSCustomOp): return folded_input_shape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): mh = self.get_nodeattr("MH") pe = self.get_nodeattr("PE") nf = mh // pe @@ -482,13 +482,13 @@ class MatrixVectorActivation(HLSCustomOp): folded_output_shape = tuple(vecs + [nf, pe]) return folded_output_shape - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): mw = self.get_nodeattr("MW") vecs = list(self.get_nodeattr("numInputVectors")) normal_input_shape = tuple(vecs + [mw]) return normal_input_shape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): mh = self.get_nodeattr("MH") vecs = list(self.get_nodeattr("numInputVectors")) normal_output_shape = tuple(vecs + [mh]) @@ -1227,8 +1227,11 @@ class MatrixVectorActivation(HLSCustomOp): self.code_gen_dict["$PRAGMAS$"].append( "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() ) - in_fifo_depth = self.get_nodeattr("inFIFODepth") - out_fifo_depth = self.get_nodeattr("outFIFODepth") + # TODO can we deprecate this entirely? this looks like legacy code + # that does not really serve a purpose - FIFO sizes are not typically + # allocated at this point; at best they are set to 2 as the default + in_fifo_depth = 2 + out_fifo_depth = 2 # insert depth pragmas only if specified if in_fifo_depth != 0: self.code_gen_dict["$PRAGMAS$"].append( @@ -1462,3 +1465,20 @@ class MatrixVectorActivation(HLSCustomOp): thres_count = out_features ret_dict[thres_param_type] = thres_count return ret_dict + + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, + } + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode in ["decoupled", "external"]: + n_weight_inps = self.calc_wmem() + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict["inputs"]["weights"] = [ + 0 for i in range(num_w_reps * n_weight_inps) + ] + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/custom_op/fpgadataflow/pool_batch.py b/src/finn/custom_op/fpgadataflow/pool_batch.py index 3bf187fa9a78ed2c812f042a29079ee1e3163d74..91cd537baeff0c7666bbf3596b46a7412ec2fe4e 100644 --- a/src/finn/custom_op/fpgadataflow/pool_batch.py +++ b/src/finn/custom_op/fpgadataflow/pool_batch.py @@ -74,11 +74,11 @@ class Pool_Batch(HLSCustomOp): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("InputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" fxn = self.get_nodeattr("Function") odt = DataType[self.get_nodeattr("OutputDataType")] @@ -98,7 +98,7 @@ class Pool_Batch(HLSCustomOp): return odt - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): ifm_ch = self.get_nodeattr("Channels") odims = self.get_nodeattr("OutImgDims") batch_size = self.get_nodeattr("BatchSize") @@ -107,7 +107,7 @@ class Pool_Batch(HLSCustomOp): ishape = (batch_size, *odims, k_prod * ifm_ch) return ishape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): normal_ishape = list(self.get_normal_input_shape()) ifm_ch = self.get_nodeattr("Channels") pe = self.get_nodeattr("PE") @@ -116,14 +116,14 @@ class Pool_Batch(HLSCustomOp): folded_ishape = normal_ishape[:-1] + [fold, pe] return tuple(folded_ishape) - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): ofm_ch = self.get_nodeattr("Channels") odims = self.get_nodeattr("OutImgDims") batch_size = self.get_nodeattr("BatchSize") oshape = (batch_size, *odims, ofm_ch) return oshape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): normal_oshape = list(self.get_normal_output_shape()) ifm_ch = self.get_nodeattr("Channels") pe = self.get_nodeattr("PE") @@ -147,13 +147,13 @@ class Pool_Batch(HLSCustomOp): exp_cycles = ((ifm_ch * k_prod) / pe) * np.prod(odims) * batch_size return int(exp_cycles) - def get_instream_width(self): + def get_instream_width(self, ind=0): dt_bits = self.get_input_datatype().bitwidth() pe = self.get_nodeattr("PE") in_width = int(dt_bits * pe) return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): dt_bits = self.get_output_datatype().bitwidth() pe = self.get_nodeattr("PE") out_width = int(dt_bits * pe) diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py index 1e6b72e4d54ede639e797f32f51fb7705ec8ce4b..a3aa9d570d0efcbe82090d19a151d4f5b12078b6 100644 --- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py +++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py @@ -60,19 +60,19 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("dataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("dataType")] - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): ishape = self.get_nodeattr("shape") return ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): oshape = self.get_nodeattr("shape") return oshape @@ -97,7 +97,7 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp): Please adjust PE and SIMD values so that OutWidth % InWidth = 0 or alternatively use impl_style = vivado""" - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): self.check_divisible_iowidths() iwidth = self.get_nodeattr("inWidth") ishape = self.get_normal_input_shape() @@ -117,7 +117,7 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp): dummy_t = dummy_t.reshape(new_shape) return dummy_t.shape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): self.check_divisible_iowidths() owidth = self.get_nodeattr("outWidth") oshape = self.get_normal_output_shape() @@ -142,11 +142,11 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp): folded_oshape = self.get_folded_output_shape() return np.prod(folded_oshape[:-1]) - def get_instream_width(self): + def get_instream_width(self, ind=0): in_width = self.get_nodeattr("inWidth") return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): out_width = self.get_nodeattr("outWidth") return out_width diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py index a7c3cd0be59db4ba8665f8fba5be72282339b8c8..40d016de43820a37e8c7894a3e1f30146c667e59 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfifo.py +++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py @@ -68,11 +68,29 @@ class StreamingFIFO(HLSCustomOp): "auto", {"auto", "block", "distributed", "ultra"}, ), + # whether depth monitoring is enabled (impl_style=rtl only) + "depth_monitor": ("i", False, 0), } my_attrs.update(super().get_nodeattr_types()) return my_attrs + def get_adjusted_depth(self): + impl = self.get_nodeattr("impl_style") + depth = self.get_nodeattr("depth") + if impl == "vivado": + old_depth = depth + # round up depth to nearest power-of-2 + # Vivado FIFO impl may fail otherwise + depth = (1 << (depth - 1).bit_length()) if impl == "vivado" else depth + if old_depth != depth: + warnings.warn( + "%s: rounding-up FIFO depth from %d to %d for impl_style=vivado" + % (self.onnx_node.name, old_depth, depth) + ) + + return depth + def make_shape_compatible_op(self, model): exp_ishape = self.get_normal_input_shape() oshape = self.get_normal_output_shape() @@ -97,6 +115,14 @@ class StreamingFIFO(HLSCustomOp): def verify_node(self): pass + def get_verilog_top_module_intf_names(self): + ret = super().get_verilog_top_module_intf_names() + is_rtl = self.get_nodeattr("impl_style") == "rtl" + is_depth_monitor = self.get_nodeattr("depth_monitor") == 1 + if is_rtl and is_depth_monitor: + ret["ap_none"] = ["maxcount"] + return ret + def get_verilog_top_module_name(self): "Return the Verilog top module name for this node." @@ -180,10 +206,8 @@ class StreamingFIFO(HLSCustomOp): self.set_nodeattr("ip_vlnv", vlnv) self.code_gen_dict.clear() - def get_normal_input_shape(self): - depth = self.get_nodeattr("depth") - # depth has to be between 2 and 256 with the current - # StreamingFIFO implementation + def get_normal_input_shape(self, ind=0): + depth = self.get_adjusted_depth() assert depth >= 2, """Depth is too low""" if depth > 256 and self.get_nodeattr("impl_style") == "rtl": warnings.warn( @@ -211,22 +235,22 @@ class StreamingFIFO(HLSCustomOp): return normal_ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): return self.get_normal_input_shape() - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): return self.get_nodeattr("folded_shape") - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): return self.get_nodeattr("folded_shape") - def get_instream_width(self): + def get_instream_width(self, ind=0): dtype = DataType[self.get_nodeattr("dataType")] folded_shape = self.get_nodeattr("folded_shape") in_width = folded_shape[-1] * dtype.bitwidth() return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): dtype = DataType[self.get_nodeattr("dataType")] folded_shape = self.get_nodeattr("folded_shape") in_width = folded_shape[-1] * dtype.bitwidth() @@ -328,7 +352,7 @@ class StreamingFIFO(HLSCustomOp): elif impl_style == "vivado": cmd = [] node_name = self.onnx_node.name - depth = self.get_nodeattr("depth") + depth = self.get_adjusted_depth() ram_style = self.get_nodeattr("ram_style") # create a hierarchy for this layer, with the same port names clk_name = self.get_verilog_top_module_intf_names()["clk"][0] @@ -393,7 +417,7 @@ class StreamingFIFO(HLSCustomOp): """Calculates resource estimation for BRAM""" impl = self.get_nodeattr("impl_style") ram_type = self.get_nodeattr("ram_style") - depth = self.get_nodeattr("depth") + depth = self.get_adjusted_depth() W = self.get_instream_width() if impl == "rtl" or (impl == "vivado" and ram_type != "block"): @@ -418,7 +442,7 @@ class StreamingFIFO(HLSCustomOp): impl = self.get_nodeattr("impl_style") ram_type = self.get_nodeattr("ram_style") - depth = self.get_nodeattr("depth") + depth = self.get_adjusted_depth() W = self.get_instream_width() if impl == "rtl" or (impl == "vivado" and ram_type != "ultra"): @@ -428,7 +452,7 @@ class StreamingFIFO(HLSCustomOp): return (math.ceil(depth / 4096)) * (math.ceil(W / 72)) def bram_efficiency_estimation(self): - depth = self.get_nodeattr("depth") + depth = self.get_adjusted_depth() W = self.get_instream_width() bram16_est = self.bram_estimation() if bram16_est == 0: @@ -441,7 +465,7 @@ class StreamingFIFO(HLSCustomOp): """Calculates resource estimations for LUTs""" impl = self.get_nodeattr("impl_style") ram_type = self.get_nodeattr("ram_style") - depth = self.get_nodeattr("depth") + depth = self.get_adjusted_depth() W = self.get_instream_width() address_luts = 2 * math.ceil(math.log(depth, 2)) diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py index 882b40a0aaf542e6dcaf427ca3567ae78394ede5..a0e60931edd8590aaebc0560c4bd28d61d62e8ea 100755 --- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py +++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py @@ -57,11 +57,11 @@ class StreamingMaxPool_Batch(HLSCustomOp): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("dataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("dataType")] @@ -82,13 +82,13 @@ class StreamingMaxPool_Batch(HLSCustomOp): ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized() return (ifm_dim[0] == 1) and (k[0] == 1) - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim") ifm_ch = self.get_nodeattr("NumChannels") ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) return ishape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim") ifm_ch = self.get_nodeattr("NumChannels") pe = self.get_nodeattr("PE") @@ -99,7 +99,7 @@ class StreamingMaxPool_Batch(HLSCustomOp): folded_ishape = (1, ifm_dim_h, ifm_dim_w, 1, ifm_ch) return folded_ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim") k_h, k_w = tuple(self.get_nodeattr("PoolDim")) ifm_ch = self.get_nodeattr("NumChannels") @@ -116,7 +116,7 @@ class StreamingMaxPool_Batch(HLSCustomOp): oshape = (1, ofm_dim_h, ofm_dim_w, ifm_ch) return oshape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): # even though there is no folding in the current hlslib op, # insert a time multiplexing axis to remain compatible with the # shapes produced by the rest of the dataflow pipeline @@ -155,7 +155,7 @@ class StreamingMaxPool_Batch(HLSCustomOp): # TODO: adjust inaccurate formula return int(ifm_dim[1] * ifm_dim[1] * (1 + 1 / (k[1] * k[1]))) - def get_instream_width(self): + def get_instream_width(self, ind=0): dt_bits = self.get_input_datatype().bitwidth() pe = self.get_nodeattr("PE") ifm_ch = self.get_nodeattr("NumChannels") @@ -165,7 +165,7 @@ class StreamingMaxPool_Batch(HLSCustomOp): in_width = int(dt_bits * ifm_ch) return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): """For streaming maxpool out stream width is the same as in stream width""" return self.get_instream_width() diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py index e73fa9bb2872d4a5023afb0c4e6953b4e6866b8d..c7bbc3f139b64f57943b2b099083a9611951e9c4 100644 --- a/src/finn/custom_op/fpgadataflow/templates.py +++ b/src/finn/custom_op/fpgadataflow/templates.py @@ -319,6 +319,7 @@ module $TOPNAME$( ap_clk, ap_rst_n, count, +maxcount, in0_$HLS_SNAME$_TDATA, in0_$HLS_SNAME$_TVALID, in0_$HLS_SNAME$_TREADY, @@ -330,6 +331,7 @@ out_$HLS_SNAME$_TREADY input ap_clk; input ap_rst_n; output $COUNT_RANGE$ count; +output $COUNT_RANGE$ maxcount; input $IN_RANGE$ in0_$HLS_SNAME$_TDATA; input in0_$HLS_SNAME$_TVALID; output in0_$HLS_SNAME$_TREADY; @@ -346,6 +348,7 @@ $LAYER_NAME$ .clock(ap_clk), .reset(!ap_rst_n), .count(count), + .maxcount(maxcount), .i_d(in0_$HLS_SNAME$_TDATA), .i_v(in0_$HLS_SNAME$_TVALID), .i_r(in0_$HLS_SNAME$_TREADY), diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py index 5383cc1f4bdf9eb88c7d7bd69c25231282f11c6f..f2cc64668d62ef15446772309577e9b15a378ef5 100644 --- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py @@ -75,9 +75,6 @@ class Thresholding_Batch(HLSCustomOp): "inputDataType": ("s", True, ""), "weightDataType": ("s", True, ""), "outputDataType": ("s", True, ""), - # input and output FIFO depths - "inFIFODepth": ("i", False, 0), - "outFIFODepth": ("i", False, 0), # number of input vectors, examples: # [1] is a single vector (like a FC layer with batch=1) # [4] is four vectors (like a FC layer with batch=4) @@ -185,11 +182,11 @@ class Thresholding_Batch(HLSCustomOp): # total cost return comparator_cost + lutram_cost - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("outputDataType")] @@ -221,11 +218,11 @@ class Thresholding_Batch(HLSCustomOp): self.set_nodeattr("weightDataType", tdt.name) return DataType[self.get_nodeattr("weightDataType")] - def get_instream_width(self): + def get_instream_width(self, ind=0): i_bits = self.get_input_datatype().bitwidth() return i_bits * self.get_nodeattr("PE") - def get_outstream_width(self): + def get_outstream_width(self, ind=0): o_bits = self.get_output_datatype().bitwidth() return o_bits * self.get_nodeattr("PE") @@ -251,7 +248,7 @@ class Thresholding_Batch(HLSCustomOp): weightstream = self.get_weightstream_width() return max([weightstream, temp_value]) - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): ich = self.get_nodeattr("NumChannels") pe = self.get_nodeattr("PE") fold = ich // pe @@ -259,17 +256,17 @@ class Thresholding_Batch(HLSCustomOp): folded_input_shape = tuple(vecs + [fold, pe]) return folded_input_shape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): # same shape as input return self.get_folded_input_shape() - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): ich = self.get_nodeattr("NumChannels") vecs = list(self.get_nodeattr("numInputVectors")) normal_input_shape = tuple(vecs + [ich]) return normal_input_shape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): # same shape as input return self.get_normal_input_shape() @@ -960,3 +957,20 @@ class Thresholding_Batch(HLSCustomOp): "Return a list of extra tcl directives for HLS synthesis." return ["config_compile -pipeline_style frp"] + + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, + } + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode in ["decoupled", "external"]: + n_weight_inps = self.calc_tmem() + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict["inputs"]["weights"] = [ + 0 for i in range(num_w_reps * n_weight_inps) + ] + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/tlastmarker.py index 7386aa7e6311754b653e94f8d2e9b2a910a1370b..1bd32442a1986d6a86571e85a09322d6c15d8a78 100644 --- a/src/finn/custom_op/fpgadataflow/tlastmarker.py +++ b/src/finn/custom_op/fpgadataflow/tlastmarker.py @@ -218,21 +218,21 @@ class TLastMarker(HLSCustomOp): def get_number_output_values(self): return self.get_nodeattr("NumIters") - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): stream_width = self.get_nodeattr("StreamWidth") elem_width = self.get_nodeattr("ElemWidth") n_packed_elems = stream_width // elem_width n_iters = self.get_nodeattr("NumIters") return (1, n_iters, n_packed_elems) - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): return self.get_folded_input_shape() - def get_instream_width(self): + def get_instream_width(self, ind=0): stream_width = self.get_nodeattr("StreamWidth") return stream_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): stream_width = self.get_nodeattr("StreamWidth") return stream_width diff --git a/src/finn/custom_op/fpgadataflow/upsampler.py b/src/finn/custom_op/fpgadataflow/upsampler.py index eb51fe39fc6e7ec84204f9d541a0e47c333bbf43..a018fd35aac4d63b365e97464dab0fd4a5fa13f2 100644 --- a/src/finn/custom_op/fpgadataflow/upsampler.py +++ b/src/finn/custom_op/fpgadataflow/upsampler.py @@ -73,7 +73,7 @@ class UpsampleNearestNeighbour_Batch(HLSCustomOp): exp_cycles = OFMDim * reps return int(exp_cycles) - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): IFMDim = self.get_nodeattr("IFMDim") num_ch = self.get_nodeattr("NumChannels") batch = self.get_nodeattr("numInputVectors") @@ -84,7 +84,7 @@ class UpsampleNearestNeighbour_Batch(HLSCustomOp): ishape = (batch, IFMDim, 1, num_ch) return ishape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): OFMDim = self.get_nodeattr("OFMDim") num_ch = self.get_nodeattr("NumChannels") batch = self.get_nodeattr("numInputVectors") @@ -95,11 +95,11 @@ class UpsampleNearestNeighbour_Batch(HLSCustomOp): oshape = (batch, OFMDim, 1, num_ch) return oshape - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): normal_ishape = list(self.get_normal_input_shape()) return tuple(normal_ishape) - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): normal_oshape = list(self.get_normal_output_shape()) return tuple(normal_oshape) @@ -129,21 +129,21 @@ class UpsampleNearestNeighbour_Batch(HLSCustomOp): def verify_node(self): pass - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" ret = DataType[self.get_nodeattr("inputDataType")] return ret - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output. (Same as input datatype)""" return self.get_input_datatype() - def get_instream_width(self): + def get_instream_width(self, ind=0): ibits = self.get_input_datatype().bitwidth() ifm_ch = self.get_nodeattr("NumChannels") return ibits * ifm_ch - def get_outstream_width(self): + def get_outstream_width(self, ind=0): obits = self.get_output_datatype().bitwidth() ifm_ch = self.get_nodeattr("NumChannels") return obits * ifm_ch diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py index bc332b59449a6627a309e3e0b7fb18024b2af14f..16a51a3c909c76497bd8b60c372c589b441a1f01 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py @@ -29,6 +29,7 @@ import math import numpy as np import os +import textwrap import warnings from qonnx.core.datatype import DataType from qonnx.util.basic import ( @@ -41,6 +42,7 @@ from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp from finn.util.data_packing import ( npy_to_rtlsim_input, numpy_to_hls_code, + pack_innermost_dim_as_hex_string, rtlsim_output_to_npy, ) @@ -68,6 +70,36 @@ class VectorVectorActivation(HLSCustomOp): "accDataType": ("s", False, "INT32"), # no-activation mode (produce accumulators) "noActivation": ("i", False, 0, {0, 1}), + # memory mode for the layer weights + # const -- embedded weights, default, long compile/synth times + # decoupled -- streaming weights with weight streamer packaged inside IP + # external -- streaming weights with external streamer + "mem_mode": ("s", False, "const", {"const", "decoupled", "external"}), + # (mem_mode = decoupled only) whether weights will be writable through + # an AXI-lite interface during runtime + # 1 for enabled, 0 for disabled. + # see finn-rtllib/memstream/doc/README for more about the memory + # address map used for writable weights + # IMPORTANT: After using AXI lite to either read or write the weights, + # always "flush" the accelerator by first passing a dummy input + # vector through the accelerator. This will get rid of any old + # weight data from the weight FIFOs. + "runtime_writeable_weights": ("i", False, 0, {0, 1}), + # FPGA resource type for memories in decoupled mode + # auto -- let Vivado decide + # block -- use BRAM + # distributed -- use LUTRAM + # ultra -- use UltraRAM (URAM), must have runtime_writeable_weights=1 + # see also https://www.xilinx.com/support/answers/38070.html + "ram_style": ( + "s", + False, + "auto", + {"auto", "block", "distributed", "ultra"}, + ), + # use xnor-popcount for binary weights/inputs, thus treating them + # as bipolar + "binaryXnorMode": ("i", False, 0, {0, 1}), } my_attrs.update(super().get_nodeattr_types()) return my_attrs @@ -178,7 +210,7 @@ class VectorVectorActivation(HLSCustomOp): def verify_node(self): pass - def get_input_datatype(self): + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] @@ -186,11 +218,11 @@ class VectorVectorActivation(HLSCustomOp): """Returns FINN DataType of weights.""" return DataType[self.get_nodeattr("weightDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("outputDataType")] - def get_instream_width(self): + def get_instream_width(self, ind=0): i_bits = self.get_input_datatype().bitwidth() simd = self.get_nodeattr("SIMD") if simd > 1: @@ -200,12 +232,12 @@ class VectorVectorActivation(HLSCustomOp): in_width = i_bits * simd * pe return in_width - def get_outstream_width(self): + def get_outstream_width(self, ind=0): o_bits = self.get_output_datatype().bitwidth() out_width = o_bits * self.get_nodeattr("PE") return out_width - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): k_h, k_w = self.get_nodeattr("Kernel") dim_h, dim_w = self.get_nodeattr("Dim") ch = self.get_nodeattr("Channels") @@ -216,10 +248,19 @@ class VectorVectorActivation(HLSCustomOp): pe = self.get_nodeattr("PE") sf = k_h * k_w // simd nf = ch // pe - folded_input_shape = tuple([1, dim_h, dim_w, sf * nf, simd * pe]) + + if ind == 0: + # calculate shape of input 0 + folded_input_shape = tuple([1, dim_h, dim_w, sf * nf, simd * pe]) + elif ind == 1 and self.get_nodeattr("mem_mode") == "external": + # calculate shape of input 1 (weights) + folded_input_shape = tuple([1, sf * nf, pe]) + else: + raise Exception("Undefined input shape for requested input") + return folded_input_shape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): ch = self.get_nodeattr("Channels") pe = self.get_nodeattr("PE") nf = ch // pe @@ -227,14 +268,14 @@ class VectorVectorActivation(HLSCustomOp): folded_output_shape = tuple([1, dim_h, dim_w, nf, pe]) return folded_output_shape - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): dim_h, dim_w = self.get_nodeattr("Dim") ch = self.get_nodeattr("Channels") k_h, k_w = self.get_nodeattr("Kernel") normal_input_shape = tuple([1, dim_h, dim_w, k_h * k_w * ch]) return normal_input_shape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): ch = self.get_nodeattr("Channels") dim_h, dim_w = self.get_nodeattr("Dim") normal_output_shape = tuple([1, dim_h, dim_w, ch]) @@ -263,13 +304,31 @@ class VectorVectorActivation(HLSCustomOp): ret = dict() inp_hls_str = self.get_input_datatype().get_hls_datatype_str() out_hls_str = self.get_output_datatype().get_hls_datatype_str() + inp_is_binary = self.get_input_datatype() == DataType["BINARY"] + # out_is_binary = self.get_output_datatype() == DataType["BINARY"] + wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] + bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 + if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode): + raise Exception("True binary (non-bipolar) inputs not yet supported") inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] + # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"] wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] + # reinterpret inp/wt as bipolar if bin_xnor_mode is iset + inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) + wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) # fill in TSrcI and TWeightI - # TODO handle bipolar inputs - if inp_is_bipolar or wt_is_bipolar: - raise Exception("VVAU node doesn't support bipolar values yet.") - else: + # TODO check these with Giulio + # TODO handle non-bipolar binary inputs + if inp_is_bipolar and wt_is_bipolar: + ret["TSrcI"] = "Recast<XnorMul>" + ret["TWeightI"] = "Identity" + elif (not inp_is_bipolar) and wt_is_bipolar: + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + ret["TWeightI"] = "Recast<Binary>" + elif inp_is_bipolar and (not wt_is_bipolar): + ret["TSrcI"] = "Recast<Binary>" + ret["TWeightI"] = "Identity" + elif (not inp_is_bipolar) and (not wt_is_bipolar): ret["TSrcI"] = "Slice<%s>" % inp_hls_str ret["TWeightI"] = "Identity" @@ -299,6 +358,13 @@ class VectorVectorActivation(HLSCustomOp): return ret def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): + """Convert the original numpy weight matrix orig_weight_matrix into + a form suitable for passing to the hlslib call: + * ensure MH % PE == 0 + * for bipolar weights&inputs, ensure thresholds are positive + * interleave rows between PEs + * reshape into (PE, TMEM, n_thres_steps) and return + """ ch = self.get_nodeattr("Channels") pe = self.get_nodeattr("PE") tmem = self.calc_tmem() @@ -308,14 +374,33 @@ class VectorVectorActivation(HLSCustomOp): ), """Threshold matrix dimension is not as expected (2).""" n_thres_steps = orig_thres_matrix.shape[1] + inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] + wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] + # reinterpret inp/wt as bipolar if bin_xnor_mode is iset + inp_is_binary = self.get_input_datatype() == DataType["BINARY"] + wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] + bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 + inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) + wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) + if inp_is_bipolar and wt_is_bipolar: + # ensure all thresholds are nonnegative + assert (orig_thres_matrix >= 0).all() + # ensure all thresholds are integer + assert (orig_thres_matrix.astype(np.int32) == orig_thres_matrix).all() ret = orig_thres_matrix # workaround for vivado_hls threshold bug - if ret[0][0] == 0: + if ret[0][0] == 0 and n_thres_steps == 1: ret = np.copy(ret) ret[0][0] = 1 warnings.warn( "Setting 0-valued first threshold to 1 to avoid vivado_hls bug" ) + # ensure channels = mh , duplicating if necessary + if ret.shape[0] == 1: + ret = np.tile(ret, (ch, 1)) + assert ( + ret.shape[0] == ch + ), "Channels of threshold matrix are not as expected (ch)" # distribute rows between PEs ret = interleave_matrix_outer_dim_from_partitions(ret, pe) assert ( @@ -332,44 +417,175 @@ class VectorVectorActivation(HLSCustomOp): rows between PEs is not as expected (n_thres_steps)""" return ret.reshape(1, pe, tmem, n_thres_steps) - def generate_params(self, model, path): - # weights - weights = model.get_initializer(self.onnx_node.input[1]) + def make_weight_file(self, weights, weight_file_mode, weight_file_name): + """Produce a file containing given weights in appropriate format for this + layer. This file can be used for either synthesis or run-time reconfig + of weights. + + Arguments: + * weights : numpy array with weights to be put into the file + * weight_file_mode : one of {hls_header, decoupled_verilog_dat, + decoupled_runtime} + * weight_file_name : filename for the weight file to be generated + """ # convert weights into hlslib-compatible format weight_tensor = self.get_hls_compatible_weight_tensor(weights) - wdt = self.get_weight_datatype() - code_gen_dir = path - - """Saves weights into params.h""" - weight_hls_code = numpy_to_hls_code(weight_tensor, wdt, "weights", True, True) - # write weights into params.h - f_weights = open("{}/params.h".format(code_gen_dir), "w") - - if wdt.bitwidth() != 1: - f_weights.write( - "const FixedPointWeights<{},{},{},{}> weights = ".format( - self.get_nodeattr("SIMD"), - wdt.get_hls_datatype_str(), - self.get_nodeattr("PE"), - self.calc_wmem(), + export_wdt = self.get_weight_datatype() + # we have converted bipolar weights to binary for export, + # so use it as such for weight generation + if self.get_weight_datatype() == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + if weight_file_mode == "hls_header": + weight_hls_code = numpy_to_hls_code( + weight_tensor, export_wdt, "weights", True, True + ) + # write weights into C++ header file as dictated by finn-hlslib + f_weights = open(weight_file_name, "w") + if export_wdt.bitwidth() != 1: + f_weights.write( + "const FixedPointWeights<{},{},{},{}> weights = ".format( + self.get_nodeattr("SIMD"), + export_wdt.get_hls_datatype_str(), + self.get_nodeattr("PE"), + self.calc_wmem(), + ) ) + else: + f_weights.write( + "const BinaryWeights<{},{},{}> weights = ".format( + self.get_nodeattr("SIMD"), + self.get_nodeattr("PE"), + self.calc_wmem(), + ) + ) + f_weights.write(weight_hls_code) + f_weights.close() + elif "decoupled" in weight_file_mode: + # create a weight stream for various flavors of decoupled mode: + # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD) + weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3)) + # reverse SIMD flip for saving weights in .npy + weight_tensor_simd_flipped = np.flip(weight_tensor_unflipped, axis=-1) + # PE flip for saving weights in .dat + weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2) + # reshape weight tensor (simd_flipped and pe_flipped) to desired shape + pe = self.get_nodeattr("PE") + simd = 1 + # simd_flipped + weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape( + 1, -1, pe * simd ) + weight_tensor_simd_flipped = weight_tensor_simd_flipped.copy() + # flipped + weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape( + 1, -1, pe * simd + ) + weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy() + if weight_file_mode == "decoupled_npy": + # save weight stream into npy for cppsim + np.save(weight_file_name, weight_tensor_simd_flipped) + elif weight_file_mode == "decoupled_verilog_dat": + # convert weight values into hexstring + weight_width = self.get_weightstream_width() + # pad to nearest 4 bits to get hex strings + weight_width_padded = roundup_to_integer_multiple(weight_width, 4) + weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( + weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" + ) + # add zeroes to pad out file to 1024 entries + weight_stream = weight_tensor_pe_flipped.flatten() + weight_stream = weight_stream.copy() + with open(weight_file_name, "w") as f: + for val in weight_stream: + f.write(val + "\n") + elif weight_file_mode == "decoupled_runtime": + # memstream axi-lite interface will map each mem line to + # one or multiple 32-bit words + weight_width = self.get_weightstream_width() + words_per_memwidth = 2 ** math.ceil(math.log2(weight_width / 32)) + if words_per_memwidth < 1: + words_per_memwidth = 1 + weight_width_padded = words_per_memwidth * 32 + # first, pack and ensure padding to 32 bits + weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( + weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" + ) + weight_stream = weight_tensor_pe_flipped.flatten() + weight_stream = weight_stream.copy() + with open(weight_file_name, "w") as f: + for val in weight_stream: + # split into groups of 8 hex digits (= 32 bits) + words_32b = textwrap.wrap(val, 8) + words_32b.reverse() + for word_32b in words_32b: + f.write(word_32b + "\n") + else: + raise Exception("Unknown weight_file_mode") + else: - f_weights.write( - "const BinaryWeights<{},{},{}> weights = ".format( - self.get_nodeattr("SIMD"), self.get_nodeattr("PE"), self.calc_wmem() + raise Exception("Unknown weight_file_mode") + + def generate_params(self, model, path): + mem_mode = self.get_nodeattr("mem_mode") + code_gen_dir = path + # weights, if not external + weights = model.get_initializer(self.onnx_node.input[1]) + if mem_mode == "const": + # save hlslib-compatible weights in params.h + weight_filename = "{}/params.h".format(code_gen_dir) + self.make_weight_file(weights, "hls_header", weight_filename) + elif mem_mode == "decoupled" or mem_mode == "external": + weight_filename_sim = "{}/weights.npy".format(code_gen_dir) + # save decoupled weights for cppsim + self.make_weight_file(weights, "decoupled_npy", weight_filename_sim) + if mem_mode == "decoupled": + # also save weights as Verilog .dat file + # note that we provide two different .dat files, one for synth + # and one for synthesis. this is because URAM-based weights always + # need zero weights for synthesis, otherwise they get inferred + # as BRAM + weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format( + code_gen_dir ) + weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir) + # sim weights are always the true weights + self.make_weight_file( + weights, "decoupled_verilog_dat", weight_filename_rtl_sim + ) + ram_style = self.get_nodeattr("ram_style") + if ram_style == "ultra": + # UltraRAM must have no memory initializer, or only zeroes + # otherwise BRAM will be inferred instead of URAM + # as a workaround we provide a zero-weight init here + synth_weights = np.zeros_like(weights, dtype=np.float32) + else: + synth_weights = weights + self.make_weight_file( + synth_weights, "decoupled_verilog_dat", weight_filename_rtl_synth + ) + else: + raise Exception( + """Please set mem_mode to "const", "decoupled", or "external", + currently no other parameter value is supported!""" ) - f_weights.write(weight_hls_code) - f_weights.close() # save thresholds in thresh.h if len(self.onnx_node.input) > 2: thresholds = model.get_initializer(self.onnx_node.input[2]) if thresholds is not None: threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + # use UINT32 threshold export for bipolar times bipolar + inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] + wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] + # reinterpret inp/wt as bipolar if bin_xnor_mode is iset + inp_is_binary = self.get_input_datatype() == DataType["BINARY"] + wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] + bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 + inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) + wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) # get computed threshold datatype from attribute tdt = DataType[self.get_nodeattr("accDataType")] + assert np.vectorize(tdt.allowed)( threshold_tensor ).all(), "Thresholds in %s can't be expressed with type %s" % ( @@ -382,8 +598,11 @@ class VectorVectorActivation(HLSCustomOp): # write thresholds into thresh.h f_thresh = open("{}/thresh.h".format(code_gen_dir), "w") tdt_hls = tdt.get_hls_datatype_str() - odt = self.get_output_datatype() - odt_hls = odt.get_hls_datatype_str() + # use binary to export bipolar activations + export_odt = self.get_output_datatype() + if self.get_output_datatype() == DataType["BIPOLAR"]: + export_odt = DataType["BINARY"] + odt_hls = export_odt.get_hls_datatype_str() f_thresh.write( "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \ = ".format( @@ -401,6 +620,7 @@ class VectorVectorActivation(HLSCustomOp): def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") + mem_mode = self.get_nodeattr("mem_mode") node = self.onnx_node # TODO ensure codegen dir exists @@ -454,7 +674,28 @@ class VectorVectorActivation(HLSCustomOp): inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), idt, nbits) super().reset_rtlsim(sim) super().toggle_clk(sim) - output = self.rtlsim(sim, inp) + + if mem_mode == "external" or mem_mode == "decoupled": + wnbits = self.get_weightstream_width() + export_wdt = self.get_weight_datatype() + # we have converted bipolar weights to binary for export, + # so use it as such for weight generation + if self.get_weight_datatype() == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + wei = npy_to_rtlsim_input( + "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits + ) + dim_h, dim_w = self.get_nodeattr("Dim") + num_w_reps = dim_h * dim_w + + io_dict = { + "inputs": {"in0": inp, "weights": wei * num_w_reps}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + output = io_dict["outputs"]["out"] + else: + output = self.rtlsim(sim, inp) odt = self.get_output_datatype() target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() @@ -480,6 +721,12 @@ class VectorVectorActivation(HLSCustomOp): def global_includes(self): self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"'] self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"'] + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode not in ["const", "decoupled", "external"]: + raise Exception( + """Please set mem_mode to "const", "decoupled", or "external", + currently no other parameter value is supported!""" + ) if self.calc_tmem() != 0: self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] @@ -488,6 +735,8 @@ class VectorVectorActivation(HLSCustomOp): numReps = 1 * dim_h * dim_w k_h, k_w = self.get_nodeattr("Kernel") innerProdDim = k_h * k_w + mem_mode = self.get_nodeattr("mem_mode") + self.code_gen_dict["$DEFINES$"] = [ """#define Channels1 {}\n #define InnerProdDim {}\n #define SIMD1 {}\n #define PE1 {}\n #define numReps {}""".format( @@ -498,6 +747,11 @@ class VectorVectorActivation(HLSCustomOp): numReps, ) ] + if mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + self.code_gen_dict["$DEFINES$"].append( + "#define WP1 {}\n".format(wdt.bitwidth()) + ) def read_npy_data(self): code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") @@ -515,7 +769,23 @@ class VectorVectorActivation(HLSCustomOp): % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) ) + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + elem_bits = wdt.bitwidth() + packed_bits = self.get_weightstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = wdt.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/weights.npy" % code_gen_dir + + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", weights, false, numReps);' + % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) + ) + def strm_decl(self): + mem_mode = self.get_nodeattr("mem_mode") self.code_gen_dict["$STREAMDECLARATIONS$"] = [] self.code_gen_dict["$STREAMDECLARATIONS$"].append( 'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width()) @@ -523,8 +793,15 @@ class VectorVectorActivation(HLSCustomOp): self.code_gen_dict["$STREAMDECLARATIONS$"].append( 'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width()) ) + if mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> weights ("weights");'.format( + self.get_weightstream_width() + ) + ) def docompute(self): + mem_mode = self.get_nodeattr("mem_mode") map_to_hls_mult_style = { "auto": "ap_resource_dflt()", "lut": "ap_resource_lut()", @@ -536,16 +813,42 @@ class VectorVectorActivation(HLSCustomOp): threshs = "PassThroughActivation<%s>()" % odtype_hls_str else: threshs = "threshs" - self.code_gen_dict["$DOCOMPUTE$"] = [ - """Vector_Vector_Activate_Batch<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}> - (in0, out, weights, {}, numReps, {});""".format( - tmpl_args["TSrcI"], - tmpl_args["TDstI"], - tmpl_args["TWeightI"], - threshs, - map_to_hls_mult_style[self.get_nodeattr("resType")], + + if mem_mode == "const": + self.code_gen_dict["$DOCOMPUTE$"] = [ + """Vector_Vector_Activate_Batch<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}> + (in0, out, weights, {}, numReps, {});""".format( + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + threshs, + map_to_hls_mult_style[self.get_nodeattr("resType")], + ) + ] + elif mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + if wdt == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + else: + export_wdt = wdt + wdtype_hls_str = export_wdt.get_hls_datatype_str() + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}, {}> + (in0, out, weights, {}, numReps, {});""".format( + "Vector_Vector_Activate_Stream_Batch", + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + wdtype_hls_str, + threshs, + map_to_hls_mult_style[self.get_nodeattr("resType")], + ) + ] + else: + raise Exception( + """Please set mem_mode to "const", "decoupled", or "external", + currently no other parameter value is supported!""" ) - ] def dataoutstrm(self): code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") @@ -576,25 +879,49 @@ class VectorVectorActivation(HLSCustomOp): self.code_gen_dict["$SAVEASCNPY$"] = [] def blackboxfunction(self): - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream<ap_uint<{}>> &in0, - hls::stream<ap_uint<{}>> &out - )""".format( - self.onnx_node.name, - self.get_instream_width(), - self.get_outstream_width(), + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "const": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream<ap_uint<{}>> &in0, + hls::stream<ap_uint<{}>> &out + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.get_outstream_width(), + ) + ] + elif mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}( + hls::stream<ap_uint<{}>> &in0, + hls::stream<ap_uint<{}>> &weights, + hls::stream<ap_uint<{}>> &out + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.get_weightstream_width(), + self.get_outstream_width(), + ) + ] + else: + raise Exception( + """Please set mem_mode to "const" or "decoupled", currently no other + parameter value is supported!""" ) - ] def pragmas(self): + mem_mode = self.get_nodeattr("mem_mode") self.code_gen_dict["$PRAGMAS$"] = [ "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() ] self.code_gen_dict["$PRAGMAS$"].append( "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() ) - in_fifo_depth = self.get_nodeattr("inFIFODepth") - out_fifo_depth = self.get_nodeattr("outFIFODepth") + # TODO can we deprecate this entirely? this looks like legacy code + # that does not really serve a purpose - FIFO sizes are not typically + # allocated at this point; at best they are set to 2 as the default + in_fifo_depth = 2 + out_fifo_depth = 2 # insert depth pragmas only if specified if in_fifo_depth != 0: self.code_gen_dict["$PRAGMAS$"].append( @@ -608,12 +935,30 @@ class VectorVectorActivation(HLSCustomOp): "#pragma HLS INTERFACE ap_ctrl_none port=return" ) - self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') - # the weight tensor is ap_uint<ch*prec> [PE][WMEM] - # partition for parallel access along the PE dimension (dim 1) - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1") - ) + if mem_mode == "const": + self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') + # the weight tensor is ap_uint<ch*prec> [PE][WMEM] + # partition for parallel access along the PE dimension (dim 1) + self.code_gen_dict["$PRAGMAS$"].append( + ( + "#pragma HLS ARRAY_PARTITION variable=weights.m_weights " + "complete dim=1" + ) + ) + elif mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=weights name=weights_" + + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS stream depth=8 variable=weights" + ) + else: + raise Exception( + """Please set mem_mode to "const", "decoupled", or external, + currently no other parameter value is supported!""" + ) + if self.calc_tmem() != 0: # TODO find a better way of checking for no pregenerated thresholds self.code_gen_dict["$PRAGMAS$"].append( @@ -629,6 +974,157 @@ class VectorVectorActivation(HLSCustomOp): ) ) + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + mem_mode = self.get_nodeattr("mem_mode") + sname = self.hls_sname() + if mem_mode == "external": + intf_names["s_axis"].append( + ("weights_" + sname, self.get_weightstream_width_padded()) + ) + if mem_mode == "decoupled": + # only expose axilite interface if attribute is set + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + if runtime_writable: + intf_names["axilite"] = ["s_axilite"] + return intf_names + + def code_generation_ipi(self): + cmd = [] + # add streamer if needed + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "decoupled": + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + if self.get_nodeattr("ram_style") == "ultra": + assert ( + runtime_writable == 1 + ), "Layer with URAM weights must have runtime_writeable_weights=1" + node_name = self.onnx_node.name + sname = self.hls_sname() + # create a hierarchy for this layer, with the same port names + clk_name = self.get_verilog_top_module_intf_names()["clk"][0] + rst_name = self.get_verilog_top_module_intf_names()["rst"][0] + dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0] + din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] + cmd.append("create_bd_cell -type hier %s" % node_name) + cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) + cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) + cmd.append( + "create_bd_intf_pin -mode Master " + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" + % (node_name, dout_name) + ) + cmd.append( + "create_bd_intf_pin -mode Slave " + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) + ) + # instantiate the hls ip + cmd.append( + "create_bd_cell -type ip -vlnv %s /%s/%s" + % (self.get_nodeattr("ip_vlnv"), node_name, node_name) + ) + # instantiate a streamer and connect it to the HLS IP + strm_vlnv = "xilinx.com:user:memstream:1.0" + strm_inst = node_name + "_wstrm" + cmd.append( + "create_bd_cell -type ip -vlnv %s /%s/%s" + % (strm_vlnv, node_name, strm_inst) + ) + cmd.append( + "set_property -dict [list " + "CONFIG.NSTREAMS {1} " + "CONFIG.MEM_DEPTH {%d} " + "CONFIG.MEM_WIDTH {%d} " + "CONFIG.MEM_INIT {%s} " + "CONFIG.RAM_STYLE {%s} " + "CONFIG.STRM0_DEPTH {%d} " + "CONFIG.STRM0_WIDTH {%d} " + "CONFIG.STRM0_OFFSET {0} " + "] [get_bd_cells /%s/%s]" + % ( + self.calc_wmem(), + self.get_weightstream_width_padded(), + self.get_nodeattr("code_gen_dir_ipgen") + "/", + self.get_nodeattr("ram_style"), + self.calc_wmem(), + self.get_weightstream_width_padded(), + node_name, + strm_inst, + ) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s/m_axis_0] " + "[get_bd_intf_pins %s/%s/weights_%s]" + % (node_name, strm_inst, node_name, node_name, sname) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]" + % (node_name, rst_name, node_name, strm_inst) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aclk]" + % (node_name, clk_name, node_name, strm_inst) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]" + % (node_name, rst_name, node_name, node_name, rst_name) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]" + % (node_name, clk_name, node_name, node_name, clk_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, din_name, node_name, node_name, din_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, dout_name, node_name, node_name, dout_name) + ) + if runtime_writable: + # expose axi lite interface for writeable weights + axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0] + cmd.append( + "create_bd_intf_pin -mode Slave " + "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" + % (node_name, axilite_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, axilite_name, node_name, strm_inst, axilite_name) + ) + # TODO calculate and pass in segment size here + cmd.append("assign_bd_address") + cmd.append("save_bd_design") + elif mem_mode == "const" or mem_mode == "external": + # base class impl sufficient for const/external modes + return super().code_generation_ipi() + else: + raise Exception("Unrecognized mem_mode for VectorVectorActivation") + return cmd + + def uram_estimation(self): + P = self.get_nodeattr("PE") + Q = 1 + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + omega = self.calc_wmem() + mem_width = Q * W * P + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "decoupled" and mstyle != "ultra") + or (mmode == "const" and self.calc_wmem() <= 128) + or (mmode == "external") + ): + return 0 + width_multiplier = math.ceil(mem_width / 72) + depth_multiplier = math.ceil(omega / 4096) + return width_multiplier * depth_multiplier + def bram_estimation(self): """Calculates resource estimation for BRAM""" # TODO add in/out FIFO contributions @@ -639,7 +1135,13 @@ class VectorVectorActivation(HLSCustomOp): # assuming SDP mode RAMB18s (see UG573 Table 1-10) # since this is HLS memory, not using the full width of a BRAM # assuming memories up to 128 deep get implemented in LUTs - if self.calc_wmem() <= 128: + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "decoupled" and mstyle in ["distributed", "ultra"]) + or (mmode == "const" and self.calc_wmem() <= 128) + or (mmode == "external") + ): return 0 if W == 1: @@ -686,8 +1188,12 @@ class VectorVectorActivation(HLSCustomOp): c0 = 300 c1 = 1.1 c2 = 0 - if self.calc_wmem() <= 128: - c2 = P * W * math.ceil(self.calc_wmem() / 64) + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if (mmode == "decoupled" and mstyle == "distributed") or ( + mmode == "const" and self.calc_wmem() <= 128 + ): + c2 = (P * W) * math.ceil(self.calc_wmem() / 64) # multiplication res_type = self.get_nodeattr("resType") @@ -725,6 +1231,25 @@ class VectorVectorActivation(HLSCustomOp): mult_dsp = 0 return int(mult_dsp) + def get_weightstream_width(self): + """Returns weight stream width. Used only in decoupled mode.""" + if ( + self.get_nodeattr("mem_mode") == "decoupled" + or self.get_nodeattr("mem_mode") == "external" + ): + pe = self.get_nodeattr("PE") + wp = self.get_weight_datatype().bitwidth() + w_width = pe * wp + return w_width + else: + return 0 + + def get_weightstream_width_padded(self): + """Returns weight stream width padded to a multiple of 8. This is required + by the AXI Stream spec. Used in decoupled mode.""" + weight_width = self.get_weightstream_width() + return roundup_to_integer_multiple(weight_width, 8) + def get_op_and_param_counts(self): k_h, k_w = self.get_nodeattr("Kernel") fm = self.get_nodeattr("Channels") @@ -748,3 +1273,20 @@ class VectorVectorActivation(HLSCustomOp): thres_count = fm ret_dict[thres_param_type] = thres_count return ret_dict + + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, + } + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode in ["decoupled", "external"]: + n_weight_inps = self.calc_wmem() + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict["inputs"]["weights"] = [ + 0 for i in range(num_w_reps * n_weight_inps) + ] + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/qnn-data/build_dataflow/expected_output.npy b/src/finn/qnn-data/build_dataflow/expected_output.npy index a8d09384633791b7e3760dc8a2d1ba88a05d526d..98037351bb4ee49985a98631750f18e9b86965b1 100644 Binary files a/src/finn/qnn-data/build_dataflow/expected_output.npy and b/src/finn/qnn-data/build_dataflow/expected_output.npy differ diff --git a/src/finn/qnn-data/build_dataflow/input.npy b/src/finn/qnn-data/build_dataflow/input.npy index edd24de05a33a15ebc330cdab31f3d77d2c47196..8bece67b7daf5b7668ff5e7515f15a891146b00b 100644 Binary files a/src/finn/qnn-data/build_dataflow/input.npy and b/src/finn/qnn-data/build_dataflow/input.npy differ diff --git a/src/finn/qnn-data/testcase/residual_testcase.onnx b/src/finn/qnn-data/testcase/residual_testcase.onnx new file mode 100644 index 0000000000000000000000000000000000000000..c96e8c694e3a39cdb9e5d984e1c069ceb55b3f2a Binary files /dev/null and b/src/finn/qnn-data/testcase/residual_testcase.onnx differ diff --git a/src/finn/qnn-data/verilog/custom_axis_infrastructure.vh b/src/finn/qnn-data/verilog/custom_axis_infrastructure.vh new file mode 100644 index 0000000000000000000000000000000000000000..1c8b6403e8628e3647810ca5fca65ca1122eaf9d --- /dev/null +++ b/src/finn/qnn-data/verilog/custom_axis_infrastructure.vh @@ -0,0 +1,346 @@ +// (c) Copyright 2011-2013 Xilinx, Inc. All rights reserved. +// +// This file contains confidential and proprietary information +// of Xilinx, Inc. and is protected under U.S. and +// international copyright and other intellectual property +// laws. +// +// DISCLAIMER +// This disclaimer is not a license and does not grant any +// rights to the materials distributed herewith. Except as +// otherwise provided in a valid license issued to you by +// Xilinx, and to the maximum extent permitted by applicable +// law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND +// WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES +// AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING +// BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON- +// INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and +// (2) Xilinx shall not be liable (whether in contract or tort, +// including negligence, or under any other theory of +// liability) for any loss or damage of any kind or nature +// related to, arising under or in connection with these +// materials, including for any direct, or any indirect, +// special, incidental, or consequential loss or damage +// (including loss of data, profits, goodwill, or any type of +// loss or damage suffered as a result of any action brought +// by a third party) even if such damage or loss was +// reasonably foreseeable or Xilinx had been advised of the +// possibility of the same. +// +// CRITICAL APPLICATIONS +// Xilinx products are not designed or intended to be fail- +// safe, or for use in any application requiring fail-safe +// performance, such as life-support or safety devices or +// systems, Class III medical devices, nuclear facilities, +// applications related to the deployment of airbags, or any +// other applications that could lead to death, personal +// injury, or severe property or environmental damage +// (individually and collectively, "Critical +// Applications"). Customer assumes the sole risk and +// liability of any use of Xilinx products in Critical +// Applications, subject only to applicable laws and +// regulations governing limitations on product liability. +// +// THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS +// PART OF THIS FILE AT ALL TIMES. +//----------------------------------------------------------------------------- +// +// Generic Functions used by AXIS-Interconnect and Infrastrucutre Modules +// +// Verilog-standard: Verilog 2001 +//-------------------------------------------------------------------------- +// Global Parameters: +// +// Functions: +// f_clogb2 +// f_gcd +// f_lcm +// f_get_tdata_indx +// f_get_tstrb_indx +// f_get_tkeep_indx +// f_get_tlast_indx +// f_get_tid_indx +// f_get_tdest_indx +// f_get_tuser_indx +// f_payload_width +// Tasks: +// t_display_tdata_error +//-------------------------------------------------------------------------- +/////////////////////////////////////////////////////////////////////////////// +// BEGIN Global Parameters +/////////////////////////////////////////////////////////////////////////////// +// Define Signal Set indices +localparam G_INDX_SS_TREADY = 0; +localparam G_INDX_SS_TDATA = 1; +localparam G_INDX_SS_TSTRB = 2; +localparam G_INDX_SS_TKEEP = 3; +localparam G_INDX_SS_TLAST = 4; +localparam G_INDX_SS_TID = 5; +localparam G_INDX_SS_TDEST = 6; +localparam G_INDX_SS_TUSER = 7; +localparam G_MASK_SS_TREADY = 32'h1 << G_INDX_SS_TREADY; +localparam G_MASK_SS_TDATA = 32'h1 << G_INDX_SS_TDATA; +localparam G_MASK_SS_TSTRB = 32'h1 << G_INDX_SS_TSTRB; +localparam G_MASK_SS_TKEEP = 32'h1 << G_INDX_SS_TKEEP; +localparam G_MASK_SS_TLAST = 32'h1 << G_INDX_SS_TLAST; +localparam G_MASK_SS_TID = 32'h1 << G_INDX_SS_TID ; +localparam G_MASK_SS_TDEST = 32'h1 << G_INDX_SS_TDEST; +localparam G_MASK_SS_TUSER = 32'h1 << G_INDX_SS_TUSER; + +// Task DRC error levels +localparam G_TASK_SEVERITY_ERR = 2; +localparam G_TASK_SEVERITY_WARNING = 1; +localparam G_TASK_SEVERITY_INFO = 0; + +/////////////////////////////////////////////////////////////////////////////// +// BEGIN Functions +/////////////////////////////////////////////////////////////////////////////// +// ceiling logb2 + function integer f_clogb2 (input integer size); + integer s; + begin + s = size; + s = s - 1; + for (f_clogb2=1; s>1; f_clogb2=f_clogb2+1) + s = s >> 1; + end + endfunction // clogb2 + + // Calculates the Greatest Common Divisor between two integers using the + // euclidean algorithm. + function automatic integer f_gcd ( + input integer a, + input integer b + ); + begin : main + integer A, B, done, swap; + A = a; + B = b; + done = 0; + while(!done) + begin + if (A < B ) begin + swap = A; + A = B; + B = swap; + end else if ( B != 0 ) begin + A = A - B; + end else begin + done = 1; + end + end + + f_gcd = A; + end + endfunction + + + // Calculates the Lowest Common Denominator between two integers + function integer f_lcm ( + input integer a, + input integer b + ); + begin : main + f_lcm = ( a / f_gcd(a, b)) * b; + end + endfunction + + // Returns back the index to the TDATA portion of TPAYLOAD, returns 0 if the + // signal is not enabled. + function integer f_get_tdata_indx ( + input integer DAW, // TDATA Width + input integer IDW, // TID Width + input integer DEW, // TDEST Width + input integer USW, // TUSER Width + input [31:0] SST // Signal Set + ); + begin : main + f_get_tdata_indx = 0; + end + endfunction + + // Returns back the index to the tstrb portion of TPAYLOAD, returns 0 if the + // signal is not enabled. + function integer f_get_tstrb_indx ( + input integer DAW, // TDATA Width + input integer IDW, // TID Width + input integer DEW, // TDEST Width + input integer USW, // TUSER Width + input [31:0] SST // Signal Set + ); + begin : main + integer cur_indx; + cur_indx = f_get_tdata_indx(DAW, IDW, DEW, USW, SST); + // If TDATA exists, then add its width to its base to get the tstrb index + f_get_tstrb_indx = SST[G_INDX_SS_TDATA] ? cur_indx + DAW : cur_indx; + end + endfunction + + // Returns back the index to the tkeep portion of TPAYLOAD, returns 0 if the + // signal is not enabled. + function integer f_get_tkeep_indx ( + input integer DAW, // TDATA Width + input integer IDW, // TID Width + input integer DEW, // TDEST Width + input integer USW, // TUSER Width + input [31:0] SST // Signal Set + ); + begin : main + integer cur_indx; + cur_indx = f_get_tstrb_indx(DAW, IDW, DEW, USW, SST); + f_get_tkeep_indx = SST[G_INDX_SS_TSTRB] ? cur_indx + DAW/8 : cur_indx; + end + endfunction + + // Returns back the index to the tlast portion of TPAYLOAD, returns 0 if the + // signal is not enabled. + function integer f_get_tlast_indx ( + input integer DAW, // TDATA Width + input integer IDW, // TID Width + input integer DEW, // TDEST Width + input integer USW, // TUSER Width + input [31:0] SST // Signal Set + ); + begin : main + integer cur_indx; + cur_indx = f_get_tkeep_indx(DAW, IDW, DEW, USW, SST); + f_get_tlast_indx = SST[G_INDX_SS_TKEEP] ? cur_indx + DAW/8 : cur_indx; + end + endfunction + + // Returns back the index to the tid portion of TPAYLOAD, returns 0 if the + // signal is not enabled. + function integer f_get_tid_indx ( + input integer DAW, // TDATA Width + input integer IDW, // TID Width + input integer DEW, // TDEST Width + input integer USW, // TUSER Width + input [31:0] SST // Signal Set + ); + begin : main + integer cur_indx; + cur_indx = f_get_tlast_indx(DAW, IDW, DEW, USW, SST); + f_get_tid_indx = SST[G_INDX_SS_TLAST] ? cur_indx + 1 : cur_indx; + end + endfunction + + // Returns back the index to the tdest portion of TPAYLOAD, returns 0 if the + // signal is not enabled. + function integer f_get_tdest_indx ( + input integer DAW, // TDATA Width + input integer IDW, // TID Width + input integer DEW, // TDEST Width + input integer USW, // TUSER Width + input [31:0] SST // Signal Set + ); + begin : main + integer cur_indx; + cur_indx = f_get_tid_indx(DAW, IDW, DEW, USW, SST); + f_get_tdest_indx = SST[G_INDX_SS_TID] ? cur_indx + IDW : cur_indx; + end + endfunction + + // Returns back the index to the tuser portion of TPAYLOAD, returns 0 if the + // signal is not enabled. + function integer f_get_tuser_indx ( + input integer DAW, // TDATA Width + input integer IDW, // TID Width + input integer DEW, // TDEST Width + input integer USW, // TUSER Width + input [31:0] SST // Signal Set + ); + begin : main + integer cur_indx; + cur_indx = f_get_tdest_indx(DAW, IDW, DEW, USW, SST); + f_get_tuser_indx = SST[G_INDX_SS_TDEST] ? cur_indx + DEW : cur_indx; + end + endfunction + + // Payload is the sum of all the AXIS signals present except for + // TREADY/TVALID + function integer f_payload_width ( + input integer DAW, // TDATA Width + input integer IDW, // TID Width + input integer DEW, // TDEST Width + input integer USW, // TUSER Width + input [31:0] SST // Signal Set + ); + begin : main + integer cur_indx; + cur_indx = f_get_tuser_indx(DAW, IDW, DEW, USW, SST); + f_payload_width = SST[G_INDX_SS_TUSER] ? cur_indx + USW : cur_indx; + // Ensure that the return value is never less than 1 + f_payload_width = (f_payload_width < 1) ? 1 : f_payload_width; + end + endfunction + + task t_check_tdata_width( + input integer data_width, + input [8*80-1:0] var_name, + input [8*80-1:0] inst_name, + input integer severity_lvl, + output integer ret_val + ); + // Severity levels: + // 0 = INFO + // 1 = WARNING + // 2 = ERROR + begin : t_check_tdata_width + if (data_width%8 != 0) begin + // 000 1 2 3 4 5 6 7 8 + // 012 0 0 0 0 0 0 0 0 + if (severity_lvl >= 2) begin + $display("ERROR: %m::%s", inst_name); + end else if (severity_lvl == 1) begin + $display("WARNING: %m::%s", inst_name); + end else begin + $display("INFO: %m::%s", inst_name); + end + $display(" Parameter %s (%2d) must be a multiple of 8.", var_name, data_width); + $display(" AXI4-Stream data width is only defined for byte multiples. See the "); + $display(" AMBA4 AXI4-Stream Protocol Specification v1.0 Section 2.1 for more"); + $display(" information."); + ret_val = 1; + end else begin + ret_val = 0; + end + end + endtask + + task t_check_tuser_width( + input integer tuser_width, + input [8*80-1:0] tuser_name, + input integer tdata_width, + input [8*80-1:0] tdata_name, + input [8*80-1:0] inst_name, + input integer severity_lvl, + output integer ret_val + ); + // Severity levels: + // 0 = INFO + // 1 = WARNING + // 2 = ERROR + begin : t_check_tuser_width + integer tdata_bytes; + tdata_bytes = tdata_width/8; + if ((tuser_width%tdata_bytes) != 0) begin + // 000 1 2 3 4 5 6 7 8 + // 012 0 0 0 0 0 0 0 0 + if (severity_lvl >= 2) begin + $display("ERROR: %m::%s", inst_name); + end else if (severity_lvl == 1) begin + $display("WARNING: %m::%s", inst_name); + end else begin + $display("INFO: %m::%s", inst_name); + end + $display(" Parameter %s == %2d is not the recommended value of 'an integer ", tuser_name, tuser_width); + $display(" multiple of the width of the interface (%s == %2d) in bytes.' AXI4-Stream", tdata_name, tdata_width); + $display(" TUSER width in this module is only defined when the TUSER is the"); + $display(" recommended value. See the AMBA4 AXI4-Stream Protocol Specification v1.0"); + $display(" Section 2.1, 2.3.3 and 2.8 for more information. "); + ret_val = 1; + end else begin + ret_val = 0; + end + end + endtask diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index 429bc34ffc59b5d98bb559f36ac557de4dbba92f..7e4ab34af79c52a08e737f57b2fc8f017940bcf5 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -48,6 +48,10 @@ from finn.transformation.fpgadataflow.minimize_accumulator_width import ( class InferConvInpGen(Transformation): """Convert Im2Col layers to ConvolutionInputGenerator layers.""" + def __init__(self, use_rtl_variant=False): + super().__init__() + self.use_rtl_variant = use_rtl_variant + def apply(self, model): graph = model.graph node_ind = 0 @@ -128,105 +132,144 @@ class InferConvInpGen(Transformation): ) graph.node.insert(node_ind, padding_node) - # Ensure that only supported HLS nodes are inserted + is_kernel_pointwise = k_h == 1 and k_w == 1 is_square_image = ConvInpGen_idim_h == ConvInpGen_idim_w is_square_kernel = k_h == k_w - is_kernel_pointwise = k_h == 1 and k_w == 1 is_equal_stride = stride_h == stride_w is_1d_convolution = (k_h == 1 and k_w > 1 and ifm_dim_h == 1) or ( k_h > 1 and k_w == 1 and ifm_dim_w == 1 ) - if (stride_h > 1 or stride_w > 1) and is_kernel_pointwise: - assert is_square_image, ( - "%s : DownSampler currently only supports square input images." - % n.name - ) - assert is_equal_stride, ( - """%s : DownSampler currently only supports equal stride value - along different axes.""" - % n.name - ) - ConvInpGen_idim = ConvInpGen_idim_h - stride = stride_h - # create DownSampler node + # Ensure that RTL variant is not inserted for unsupported configuration + is_rtl_variant_compatible = True + if is_kernel_pointwise: + is_rtl_variant_compatible = False + if self.use_rtl_variant: + warnings.warn( + """%s : RTL ConvInpGen requested for unsupported + configuration. Falling back to HLS implementation.""" + % n.name + ) + + if self.use_rtl_variant and is_rtl_variant_compatible: + ConvInpGen_node = helper.make_node( - "DownSampler", + "ConvolutionInputGenerator_rtl", [ConvInpGen_input], [i2c_output], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - ImgDim=ConvInpGen_idim, - NumChannels=ifm_ch, + ConvKernelDim=[k_h, k_w], + IFMChannels=ifm_ch, + IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w], + OFMDim=[ofm_dim_h, ofm_dim_w], SIMD=ifm_ch, - Stride=stride, + M=1, + parallel_window=0, + Stride=[stride_h, stride_w], + Dilation=[dilation_h, dilation_w], inputDataType=dt.name, - name="DownSampler_" + n.name, + outputDataType=dt.name, + depthwise=depthwise, + name="ConvolutionInputGenerator_rtl_" + n.name, ) graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) else: - # create equivalent ConvolutionInputGenerator node - if ( - is_square_image and is_square_kernel - ): # square images and square kernels - assert is_equal_stride, ( - """%s: Non-equal strides along different axes is not supported - for (non-)square convolutions""" - % n.name - ) - assert dilation_h == 1 and dilation_w == 1, ( - """%s: Dilation value != 1 is not supported - for square convolutions""" - % n.name + # Ensure that only supported HLS nodes are inserted + if (stride_h > 1 or stride_w > 1) and is_kernel_pointwise: + downsample_1D = (ifm_dim_h == 1) or (ifm_dim_w == 1) + is1D_unitx = ifm_dim_w == 1 + downsample_2D = ( + (not downsample_1D) and is_square_image and is_equal_stride ) + if not (downsample_1D or downsample_2D): + warnings.warn( + f"Couldn't infer Downsample from {n.name},check config." + ) + continue + ConvInpGen_idim = max(ConvInpGen_idim_h, ConvInpGen_idim_w) + stride = max(stride_h, stride_w) + # create DownSampler node ConvInpGen_node = helper.make_node( - "ConvolutionInputGenerator", + "DownSampler", [ConvInpGen_input], [i2c_output], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - ConvKernelDim=[k_h, k_w], - IFMChannels=ifm_ch, - IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w], - OFMDim=[ofm_dim_h, ofm_dim_w], + ImgDim=ConvInpGen_idim, + NumChannels=ifm_ch, SIMD=ifm_ch, - Stride=[stride_h, stride_w], - Dilation=[dilation_h, dilation_w], + Stride=stride, inputDataType=dt.name, - outputDataType=dt.name, - depthwise=depthwise, - name="ConvolutionInputGenerator_" + n.name, + name="DownSampler_" + n.name, + is1D=downsample_1D, + is1D_unitx=is1D_unitx, ) - else: # 1D images and/or kernels - assert is_1d_convolution, ( - "%s: ConvolutionInputGenerator1D works only for 1D convs" - % n.name - ) - if dilation_h > 1 or dilation_w > 1: - assert depthwise == 1, ( - """%s: Dilation value > 1 is only supported for - 1D depthwise separable convolutions""" + graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) + else: + # create equivalent ConvolutionInputGenerator node + if ( + is_square_image and is_square_kernel + ): # square images and square kernels + assert is_equal_stride, ( + """%s: Non-equal strides along different axes is not supported + for (non-)square convolutions""" % n.name ) - ConvInpGen_node = helper.make_node( - "ConvolutionInputGenerator1D", - [ConvInpGen_input], - [i2c_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - ConvKernelDim=[k_h, k_w], - IFMChannels=ifm_ch, - IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w], - OFMDim=[ofm_dim_h, ofm_dim_w], - SIMD=ifm_ch, - Stride=[stride_h, stride_w], - Dilation=[dilation_h, dilation_w], - inputDataType=dt.name, - outputDataType=dt.name, - depthwise=depthwise, - name="ConvolutionInputGenerator1D_" + n.name, - ) - graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) + assert dilation_h == 1 and dilation_w == 1, ( + """%s: Dilation value != 1 is not supported + for square convolutions""" + % n.name + ) + ConvInpGen_node = helper.make_node( + "ConvolutionInputGenerator", + [ConvInpGen_input], + [i2c_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + ConvKernelDim=[k_h, k_w], + IFMChannels=ifm_ch, + IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w], + OFMDim=[ofm_dim_h, ofm_dim_w], + SIMD=ifm_ch, + Stride=[stride_h, stride_w], + Dilation=[dilation_h, dilation_w], + inputDataType=dt.name, + outputDataType=dt.name, + depthwise=depthwise, + name="ConvolutionInputGenerator_" + n.name, + ) + else: # 1D images and/or kernels + assert is_1d_convolution, ( + """%s: ConvolutionInputGenerator1D works only + for 1D convs""" + % n.name + ) + if dilation_h > 1 or dilation_w > 1: + assert depthwise == 1, ( + """%s: Dilation value > 1 is only supported for + 1D depthwise separable convolutions""" + % n.name + ) + ConvInpGen_node = helper.make_node( + "ConvolutionInputGenerator1D", + [ConvInpGen_input], + [i2c_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + ConvKernelDim=[k_h, k_w], + IFMChannels=ifm_ch, + IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w], + OFMDim=[ofm_dim_h, ofm_dim_w], + SIMD=ifm_ch, + Stride=[stride_h, stride_w], + Dilation=[dilation_h, dilation_w], + inputDataType=dt.name, + outputDataType=dt.name, + depthwise=depthwise, + name="ConvolutionInputGenerator1D_" + n.name, + ) + graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) # remove old nodes graph.node.remove(n) graph_modified = True @@ -870,6 +913,10 @@ class InferVectorVectorActivation(Transformation): a depthwise convolution. Any immediately following MultiThreshold layers will also be absorbed into the VVAU.""" + def __init__(self, mem_mode="const"): + super().__init__() + self.mem_mode = mem_mode + def apply(self, model): graph = model.graph node_ind = 0 @@ -970,6 +1017,7 @@ class InferVectorVectorActivation(Transformation): ActVal=actval, noActivation=0, name="VectorVectorActivation_" + n.name, + mem_mode=self.mem_mode, ) graph.node.insert(node_ind, new_node) # remove old nodes @@ -1234,6 +1282,7 @@ class InferDuplicateStreamsLayer(Transformation): inputDataType=dt.name, numInputVectors=vecs, NumOutputStreams=n_outputs, + outFIFODepths=[2] * n_outputs, name="DuplicateStreams_Batch_" + node.name, ) @@ -1661,6 +1710,7 @@ class InferConcatLayer(Transformation): ElemsPerStream=elems_per_stream, inputDataType=dt0.name, numInputVectors=inp_vec, + inFIFODepths=[2] * len(node.input), ) graph.node.insert(node_ind, new_node) # remove old node @@ -1671,3 +1721,95 @@ class InferConcatLayer(Transformation): model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) return (model, graph_modified) + + +class InferStreamingEltwise(Transformation): + """Convert eltwise Sub or Sub -> Abs to StreamingEltwise layer + with SubEltwise or AbsDiffEltwise op.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + if node.op_type == "Sub": + in0 = node.input[0] + in1 = node.input[1] + result = node.output[0] + in0_shape = model.get_tensor_shape(in0) + in1_shape = model.get_tensor_shape(in1) + + # skip if different shapes on inputs + if in0_shape != in1_shape: + continue + + idt0 = model.get_tensor_datatype(in0) + idt1 = model.get_tensor_datatype(in1) + + # skip conversion for layers with float input + if not (idt0.is_integer() and idt1.is_integer()): + continue + + eltwiseOp = "Sub" + nodes_to_remove = [node] + # look for a downstream Abs node + res_consumer = model.find_consumer(result) + if (res_consumer is not None) and (res_consumer.op_type == "Abs"): + eltwiseOp = "AbsDiff" + result = res_consumer.output[0] + nodes_to_remove.append(res_consumer) + + # check layout and convert if necessary + in0_layout = model.get_tensor_layout(in0) + in1_layout = model.get_tensor_layout(in1) + result_layout = model.get_tensor_layout(result) + + if in0_layout == DataLayout.NCHW: + in0 = nchw_to_nhwc(in0, model, node_ind) + node_ind += 1 + in0_shape = model.get_tensor_shape(in0) + + if in1_layout == DataLayout.NCHW: + in1 = nchw_to_nhwc(in1, model, node_ind) + node_ind += 1 + in1_shape = model.get_tensor_shape(in1) + + # keep track of where we need to insert the HLS Op + # it has to be ahead of the output transform + insert_point = node_ind + + if result_layout == DataLayout.NCHW: + result = nchw_to_nhwc(result, model, node_ind, reverse=True) + node_ind += 1 + + # now safe to assume num_channels is size of last dimension + num_channels = int(in0_shape[-1]) + # create node with no parallelization first + pe = 1 + + # create and insert new Eltwise node + new_node = helper.make_node( + "StreamingEltwise", + [in0, in1], + [result], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + NumChannels=num_channels, + PE=pe, + inputDataType0=idt0.name, + inputDataType1=idt1.name, + eltwiseOp=eltwiseOp, + numInputVectors=in0_shape[:-1], + name="StreamingEltwise_" + node.name, + ) + graph.node.insert(insert_point, new_node) + # remove old nodes + for nd in nodes_to_remove: + graph.node.remove(nd) + graph_modified = True + + # if graph_modified: + # model = model.transform(InferShapes()) + # model = model.transform(InferDataTypes()) + return (model, graph_modified) diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py index 892ab09fdf41947f86e2bf122e057e94585dfa8c..52e4e88b409766f0764d3ce7666dbf1971713575 100644 --- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py +++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py @@ -228,6 +228,22 @@ class CreateStitchedIP(Transformation): ) self.s_axis_idx += 1 + def connect_ap_none_external(self, node): + inst_name = node.name + node_inst = getCustomOp(node) + input_intf_names = node_inst.get_verilog_top_module_intf_names()["ap_none"] + # make external + for i in range(len(input_intf_names)): + input_intf_name = input_intf_names[i] + self.connect_cmds.append( + "make_bd_pins_external [get_bd_pins %s/%s]" + % (inst_name, input_intf_name) + ) + self.connect_cmds.append( + "set_property name %s [get_bd_ports %s_0]" + % (input_intf_name, input_intf_name) + ) + def insert_signature(self, checksum_count): signature_vlnv = "AMD:user:axi_info_top:1.0" signature_name = "axi_info_top0" @@ -275,7 +291,7 @@ class CreateStitchedIP(Transformation): "make_bd_intf_pins_external [get_bd_intf_pins %s/s_axi]" % signature_name ) self.connect_cmds.append( - "set_property name s_axis_info [get_bd_intf_ports s_axi_0]" + "set_property name s_axilite_info [get_bd_intf_ports s_axi_0]" ) self.connect_cmds.append("assign_bd_address") @@ -305,6 +321,7 @@ class CreateStitchedIP(Transformation): ip_dirs += [ip_dir_value] self.create_cmds += node_inst.code_generation_ipi() self.connect_clk_rst(node) + self.connect_ap_none_external(node) self.connect_axi(node) for i in range(len(node.input)): if not is_external_input(model, node, i): @@ -387,6 +404,7 @@ class CreateStitchedIP(Transformation): wrapper_filename = "%s/hdl/%s_wrapper.v" % (bd_base, block_name) tcl.append("add_files -norecurse %s" % wrapper_filename) model.set_metadata_prop("wrapper_filename", wrapper_filename) + tcl.append("set_property top finn_design_wrapper [current_fileset]") # synthesize to DCP and export stub, DCP and constraints if self.vitis: tcl.append( @@ -565,6 +583,10 @@ class CreateStitchedIP(Transformation): if os.path.isfile(wrapper_filename_alt): model.set_metadata_prop("wrapper_filename", wrapper_filename_alt) else: - raise Exception("CreateStitchedIP failed, no wrapper HDL found.") + raise Exception( + """CreateStitchedIP failed, no wrapper HDL found under %s or %s. + Please check logs under the parent directory.""" + % (wrapper_filename, wrapper_filename_alt) + ) return (model, False) diff --git a/src/finn/transformation/fpgadataflow/derive_characteristic.py b/src/finn/transformation/fpgadataflow/derive_characteristic.py new file mode 100644 index 0000000000000000000000000000000000000000..822679721036c7832241db4642911ff804fb9dff --- /dev/null +++ b/src/finn/transformation/fpgadataflow/derive_characteristic.py @@ -0,0 +1,190 @@ +# Copyright (c) 2022, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import qonnx.custom_op.registry as registry +import warnings +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.transformation.base import NodeLocalTransformation + +from finn.util.fpgadataflow import is_fpgadataflow_node + + +class DeriveCharacteristic(NodeLocalTransformation): + """For each node in the graph, run rtlsim to obtain the i/o + characteristic function for FIFO sizing and set the attribute. + It is assumed that the PrepareRTLSim transformation was already + called on the graph. + + This transformation performs rtlsim for each node, so it will run for + some time (minutes to hours depending on configuration). + + * period (int) desired period over which the characteristic function + will be derived. + + * num_workers (int or None) number of parallel workers, see documentation in + NodeLocalTransformation for more details. + """ + + def __init__(self, period, num_workers=None, manual_bypass=False): + super().__init__(num_workers=num_workers) + self.period = period + self.manual_bypass = manual_bypass + + def applyNodeLocal(self, node): + op_type = node.op_type + if is_fpgadataflow_node(node) is True: + try: + # lookup op_type in registry of CustomOps + inst = registry.getCustomOp(node) + inst.derive_characteristic_fxns(period=self.period) + except KeyError: + # exception if op_type is not supported + raise Exception( + "Custom op_type %s is currently not supported." % op_type + ) + return (node, False) + + def apply(self, model: ModelWrapper): + (model, run_again) = super().apply(model) + if not self.manual_bypass: + return (model, run_again) + # apply manual fix for DuplicateStreams and AddStreams for + # simple residual reconvergent paths with bypass + addstrm_nodes = model.get_nodes_by_op_type("AddStreams_Batch") + for addstrm_node in addstrm_nodes: + # we currently only support the case where one branch is + # a bypass + b0 = model.find_producer(addstrm_node.input[0]) + b1 = model.find_producer(addstrm_node.input[1]) + if (b0 is None) or (b1 is None): + warnings.warn("Found unsupported AddStreams, skipping") + return (model, run_again) + b0_is_bypass = b0.op_type == "DuplicateStreams_Batch" + b1_is_bypass = b1.op_type == "DuplicateStreams_Batch" + if (not b0_is_bypass) and (not b1_is_bypass): + warnings.warn("Found unsupported AddStreams, skipping") + return (model, run_again) + ds_node = b0 if b0_is_bypass else b1 + comp_branch_last = b1 if b0_is_bypass else b0 + + ds_comp_bout = ds_node.output[0] if b0_is_bypass else ds_node.output[1] + comp_branch_first = model.find_consumer(ds_comp_bout) + if comp_branch_first is None or comp_branch_last is None: + warnings.warn("Found unsupported DuplicateStreams, skipping") + return (model, run_again) + comp_branch_last = registry.getCustomOp(comp_branch_last) + comp_branch_first = registry.getCustomOp(comp_branch_first) + # for DuplicateStreams, use comp_branch_first's input characterization + # for AddStreams, use comp_branch_last's output characterization + period = comp_branch_first.get_nodeattr("io_chrc_period") + comp_branch_first_f = comp_branch_first.get_nodeattr("io_characteristic")[ + : 2 * period + ] + comp_branch_last_f = comp_branch_last.get_nodeattr("io_characteristic")[ + 2 * period : + ] + ds_node_inst = registry.getCustomOp(ds_node) + addstrm_node_inst = registry.getCustomOp(addstrm_node) + ds_node_inst.set_nodeattr("io_chrc_period", period) + ds_node_inst.set_nodeattr("io_characteristic", comp_branch_first_f * 2) + addstrm_node_inst.set_nodeattr("io_chrc_period", period) + addstrm_node_inst.set_nodeattr("io_characteristic", comp_branch_last_f * 2) + warnings.warn( + f"Set {ds_node.name} chrc. from {comp_branch_first.onnx_node.name}" + ) + warnings.warn( + f"Set {addstrm_node.name} chrc. from {comp_branch_last.onnx_node.name}" + ) + return (model, run_again) + + +class DeriveFIFOSizes(NodeLocalTransformation): + """Prerequisite: DeriveCharacteristic already called on graph. + For each node in the graph, use the accumulated I/O characteristic function + to perform FIFO sizing, setting the in/outFIFODepth attributes of HLSCustomOp + nodes. + + * num_workers (int or None) number of parallel workers, see documentation in + NodeLocalTransformation for more details. + """ + + def __init__(self, num_workers=None): + super().__init__(num_workers=num_workers) + + def applyNodeLocal(self, node): + op_type = node.op_type + if is_fpgadataflow_node(node) is True: + try: + # lookup op_type in registry of CustomOps + prod = registry.getCustomOp(node) + assert op_type != "StreamingFIFO", "Found existing FIFOs" + period = prod.get_nodeattr("io_chrc_period") + prod_chrc = prod.get_nodeattr("io_chrc_out")[0] + assert ( + len(prod_chrc) == 2 * period + ), "Found unexpected characterization attribute" + if any([x > 2 for x in prod.get_nodeattr("outFIFODepths")]): + # FIFO depth already set, can skip this node + return (node, False) + + # find consumers + model = self.ref_input_model + out_fifo_depths = [] + for output_name in node.output: + cons_node = model.find_consumer(output_name) + if cons_node is None: + # could be final node, will be overridden if so + # need an entry in the list anyway + out_fifo_depths.append(2) + continue + cons = registry.getCustomOp(cons_node) + cons_chrc = cons.get_nodeattr("io_chrc_in")[0] + # find minimum phase shift satisfying the constraint + pshift_min = period - 1 + for pshift_cand in range(period): + prod_chrc_part = prod_chrc[pshift_cand:period] + cons_chrc_part = cons_chrc[: period - pshift_cand] + if (prod_chrc_part >= cons_chrc_part).all(): + pshift_min = pshift_cand + break + prod_chrc_part = prod_chrc[pshift_min : (pshift_min + period)] + cons_chrc_part = cons_chrc[:period] + fifo_depth = int((prod_chrc_part - cons_chrc_part).max()) + out_fifo_depths.append(fifo_depth) + # set output FIFO depth for this (producing) node + # InsertFIFO looks at the max of (outFIFODepth, inFIFODepth) + # for each tensor + prod.set_nodeattr("outFIFODepths", out_fifo_depths) + + except KeyError: + # exception if op_type is not supported + raise Exception( + "Custom op_type %s is currently not supported." % op_type + ) + return (node, False) diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py index 9817f2e3d2857bd5e59b304fbdaf3bad74a9b037..efc179923545eb06e4d173c683b0941887f8bb79 100644 --- a/src/finn/transformation/fpgadataflow/insert_dwc.py +++ b/src/finn/transformation/fpgadataflow/insert_dwc.py @@ -81,6 +81,12 @@ class InsertDWC(Transformation): dwc_in_width = n0.get_outstream_width() # determine dwc outwidth dwc_out_width = n1.get_instream_width() + larger_width = max(dwc_in_width, dwc_out_width) + smaller_width = min(dwc_in_width, dwc_out_width) + if larger_width % smaller_width == 0: + impl_style = "hls" + else: + impl_style = "vivado" # determine shape for dwc dwc_shape = n0.get_normal_output_shape() @@ -105,6 +111,7 @@ class InsertDWC(Transformation): inWidth=dwc_in_width, outWidth=dwc_out_width, dataType=str(dtype.name), + impl_style=impl_style, ) # insert dwc graph.node.insert(node_ind + 1, dwc_node) diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py index 78200b280960ad53e3e84d44394c10296c432ba5..79bd717a5d96e7a9839740d73254db53e5133e13 100644 --- a/src/finn/transformation/fpgadataflow/insert_fifo.py +++ b/src/finn/transformation/fpgadataflow/insert_fifo.py @@ -70,16 +70,26 @@ class InsertFIFO(Transformation): node attribute 'outFIFODepth' of the previous and node attribute 'inFIFODepth' of the subsequent node. max() of these two values sets the FIFO depth. - Normally, shallow-depth (<=2) FIFOs won't be created since HLS streaming - interfaces already have a degree of buffering. You can set - create_shallow_fifos=True to override this default behavior. + Constructor arguments: + - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of + Verilog FIFOs (Q_srl.v) + - vivado_ram_style : the StreamingFIFO.ram_style attribute to be used for + large FIFOs implemented by Vivado + - create_shallow_fifos : Normally, shallow-depth (<=2) FIFOs won't be created since + HLS streaming interfaces already have a degree of buffering. + Override with this parameter. + The other node attributes necessary to create a FIFO node are taken from the node the FIFO node is inserted after: 'folded_shape' and 'dtype'""" - def __init__(self, create_shallow_fifos=False): + def __init__( + self, create_shallow_fifos=False, max_qsrl_depth=None, vivado_ram_style="auto" + ): super().__init__() self.create_shallow_fifos = create_shallow_fifos + self.max_qsrl_depth = max_qsrl_depth + self.vivado_ram_style = vivado_ram_style def apply(self, model): graph = model.graph @@ -88,8 +98,8 @@ class InsertFIFO(Transformation): for first_node in graph.node: node_ind += 1 if _suitable_node(first_node): - for n_output in first_node.output: - consumers = model.find_consumers(n_output) + for idx_out, output_name in enumerate(first_node.output): + consumers = model.find_consumers(output_name) if consumers == []: continue if len(consumers) > 1: @@ -108,11 +118,9 @@ class InsertFIFO(Transformation): # input of the second node is equal n1 = getCustomOp(consumer) for idx, inp in enumerate(consumer.input): - if inp == n_output: - if idx == 0: - fld_shape_2 = n1.get_folded_input_shape() - else: - fld_shape_2 = n1.get_folded_input_shape(ind=idx) + if inp == output_name: + fld_shape_2 = n1.get_folded_input_shape(ind=idx) + idx_inp = idx assert _suitable_folded_shapes( fld_shape, fld_shape_2 ), """The @@ -122,12 +130,10 @@ class InsertFIFO(Transformation): # check if outFIFOdepth attribute of first node # and inFIFOdepth attribute of consumer node is equal - n0_depth = n0.get_nodeattr("outFIFODepth") - n1_depth = n1.get_nodeattr("inFIFODepth") - if n0_depth == n1_depth: - fifo_depth = n0_depth - elif n0_depth != n1_depth: - fifo_depth = max(n0_depth, n1_depth) + n0_depth = n0.get_nodeattr("outFIFODepths")[idx_out] + n1_depth = n1.get_nodeattr("inFIFODepths")[idx_inp] + + fifo_depth = max(n0_depth, n1_depth) if fifo_depth > 2 or self.create_shallow_fifos: # assumption: HLS streaming components already have @@ -143,25 +149,40 @@ class InsertFIFO(Transformation): graph.value_info.append(fifo_output_tensor) model.set_tensor_datatype(fifo_output_tensor.name, dtype) + if ( + self.max_qsrl_depth is None + or fifo_depth <= self.max_qsrl_depth + ): + impl_style = "rtl" + else: + impl_style = "vivado" + fifo_node = oh.make_node( "StreamingFIFO", - [n_output], + [output_name], [fifo_output_tensor.name], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", depth=fifo_depth, folded_shape=fld_shape, dataType=str(dtype.name), + impl_style=impl_style, + ram_style=self.vivado_ram_style, ) # insert fifo graph.node.insert(node_ind + 1, fifo_node) # set fifo output tensor as new input tensor of second node for idx, inp in enumerate(consumer.input): - if inp == n_output: + if inp == output_name: consumer.input[idx] = fifo_output_tensor.name # ensure created FIFO depth is reflected on both sides - n0.set_nodeattr("outFIFODepth", fifo_depth) - n1.set_nodeattr("inFIFODepth", fifo_depth) + odepths = n0.get_nodeattr("outFIFODepths") + odepths[idx_out] = fifo_depth + n0.set_nodeattr("outFIFODepths", odepths) + idepths = n1.get_nodeattr("inFIFODepths") + idepths[idx_inp] = fifo_depth + n1.set_nodeattr("inFIFODepths", idepths) + graph_modified = True if graph_modified is False: @@ -177,13 +198,9 @@ class InsertFIFO(Transformation): n_input = first_node.input[inp_ind] n0 = getCustomOp(first_node) # determine fifo node attributes - if inp_ind == 0: - fld_shape = n0.get_folded_input_shape() - dtype = n0.get_input_datatype() - else: - fld_shape = n0.get_folded_input_shape(inp_ind) - dtype = n0.get_input_datatype(inp_ind) - fifo_depth = n0.get_nodeattr("inFIFODepth") + fld_shape = n0.get_folded_input_shape(inp_ind) + dtype = n0.get_input_datatype(inp_ind) + fifo_depth = n0.get_nodeattr("inFIFODepths")[inp_ind] if fifo_depth <= 2: warnings.warn("Overriding input FIFO depth to 32") @@ -198,6 +215,11 @@ class InsertFIFO(Transformation): graph.value_info.append(fifo_output_tensor) model.set_tensor_datatype(fifo_output_tensor.name, dtype) + if self.max_qsrl_depth is None or fifo_depth <= self.max_qsrl_depth: + impl_style = "rtl" + else: + impl_style = "vivado" + fifo_node = oh.make_node( "StreamingFIFO", [n_input], @@ -207,6 +229,8 @@ class InsertFIFO(Transformation): depth=fifo_depth, folded_shape=fld_shape, dataType=str(dtype.name), + impl_style=impl_style, + ram_style=self.vivado_ram_style, ) # insert fifo graph.node.insert(0, fifo_node) @@ -227,10 +251,11 @@ class InsertFIFO(Transformation): ), """Insert tlast marker should be done after inserting the FIFOs""" n0 = getCustomOp(final_node) + out_ind = list(final_node.output).index(graph_out_name) # determine fifo node attributes - fld_shape = n0.get_folded_output_shape() - dtype = n0.get_output_datatype() - fifo_depth = n0.get_nodeattr("outFIFODepth") + fld_shape = n0.get_folded_output_shape(out_ind) + dtype = n0.get_output_datatype(out_ind) + fifo_depth = n0.get_nodeattr("outFIFODepths")[out_ind] if fifo_depth <= 2: warnings.warn("Overriding output FIFO depth to 32") @@ -245,6 +270,11 @@ class InsertFIFO(Transformation): graph.value_info.append(fifo_input_tensor) model.set_tensor_datatype(fifo_input_tensor.name, dtype) + if self.max_qsrl_depth is None or fifo_depth <= self.max_qsrl_depth: + impl_style = "rtl" + else: + impl_style = "vivado" + fifo_node = oh.make_node( "StreamingFIFO", [fifo_input_tensor.name], @@ -254,6 +284,8 @@ class InsertFIFO(Transformation): depth=fifo_depth, folded_shape=fld_shape, dataType=str(dtype.name), + impl_style=impl_style, + ram_style=self.vivado_ram_style, ) # insert fifo graph.node.append(fifo_node) diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py index a589cb039c825ff97c11df7ffa57109df27f3fd0..f48566326e576f4d39d81359fe7f28a12645a635 100644 --- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -45,7 +45,7 @@ from finn.transformation.fpgadataflow.insert_dwc import InsertDWC from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA from finn.transformation.fpgadataflow.prepare_ip import PrepareIP -from finn.util.basic import make_build_dir, pynq_part_map +from finn.util.basic import make_build_dir, pynq_native_port_width, pynq_part_map from . import templates @@ -320,6 +320,7 @@ class ZynqBuild(Transformation): ): super().__init__() self.fpga_part = pynq_part_map[platform] + self.axi_port_width = pynq_native_port_width[platform] self.period_ns = period_ns self.platform = platform self.enable_debug = enable_debug @@ -330,7 +331,7 @@ class ZynqBuild(Transformation): model = model.transform(InferDataLayouts()) # prepare at global level, then break up into kernels prep_transforms = [ - InsertIODMA(64), + InsertIODMA(self.axi_port_width), InsertDWC(), Floorplan(), CreateDataflowPartition(partition_model_dir=self.partition_model_dir), diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py index 0139c71666fdfa4b60cb356ceb65ce2c5b831c13..f715aaeffb6d4d00f2e14c5fb25ec931443d5d97 100644 --- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -192,10 +192,11 @@ class InsertAndSetFIFODepths(Transformation): - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of Verilog FIFOs (Q_srl.v) - max_depth : how deep the "max"-sized FIFOs initially inserted will be + if set to None, use the tensor size as the depth - swg_exception : call CapConvolutionFIFODepths to make convolution FIFOs smaller where appropriate - vivado_ram_style : the StreamingFIFO.ram_style attribute to be used for - large FIFOs implemented by Vivado + large FIFOs implemented by Vivado afterwards Assumed input graph properties: - all nodes are fpgadataflow nodes @@ -210,7 +211,7 @@ class InsertAndSetFIFODepths(Transformation): necessary to insert FIFOs between them to prevent stalls due to bursty behavior. The sizes of those FIFOs are hard to predict analytically, so we do the following: - - insert very deep (default 16k deep) FIFOs between all fpgadataflow nodes + - insert deep (=tensor size) FIFOs between all fpgadataflow nodes - create stitched design - run through rtlsim with stream of multiple random input images (to fill pipeline) - keep track of observed maximum occupancy for each FIFO during rtlsim @@ -223,7 +224,7 @@ class InsertAndSetFIFODepths(Transformation): fpgapart, clk_ns=10.0, max_qsrl_depth=256, - max_depth=2**14, + max_depth=None, swg_exception=True, vivado_ram_style="auto", ): @@ -236,6 +237,9 @@ class InsertAndSetFIFODepths(Transformation): self.vivado_ram_style = vivado_ram_style def apply(self, model): + # these optypes may potentially use external weights + # we'll temporarily change them to use decoupled mode for FIFO sizing + extw_optypes = ["MatrixVectorActivation", "VectorVectorActivation"] # change external to decoupled and warn user # this way we are sure we have exactly one input/output modified_fc_nodes = [] @@ -246,9 +250,22 @@ class InsertAndSetFIFODepths(Transformation): ) assert node.op_type != "StreamingFIFO", "Found existing StreamingFIFO node" node = getCustomOp(node) - node.set_nodeattr("inFIFODepth", self.max_depth) - node.set_nodeattr("outFIFODepth", self.max_depth) - if node.onnx_node.op_type == "MatrixVectorActivation": + ifd = node.get_nodeattr("inFIFODepths") + ofd = node.get_nodeattr("outFIFODepths") + if self.max_depth is not None: + ifd = [self.max_depth] * len(ifd) + ofd = [self.max_depth] * len(ofd) + else: + # set each FIFO to its tensor size + # (except stream width hence the :-1) + for i in range(len(ifd)): + ifd[i] = np.prod(node.get_folded_input_shape(i)[:-1]) + for o in range(len(ofd)): + ofd[o] = np.prod(node.get_folded_output_shape(o)[:-1]) + node.set_nodeattr("inFIFODepths", ifd) + node.set_nodeattr("outFIFODepths", ofd) + + if node.onnx_node.op_type in extw_optypes: mmode = node.get_nodeattr("mem_mode") if mmode == "external": modified_fc_nodes.append(node.onnx_node.name) @@ -267,13 +284,17 @@ class InsertAndSetFIFODepths(Transformation): # gather FIFO names, check they are of expected depth fifos = {} - for node in model.graph.node: - if node.op_type == "StreamingFIFO": - fifos[node.name] = 0 - node = getCustomOp(node) - # check depths and fix as necessary - if node.get_nodeattr("depth") != self.max_depth: - node.set_nodeattr("depth", self.max_depth) + fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO") + for node in fifo_nodes: + fifos[node.name] = 0 + node = getCustomOp(node) + node.set_nodeattr("depth_monitor", 1) + node.set_nodeattr("impl_style", "rtl") + # check depths and fix as necessary + if (self.max_depth is not None) and ( + node.get_nodeattr("depth") != self.max_depth + ): + node.set_nodeattr("depth", self.max_depth) # insert FIFOs and do all transformations for RTLsim model = model.transform(AnnotateCycles()) @@ -324,21 +345,6 @@ class InsertAndSetFIFODepths(Transformation): else: set_signal(sim, "tvalid", 0) - # check/update all fifo counts - for key in fifos: - current_state = sim.internals["finn_design_i"][key]["inst"][ - key + "_" + key - ]["state"] - current_addr = sim.internals["finn_design_i"][key]["inst"][ - key + "_" + key - ]["addr"] - if current_state == 2: - current_count = current_addr + 2 - else: - current_count = current_state - if current_count > fifos[key]: - fifos[key] = current_count - # since latency estimation is very pessimistic, detect first output # and fast-forward the sim if get_signal(sim, "tvalid") != 0 and not output_detected: @@ -352,6 +358,12 @@ class InsertAndSetFIFODepths(Transformation): "No output detected, calculated FIFO depths may not be correct" ) + for ind, node in enumerate(fifo_nodes): + maxcount_name = "maxcount_%d" % ind + if ind == 0: + maxcount_name = "maxcount" + fifos[node.name] = sim[maxcount_name] + # Apply depths back into the model; # also set in/outFIFODepth to zero for non-FIFO # nodes, preventing further FIFO insertion @@ -364,6 +376,7 @@ class InsertAndSetFIFODepths(Transformation): depth = optimize_depth(fifos[node.name]) node_inst = getCustomOp(node) node_inst.set_nodeattr("depth", depth) + node_inst.set_nodeattr("depth_monitor", 0) # Set FIFO implementation/ram styles if depth > self.max_qsrl_depth: node_inst.set_nodeattr("impl_style", "vivado") @@ -374,11 +387,14 @@ class InsertAndSetFIFODepths(Transformation): reset_implementation(node_inst) del fifos[node.name] else: - getCustomOp(node).set_nodeattr("inFIFODepth", 0) - getCustomOp(node).set_nodeattr("outFIFODepth", 0) - # for every FC node we changed from external to decoupled, + inst = getCustomOp(node) + ifd = inst.get_nodeattr("inFIFODepths") + ofd = inst.get_nodeattr("outFIFODepths") + inst.set_nodeattr("inFIFODepths", [0] * len(ifd)) + inst.set_nodeattr("outFIFODepths", [0] * len(ofd)) + # for every extw node we changed from external to decoupled, # change back and reset implementation - if node.op_type == "MatrixVectorActivation": + if node.op_type in extw_optypes: if node.name in modified_fc_nodes: node_inst = getCustomOp(node) node_inst.set_nodeattr("mem_mode", "external") diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py index 23943084ab99d6ab880a69975e0b4a49756905a7..e24e24f1f8ebb2873c81617884cd333311d8aea9 100644 --- a/src/finn/transformation/fpgadataflow/set_folding.py +++ b/src/finn/transformation/fpgadataflow/set_folding.py @@ -109,6 +109,7 @@ class SetFolding(Transformation): "FMPadding_Batch", "ConvolutionInputGenerator", "ConvolutionInputGenerator1D", + "ConvolutionInputGenerator_rtl", ] # these ops are preceded by depthwise SWG and have special behavior, # as explained in the SetFolding docstring @@ -171,10 +172,7 @@ class SetFolding(Transformation): "Expected SWU on DW op input, found " + swu_node.op_type ) elif op_type in simd_ops: - if op_type in [ - "ConvolutionInputGenerator", - "ConvolutionInputGenerator1D", - ]: + if op_type.startswith("ConvolutionInputGenerator"): depthwise = node_inst.get_nodeattr("depthwise") if depthwise == 0: max_simd = node_inst.get_nodeattr("IFMChannels") diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py index 3e815c1537353cc2be970a2068d4ded30cc48bc8..29eefacc32370598ddcd39283d022f5eb61f3f0c 100644 --- a/src/finn/transformation/streamline/reorder.py +++ b/src/finn/transformation/streamline/reorder.py @@ -725,6 +725,77 @@ class MakeMaxPoolNHWC(Transformation): return (model, graph_modified) +class MakeScaleResizeNHWC(Transformation): + """ + Converts the inputs and outputs for all scales Resize and Upsample nodes + from NCHW to NHWC. + """ + + def apply(self, model): + graph = model.graph + node_ind = 0 + for n in graph.node: + node_ind += 1 + if n.op_type == "Upsample" or n.op_type == "Resize": + if model.get_tensor_layout(n.input[0]) != DataLayout.NCHW: + warnings.warn( + "%s: Input not NCHW. Can't operate transformation on node." + % n.name + ) + continue + consumer = model.find_consumer(n.output[0]) + producer = model.find_producer(n.input[0]) + if n.op_type == "Upsample": + scales_ind = 1 + else: + scales_ind = 2 + if producer is not None and producer.op_type == "Transpose": + perms = list(get_by_name(producer.attribute, "perm").ints) + if perms == [0, 3, 1, 2]: + old_value = model.get_initializer(n.input[scales_ind]) + new_value = np.array( + [old_value[idx] for idx in (0, 2, 3, 1)], + dtype=np.dtype("float32"), + ) + model.set_initializer(n.input[scales_ind], new_value) + start_name = producer.input[0] + mid_name = n.input[0] + end_name = n.output[0] + (b, hi, wi, c) = model.get_tensor_shape(start_name) + (b, c, ho, wo) = model.get_tensor_shape(end_name) + producer.input[0] = mid_name + producer.output[0] = end_name + n.input[0] = start_name + n.output[0] = mid_name + model.set_tensor_shape(mid_name, (b, ho, wo, c)) + model.set_tensor_shape(end_name, (b, c, ho, wo)) + graph.node.remove(producer) + graph.node.insert(node_ind, producer) + elif consumer is not None and consumer.op_type == "Transpose": + perms = list(get_by_name(consumer.attribute, "perm").ints) + if perms == [0, 2, 3, 1]: + old_value = model.get_initializer(n.input[scales_ind]) + new_value = np.array( + [old_value[idx] for idx in (0, 2, 3, 1)], + dtype=np.dtype("float32"), + ) + model.set_initializer(n.input[scales_ind], new_value) + start_name = n.input[0] + mid_name = consumer.input[0] + end_name = consumer.output[0] + (b, c, hi, wi) = model.get_tensor_shape(start_name) + (b, c, ho, wo) = model.get_tensor_shape(mid_name) + consumer.input[0] = start_name + consumer.output[0] = mid_name + n.input[0] = mid_name + n.output[0] = end_name + model.set_tensor_shape(mid_name, (b, hi, wi, c)) + model.set_tensor_shape(end_name, (b, ho, wo, c)) + graph.node.remove(consumer) + graph.node.insert(node_ind - 1, consumer) + return (model, False) + + class MoveOpPastFork(Transformation): """Move node operations past graph forks. Used when a node before a fork can be merged with nodes in the branches diff --git a/src/finn/util/pyverilator.py b/src/finn/util/pyverilator.py index f6a51da8e44ea60ae5693cdd033b39bdf51376ac..d7ed3e261fe024b7f054382f12184628d3f3e94c 100644 --- a/src/finn/util/pyverilator.py +++ b/src/finn/util/pyverilator.py @@ -26,7 +26,10 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pkg_resources as pk + import os +import shutil from pyverilator import PyVerilator from finn.util.basic import get_rtlsim_trace_depth, make_build_dir @@ -74,14 +77,35 @@ def pyverilate_stitched_ip( # are identical but in multiple directories (regslice_core.v) # remove duplicates from list by doing list -> set -> list - all_verilog_files = list( - set(filter(lambda x: x.endswith(".v") or x.endswith(".sv"), all_verilog_srcs)) + src_exts = [".v", ".sv"] + + all_verilog_src_files = list( + set( + filter( + lambda x: any(map(lambda y: x.endswith(y), src_exts)), all_verilog_srcs + ) + ) + ) + + verilog_header_dir = make_build_dir("pyverilator_vh_") + # use custom version of axis infrastructure vh + # to enable Verilator to simulate AMD/Xilinx components (e.g DWC) + custom_vh = pk.resource_filename( + "finn.qnn-data", "verilog/custom_axis_infrastructure.vh" ) + shutil.copy(custom_vh, verilog_header_dir + "/axis_infrastructure_v1_1_0.vh") + for fn in all_verilog_srcs: + if fn.endswith(".vh"): + if "axis_infrastructure_v1_1_0.vh" in fn: + # skip, we use a custom version for this file without recursive gcd + continue + else: + shutil.copy(fn, verilog_header_dir) # remove all but one instances of regslice_core.v filtered_verilog_files = [] remove_entry = False - for vfile in all_verilog_files: + for vfile in all_verilog_src_files: if "regslice_core" in vfile: if not remove_entry: filtered_verilog_files.append(vfile) @@ -94,7 +118,12 @@ def pyverilate_stitched_ip( for vfile in filtered_verilog_files: with open(vfile) as rf: wf.write("//Added from " + vfile + "\n\n") - wf.write(rf.read()) + lines = rf.read() + for line in lines.split("\n"): + # break down too-long lines, Verilator complains otherwise + if len(line) > 20000: + line = line.replace("&", "\n&") + wf.write("\n" + line) verilator_args = [] # disable common verilator warnings that should be harmless but commonly occur @@ -108,10 +137,20 @@ def pyverilate_stitched_ip( # force inlining of all submodules to ensure we can read internal signals properly if read_internal_signals: verilator_args += ["--inline-mult", "0"] + # add defines to make certain XPM src files work with Verilator + verilator_args.append("-DDISABLE_XPM_ASSERTIONS") + verilator_args.append("-DOBSOLETE") + verilator_args.append("-DONESPIN") + verilator_args.append("--bbox-unsup") + vivado_path = os.environ["VIVADO_PATH"] + # additional SystemVerilog modules to make XPMs work with Verilator + xpm_memory = f"{vivado_path}/data/ip/xpm/xpm_memory/hdl/xpm_memory.sv" + xpm_cdc = f"{vivado_path}/data/ip/xpm/xpm_cdc/hdl/xpm_cdc.sv" + xpm_fifo = f"{vivado_path}/data/ip/xpm/xpm_fifo/hdl/xpm_fifo.sv" sim = PyVerilator.build( - top_module_file_name, - verilog_path=[vivado_stitch_proj_dir], + [top_module_file_name, xpm_fifo, xpm_memory, xpm_cdc], + verilog_path=[vivado_stitch_proj_dir, verilog_header_dir], build_dir=build_dir, trace_depth=get_rtlsim_trace_depth(), top_module_name=top_module_name, diff --git a/src/finn/util/test.py b/src/finn/util/test.py index f5d3b1c30b8b7b439eae1c684ad84b33a3401c7c..bfe4aa0bb826c73f6a7c67f025e24764da8c36cc 100644 --- a/src/finn/util/test.py +++ b/src/finn/util/test.py @@ -180,6 +180,7 @@ def execute_parent(parent_path, child_path, input_tensor_npy, return_full_ctx=Fa sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0] sdp_node = getCustomOp(sdp_node) sdp_node.set_nodeattr("model", child_path) + sdp_node.set_nodeattr("return_full_exec_context", 1 if return_full_ctx else 0) ret = execute_onnx(parent_model, {iname: input_tensor_npy}, True) if return_full_ctx: return ret diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index 103f18b514c23c4e1ad35a85d020dc0481aa9c47..5f787d1f889645d04884aed9b89a0b1c91d1f418 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -569,8 +569,8 @@ class TestEnd2End: for node in hls_layers: if node.op_type != "StreamingFIFO": op_inst = getCustomOp(node) - assert op_inst.get_nodeattr("inFIFODepth") == 0 - assert op_inst.get_nodeattr("outFIFODepth") == 0 + assert op_inst.get_nodeattr("inFIFODepths") == [0] + assert op_inst.get_nodeattr("outFIFODepths") == [0] model.save( get_checkpoint_name( topology, wbits, abits, QONNX_export, "fifodepth_" + kind diff --git a/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py index 5bbaefac2d3e5f800fbb9471df6469235271c2f3..7b3e20616410f54e4718290baec9a510a0d49c0d 100644 --- a/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py +++ b/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py @@ -66,11 +66,12 @@ from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode ], ) @pytest.mark.parametrize("depthwise", [False, True]) +@pytest.mark.parametrize("use_rtl_swg", [False, True]) @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, exec_mode): +def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mode): pad, kernel_size, stride, dilation = conv_config np.random.seed(0) idt = DataType["UINT4"] @@ -84,6 +85,9 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, exec_mode): pad_h = pad[0] + pad[2] pad_w = pad[1] + pad[3] + if use_rtl_swg and exec_mode == "cppsim": + pytest.skip("cppsim not supported for RTL SWG") + if depthwise is True: group = out_chn = in_chn conv_param_shape = [out_chn, 1, k_h, k_w] @@ -139,7 +143,7 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, exec_mode): model = model.transform(InferDataTypes()) new_model = model.transform(LowerConvsToMatMul()) - new_model = new_model.transform(to_hls.InferConvInpGen()) + new_model = new_model.transform(to_hls.InferConvInpGen(use_rtl_variant=use_rtl_swg)) if depthwise is True: new_model = new_model.transform(to_hls.InferVectorVectorActivation()) else: diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py index 55dc77cafb898ead28a7cbb9641e0b40db276919..8c9f110c315089ec03354863bf2213963197217a 100644 --- a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py +++ b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py @@ -57,11 +57,12 @@ from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode "conv_config", [(1, 2, 0), (1, 3, 0), (3, 2, 1), (3, 1, 0), (3, 1, 1), (5, 2, 1)] ) @pytest.mark.parametrize("depthwise", [False, True]) +@pytest.mark.parametrize("use_rtl_swg", [False, True]) @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode): +def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mode): kernel_size, stride, pad = conv_config np.random.seed(0) idt = DataType["UINT4"] @@ -69,6 +70,12 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode): in_feature_dim = 7 in_chn = 16 + if use_rtl_swg and exec_mode == "cppsim": + pytest.skip("cppsim not supported for RTL SWG") + + if use_rtl_swg and kernel_size == 1: + pytest.skip("1x1 kernel not supported by current RTL SWG") + if depthwise is True: group = out_chn = in_chn conv_param_shape = [out_chn, 1, kernel_size, kernel_size] @@ -122,7 +129,7 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode): model = model.transform(InferDataTypes()) new_model = model.transform(LowerConvsToMatMul()) - new_model = new_model.transform(to_hls.InferConvInpGen()) + new_model = new_model.transform(to_hls.InferConvInpGen(use_rtl_variant=use_rtl_swg)) if depthwise is True: new_model = new_model.transform(to_hls.InferVectorVectorActivation()) else: @@ -156,6 +163,7 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode): x = gen_finn_dt_tensor(idt, input_shape) inp_dict = {model.graph.input[0].name: x} assert oxe.compare_execution(model, new_model, inp_dict) + if kernel_size == 1 and stride > 1 and pad == 0: assert new_model.graph.node[1].op_type == "DownSampler" if exec_mode == "rtlsim": diff --git a/tests/fpgadataflow/test_fifosizing.py b/tests/fpgadataflow/test_fifosizing.py new file mode 100644 index 0000000000000000000000000000000000000000..5fd1439bd055782692bac404622137e166ef5e07 --- /dev/null +++ b/tests/fpgadataflow/test_fifosizing.py @@ -0,0 +1,81 @@ +# Copyright (c) 2022 Xilinx, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of Xilinx nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import pytest + +import json +import shutil +from brevitas.export.onnx.generic.manager import BrevitasONNXManager + +import finn.builder.build_dataflow as build +import finn.builder.build_dataflow_config as build_cfg +from finn.util.basic import make_build_dir +from finn.util.test import get_trained_network_and_ishape + + +def fetch_test_model(topology, wbits=2, abits=2): + tmp_output_dir = make_build_dir("build_fifosizing_%s_" % topology) + (model, ishape) = get_trained_network_and_ishape(topology, wbits, abits) + chkpt_name = tmp_output_dir + "/model.onnx" + BrevitasONNXManager.export(model, ishape, chkpt_name) + return tmp_output_dir + + +@pytest.mark.slow +@pytest.mark.vivado +@pytest.mark.fpgadataflow +def test_fifosizing_linear(): + tmp_output_dir = fetch_test_model("tfc") + cfg = build_cfg.DataflowBuildConfig( + output_dir=tmp_output_dir, + auto_fifo_depths=True, + auto_fifo_strategy="characterize", + target_fps=10000, + synth_clk_period_ns=10.0, + board="Pynq-Z1", + rtlsim_batch_size=100, + shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, + generate_outputs=[ + build_cfg.DataflowOutputType.ESTIMATE_REPORTS, + build_cfg.DataflowOutputType.STITCHED_IP, + build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, + ], + default_mem_mode=build_cfg.ComputeEngineMemMode.DECOUPLED, + ) + build.build_dataflow_cfg(tmp_output_dir + "/model.onnx", cfg) + with open(tmp_output_dir + "/report/estimate_network_performance.json") as f: + est_data = json.load(f) + with open(tmp_output_dir + "/report/rtlsim_performance.json") as f: + sim_data = json.load(f) + assert ( + float(sim_data["throughput[images/s]"]) + / float(est_data["estimated_throughput_fps"]) + > 0.9 + ) + shutil.rmtree(tmp_output_dir) diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py new file mode 100755 index 0000000000000000000000000000000000000000..007360a5fd0b74ee49d54c84f332061dd5f3a114 --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py @@ -0,0 +1,260 @@ +# Copyright (C) 2022, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +from onnx import TensorProto, helper +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.general.im2col import compute_conv_output_dim +from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.util.basic import gen_finn_dt_tensor + +import finn.core.onnx_exec as oxe +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode + + +def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt): + k_h, k_w = k + ifm_dim_h, ifm_dim_w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + ofm_dim_h, ofm_dim_w = ofm_dim + + odt = idt + inp = helper.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch] + ) + outp = helper.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] + ) + + im2col_node = helper.make_node( + "Im2Col", + ["inp"], + ["outp"], + domain="finn.custom_op.general", + stride=[stride_h, stride_w], + kernel_size=[k_h, k_w], + input_shape=str((1, ifm_dim_h, ifm_dim_w, ifm_ch)), + dilations=[dilation_h, dilation_w], + pad_amount=[0, 0, 0, 0], + pad_value=0, + ) + graph = helper.make_graph( + nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp] + ) + + model = helper.make_model(graph, producer_name="im2col-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", odt) + + return model + + +def make_single_slidingwindow_modelwrapper( + k, ifm_ch, ifm_dim, ofm_dim, simd, m, parallel_window, stride, dilation, idt, dw=0 +): + k_h, k_w = k + ifm_dim_h, ifm_dim_w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + ofm_dim_h, ofm_dim_w = ofm_dim + + odt = idt + inp = helper.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch] + ) + outp = helper.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] + ) + + SlidingWindow_node = helper.make_node( + "ConvolutionInputGenerator_rtl", + ["inp"], + ["outp"], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + ConvKernelDim=[k_h, k_w], + IFMChannels=ifm_ch, + IFMDim=[ifm_dim_h, ifm_dim_w], + OFMDim=[ofm_dim_h, ofm_dim_w], + SIMD=simd, + M=m, + parallel_window=parallel_window, + Stride=[stride_h, stride_w], + Dilation=[dilation_h, dilation_w], + inputDataType=idt.name, + outputDataType=odt.name, + depthwise=dw, + ) + graph = helper.make_graph( + nodes=[SlidingWindow_node], + name="slidingwindow_graph", + inputs=[inp], + outputs=[outp], + ) + + model = helper.make_model(graph, producer_name="slidingwindow-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", odt) + + return model + + +def prepare_inputs(input_tensor): + return {"inp": input_tensor} + + +# input datatype +@pytest.mark.parametrize("idt", [DataType["UINT4"]]) +# kernel size +@pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 3]]) +# input dimension +@pytest.mark.parametrize("ifm_dim", [[24, 24], [15, 6], [13, 13], [1, 14]]) +# input channels +@pytest.mark.parametrize("ifm_ch", [6]) +# Stride +@pytest.mark.parametrize("stride", [[1, 1], [2, 2]]) +# Dilation +@pytest.mark.parametrize("dilation", [[1, 1], [2, 2]]) +# depthwise +@pytest.mark.parametrize("dw", [0, 1]) +# input channel parallelism ("SIMD") +@pytest.mark.parametrize("simd", [1, 2, 3, 6]) +# parallel_window enable (MMV_out = M*K) +@pytest.mark.parametrize("parallel_window", [0]) +# in/out MMV ("M") +@pytest.mark.parametrize("m", [1]) +# Flip dimensions +@pytest.mark.parametrize("flip", [False]) +@pytest.mark.slow +@pytest.mark.vivado +@pytest.mark.fpgadataflow +def test_fpgadataflow_slidingwindow_rtl( + idt, k, ifm_dim, ifm_ch, stride, dilation, dw, simd, m, parallel_window, flip +): + if flip: + if ( + ifm_dim[0] == ifm_dim[1] + and k[0] == k[1] + and stride[0] == stride[1] + and dilation[0] == dilation[1] + ): + pytest.skip("Dimension flip would have no effect") + k = k[::-1] + ifm_dim = ifm_dim[::-1] + stride = stride[::-1] + dilation = dilation[::-1] + + k_h, k_w = k + ifm_dim_h, ifm_dim_w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + + kernel_width = (k_w - 1) * dilation_w + 1 # incl. dilation + kernel_height = (k_h - 1) * dilation_h + 1 # incl. dilation + + if simd > ifm_ch: + pytest.skip("SIMD cannot be larger than number of input channels") + if ifm_ch % simd != 0: + pytest.skip("SIMD must divide number of input channels") + if kernel_height > ifm_dim_h or stride_h > ifm_dim_h: + pytest.skip( + "Illegal convolution configuration: kernel or stride > FM dimension" + ) + if kernel_width > ifm_dim_w or stride_w > ifm_dim_w: + pytest.skip( + "Illegal convolution configuration: kernel or stride > FM dimension" + ) + if (k_h == 1 and (stride_h != 1 or dilation_h != 1)) or ( + k_w == 1 and (stride_w != 1 or dilation_w != 1) + ): + pytest.skip( + """Illegal convolution configuration: + stride or dilation defined for unitary kernel dim""" + ) + if k_h == 1 and k_w == 1 and simd != ifm_ch: + pytest.skip("1x1 Kernel only supported in parallel mode (SIMD=C)") + if parallel_window and simd != ifm_ch: + pytest.skip("Parallel window requires SIMD=C") + + ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h) + ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w) + ofm_dim = [ofm_dim_h, ofm_dim_w] + + x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch)) + model = make_single_slidingwindow_modelwrapper( + k=k, + ifm_ch=ifm_ch, + ifm_dim=ifm_dim, + ofm_dim=ofm_dim, + simd=simd, + m=m, + parallel_window=parallel_window, + stride=stride, + dilation=dilation, + idt=idt, + dw=dw, + ) + + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP("xc7z020clg400-1", 5)) + model = model.transform(PrepareRTLSim()) + + # prepare input data + input_dict = prepare_inputs(x) + # execute model + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + golden = make_single_im2col_modelwrapper( + k=k, + ifm_ch=ifm_ch, + ifm_dim=ifm_dim, + ofm_dim=ofm_dim, + stride=stride, + dilation=dilation, + idt=idt, + ) + y_expected = oxe.execute_onnx(golden, input_dict)["outp"] + + if dw == 0: + assert (y_produced == y_expected).all() + else: + y_expected = y_expected.reshape( + 1, ofm_dim_h, ofm_dim_w, k_h * k_w, ifm_ch // simd, simd + ) + y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5) + y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, ifm_ch * k_h * k_w) + assert (y_produced == y_expected).all() diff --git a/tests/fpgadataflow/test_fpgadataflow_downsampler.py b/tests/fpgadataflow/test_fpgadataflow_downsampler.py new file mode 100644 index 0000000000000000000000000000000000000000..64da0a2368a69d6037c681d88391eef2844dae2c --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_downsampler.py @@ -0,0 +1,160 @@ +# Copyright (c) 2022, Xilinx, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import numpy as np +import onnx.parser as oprs +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.general.im2col import compute_conv_output_dim +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul +from qonnx.util.basic import gen_finn_dt_tensor + +import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer +from finn.core.onnx_exec import execute_onnx +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode + + +def build_model(is_1d, in_dim, k, stride, dt_in, dt_w, pad_half=0, flip_1d=False): + np.random.seed(0) + out_dim = compute_conv_output_dim(in_dim, k, stride, 2 * pad_half) + ifm = 8 + ofm = 16 + if is_1d: + if flip_1d: + shape_in = [1, ifm, 1, in_dim] + shape_out = [1, ofm, 1, out_dim] + shape_k = [1, k] + shape_s = [1, stride] + shape_p = [0, pad_half, 0, pad_half] + else: + shape_in = [1, ifm, in_dim, 1] + shape_out = [1, ofm, out_dim, 1] + shape_k = [k, 1] + shape_s = [stride, 1] + shape_p = [pad_half, 0, pad_half, 0] + else: + shape_in = [1, ifm, in_dim, in_dim] + shape_out = [1, ofm, out_dim, out_dim] + shape_k = [k, k] + shape_s = [stride, stride] + shape_p = [pad_half, pad_half, pad_half, pad_half] + shape_w = [ofm, ifm] + shape_k + + sstr_in = str(shape_in) + sstr_out = str(shape_out) + sstr_k = str(shape_k) + sstr_s = str(shape_s) + sstr_p = str(shape_p) + sstr_w = str(shape_w) + + input = f""" + < + ir_version: 7, + opset_import: ["" : 9] + > + agraph (float{sstr_in} in0) => (float{sstr_out} out0) + < + float{sstr_w} param_w_conv0 + > + {{ + out0 = Conv<kernel_shape={sstr_k}, group=1, pads={sstr_p}, + strides={sstr_s}>(in0, param_w_conv0) + }} + """ + model = oprs.parse_model(input) + model = ModelWrapper(model) + model.set_tensor_datatype("in0", dt_in) + model.set_tensor_datatype("param_w_conv0", dt_w) + model.set_initializer("param_w_conv0", gen_finn_dt_tensor(dt_w, shape_w)) + model = model.transform(InferShapes()) + model = model.transform(LowerConvsToMatMul()) + model = model.transform(InferShapes()) + return model + + +@pytest.mark.parametrize("is_1d", [True, False]) +@pytest.mark.parametrize("flip_1d", [True, False]) +@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +@pytest.mark.slow +@pytest.mark.vivado +@pytest.mark.fpgadataflow +def test_fpgadataflow_downsampler(is_1d, flip_1d, exec_mode): + if flip_1d and not is_1d: + pytest.skip("flip_1d only applicable for is_1d") + in_dim = 32 + k = 1 + stride = 2 + dt_in = DataType["UINT8"] + dt_w = DataType["INT2"] + model = build_model( + is_1d, in_dim, k, stride, dt_in, dt_w, pad_half=0, flip_1d=flip_1d + ) + inp = gen_finn_dt_tensor(dt_in, model.get_tensor_shape("in0")) + idict = {"in0": inp} + y_expected = execute_onnx(model, idict)["out0"] + model = model.transform(to_hls.InferConvInpGen()) + assert len(model.get_nodes_by_op_type("DownSampler")) == 1 + if exec_mode == "cppsim": + model = model.transform(SetExecMode("cppsim")) + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + elif exec_mode == "rtlsim": + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP("xc7z020clg400-1", 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + else: + raise Exception("Unknown exec_mode") + y_produced = execute_onnx(model, idict)["out0"] + assert (y_produced == y_expected).all() + if exec_mode == "rtlsim": + node = model.get_nodes_by_op_type("DownSampler")[0] + inst = getCustomOp(node) + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + exp_cycles_dict = model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + # small adjustment for 2D testcase due to how rtlsim works: + # output is finished before all pixels are read, since last + # row is dropped (rtlsim finishes based on # of expected + # pixels) + if not is_1d: + exp_cycles = exp_cycles - in_dim + assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) + assert exp_cycles != 0 diff --git a/tests/fpgadataflow/test_fpgadataflow_eltwise.py b/tests/fpgadataflow/test_fpgadataflow_eltwise.py new file mode 100644 index 0000000000000000000000000000000000000000..6028a9b9f0fb4a04d0f53fd8c4fae3aac3ae686e --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_eltwise.py @@ -0,0 +1,133 @@ +# Copyright (c) 2022, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import numpy as np +import onnx.parser as oprs +import qonnx.core.data_layout as dl +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.util.basic import gen_finn_dt_tensor + +import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer +from finn.core.onnx_exec import execute_onnx +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode + + +def build_model(shp, dt0, dt1, do_abs): + np.random.seed(0) + shp_str = str(shp) + if do_abs: + graph = """ + sub_out = Sub(in0, in1) + out0 = Abs(sub_out) + """ + else: + graph = "out0 = Sub(in0, in1)" + + input = f""" + < + ir_version: 7, + opset_import: ["" : 9] + > + agraph (float{shp_str} in0, float{shp_str} in1) => (float{shp_str} out0) + {{ + {graph} + }} + """ + model = oprs.parse_model(input) + model = ModelWrapper(model) + model.set_tensor_datatype("in0", dt0) + model.set_tensor_datatype("in1", dt1) + model.set_tensor_layout("in0", dl.NHWC) + model.set_tensor_layout("in1", dl.NHWC) + model = model.transform(InferShapes()) + return model + + +# input datatype for one operand +@pytest.mark.parametrize("dt0", [DataType["UINT4"], DataType["UINT7"]]) +# channels +@pytest.mark.parametrize("ch", [1, 64]) +# folding +@pytest.mark.parametrize("fold", [-1, 2, 1]) +# include Abs output node or not +@pytest.mark.parametrize("do_abs", [True, False]) +# execution mode +@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +@pytest.mark.fpgadataflow +@pytest.mark.vivado +def test_fpgadataflow_eltwise(dt0, ch, fold, do_abs, exec_mode): + if fold == -1: + pe = 1 + else: + pe = max(1, ch // fold) + assert ch % pe == 0 + dt1 = DataType["UINT8"] + shp = [1, 4, 2, ch] + model = build_model(shp, dt0, dt1, do_abs) + in0 = gen_finn_dt_tensor(dt0, shp) + in1 = gen_finn_dt_tensor(dt1, shp) + idict = {"in0": in0, "in1": in1} + y_expected = execute_onnx(model, idict)["out0"] + model = model.transform(to_hls.InferStreamingEltwise()) + assert len(model.graph.node) == 1 + assert model.graph.node[0].op_type == "StreamingEltwise" + getCustomOp(model.graph.node[0]).set_nodeattr("PE", pe) + if exec_mode == "cppsim": + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + model = model.transform(SetExecMode("cppsim")) + elif exec_mode == "rtlsim": + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP("xc7z020clg400-1", 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + else: + raise Exception("Unknown exec_mode") + y_produced = execute_onnx(model, idict)["out0"] + assert (y_produced == y_expected).all(), exec_mode + " failed" + if exec_mode == "rtlsim": + node = model.get_nodes_by_op_type("StreamingEltwise")[0] + inst = getCustomOp(node) + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + exp_cycles_dict = model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) + assert exp_cycles != 0 diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py index 2e2da0da7a217091d76d0a59a2a36a8e6a28af8e..34928ce45be0fd96d27b153ae28e2128bf306bb5 100644 --- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py +++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py @@ -53,12 +53,11 @@ test_fpga_part = pynq_part_map[test_pynq_board] target_clk_ns = 10 -def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_style): +def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt): pad_h = padding[0] + padding[2] pad_w = padding[1] + padding[3] idim_h, idim_w = idim - assert pad_style == 2, "only pad_style == 2 supported in hlslib" assert pad_h > 0 or pad_w > 0, "Output dim should be greater than input dim" odim_h = idim_h + pad_h odim_w = idim_w + pad_w @@ -80,7 +79,6 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_sty Padding=padding, NumChannels=num_ch, inputDataType=str(idt.name), - PaddingStyle=pad_style, numInputVectors=1, SIMD=simd, ) @@ -101,13 +99,13 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_sty # input image dimension @pytest.mark.parametrize("idim", [[8, 8], [10, 8]]) # number of rows and number of cols to add -@pytest.mark.parametrize("pad", [[1, 1, 1, 1], [1, 1, 2, 2], [1, 3, 2, 3]]) +@pytest.mark.parametrize( + "pad", [[1, 1, 1, 1], [1, 1, 2, 2], [1, 3, 2, 3], [7, 0, 8, 0]] +) # number of channels @pytest.mark.parametrize("num_ch", [2, 4]) # Input parallelism @pytest.mark.parametrize("simd", [1, 2]) -# PaddingStyle: selects behavior when (odim-idim)%2 != 0 -@pytest.mark.parametrize("pad_style", [2]) # FINN input datatype @pytest.mark.parametrize("idt", [DataType["INT2"], DataType["INT4"]]) # execution mode @@ -115,7 +113,7 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_sty @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode): +def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode): if num_ch % simd != 0: pytest.skip(" num_ch % simd != 0, skipping") @@ -123,19 +121,13 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode): pad_h = pad[0] + pad[2] pad_w = pad[1] + pad[3] - if idim_h == idim_w and pad_h != pad_w: - pytest.skip( - """Only equal padding along the dimensions for square images - is supported, skipping""" - ) - # generate input data x = gen_finn_dt_tensor(idt, [1, idim_h, idim_w, num_ch]) input_dict = {"inp": x} odim_h = idim_h + pad_h odim_w = idim_w + pad_w - model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, simd, idt, pad_style) + model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, simd, idt) model = model.transform(InferShapes()) model = model.transform(SetExecMode(mode)) model = model.transform(GiveUniqueNodeNames()) @@ -150,26 +142,8 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode): expected_oshape = (1, odim_h, odim_w, num_ch) assert y_produced.shape == expected_oshape - # calculate reference - # calculate correct pad according to parameters - if pad_style == 2: - if pad_h % 2 == 0: - pad_up = pad_h // 2 - else: - pad_up = pad_h // 2 + 1 - if pad_w % 2 == 0: - pad_left = pad_w // 2 - else: - pad_left = pad_w // 2 + 1 - else: - pad_up = pad_h // 2 - pad_left = pad_w // 2 - - pad_down = pad_h - pad_up - pad_right = pad_w - pad_left - y_expected = np.pad( - x, ((0, 0), (pad_up, pad_down), (pad_left, pad_right), (0, 0)), "constant" + x, ((0, 0), (pad[0], pad[2]), (pad[1], pad[3]), (0, 0)), "constant" ) assert (y_produced == y_expected).all() diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py index d1895a12675dce69070d280381a9982060e20c21..a7e7eba7ee8de81ec5eebe3e270e8e1d28564a00 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py @@ -42,6 +42,7 @@ import finn.core.onnx_exec as oxe from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.derive_characteristic import DeriveCharacteristic from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP @@ -417,3 +418,67 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( exp_cycles = exp_cycles_dict[node.name] assert np.isclose(exp_cycles, cycles_rtlsim, atol=15) assert exp_cycles != 0 + + +# mem_mode: const or decoupled +@pytest.mark.parametrize("mem_mode", ["decoupled", "const"]) +# activation: None or DataType +@pytest.mark.parametrize("act", [DataType["INT4"]]) +# weight datatype +@pytest.mark.parametrize("wdt", [DataType["INT4"]]) +# input datatype +@pytest.mark.parametrize("idt", [DataType["INT4"]]) +# neuron folding, -1 is maximum possible +@pytest.mark.parametrize("nf", [8]) +# synapse folding, -1 is maximum possible +@pytest.mark.parametrize("sf", [8]) +# HLS matrix width (input features) +@pytest.mark.parametrize("mw", [32]) +# HLS matrix height (output features) +@pytest.mark.parametrize("mh", [32]) +@pytest.mark.fpgadataflow +@pytest.mark.vivado +def test_fclayer_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): + if nf == -1: + nf = mh + if sf == -1: + sf = mw + pe = mh // nf + simd = mw // sf + assert mh % pe == 0 + assert mw % sf == 0 + # generate weights + W = gen_finn_dt_tensor(wdt, (mw, mh)) + + # no activation, produce accumulators + T = None + tdt = None + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + odt = DataType["UINT32"] + else: + odt = DataType["INT32"] + + model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt) + for node in model.graph.node: + # lookup op_type in registry of CustomOps + inst = getCustomOp(node) + inst.set_nodeattr("mem_mode", mem_mode) + total_fold = nf * sf + exp_total_cycles = total_fold + 10 + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP("xc7z020clg400-1", 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + model = model.transform(DeriveCharacteristic(exp_total_cycles)) + node_inst = getCustomOp(model.graph.node[0]) + period_attr = node_inst.get_nodeattr("io_chrc_period") + assert period_attr == exp_total_cycles + chrc_in = node_inst.get_nodeattr("io_chrc_in") + chrc_out = node_inst.get_nodeattr("io_chrc_out") + assert chrc_in.shape == (1, 2 * exp_total_cycles) + assert chrc_out.shape == (1, 2 * exp_total_cycles) + # first sf cycles should read input continuously + assert (chrc_in[0, :sf] == range(1, sf + 1)).all() + # all outputs should be produced within the exp n of cycles + assert chrc_out[0, exp_total_cycles] == nf diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau.py b/tests/fpgadataflow/test_fpgadataflow_vvau.py index f854c997ff2b9d355933e4e2636c0280a47ea762..c54284dee9a99ed1625e7ac803958ec4ca7b4edc 100644 --- a/tests/fpgadataflow/test_fpgadataflow_vvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_vvau.py @@ -33,12 +33,14 @@ from onnx import TensorProto, helper from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.general.multithreshold import multithreshold -from qonnx.custom_op.registry import getCustomOp + +# from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.general import GiveUniqueNodeNames from qonnx.util.basic import gen_finn_dt_tensor import finn.core.onnx_exec as oxe -from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer + +# from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim @@ -75,7 +77,20 @@ def _calculate_dot_prod_range(dt_a, dt_b, len): def _make_single_vvau_modelwrapper( - W, pe, simd, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T=None, tdt=None + W, + pe, + simd, + k_h, + k_w, + channels, + dim_h, + dim_w, + wdt, + idt, + odt, + T=None, + tdt=None, + mem_mode="const", ): in_shape = [1, dim_h, dim_w, k_h * k_w * channels] # [N, H, W, K*K*CH] out_shape = [ @@ -114,6 +129,7 @@ def _make_single_vvau_modelwrapper( weightDataType=wdt.name, outputDataType=odt.name, noActivation=no_act, + mem_mode=mem_mode, ) graph = helper.make_graph( @@ -141,7 +157,7 @@ def prepare_inputs(input_tensor): return {"inp": input_tensor} -# mem_mode: const or decoupled +# input datatype @pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]]) # weight datatype @pytest.mark.parametrize("wdt", [DataType["INT4"]]) @@ -159,13 +175,15 @@ def prepare_inputs(input_tensor): @pytest.mark.parametrize("k_w", [3, 1]) # Number of input and output channels @pytest.mark.parametrize("channels", [3, 4]) +# memory mode +@pytest.mark.parametrize("mem_mode", ["const", "decoupled"]) # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado def test_fpgadataflow_vvau( - idt, wdt, act, pe, simd, dim_h, dim_w, k_h, k_w, channels, exec_mode + idt, wdt, act, pe, simd, dim_h, dim_w, k_h, k_w, channels, mem_mode, exec_mode ): if pe == "channels": pe = channels @@ -201,7 +219,7 @@ def test_fpgadataflow_vvau( tdt = DataType["INT32"] model = _make_single_vvau_modelwrapper( - W, pe, simd, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt + W, pe, simd, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt, mem_mode ) if exec_mode == "cppsim": @@ -242,11 +260,11 @@ def test_fpgadataflow_vvau( assert (y_produced == y_expected).all(), "incorrect result" - if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("VectorVectorActivation")[0] - inst = getCustomOp(node) - cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") - exp_cycles_dict = model.analysis(exp_cycles_per_layer) - exp_cycles = exp_cycles_dict[node.name] - # assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) - # assert exp_cycles != 0 + # if exec_mode == "rtlsim": + # node = model.get_nodes_by_op_type("VectorVectorActivation")[0] + # inst = getCustomOp(node) + # cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + # exp_cycles_dict = model.analysis(exp_cycles_per_layer) + # exp_cycles = exp_cycles_dict[node.name] + # assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) + # assert exp_cycles != 0 diff --git a/tests/transformation/streamline/test_scale_resize_nhwc.py b/tests/transformation/streamline/test_scale_resize_nhwc.py new file mode 100644 index 0000000000000000000000000000000000000000..f10930f4e7d5aeb98a60630e7e4f48adfc371d59 --- /dev/null +++ b/tests/transformation/streamline/test_scale_resize_nhwc.py @@ -0,0 +1,293 @@ +import pytest + +import numpy as np +import onnx +import onnx.helper as oh +import qonnx.core.data_layout as DataLayout +from onnx import TensorProto +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.transformation.infer_data_layouts import InferDataLayouts +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.util.basic import gen_finn_dt_tensor + +import finn.core.onnx_exec as oxe +from finn.transformation.streamline.reorder import MakeScaleResizeNHWC + + +def create_resize_transpose(ifm_dim, ifm_ch, scales, mode, idt): + ofm_dim_h = ifm_dim[0] * scales[2] + ofm_dim_w = ifm_dim[1] * scales[3] + inp = oh.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim[0], ifm_dim[1]] + ) + + param = oh.make_tensor_value_info("scales", TensorProto.FLOAT, [4]) + + # Not actually used, only needed for compliance with the Resize node interface + roi = oh.make_tensor_value_info("roi", TensorProto.FLOAT, [4]) + + outp_up = oh.make_tensor_value_info( + "outp_up", TensorProto.FLOAT, [1, ifm_ch, ofm_dim_h, ofm_dim_w] + ) + outp = oh.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, ifm_ch] + ) + + resize_node = oh.make_node( + "Resize", + inputs=["inp", "roi", "scales"], + outputs=["outp_up"], + name="Resize1", + mode=mode, + ) + + transpose_node = onnx.helper.make_node( + "Transpose", + inputs=["outp_up"], + outputs=["outp"], + name="Transpose1", + perm=[0, 2, 3, 1], + ) + + graph = oh.make_graph( + nodes=[resize_node, transpose_node], + name="resize_graph", + inputs=[inp], + outputs=[outp], + value_info=[outp_up, param, roi], + ) + + model = oh.make_model(graph, producer_name="resize_model1") + model = ModelWrapper(model) + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", idt) + + model.set_tensor_layout("inp", DataLayout.NCHW) + model = model.transform(InferShapes()) + model = model.transform(InferDataLayouts()) + + return model + + +def create_transpose_resize(ifm_dim, ifm_ch, scales, mode, idt): + ofm_dim_h = ifm_dim[0] * scales[2] + ofm_dim_w = ifm_dim[1] * scales[3] + inp = oh.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, ifm_dim[0], ifm_dim[1], ifm_ch] + ) + + param = oh.make_tensor_value_info("scales", TensorProto.FLOAT, [4]) + + # Not actually used, only needed for compliance with the Resize node interface + roi = oh.make_tensor_value_info("roi", TensorProto.FLOAT, [4]) + + outp = oh.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ifm_ch, ofm_dim_h, ofm_dim_w] + ) + outp_tr = oh.make_tensor_value_info( + "outp_tr", TensorProto.FLOAT, [1, ifm_ch, ifm_dim[0], ifm_dim[1]] + ) + + transpose_node = onnx.helper.make_node( + "Transpose", + inputs=["inp"], + outputs=["outp_tr"], + name="Transpose1", + perm=[0, 3, 1, 2], + ) + + resize_node = oh.make_node( + "Resize", + inputs=["outp_tr", "roi", "scales"], + outputs=["outp"], + name="Resize1", + mode=mode, + ) + + graph = oh.make_graph( + nodes=[transpose_node, resize_node], + name="resize_graph", + inputs=[inp], + outputs=[outp], + value_info=[outp_tr, param, roi], + ) + + model = oh.make_model(graph, producer_name="resize_model2") + model = ModelWrapper(model) + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", idt) + model.set_tensor_layout("inp", DataLayout.NHWC) + + model = model.transform(InferShapes()) + model = model.transform(InferDataLayouts()) + + return model + + +def create_transpose_resize_transpose(ifm_dim, ifm_ch, scales, mode, idt): + ofm_dim_h = ifm_dim[0] * scales[2] + ofm_dim_w = ifm_dim[1] * scales[3] + inp = oh.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, ifm_dim[0], ifm_dim[1], ifm_ch] + ) + + param = oh.make_tensor_value_info("scales", TensorProto.FLOAT, scales) + + # Not actually used, only needed for compliance with the Resize node interface + roi = oh.make_tensor_value_info("roi", TensorProto.FLOAT, [4]) + + outp_tr = oh.make_tensor_value_info( + "outp_tr", TensorProto.FLOAT, [1, ifm_ch, ifm_dim[0], ifm_dim[1]] + ) + + outp_up = oh.make_tensor_value_info( + "outp_up", TensorProto.FLOAT, [1, ifm_ch, ofm_dim_h, ofm_dim_w] + ) + outp = oh.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, ifm_ch] + ) + + transpose_node1 = onnx.helper.make_node( + "Transpose", + inputs=["inp"], + outputs=["outp_tr"], + name="Transpose1", + perm=[0, 3, 1, 2], + ) + + resize_node = oh.make_node( + "Resize", + inputs=["outp_tr", "roi", "scales"], + outputs=["outp_up"], + name="Resize1", + mode=mode, + ) + + transpose_node2 = onnx.helper.make_node( + "Transpose", + inputs=["outp_up"], + outputs=["outp"], + name="Transpose2", + perm=[0, 2, 3, 1], + ) + + graph = oh.make_graph( + nodes=[transpose_node1, resize_node, transpose_node2], + name="resize_graph", + inputs=[inp], + outputs=[outp], + value_info=[outp_up, outp_tr, param, roi], + ) + + model = oh.make_model(graph, producer_name="resize_model3") + model = ModelWrapper(model) + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", idt) + model.set_tensor_layout("inp", DataLayout.NHWC) + + model = model.transform(InferShapes()) + model = model.transform(InferDataLayouts()) + + return model + + +def check_transform(model): + graph = model.graph + node_ind = 0 + for n in graph.node: + node_ind += 1 + if n.op_type == "Upsample" or n.op_type == "Resize": + if model.get_tensor_layout(n.output[0]) == DataLayout.NHWC: + return True + return False + + +@pytest.mark.streamline +# input dimension +@pytest.mark.parametrize("ifm_dim", [[2**i, 2**i] for i in range(3, 6)]) +# input channels +@pytest.mark.parametrize("ifm_ch", [3]) +# scales +@pytest.mark.parametrize( + "scales", [[1, 1, i, j] for i in range(2, 5) for j in range(2, 5)] +) +# mode +@pytest.mark.parametrize("mode", ["nearest"]) +# input datatype +@pytest.mark.parametrize("idt", [DataType["INT4"]]) +def test_scale_resize_nhwc(ifm_dim, ifm_ch, scales, mode, idt): + # create models + resize_model1 = create_resize_transpose(ifm_dim, ifm_ch, scales, mode, idt) + resize_model2 = create_transpose_resize(ifm_dim, ifm_ch, scales, mode, idt) + resize_model3 = create_transpose_resize_transpose( + ifm_dim, ifm_ch, scales, mode, idt + ) + + # set initializers + resize_model1.set_initializer("scales", np.array(scales, dtype=np.float32)) + resize_model2.set_initializer("scales", np.array(scales, dtype=np.float32)) + resize_model3.set_initializer("scales", np.array(scales, dtype=np.float32)) + + # generate input tensor for testing + input_tensor_nchw = gen_finn_dt_tensor(idt, [1, ifm_ch, ifm_dim[0], ifm_dim[1]]) + input_tensor_nhwc = gen_finn_dt_tensor(idt, [1, ifm_dim[0], ifm_dim[1], ifm_ch]) + input_dict_nchw = {"inp": input_tensor_nchw} + input_dict_nhwc = {"inp": input_tensor_nhwc} + + # execute first model + output_dict1 = oxe.execute_onnx(resize_model1, input_dict_nchw) + expected1 = output_dict1["outp"] + + # transform Resize into ResizeNHWC + resize_model1 = resize_model1.transform(MakeScaleResizeNHWC()) + resize_model1 = resize_model1.transform(InferDataLayouts()) + + # execute transformed model + output_node_name1 = resize_model1.graph.output[0].name + output_dict1 = oxe.execute_onnx( + resize_model1, input_dict_nchw, return_full_exec_context=False + ) + output1 = output_dict1[output_node_name1] + + # compare outputs + assert (expected1 == output1).all() + assert check_transform(resize_model1) + + # execute second model + output_dict2 = oxe.execute_onnx(resize_model2, input_dict_nhwc) + expected2 = output_dict2["outp"] + + # transform Resize into ResizeNHWC + resize_model2 = resize_model2.transform(MakeScaleResizeNHWC()) + resize_model2 = resize_model2.transform(InferDataLayouts()) + + # execute transformed model + output_node_name2 = resize_model2.graph.output[0].name + output_dict2 = oxe.execute_onnx( + resize_model2, input_dict_nhwc, return_full_exec_context=False + ) + output2 = output_dict2[output_node_name2] + + # compare outputs + assert (expected2 == output2).all() + assert check_transform(resize_model2) + + # execute third model + output_dict3 = oxe.execute_onnx(resize_model3, input_dict_nhwc) + expected3 = output_dict3["outp"] + + # transform Resize into ResizeNHWC + resize_model3 = resize_model3.transform(MakeScaleResizeNHWC()) + resize_model3 = resize_model3.transform(InferDataLayouts()) + + # execute transformed model + output_node_name3 = resize_model3.graph.output[0].name + output_dict3 = oxe.execute_onnx( + resize_model3, input_dict_nhwc, return_full_exec_context=False + ) + output3 = output_dict3[output_node_name3] + + # compare outputs + assert (expected3 == output3).all() + assert check_transform(resize_model3) diff --git a/tests/util/test_build_dataflow.py b/tests/util/test_build_dataflow.py index cdf69aebddc4d6af2288774acbff5dd8a52512b3..39f0b0dc89e9388c54a013becb53d9afbfb2ce4e 100644 --- a/tests/util/test_build_dataflow.py +++ b/tests/util/test_build_dataflow.py @@ -30,6 +30,7 @@ import pkg_resources as pk import pytest +import numpy as np import os from shutil import copytree @@ -55,7 +56,6 @@ def test_end2end_build_dataflow_directory(): assert os.path.isfile(output_dir + "/driver/driver.py") assert os.path.isfile(output_dir + "/report/estimate_layer_cycles.json") assert os.path.isfile(output_dir + "/report/estimate_layer_resources.json") - assert os.path.isfile(output_dir + "/report/verify_rtlsim.vcd") assert os.path.isfile(output_dir + "/report/rtlsim_perf_batch_1.vcd") assert os.path.isfile( output_dir + "/report/estimate_layer_config_alternatives.json" @@ -68,8 +68,19 @@ def test_end2end_build_dataflow_directory(): assert os.path.isfile(output_dir + "/report/post_synth_resources.xml") assert os.path.isfile(output_dir + "/report/post_route_timing.rpt") # verification outputs - verify_out_dir = output_dir + "/verification_output" - assert os.path.isfile(verify_out_dir + "/verify_initial_python_SUCCESS.npy") - assert os.path.isfile(verify_out_dir + "/verify_streamlined_python_SUCCESS.npy") - assert os.path.isfile(verify_out_dir + "/verify_folded_hls_cppsim_SUCCESS.npy") - assert os.path.isfile(verify_out_dir + "/verify_stitched_ip_rtlsim_SUCCESS.npy") + verif_batchsize = np.load(target_dir + "/input.npy").shape[0] + for i in range(verif_batchsize): + verify_out_dir = output_dir + "/verification_output" + assert os.path.isfile( + verify_out_dir + f"/verify_initial_python_{i}_SUCCESS.npy" + ) + assert os.path.isfile( + verify_out_dir + f"/verify_streamlined_python_{i}_SUCCESS.npy" + ) + assert os.path.isfile( + verify_out_dir + f"/verify_folded_hls_cppsim_{i}_SUCCESS.npy" + ) + assert os.path.isfile( + verify_out_dir + f"/verify_stitched_ip_rtlsim_{i}_SUCCESS.npy" + ) + assert os.path.isfile(output_dir + f"/report/verify_rtlsim_{i}.vcd")