diff --git a/fetch-repos.sh b/fetch-repos.sh index 0026e750b55ede3d72bdf2358447f7edfd123ced..36c9ae55780fe0f945f065d7a0214c683bf513a8 100755 --- a/fetch-repos.sh +++ b/fetch-repos.sh @@ -32,7 +32,7 @@ FINN_EXP_COMMIT="9cbd2787b5160e2b44e0e8164a0df1457dbd5366" BREVITAS_COMMIT="a5b71d6de1389d3e7db898fef72e014842670f03" PYVERILATOR_COMMIT="766e457465f5c0dd315490d7b9cc5d74f9a76f4f" CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4" -HLSLIB_COMMIT="79d7c61fbe318bfcd56e3c35bbfb774995a7870c" +HLSLIB_COMMIT="e7f2de91d1a2ddadaaea06b8f4c20e97a575470e" OMX_COMMIT="d1065a788219ca0eb54d5e57600b1f9d7f67d4cc" AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b" XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e" diff --git a/finn-rtllib/swg/swg_template_default.sv b/finn-rtllib/swg/swg_template_default.sv new file mode 100644 index 0000000000000000000000000000000000000000..97517438a0c261e4488b74a677a352f9dc51743b --- /dev/null +++ b/finn-rtllib/swg/swg_template_default.sv @@ -0,0 +1,351 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +module $TOP_MODULE_NAME$_controller #( + int unsigned LOOP_H_ITERATIONS = $LOOP_H_ITERATIONS$, + int unsigned LOOP_W_ITERATIONS = $LOOP_W_ITERATIONS$, + int unsigned LOOP_KH_ITERATIONS = $LOOP_KH_ITERATIONS$, + int unsigned LOOP_KW_ITERATIONS = $LOOP_KW_ITERATIONS$, + int unsigned LOOP_SIMD_ITERATIONS = $LOOP_SIMD_ITERATIONS$, + + int unsigned INCR_BITWIDTH = $INCR_BITWIDTH$, + bit [INCR_BITWIDTH-1:0] ADDR_INCREMENT_MAP[6] = $ADDR_INCREMENT_MAP$, + + bit IS_DEPTHWISE = $IS_DEPTHWISE$ +)( + input logic clk, + input logic rst_n, + + input logic advance, + output logic [INCR_BITWIDTH-1:0] addr_incr, + output logic [INCR_BITWIDTH-1:0] tail_incr +); + + // state and counters + typedef enum logic [2:0] { + STATE_START, + STATE_LOOP_SIMD, + STATE_LOOP_KW, + STATE_LOOP_KH, + STATE_LOOP_W, + STATE_LOOP_H + } state_e; + state_e State = $INNERMOST_STATE$; + state_e state_next; + + logic signed [$clog2(LOOP_H_ITERATIONS +2)+1-1:0] Counter_loop_h = LOOP_H_ITERATIONS-1; + logic signed [$clog2(LOOP_W_ITERATIONS +2)+1-1:0] Counter_loop_w = LOOP_W_ITERATIONS-1; + logic signed [$clog2(LOOP_KH_ITERATIONS +2)+1-1:0] Counter_loop_kh = LOOP_KH_ITERATIONS-1; + logic signed [$clog2(LOOP_KW_ITERATIONS +2)+1-1:0] Counter_loop_kw = LOOP_KW_ITERATIONS-1; + logic signed [$clog2(LOOP_SIMD_ITERATIONS+2)+1-1:0] Counter_loop_simd = LOOP_SIMD_ITERATIONS-1; + + assign addr_incr = ADDR_INCREMENT_MAP[State]; + + // combinational logic for tail_incr generation + uwire tail_incr_inner_condition = IS_DEPTHWISE? (Counter_loop_kh >= 0) : 0; + always_comb begin : blkTail + if (tail_incr_inner_condition) + tail_incr = 1; + else if (Counter_loop_w >= 0) + tail_incr = $TAIL_INCR_W$; + else if (Counter_loop_h >= 0) + tail_incr = $TAIL_INCR_H$; + else + tail_incr = $TAIL_INCR_LAST$; + end + + // combinational next state logic + always_comb begin : blkState + state_next = State; + if(State != $INNERMOST_STATE$) state_next = $INNERMOST_STATE$; + else begin + if(Counter_loop_simd < 0) begin + state_next = + (Counter_loop_kw >= 0)? STATE_LOOP_KW : + (Counter_loop_kh >= 0)? STATE_LOOP_KH : + (Counter_loop_w >= 0)? STATE_LOOP_W : + (Counter_loop_h >= 0)? STATE_LOOP_H : + /* else */ STATE_START; + end + end + end : blkState + + // sequential logic + always_ff @ (posedge clk) begin + if(!rst_n) begin + State <= $INNERMOST_STATE$; + Counter_loop_h <= LOOP_H_ITERATIONS-1; + Counter_loop_w <= LOOP_W_ITERATIONS-1; + Counter_loop_kh <= LOOP_KH_ITERATIONS-1; + Counter_loop_kw <= LOOP_KW_ITERATIONS-1; + Counter_loop_simd <= LOOP_SIMD_ITERATIONS-1; + end + else if(advance) begin + State <= state_next; + if (State == $INNERMOST_STATE$) begin + if(Counter_loop_simd >= 0) Counter_loop_simd <= Counter_loop_simd-1; + else begin + Counter_loop_simd <= LOOP_SIMD_ITERATIONS-1; + if(Counter_loop_kw >= 0) Counter_loop_kw <= Counter_loop_kw-1; + else begin + Counter_loop_kw <= LOOP_KW_ITERATIONS-1; + if(Counter_loop_kh >= 0) Counter_loop_kh <= Counter_loop_kh-1; + else begin + Counter_loop_kh <= LOOP_KH_ITERATIONS-1; + if(Counter_loop_w >= 0) Counter_loop_w <= Counter_loop_w-1; + else begin + Counter_loop_w <= LOOP_W_ITERATIONS-1; + if(Counter_loop_h >= 0) Counter_loop_h <= Counter_loop_h-1; + else Counter_loop_h <= LOOP_H_ITERATIONS-1; + end + end + end + end + end + end + end + +endmodule : $TOP_MODULE_NAME$_controller + +module $TOP_MODULE_NAME$_cyclic_buffer_addressable #( + int unsigned WIDTH, + int unsigned DEPTH +)( + input logic clk, + input logic rst_n, + + input logic write_enable, + input logic [$clog2(DEPTH)-1:0] write_addr, + input logic [WIDTH-1:0] data_in, + + input logic read_enable, + input logic [$clog2(DEPTH)-1:0] read_addr, // absolute (!) read address of cyclic buffer + output logic [WIDTH-1:0] data_out +); + + $RAM_STYLE$ logic [WIDTH-1:0] Ram[DEPTH]; + logic [WIDTH-1:0] Out = 'x; + always_ff @(posedge clk) begin + if (read_enable) Out <= Ram[read_addr]; + if (write_enable) Ram[write_addr] <= data_in; + end + assign data_out = Out; + +endmodule : $TOP_MODULE_NAME$_cyclic_buffer_addressable + +module $TOP_MODULE_NAME$_impl #( + int BIT_WIDTH, + int SIMD, + int MMV_IN, + int MMV_OUT, + int LAST_READ_ELEM = $LAST_READ_ELEM$, + int LAST_WRITE_ELEM = $LAST_WRITE_ELEM$, + int BUF_ELEM_TOTAL = $BUF_ELEM_TOTAL$, + int ELEM_PER_WINDOW = $ELEM_PER_WINDOW$, + int INCR_BITWIDTH = $INCR_BITWIDTH$ +)( + input logic ap_clk, + input logic ap_rst_n, + + input logic in0_V_V_TVALID, + output logic in0_V_V_TREADY, + input logic [BIT_WIDTH * SIMD * MMV_IN-1:0] in0_V_V_TDATA, + + output logic out_V_V_TVALID, + input logic out_V_V_TREADY, + output logic [BIT_WIDTH * SIMD * MMV_OUT-1:0] out_V_V_TDATA +); + // derived Constants + localparam int unsigned BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN; + localparam int unsigned BUF_OUT_ELEM_WIDTH = BIT_WIDTH * SIMD; + localparam int unsigned BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT; + + // main buffer instantiation + uwire [BUF_IN_WIDTH -1:0] window_buffer_in; + uwire [BUF_OUT_WIDTH-1:0] window_buffer_out; + uwire window_buffer_write_enable; + uwire window_buffer_read_enable; + uwire [$clog2(BUF_ELEM_TOTAL)-1:0] window_buffer_write_addr; + uwire [$clog2(BUF_ELEM_TOTAL)-1:0] window_buffer_read_addr; + $TOP_MODULE_NAME$_cyclic_buffer_addressable #( + .WIDTH(BUF_IN_WIDTH), + .DEPTH(BUF_ELEM_TOTAL) + ) window_buffer_inst ( + .clk(ap_clk), + .rst_n(ap_rst_n), + + .write_enable(window_buffer_write_enable), + .write_addr(window_buffer_write_addr), + .data_in(window_buffer_in), + + .read_enable(window_buffer_read_enable), + .read_addr(window_buffer_read_addr), + .data_out(window_buffer_out) + ); + + //controller instantiation + uwire advance_controller; + uwire signed [INCR_BITWIDTH-1:0] addr_incr; + uwire [INCR_BITWIDTH-1:0] tail_incr; + $TOP_MODULE_NAME$_controller controller_inst ( + .clk(ap_clk), + .rst_n(ap_rst_n), + .advance(advance_controller), + .addr_incr(addr_incr), + .tail_incr(tail_incr) + ); + + // Counters/address registers + // Add a sign bit even to (most) unsigned counters and Window_buffer_read_addr_reg, + // so we can use automatic sign extension and simplify calculations w/ signed increment. + // Alternatively, we could manually sign-extend and shave off a bit here or there. + logic signed [$clog2(LAST_READ_ELEM+1)+1-1:0] Newest_buffered_elem = -1; + logic [$clog2(LAST_READ_ELEM+1)+1-1:0] Current_elem = 0; + logic [$clog2(LAST_READ_ELEM+1)+1-1:0] First_elem_next_window = 0; + logic [$clog2(ELEM_PER_WINDOW) -1:0] Position_in_window = 0; + logic [$clog2(BUF_ELEM_TOTAL)+1 -1:0] Window_buffer_read_addr_reg = 0; + logic [$clog2(BUF_ELEM_TOTAL)-1:0] Window_buffer_write_addr_reg = 0; + + // Control signals/registers + uwire read_cmd = + !reading_done && ( // if there is still an input element left to read + Fetching_done || ( // if fetching is done (e.g. for skipped rows at FM end due to stride) + $signed(((Newest_buffered_elem - (BUF_ELEM_TOTAL - 1)))) < $signed(First_elem_next_window) && + $signed(((Newest_buffered_elem - (BUF_ELEM_TOTAL - 1)))) < $signed(Current_elem) + ) // (over-)write to buffer if oldest buffered element will no longer be needed + ); + uwire read_ok = read_cmd && in0_V_V_TVALID; + uwire reading_done = Newest_buffered_elem == LAST_READ_ELEM; + + uwire fetch_cmd = !($signed(Current_elem) > Newest_buffered_elem) && !write_blocked && !Fetching_done; + logic Fetching_done = 0; + + logic Write_cmd = 0; + logic Writing_done = 0; + uwire write_ok = Write_cmd && out_V_V_TREADY; + uwire write_blocked = Write_cmd && !out_V_V_TREADY;; + + //assign buffer control + assign window_buffer_write_addr = Window_buffer_write_addr_reg; + assign window_buffer_read_addr = Window_buffer_read_addr_reg; + assign window_buffer_write_enable = read_ok; + assign window_buffer_read_enable = fetch_cmd; + assign advance_controller = fetch_cmd; + + //assign I/O ports + assign window_buffer_in = in0_V_V_TDATA; + assign out_V_V_TDATA = window_buffer_out; + assign in0_V_V_TREADY = ap_rst_n && read_ok; //only asserted if data is available and we can store it (allowed) + assign out_V_V_TVALID = ap_rst_n && Write_cmd; //only asserted if we have data available and it has not been read yet (don't wait for READY from sink) + + //main process for advancing counters + always_ff @(posedge ap_clk) begin + if(!ap_rst_n) begin + Newest_buffered_elem <= -1; + Current_elem <= 0; + First_elem_next_window <= 0; + Position_in_window <= 0; + Window_buffer_read_addr_reg <= 0; + Window_buffer_write_addr_reg <= 0; + Fetching_done <= 0; + Write_cmd <= 0; + Writing_done <= 0; + end + else begin + if (read_ok) begin + Window_buffer_write_addr_reg <= (Window_buffer_write_addr_reg == BUF_ELEM_TOTAL-1)? 0 : Window_buffer_write_addr_reg + 1; + Newest_buffered_elem <= Newest_buffered_elem+1; + + if (Newest_buffered_elem == LAST_READ_ELEM-1) begin + Window_buffer_write_addr_reg <= 0; + end + //check if this is the last read cycle (reading_done will be true afterwards) + if ((Newest_buffered_elem == LAST_READ_ELEM-1) && Writing_done) begin + //start processing of next FM if writing is done already (possible due to unused input elements at the tail end) + //todo: allow for read overlapping between feature maps (i.e., reading first elements from next FM while still writing last window of current FM) + Newest_buffered_elem <= -1; + Current_elem <= 0; + Window_buffer_read_addr_reg <= 0; + First_elem_next_window <= 0; + Writing_done <= 0; + Fetching_done <= 0; + end + end + + if (fetch_cmd) begin + //count up to track which element index is about to be read from the buffer, and where it is located within the buffer + //use increment value calculated by controller + + // absolute buffer address wrap-around + automatic logic signed [$clog2(BUF_ELEM_TOTAL)+1:0] ra = $signed(Window_buffer_read_addr_reg) + $signed(addr_incr); + automatic logic signed [$clog2(BUF_ELEM_TOTAL+1):0] ra_correct = + (ra >= BUF_ELEM_TOTAL)? -BUF_ELEM_TOTAL : + (ra < 0)? BUF_ELEM_TOTAL : 0; + Window_buffer_read_addr_reg <= ra + ra_correct; + + //keep track where we are within a window + Position_in_window <= (Position_in_window != ELEM_PER_WINDOW - 1)? Position_in_window+1 : 0; + + //update first element of next window to allow buffer overwrite up until that point + if (Position_in_window == 0) + First_elem_next_window <= First_elem_next_window + tail_incr; + + //check if this is the last write cycle (Writing_done will be true afterwards) + if (Current_elem == LAST_WRITE_ELEM) + Fetching_done <= 1; + else + Current_elem <= $signed(Current_elem) + addr_incr; + + // determine if prefetched data will be outstanding in the next cycle + // if we fetch in this cycle -> yes + // if we do not fetch nor write -> do not change + // if we do not fetch but write successfully-> clear outstanding data + Write_cmd <= fetch_cmd; + end + + if (write_ok) + Write_cmd <= fetch_cmd; + + if (write_ok && Fetching_done) begin + //check if this is the last write cycle (Writing_done will be true afterwards) + if (reading_done || (read_ok && (Newest_buffered_elem == LAST_READ_ELEM - 1))) begin + //start processing of next FM if reading is done already, or completes in the same cycle + Newest_buffered_elem <= -1; + Current_elem <= 0; + Window_buffer_read_addr_reg <= 0; + First_elem_next_window <= 0; + Fetching_done <= 0; + end else + Writing_done <= 1; + end + end + end + +endmodule : $TOP_MODULE_NAME$_impl diff --git a/finn-rtllib/swg/swg_template_wrapper.v b/finn-rtllib/swg/swg_template_wrapper.v new file mode 100644 index 0000000000000000000000000000000000000000..0cc3579a255fddaf1a470d440b9e8ac245abe486 --- /dev/null +++ b/finn-rtllib/swg/swg_template_wrapper.v @@ -0,0 +1,75 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +`timescale 1 ns / 1 ps + +module $TOP_MODULE_NAME$ ( +(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *) +input ap_clk, +(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *) +input ap_rst_n, +input [BUF_IN_WIDTH-1:0] in0_V_TDATA, +input in0_V_TVALID, +output in0_V_TREADY, +output [BUF_OUT_WIDTH-1:0] out_V_TDATA, +output out_V_TVALID, +input out_V_TREADY +); + +// top-level parameters (set via code-generation) +parameter BIT_WIDTH = $BIT_WIDTH$; +parameter SIMD = $SIMD$; +parameter MMV_IN = $MMV_IN$; +parameter MMV_OUT = $MMV_OUT$; + +// derived constants +parameter BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN; +parameter BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT; + +$TOP_MODULE_NAME$_impl +#( + .BIT_WIDTH(BIT_WIDTH), + .SIMD(SIMD), + .MMV_IN(MMV_IN), + .MMV_OUT(MMV_OUT) +) +impl +( + .ap_clk(ap_clk), + .ap_rst_n(ap_rst_n), + .in0_V_V_TDATA(in0_V_TDATA), + .in0_V_V_TVALID(in0_V_TVALID), + .in0_V_V_TREADY(in0_V_TREADY), + .out_V_V_TDATA(out_V_TDATA), + .out_V_V_TVALID(out_V_TVALID), + .out_V_V_TREADY(out_V_TREADY) +); + +endmodule //TOP_MODULE_NAME diff --git a/requirements.txt b/requirements.txt index 970acc342bb7984e69929d1ef5eaa027b765ced0..9038a5e8170301421529e0b570482316e4fff20a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,7 @@ onnx==1.11.0 onnxoptimizer onnxruntime==1.11.1 pre-commit==2.9.2 -protobuf==3.20.1 +protobuf==3.20.2 pyscaffold==3.2.1 scipy==1.5.2 setupext-janitor>=1.1.2 diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index 49538939d7ddbb4ec1ab4c7920ca25220418d89d..d842d89e234fd59f953a246293d271154d50954a 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -258,6 +258,10 @@ class DataflowBuildConfig: #: Which memory mode will be used for compute layers default_mem_mode: Optional[ComputeEngineMemMode] = ComputeEngineMemMode.DECOUPLED + #: Force inference of RTL ConvolutionInputGenerator over HLS implementation + #: If set to False, falls back to the default behavior of InferConvInpGen() + force_rtl_conv_inp_gen: Optional[bool] = False + #: Which Vitis platform will be used. #: Only relevant when `shell_flow_type = ShellFlowType.VITIS_ALVEO` #: e.g. "xilinx_u250_xdma_201830_2" diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index c983432e1e33bd63c6e2d19f4d0fe943de328b2c..8290621056f9e4531693a3266bfb633735a4db33 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -29,6 +29,7 @@ import json import numpy as np import os +import shutil from copy import deepcopy from distutils.dir_util import copy_tree from qonnx.core.modelwrapper import ModelWrapper @@ -121,44 +122,85 @@ def verify_step( verify_out_dir = cfg.output_dir + "/verification_output" intermediate_models_dir = cfg.output_dir + "/intermediate_models" os.makedirs(verify_out_dir, exist_ok=True) - (in_npy, exp_out_npy) = cfg._resolve_verification_io_pair() - if need_parent: - assert ( - cfg.save_intermediate_models - ), "Enable save_intermediate_models for verification" - parent_model_fn = intermediate_models_dir + "/dataflow_parent.onnx" - child_model_fn = intermediate_models_dir + "/verify_%s.onnx" % step_name - model.save(child_model_fn) - out_tensor_name = ModelWrapper(parent_model_fn).graph.output[0].name - out_dict = execute_parent( - parent_model_fn, child_model_fn, in_npy, return_full_ctx=True - ) - out_npy = out_dict[out_tensor_name] - else: - inp_tensor_name = model.graph.input[0].name - out_tensor_name = model.graph.output[0].name - inp_dict = {inp_tensor_name: in_npy} - if rtlsim_pre_hook is not None: - out_dict = rtlsim_exec(model, inp_dict, pre_hook=rtlsim_pre_hook) + (in_npy_all, exp_out_npy_all) = cfg._resolve_verification_io_pair() + bsize_in = in_npy_all.shape[0] + bsize_out = exp_out_npy_all.shape[0] + assert bsize_in == bsize_out, "Batch sizes don't match for verification IO pair" + all_res = True + for b in range(bsize_in): + in_npy = np.expand_dims(in_npy_all[b], axis=0) + exp_out_npy = np.expand_dims(exp_out_npy_all[b], axis=0) + if need_parent: + assert ( + cfg.save_intermediate_models + ), "Enable save_intermediate_models for verification" + parent_model_fn = intermediate_models_dir + "/dataflow_parent.onnx" + child_model_fn = intermediate_models_dir + "/verify_%s.onnx" % step_name + model.save(child_model_fn) + parent_model = ModelWrapper(parent_model_fn) + out_tensor_name = parent_model.graph.output[0].name + exp_ishape = parent_model.get_tensor_shape(parent_model.graph.input[0].name) + if in_npy.shape != exp_ishape: + print( + "Verification input has shape %s while model expects %s" + % (str(in_npy.shape), str(exp_ishape)) + ) + print("Attempting to force model shape on verification input") + in_npy = in_npy.reshape(exp_ishape) + out_dict = execute_parent( + parent_model_fn, child_model_fn, in_npy, return_full_ctx=True + ) + out_npy = out_dict[out_tensor_name] else: - out_dict = execute_onnx(model, inp_dict, True) - out_npy = out_dict[out_tensor_name] - res = np.isclose(exp_out_npy, out_npy, atol=1e-3).all() - res_to_str = {True: "SUCCESS", False: "FAIL"} - res_str = res_to_str[res] - if cfg.verify_save_full_context: - verification_output_fn = verify_out_dir + "/verify_%s_%s.npz" % ( - step_name, - res_str, - ) - np.savez(verification_output_fn, **out_dict) - else: - verification_output_fn = verify_out_dir + "/verify_%s_%s.npy" % ( - step_name, - res_str, - ) - np.save(verification_output_fn, out_npy) - print("Verification for %s : %s" % (step_name, res_str)) + inp_tensor_name = model.graph.input[0].name + out_tensor_name = model.graph.output[0].name + exp_ishape = model.get_tensor_shape(inp_tensor_name) + if in_npy.shape != exp_ishape: + print( + "Verification input has shape %s while model expects %s" + % (str(in_npy.shape), str(exp_ishape)) + ) + print("Attempting to force model shape on verification input") + in_npy = in_npy.reshape(exp_ishape) + inp_dict = {inp_tensor_name: in_npy} + if rtlsim_pre_hook is not None: + out_dict = rtlsim_exec(model, inp_dict, pre_hook=rtlsim_pre_hook) + else: + out_dict = execute_onnx(model, inp_dict, True) + out_npy = out_dict[out_tensor_name] + exp_oshape = exp_out_npy.shape + if out_npy.shape != exp_oshape: + print( + "Verification output has shape %s while model produces %s" + % (str(exp_oshape), str(out_npy.shape)) + ) + print("Attempting to force model shape on verification output") + out_npy = out_npy.reshape(exp_oshape) + + res = np.isclose(exp_out_npy, out_npy, atol=1e-3).all() + all_res = all_res and res + res_to_str = {True: "SUCCESS", False: "FAIL"} + res_str = res_to_str[res] + if cfg.verify_save_full_context: + verification_output_fn = verify_out_dir + "/verify_%s_%d_%s.npz" % ( + step_name, + b, + res_str, + ) + np.savez(verification_output_fn, **out_dict) + else: + verification_output_fn = verify_out_dir + "/verify_%s_%d_%s.npy" % ( + step_name, + b, + res_str, + ) + np.save(verification_output_fn, out_npy) + if cfg.verify_save_rtlsim_waveforms: + vcd_path = model.get_metadata_prop("rtlsim_trace") + if vcd_path is not None and os.path.isfile(vcd_path): + new_vcd_path = vcd_path.replace(".vcd", "_%d.vcd" % b) + shutil.move(vcd_path, new_vcd_path) + print("Verification for %s : %s" % (step_name, res_to_str[all_res])) def prepare_for_stitched_ip_rtlsim(verify_model, cfg): @@ -306,7 +348,10 @@ def step_convert_to_hls(model: ModelWrapper, cfg: DataflowBuildConfig): # needed for convolutions -- TODO always exec? need_conv = len(model.get_nodes_by_op_type("Im2Col")) > 0 if need_conv: - model = model.transform(to_hls.InferConvInpGen()) + if cfg.force_rtl_conv_inp_gen: + model = model.transform(to_hls.InferConvInpGen(use_rtl_variant=True)) + else: + model = model.transform(to_hls.InferConvInpGen()) model = model.transform(to_hls.InferStreamingMaxPool()) model = model.transform(RemoveCNVtoFCFlatten()) # get rid of Tranpose -> Tranpose identity seq diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 2c7c86c64ea1279cb18cf8342aa20fb2792bdaf5..e5eb483a00f6890f5eeb16c5cec533a4533c9f15 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -36,8 +36,12 @@ from finn.custom_op.fpgadataflow.convolutioninputgenerator import ( from finn.custom_op.fpgadataflow.convolutioninputgenerator1d import ( ConvolutionInputGenerator1D, ) +from finn.custom_op.fpgadataflow.convolutioninputgenerator_rtl import ( + ConvolutionInputGenerator_rtl, +) from finn.custom_op.fpgadataflow.downsampler import DownSampler from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch +from finn.custom_op.fpgadataflow.eltwise import StreamingEltwise from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch from finn.custom_op.fpgadataflow.iodma import IODMA @@ -67,6 +71,7 @@ custom_op["StreamingMaxPool_Batch"] = StreamingMaxPool_Batch custom_op["MatrixVectorActivation"] = MatrixVectorActivation custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator custom_op["ConvolutionInputGenerator1D"] = ConvolutionInputGenerator1D +custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl custom_op["TLastMarker"] = TLastMarker custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch custom_op["StreamingFIFO"] = StreamingFIFO @@ -85,3 +90,4 @@ custom_op["UpsampleNearestNeighbour_Batch"] = UpsampleNearestNeighbour_Batch custom_op["Lookup"] = Lookup custom_op["StreamingConcat"] = StreamingConcat custom_op["CheckSum"] = CheckSum +custom_op["StreamingEltwise"] = StreamingEltwise diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py new file mode 100755 index 0000000000000000000000000000000000000000..399b36e15021af6f449df3e9ba2acdc699a27647 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py @@ -0,0 +1,834 @@ +# Copyright (C) 2022, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import math +import numpy as np +import os +from math import copysign +from qonnx.core.datatype import DataType +from qonnx.custom_op.general import im2col +from qonnx.custom_op.general.im2col import compute_conv_output_dim + +from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.util.basic import get_rtlsim_trace_depth, make_build_dir +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None + +# RTL Convolution Input Generator / Sliding Window Generator (SWG) +# Matches and extends the functionality of all ConvolutionInputGenerator_* functions +# in finn-hlslib by generating HDL code for two different implementation styles: +# - Addressable cyclic buffer: to be used when out_width <= in_width +# - Parallel registers + line buffers: to be used when out_width > in_width +# Supports non-square, 1D, strided, dilated, and depthwise convolutions. +# Note: the actual data layout produced is different for depthwise and non-depthwise: +# * non-depthwise SWG: (1, OFMDim_H, OFMDim_W, K_H, K_W, IFMChannels/SIMD, SIMD) +# * depthwise SWG: (1, OFMDim_H, OFMDim_W, IFMChannels/SIMD, K_H, K_W, SIMD) + +# NOTE: "Parallel" implementation style not yet implemented in this version! + + +class ConvolutionInputGenerator_rtl(HLSCustomOp): + """Class that does not correspond to one of the finn-hlslib ConvolutionInputGenerator + (sliding window) function variants. Generates an RTL ConvolutionInputGenerator + implementation based on (System-)Verilog templates, defined in finn-rtllib/swg.""" + + def __init__(self, onnx_node): + super().__init__(onnx_node) + + def get_nodeattr_types(self): + my_attrs = { + "ConvKernelDim": ("ints", True, []), # [H, W] = [Y, X] + "IFMChannels": ("i", True, 0), + "IFMDim": ("ints", True, []), # [H, W] = [Y, X] + "OFMDim": ("ints", True, []), # [H, W] = [Y, X] + "SIMD": ("i", True, 0), + # additional parallelization parameter - not yet implemented + "M": ("i", False, 1), + # alternative implementation style - not yet implemented + "parallel_window": ("i", False, 0, {0}), + "Stride": ("ints", True, []), # [H, W] = [Y, X] + "Dilation": ("ints", True, []), # [H, W] = [Y, X] + # FINN DataTypes for inputs, weights, outputs + "inputDataType": ("s", True, ""), + "outputDataType": ("s", True, ""), + "depthwise": ("i", False, 0, {0, 1}), + # FPGA resource type for ConvolutionInputGenerator input buffer + # auto -- let Vivado decide + # block -- use BRAM + # distributed -- use LUTRAM + # ultra -- use URAM + "ram_style": ( + "s", + False, + "auto", + {"auto", "block", "distributed", "ultra"}, + ), + # attribute to save top module name - not user configurable + "gen_top_module": ("s", False, ""), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_normal_input_shape(self): + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") + ifm_ch = self.get_nodeattr("IFMChannels") + ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) + return ishape + + def get_folded_input_shape(self): + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") + ifm_ch = self.get_nodeattr("IFMChannels") + simd = self.get_nodeattr("SIMD") + assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" + wf = int(ifm_ch / simd) + folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd) + return folded_ishape + + def get_normal_output_shape(self): + k_h, k_w = self.get_nodeattr("ConvKernelDim") + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") + ifm_ch = self.get_nodeattr("IFMChannels") + stride_h, stride_w = self.get_nodeattr("Stride") + dilation_h, dilation_w = self.get_nodeattr("Dilation") + pad = 0 + ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) + ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) + oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch) + return oshape + + def get_folded_output_shape(self): + k_h, k_w = self.get_nodeattr("ConvKernelDim") + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") + ifm_ch = self.get_nodeattr("IFMChannels") + stride_h, stride_w = self.get_nodeattr("Stride") + dilation_h, dilation_w = self.get_nodeattr("Dilation") + simd = self.get_nodeattr("SIMD") + pad = 0 + ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) + ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) + assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" + if self.get_nodeattr("parallel_window"): + wf = int((ifm_ch) // simd) + folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd) + else: + wf = int((k_h * k_w * ifm_ch) // simd) + folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd) + return folded_oshape + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpect input shape for ConvInpGen." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + # data type stays the same + dtype = model.get_tensor_datatype(node.input[0]) + model.set_tensor_datatype(node.output[0], dtype) + + def verify_node(self): + pass + + def get_input_datatype(self): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("outputDataType")] + + def get_instream_width(self): + ibits = self.get_input_datatype().bitwidth() + simd = self.get_nodeattr("SIMD") + ifm_ch = self.get_nodeattr("IFMChannels") + assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" + in_width = simd * ibits + return in_width + + def get_outstream_width(self): + if self.get_nodeattr("parallel_window"): + # feed all window pixels in parallel + k_h, k_w = self.get_nodeattr("ConvKernelDim") + return self.get_instream_width() * k_h * k_w + else: + # if parallel variant not in use: same width for output and input stream + return self.get_instream_width() + + def get_number_input_values(self): + folded_ishape = self.get_folded_input_shape() + num_input_elems = np.prod(folded_ishape[:-1]) + return num_input_elems + + def get_number_output_values(self): + folded_oshape = self.get_folded_output_shape() + num_output_elems = np.prod(folded_oshape[:-1]) + return num_output_elems + + def get_1d_conv_attrs_normalized(self): + # normalize FM dimensions so that: + # [H, W] = [Y, X] = [1, D] or [D, 1] are always mapped to [1, D]. + # The dummy ('1') dimension is the Y-dimension. + ifm_ch = self.get_nodeattr("IFMChannels") + k = self.get_nodeattr("ConvKernelDim") + ifm_dim = self.get_nodeattr("IFMDim") + ofm_dim = self.get_nodeattr("OFMDim") + stride = self.get_nodeattr("Stride") + dilation = self.get_nodeattr("Dilation") + + if ifm_dim[1] == 1: + ifm_dim = ifm_dim[::-1] + ofm_dim = ofm_dim[::-1] + k = k[::-1] + stride = stride[::-1] + dilation = dilation[::-1] + + return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation) + + def get_buffer_depth(self): + ifm_ch = self.get_nodeattr("IFMChannels") + k = self.get_nodeattr("ConvKernelDim") + ifm_dim = self.get_nodeattr("IFMDim") + stride = self.get_nodeattr("Stride") + dilation = self.get_nodeattr("Dilation") + simd = self.get_nodeattr("SIMD") + + k_h, k_w = k + h, w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + mmv_in = 1 + mmv_out = 1 + channel_factor = int(ifm_ch / simd) + + impl_style = self.select_impl_style() + if impl_style == "default": + # compute minimal buffer length (assuming it holds 1 complete window) + buffer_min_size = ( + (k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1 + ) * channel_factor + + # add additional buffer space in case of stride > 1 + # this minimizes cycle count as it allows an earlier pre-load of inputs + buffer_depth = ( + buffer_min_size + + max( + 0, + ((stride_w - 1) - (int(mmv_out * k_h * k_w / mmv_in))) + * channel_factor, + ) + + max( + 0, + ((stride_h - 1) * w - (int(mmv_out * k_h * k_w / mmv_in))) + * channel_factor, + ) + ) + else: + buffer_depth = 0 + raise Exception("Requested impl. style not implemented") + return buffer_depth + + def get_exp_cycles(self): + simd = self.get_nodeattr("SIMD") + ifm_ch = self.get_nodeattr("IFMChannels") + k = self.get_nodeattr("ConvKernelDim") + ifm_dim = self.get_nodeattr("IFMDim") + ofm_dim = self.get_nodeattr("OFMDim") + stride = self.get_nodeattr("Stride") + dilation = self.get_nodeattr("Dilation") + depthwise = self.get_nodeattr("depthwise") + ifm_dim_h, ifm_dim_w = ifm_dim + ofm_dim_h, ofm_dim_w = ofm_dim + k_h, k_w = k + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + + channel_factor = int(ifm_ch / simd) + + if ifm_dim_h == 1 or ifm_dim_w == 1: + # 1D case + ( + ifm_ch, + [ifm_dim_h, ifm_dim_w], + [ofm_dim_h, ofm_dim_w], + [k_h, k_w], + [stride_h, stride_w], + [dilation_h, dilation_w], + ) = self.get_1d_conv_attrs_normalized() + + if depthwise: + exp_cycles = ( + +ofm_dim_w * k_w * channel_factor + + channel_factor * (k_w - 1) * (stride_w - 1) + - (k_w - 1) + + 2 + ) + else: + exp_cycles = ofm_dim_w * k_w * channel_factor + 2 + else: + # 2D case + buffer_min_size = ( + (k_h - 1) * dilation_h * ifm_dim_w + (k_w - 1) * dilation_w + 1 + ) * channel_factor + cycles_write_block = ofm_dim_w * k_w * k_h * channel_factor + cycles_read_block = stride_w * ifm_dim_w * channel_factor + max_cycles = max(cycles_write_block, cycles_read_block) + if depthwise: + max_cycles += ofm_dim_w * (stride_w - 1) * (channel_factor - 1) + exp_cycles = buffer_min_size + ofm_dim_h * max_cycles # initial buffering + if depthwise: + exp_cycles += (stride_h - 1) * ifm_dim_w * channel_factor + + return int(exp_cycles) + + def bram_estimation(self): + simd = self.get_nodeattr("SIMD") + ram_style = self.get_nodeattr("ram_style") + + # NOTE: Actual BRAM usage might be lower in some cases. + # This does not account for the exact Vivado behavior yet. + buffer_width = simd * self.get_input_datatype().bitwidth() + buffer_depth = self.get_buffer_depth() + if ram_style == "block" or ram_style == "auto": + if buffer_depth <= 512: + ram_width = 36 + elif buffer_depth <= 1024: + ram_width = 18 + elif buffer_depth <= 2048: + ram_width = 9 + elif buffer_depth <= 4096: + ram_width = 4 + elif buffer_depth <= 8192: + ram_width = 2 + else: + ram_width = 1 + + ram_cascade_depth = math.ceil(buffer_depth / 16384) + ram_cascade_width = math.ceil(buffer_width / ram_width) + cascade_savings = 0 + if buffer_depth > 16384: + remainder_depth = buffer_depth % 16384 + if remainder_depth <= 512: + remainder_width = 36 + elif remainder_depth <= 1024: + remainder_width = 18 + elif remainder_depth <= 2048: + remainder_width = 9 + elif remainder_depth <= 4096: + remainder_width = 4 + elif remainder_depth <= 8192: + remainder_width = 2 + else: + remainder_width = 1 + + remainder_cascade_width = math.ceil(buffer_width / remainder_width) + cascade_savings = ram_cascade_width - remainder_cascade_width + + return int(ram_cascade_depth * ram_cascade_width - cascade_savings) + else: + return 0 + + def lut_estimation(self): + simd = self.get_nodeattr("SIMD") + ram_style = self.get_nodeattr("ram_style") + buffer_width = simd * self.get_input_datatype().bitwidth() + buffer_depth = self.get_buffer_depth() + if ram_style == "distributed": + ram_luts = int(buffer_width * math.ceil(buffer_depth / 38)) + else: + ram_luts = 0 + return 300 + ram_luts + + def uram_estimation(self): + simd = self.get_nodeattr("SIMD") + ram_style = self.get_nodeattr("ram_style") + buffer_width = simd * self.get_input_datatype().bitwidth() + buffer_depth = self.get_buffer_depth() + + if ram_style == "ultra": + ram_depth = 4096 + ram_width = 72 + ram_cascade_depth = math.ceil(buffer_depth / ram_depth) + ram_cascade_width = math.ceil(buffer_width / ram_width) + return int(ram_cascade_depth * ram_cascade_width) + else: + return 0 + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + if mode == "cppsim": + raise Exception( + "cppsim not possible for RTL SWG, please set exec_mode to rtlsim" + ) + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't match expected shape (1, ifm_dim, ifm_dim, ifm_ch).""" + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + inp = (inp + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + + # reshape input into folded form + inp = inp.reshape(folded_ishape) + # make copy before saving array + reshaped_input = inp.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + + # binary -> bipolar if needed + if self.get_output_datatype() == DataType["BIPOLAR"]: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output + shape doesn't match expected shape (1, ofm_dim_h, ofm_dim_w, k_h*k_w*ifm_ch).""" + + def prepare_codegen_default(self): + # Default implementation style for MMV_out = 1: addressable cyclic buffer + # Computing incremental addressing scheme directly.. + template_path = ( + os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_template_default.sv" + ) + code_gen_dict = {} + + ifm_ch = self.get_nodeattr("IFMChannels") + k = self.get_nodeattr("ConvKernelDim") + ifm_dim = self.get_nodeattr("IFMDim") + stride = self.get_nodeattr("Stride") + dilation = self.get_nodeattr("Dilation") + depthwise = self.get_nodeattr("depthwise") + simd = self.get_nodeattr("SIMD") + + k_h, k_w = k + h, w = ifm_dim + pad = [0, 0, 0, 0] # padding happens in separate padding node for now + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + pad_h = pad[0] + pad[2] + pad_w = pad[1] + pad[3] + out_dim_h = im2col.compute_conv_output_dim(h, k_h, stride_h, pad_h, dilation_h) + out_dim_w = im2col.compute_conv_output_dim(w, k_w, stride_w, pad_w, dilation_w) + mmv_in = 1 + mmv_out = 1 + channel_factor = int(ifm_ch / simd) + + # compute minimal buffer length (assuming it holds 1 complete window) + buffer_min_size = ( + (k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1 + ) * channel_factor + + buffer_actual_size = self.get_buffer_depth() + code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_actual_size)] + + # compute some intermediate values, e.g., kernel "width" = k_w incl. dilation + # or cols/rows that are skipped due to imperfect stride<->dim combination + kernel_width = (k_w - 1) * dilation_w + 1 + kernel_height = (k_h - 1) * dilation_h + 1 + skip_columns = w % (kernel_width + (out_dim_w - 1) * stride_w) + skip_rows = h % (kernel_height + (out_dim_h - 1) * stride_h) + + # compute address increment values for 5-loop nest + addr_incr_end_simd = 1 + addr_incr_end_window_elem = (dilation_w - 1) * channel_factor + 1 + addr_incr_end_window_row = ( + ((w - kernel_width) * channel_factor) # remaining line + + ((dilation_h - 1) * w * channel_factor) # skip lines + + 1 # wrap-around of minimally sized buffer + ) + addr_incr_end_window = -buffer_min_size + stride_w * channel_factor + 1 + addr_incr_end_row = ( + -buffer_min_size + + ((skip_columns + kernel_width) * channel_factor) # remaining line + + ((stride_h - 1) * w * channel_factor) # skip lines + + 1 + ) + + # re-use same controller structure -> re-assign address increments + if depthwise: + addr_incr_end_window_elem = dilation_w * channel_factor + addr_incr_end_window_row = ( + channel_factor + + (w - kernel_width) * channel_factor + + (dilation_h - 1) * w * channel_factor + ) + addr_incr_end_simd = -buffer_min_size + (channel_factor + 1) + + # sanity check + assert not ( + abs(addr_incr_end_window) > buffer_actual_size + ), "ERROR: W increment > buffer size, wrap logic doesn't account for this" + assert not ( + abs(addr_incr_end_row) > buffer_actual_size + ), "ERROR: H increment > buffer size, wrap logic doesn't account for this" + + # set certain threshold indices to detect when reading/writing finishes + code_gen_dict["$LAST_READ_ELEM$"] = [str(h * w * channel_factor - 1)] + code_gen_dict["$LAST_WRITE_ELEM$"] = [ + str(((h - skip_rows - 1) * w + (w - skip_columns)) * channel_factor - 1) + ] + + # default controller loop structure: # iterations (counters) map directly + loop_h_iterations = out_dim_h + loop_w_iterations = out_dim_w + loop_kh_iterations = k_h + loop_kw_iterations = k_w + loop_simd_iterations = channel_factor + + if depthwise and channel_factor > 1: + # re-arrange existing controller loop structure for depthwise convolutions + loop_kh_iterations = channel_factor + loop_kw_iterations = k_h + loop_simd_iterations = k_w + addr_incr_end_simd_ = addr_incr_end_simd + addr_incr_end_simd = addr_incr_end_window_elem + addr_incr_end_window_elem = addr_incr_end_window_row + addr_incr_end_window_row = addr_incr_end_simd_ + elem_per_window = k_h * k_w + + tail_incr_w = addr_incr_end_window + buffer_min_size - channel_factor + tail_incr_h = addr_incr_end_row + buffer_min_size - channel_factor + tail_incr_last_window = buffer_min_size - 1 + code_gen_dict["$IS_DEPTHWISE$"] = ["1"] + else: + # depthwise output format is equivalent to non-depthwise if SIMD=C + elem_per_window = k_h * k_w * channel_factor + + tail_incr_w = addr_incr_end_window + buffer_min_size - 1 + tail_incr_h = addr_incr_end_row + buffer_min_size - 1 + tail_incr_last_window = buffer_min_size - 1 + code_gen_dict["$IS_DEPTHWISE$"] = ["0"] + + code_gen_dict["$TAIL_INCR_W$"] = [str(tail_incr_w)] + code_gen_dict["$TAIL_INCR_H$"] = [str(tail_incr_h)] + code_gen_dict["$TAIL_INCR_LAST$"] = [str(tail_incr_last_window)] + + # support SIMD = IFMChannels and k_w = 1 cases + # for k = [k_h, k_w] = [1, k_w], no adjustment is needed + # for k = [k_h, k_w] = [1, 1], do not use this impl. style (mmv_out=K=1) + # innermost loop is executed at least once -> adjust if needed + if loop_simd_iterations == 1: + # skip innermost SIMD loop completely + if loop_kw_iterations == 1: + # skip innermost KW loop completely + code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_KH"] + loop_kh_iterations -= 1 # -1 because state is initial state + else: + code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_KW"] + loop_kw_iterations -= 1 # -1 because state is initial state + else: + code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_SIMD"] + loop_simd_iterations -= 1 # -1 because state is initial state + + code_gen_dict["$LOOP_H_ITERATIONS$"] = [str(loop_h_iterations - 1)] + code_gen_dict["$LOOP_W_ITERATIONS$"] = [str(loop_w_iterations - 1)] + code_gen_dict["$LOOP_KH_ITERATIONS$"] = [str(loop_kh_iterations - 1)] + code_gen_dict["$LOOP_KW_ITERATIONS$"] = [str(loop_kw_iterations - 1)] + code_gen_dict["$LOOP_SIMD_ITERATIONS$"] = [str(loop_simd_iterations - 1)] + + incr_bitwidth = 1 + math.ceil( + math.log2( + max( + abs(addr_incr_end_simd) + 1, + abs(addr_incr_end_window_elem) + 1, + abs(addr_incr_end_window_row) + 1, + abs(addr_incr_end_window) + 1, + abs(addr_incr_end_row) + 1, + abs(tail_incr_w) + 1, + abs(tail_incr_h) + 1, + abs(tail_incr_last_window) + 1, + ) + ) + ) + code_gen_dict["$INCR_BITWIDTH$"] = [str(incr_bitwidth)] + code_gen_dict["$ADDR_INCREMENT_MAP$"] = [ + "'{{ {}'d0, {}'d{}, {}'d{}, {}'d{}, {}'d{}, {}'d{}}}".format( + incr_bitwidth, + int(copysign(incr_bitwidth, addr_incr_end_simd)), + abs(addr_incr_end_simd), + int(copysign(incr_bitwidth, addr_incr_end_window_elem)), + abs(addr_incr_end_window_elem), + int(copysign(incr_bitwidth, addr_incr_end_window_row)), + abs(addr_incr_end_window_row), + int(copysign(incr_bitwidth, addr_incr_end_window)), + abs(addr_incr_end_window), + int(copysign(incr_bitwidth, addr_incr_end_row)), + abs(addr_incr_end_row), + ) + ] + + code_gen_dict["$ELEM_PER_WINDOW$"] = [str(elem_per_window)] + code_gen_dict["$SIMD$"] = [str(simd)] + code_gen_dict["$MMV_IN$"] = [str(mmv_in)] + code_gen_dict["$MMV_OUT$"] = [str(mmv_out)] + + return template_path, code_gen_dict + + def select_impl_style(self): + simd = self.get_nodeattr("SIMD") + M = self.get_nodeattr("M") + ifm_ch = self.get_nodeattr("IFMChannels") + ifm_dim = self.get_nodeattr("IFMDim") + stride = self.get_nodeattr("Stride") + dilation = self.get_nodeattr("Dilation") + k = self.get_nodeattr("ConvKernelDim") + ifm_dim_h, ifm_dim_w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + k_h, k_w = k + kernel_width = (k_w - 1) * dilation_w + 1 # incl. dilation + kernel_height = (k_h - 1) * dilation_h + 1 # incl. dilation + + # check for valid configuration + assert ( + kernel_height <= ifm_dim_h + and kernel_width <= ifm_dim_w + and stride_h <= ifm_dim_h + and stride_w <= ifm_dim_w + ), "Illegal conv configuration: kernel or stride > FM dimension" + + # init folding config + if self.get_nodeattr("parallel_window"): + # mmv_in = M * 1 + mmv_out = M * k_h * k_w + assert ( + ifm_ch == simd + ), "Constraint violated: SIMD must be equal to IFMChannels" + else: + # mmv_in = 1 + mmv_out = 1 + assert ( + ifm_ch % simd == 0 + ), "Constraint violated: SIMD must divide IFMChannels" + + # choose implementation style + if mmv_out > 1 or (k_h == 1 and k_w == 1): + impl_style = "parallel" + assert ( + ifm_ch == simd + ), "Constraint violated: SIMD must be equal to IFMChannels" + else: + impl_style = "default" + + assert ( + impl_style == "default" + ), "ERROR: Parallel window mode not yet implemented" + return impl_style + + def generate_hdl(self): + impl_style = self.select_impl_style() + + # prepare code generation by filling out dictionaries + if impl_style == "default": + template_path, code_gen_dict = self.prepare_codegen_default() + else: + raise Exception("Requested impl. style not implemented") + + # add general parameters to dictionary + code_gen_dict["$TOP_MODULE_NAME$"] = [self.get_verilog_top_module_name()] + # save top module name so we can refer to it after this node has been renamed + # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) + self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) + code_gen_dict["$BIT_WIDTH$"] = [str(self.get_input_datatype().bitwidth())] + ram_style = self.get_nodeattr("ram_style") + if ram_style == "auto": + code_gen_dict["$RAM_STYLE$"] = [""] + else: + code_gen_dict["$RAM_STYLE$"] = ['(* ram_style = "{}" *)'.format(ram_style)] + + # apply code generation to templates + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + with open(template_path, "r") as f: + template = f.read() + with open( + os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_template_wrapper.v", "r" + ) as f: + template_wrapper = f.read() + for key in code_gen_dict: + # transform list into long string separated by '\n' + code_gen_line = "\n".join(code_gen_dict[key]) + template = template.replace(key, code_gen_line) + template_wrapper = template_wrapper.replace(key, code_gen_line) + with open( + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv" + ), + "w", + ) as f: + f.write(template) + with open( + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" + ), + "w", + ) as f: + f.write(template_wrapper) + + # set ipgen_path and ip_path so that HLS-Synth transformation + # and stich_ip transformation do not complain + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + + def prepare_rtlsim(self): + """Creates a Verilator emulation library for the RTL code generated + for this node, sets the rtlsim_so attribute to its path and returns + a PyVerilator wrapper around it.""" + # Modified to use generated (System-)Verilog instead of HLS output products + + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + verilog_paths = [code_gen_dir] + verilog_files = [ + self.get_nodeattr("gen_top_module") + "_wrapper.v", + self.get_nodeattr("gen_top_module") + "_impl.sv", + ] + + # build the Verilator emu library + sim = PyVerilator.build( + verilog_files, + build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), + verilog_path=verilog_paths, + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_verilog_top_module_name(), + ) + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + return sim + + def code_generation_ipi(self): + """Constructs and returns the TCL for node instantiation in Vivado IPI.""" + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + + cmd = [ + "add_files -norecurse %s" + % ( + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" + ) + ), + "add_files -norecurse %s" + % ( + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv" + ) + ), + "create_bd_cell -type module -reference %s %s" + % (self.get_nodeattr("gen_top_module"), self.onnx_node.name), + ] + + return cmd + + def code_generation_ipgen(self, model, fpgapart, clk): + """Normally: Generates C++ code and tcl script for IP generation. + Here: Generates (System-)Verilog code for IP generation.""" + self.generate_hdl() + + def ipgen_singlenode_code(self): + """Normally: Builds the bash script for IP generation.""" + pass + + def code_generation_cppsim(self, model): + """Normally: Generates C++ code for simulation (cppsim).""" + pass + + def compile_singlenode_code(self): + pass + + def global_includes(self): + pass + + def defines(self, var): + pass + + def read_npy_data(self): + pass + + def strm_decl(self): + pass + + def docompute(self): + pass + + def dataoutstrm(self): + pass + + def save_as_npy(self): + pass + + def blackboxfunction(self): + pass + + def pragmas(self): + pass diff --git a/src/finn/custom_op/fpgadataflow/downsampler.py b/src/finn/custom_op/fpgadataflow/downsampler.py index da29a524b6bba7ce0c7a71bc64a44ae128d91709..e9009e1856a2b379911969a69d258163e67c1197 100644 --- a/src/finn/custom_op/fpgadataflow/downsampler.py +++ b/src/finn/custom_op/fpgadataflow/downsampler.py @@ -36,7 +36,7 @@ from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy class DownSampler(HLSCustomOp): - """Corresponds to finn-hlslib ConvolutionInputGenerator_kernel1 function. + """Corresponds to finn-hlslib ConvolutionInputGenerator_*_kernel1 function. Basically performs a down sampling of the image removing rows and columns.""" def __init__(self, onnx_node): @@ -55,6 +55,10 @@ class DownSampler(HLSCustomOp): "inputDataType": ("s", True, ""), # Batch size "numInputVectors": ("i", False, 1), + # 1D (True) or 2D (False) spatial data + "is1D": ("i", False, 0), + # for 1D only: (D, 1) (True) or (1, D) dims + "is1D_unitx": ("i", False, 1), } my_attrs.update(super().get_nodeattr_types()) return my_attrs @@ -66,25 +70,43 @@ class DownSampler(HLSCustomOp): return int(np.floor((idim - 1) / stride) + 1) def get_exp_cycles(self): + is_1D = self.get_nodeattr("is1D") idim = self.get_nodeattr("ImgDim") + idim_total = idim if is_1D else idim * idim channels = self.get_nodeattr("NumChannels") simd = self.get_nodeattr("SIMD") batch_size = self.get_nodeattr("numInputVectors") - exp_cycles = channels / simd * batch_size * idim * idim + exp_cycles = channels / simd * batch_size * idim_total return int(exp_cycles) def get_normal_input_shape(self): + is_1D = self.get_nodeattr("is1D") + is_1D_unitx = self.get_nodeattr("is1D_unitx") idim = self.get_nodeattr("ImgDim") num_ch = self.get_nodeattr("NumChannels") batch = self.get_nodeattr("numInputVectors") - ishape = (batch, idim, idim, num_ch) + if is_1D: + if is_1D_unitx: + ishape = (batch, idim, 1, num_ch) + else: + ishape = (batch, 1, idim, num_ch) + else: + ishape = (batch, idim, idim, num_ch) return ishape def get_normal_output_shape(self): + is_1D = self.get_nodeattr("is1D") + is_1D_unitx = self.get_nodeattr("is1D_unitx") odim = self.get_downsampled_odim() num_ch = self.get_nodeattr("NumChannels") batch = self.get_nodeattr("numInputVectors") - oshape = (batch, odim, odim, num_ch) + if is_1D: + if is_1D_unitx: + oshape = (batch, odim, 1, num_ch) + else: + oshape = (batch, 1, odim, num_ch) + else: + oshape = (batch, odim, odim, num_ch) return oshape def get_folded_input_shape(self): @@ -204,8 +226,9 @@ class DownSampler(HLSCustomOp): ) def docompute(self): + dim_var = "1D" if (self.get_nodeattr("is1D") == 1) else "2D" self.code_gen_dict["$DOCOMPUTE$"] = [ - """ConvolutionInputGenerator_kernel1<IFMChannels, Input_precision, + f"""ConvolutionInputGenerator_{dim_var}_kernel1<IFMChannels, Input_precision, IFMDim, SIMD,Stride> (in0, out, numReps);""" ] diff --git a/src/finn/custom_op/fpgadataflow/eltwise.py b/src/finn/custom_op/fpgadataflow/eltwise.py new file mode 100644 index 0000000000000000000000000000000000000000..a29e871fabbc01f0accd6858d69c0a96a5a8c495 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/eltwise.py @@ -0,0 +1,462 @@ +# Copyright (c) 2022, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class StreamingEltwise(HLSCustomOp): + """Class that corresponds to finn-hlslib StreamingEltwise function.""" + + def __init__(self, onnx_node): + super().__init__(onnx_node) + + def get_nodeattr_types(self): + my_attrs = { + "NumChannels": ("i", True, ""), + "PE": ("i", True, ""), + # FINN DataTypes for inputs; output datatype inferred from input + "inputDataType0": ("s", True, ""), + "inputDataType1": ("s", True, ""), + # type of EltwiseFunction for the operation + "eltwiseOp": ("s", True, "", ["Add", "Sub", "AbsDiff"]), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_eltwise_op_lambda(self): + eltwise_op = self.get_nodeattr("eltwiseOp") + idt0 = self.get_input_datatype(0) + idt1 = self.get_input_datatype(1) + odt = self.get_output_datatype() + tin0 = idt0.get_hls_datatype_str() + tin1 = idt1.get_hls_datatype_str() + tout = odt.get_hls_datatype_str() + eltwise_ops = { + # "Add": "[](auto a, auto b) { return a + b; }", + # "Sub": "[](auto a, auto b) { return a - b; }", + # "AbsDiff": "[](auto a, auto b) { return a>b? a-b : b-a; }", + "Add": f"add<{tin0}, {tin1}, {tout}>()", + "Sub": f"sub<{tin0}, {tin1}, {tout}>()", + "AbsDiff": f"absdiff<{tin0}, {tin1}, {tout}>()", + } + return eltwise_ops[eltwise_op] + + def get_normal_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + ishape = tuple(vecs + [ich]) + return ishape + + def get_folded_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + assert ich % pe == 0, "PE must divide NumChannels" + vecs = list(self.get_nodeattr("numInputVectors")) + ishape = tuple(vecs + [ich // pe, pe]) + return ishape + + def get_normal_output_shape(self): + return self.get_normal_input_shape() + + def get_folded_output_shape(self): + return self.get_folded_input_shape() + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpected input1 shape." + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[1])) + assert ishape == exp_ishape, "Unexpected input2 shape." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt0 = model.get_tensor_datatype(node.input[0]) + if idt0 != self.get_input_datatype(0): + warn_str = "inputDataType0 changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype(0)), + str(idt0), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType0", idt0.name) + idt1 = model.get_tensor_datatype(node.input[1]) + if idt1 != self.get_input_datatype(1): + warn_str = "inputDataType1 changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype(1)), + str(idt1), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType1", idt1.name) + # enforce output data type (calculated based on idt) + odt = self.get_output_datatype() + model.set_tensor_datatype(self.onnx_node.output[0], odt) + + def verify_node(self): + info_messages = [] + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify that all necessary attributes exist + try: + self.get_nodeattr("code_gen_dir_cppsim") + self.get_nodeattr("executable_path") + self.get_nodeattr("NumChannels") + self.get_nodeattr("PE") + self.get_nodeattr("inputDataType0") + self.get_nodeattr("inputDataType1") + self.get_nodeattr("eltwiseOp") + info_messages.append("All necessary attributes exist") + except Exception: + info_messages.append( + """The required StreamingEltwise attributes do not exist.""" + ) + + return info_messages + + def get_input_datatype(self, id=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType" + str(id))] + + def get_output_datatype(self): + """Returns FINN DataType of output.""" + op = self.get_nodeattr("eltwiseOp") + idt0 = self.get_input_datatype(0) + idt1 = self.get_input_datatype(1) + assert idt0.signed() == idt1.signed(), ( + "%s: Inputs must have same signedness" % self.onnx_node.name + ) + idt0_min, idt0_max = idt0.min(), idt0.max() + idt1_min, idt1_max = idt1.min(), idt1.max() + cands = [ + idt0_min - idt1_min, + idt0_min - idt1_max, + idt0_max - idt1_min, + idt0_max - idt1_max, + ] + largest_magnitude = max(map(abs, cands)) + if op == "Add": + if idt0.signed(): + return DataType.get_smallest_possible(idt0.min() + idt1.min()) + else: + return DataType.get_smallest_possible(idt0.max() + idt1.max()) + elif op == "Sub": + return DataType.get_smallest_possible(-largest_magnitude) + elif op == "AbsDiff": + return DataType.get_smallest_possible(largest_magnitude) + else: + raise Exception("%s: Unknown eltWiseOp = %s" % (self.onnx_node.name, op)) + + def get_instream_width(self, ind=0): + """Returns input stream width.""" + ibits = self.get_input_datatype(ind).bitwidth() + pe = self.get_nodeattr("PE") + in_width = pe * ibits + return in_width + + def get_outstream_width(self): + """Returns output stream width.""" + obits = self.get_output_datatype().bitwidth() + pe = self.get_nodeattr("PE") + out_width = pe * obits + return out_width + + def get_number_output_values(self): + return np.prod(self.get_folded_output_shape()[:-1]) + + def get_exp_cycles(self): + # Channels/PE * batch size * fmdim * fmdim + return np.prod(self.get_folded_output_shape()[:-1]) + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input0 shape doesn't match expected shape .""" + export_idt0 = self.get_input_datatype(0) + # reshape input into folded form + inp = inp.reshape(folded_ishape) + # make copy before saving array + reshaped_input = inp.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + # exact same thing for input1 + inp = context[node.input[1]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input1 shape doesn't match expected shape .""" + export_idt1 = self.get_input_datatype(1) + # reshape input into folded form + inp = inp.reshape(folded_ishape) + # make copy before saving array + reshaped_input = inp.copy() + np.save(os.path.join(code_gen_dir, "input_1.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == exp_oshape + ), "cppsim did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits0 = self.get_instream_width(0) + nbits1 = self.get_instream_width(1) + rtlsim_inp0 = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt0, nbits0 + ) + rtlsim_inp1 = npy_to_rtlsim_input( + "{}/input_1.npy".format(code_gen_dir), export_idt1, nbits1 + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp0, rtlsim_inp1) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape.""" + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = [ + '#include "eltwise.hpp"', + '#include "interpret.hpp"', + ] + + self.code_gen_dict["$GLOBALS$"].extend( + [ + "template<typename TI1, typename TI2, typename TO>", + "struct absdiff {", + "TO operator()(TI1 const &a, TI2 const &b) const {", + "#pragma HLS inline", + "return a>b? a-b : b-a;", + "}", + "};", + "template<typename TI1, typename TI2, typename TO>", + "struct sub {", + "TO operator()(TI1 const &a, TI2 const &b) const {", + "#pragma HLS inline", + "return a-b;", + "}", + "};", + "template<typename TI1, typename TI2, typename TO>", + "struct add {", + "TO operator()(TI1 const &a, TI2 const &b) const {", + "#pragma HLS inline", + "return a+b;", + "}", + "};", + ] + ) + + def defines(self, var): + self.code_gen_dict["$DEFINES$"] = [] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + idt0 = self.get_input_datatype(0) + idt1 = self.get_input_datatype(1) + elem_bits_0 = idt0.bitwidth() + elem_bits_1 = idt1.bitwidth() + packed_bits_0 = self.get_instream_width(0) + packed_hls_type_0 = "ap_uint<%d>" % packed_bits_0 + packed_bits_1 = self.get_instream_width(1) + packed_hls_type_1 = "ap_uint<%d>" % packed_bits_1 + elem_hls_type_0 = idt0.get_hls_datatype_str() + elem_hls_type_1 = idt1.get_hls_datatype_str() + npy_type = "float" + self.code_gen_dict["$READNPYDATA$"] = [] + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0);' + % (packed_hls_type_0, elem_hls_type_0, elem_bits_0, npy_type, npy_in) + ) + npy_in = "%s/input_1.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in1);' + % (packed_hls_type_1, elem_hls_type_1, elem_bits_1, npy_type, npy_in) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width(0)) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> in1 ("in1");'.format(self.get_instream_width(1)) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width()) + ) + + def docompute(self): + op = self.get_nodeattr("eltwiseOp") + idt0 = self.get_input_datatype(0) + idt1 = self.get_input_datatype(1) + odt = self.get_output_datatype() + elem_hls_type_0 = idt0.get_hls_datatype_str() + elem_hls_type_1 = idt1.get_hls_datatype_str() + out_hls_type = odt.get_hls_datatype_str() + slice_in0 = "Slice<%s>" % elem_hls_type_0 + slice_in1 = "Slice<%s>" % elem_hls_type_1 + slice_out = "Slice<%s>" % out_hls_type + eltwise_op_str = self.get_eltwise_op_lambda() + "%sEltwiseFunction<%s, %s, %s>()" % ( + op, + elem_hls_type_0, + elem_hls_type_1, + out_hls_type, + ) + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<{}, {}, {}, {}, {}, {}>(in0, in1, out, {});""".format( + "StreamingEltwise", + self.get_nodeattr("NumChannels"), + self.get_nodeattr("PE"), + self.get_number_output_values(), + slice_in0, + slice_in1, + slice_out, + eltwise_op_str, + ) + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + oshape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream<ap_uint<{}>> &in0, hls::stream<ap_uint<{}>> &in1, + hls::stream<ap_uint<{}>> &out)""".format( + self.onnx_node.name, + self.get_nodeattr("PE") * self.get_input_datatype(0).bitwidth(), + self.get_nodeattr("PE") * self.get_input_datatype(1).bitwidth(), + self.get_nodeattr("PE") * self.get_output_datatype().bitwidth(), + ) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=in1 name=in1_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE ap_ctrl_none port=return" + ) + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + sname = self.hls_sname() + swidth = self.get_instream_width_padded() + intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]] + return intf_names diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py index 27b23dd32835c265759a8cabfd2a3412844077ca..b0c05d1ad6c74ceaaaa2c932f4add3f0076bda51 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py @@ -29,6 +29,7 @@ import math import numpy as np import os +import textwrap import warnings from qonnx.core.datatype import DataType from qonnx.util.basic import ( @@ -41,6 +42,7 @@ from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp from finn.util.data_packing import ( npy_to_rtlsim_input, numpy_to_hls_code, + pack_innermost_dim_as_hex_string, rtlsim_output_to_npy, ) @@ -67,6 +69,36 @@ class VectorVectorActivation(HLSCustomOp): "accDataType": ("s", False, "INT32"), # no-activation mode (produce accumulators) "noActivation": ("i", False, 0, {0, 1}), + # memory mode for the layer weights + # const -- embedded weights, default, long compile/synth times + # decoupled -- streaming weights with weight streamer packaged inside IP + # external -- streaming weights with external streamer + "mem_mode": ("s", False, "const", {"const", "decoupled", "external"}), + # (mem_mode = decoupled only) whether weights will be writable through + # an AXI-lite interface during runtime + # 1 for enabled, 0 for disabled. + # see finn-rtllib/memstream/doc/README for more about the memory + # address map used for writable weights + # IMPORTANT: After using AXI lite to either read or write the weights, + # always "flush" the accelerator by first passing a dummy input + # vector through the accelerator. This will get rid of any old + # weight data from the weight FIFOs. + "runtime_writeable_weights": ("i", False, 0, {0, 1}), + # FPGA resource type for memories in decoupled mode + # auto -- let Vivado decide + # block -- use BRAM + # distributed -- use LUTRAM + # ultra -- use UltraRAM (URAM), must have runtime_writeable_weights=1 + # see also https://www.xilinx.com/support/answers/38070.html + "ram_style": ( + "s", + False, + "auto", + {"auto", "block", "distributed", "ultra"}, + ), + # use xnor-popcount for binary weights/inputs, thus treating them + # as bipolar + "binaryXnorMode": ("i", False, 0, {0, 1}), } my_attrs.update(super().get_nodeattr_types()) return my_attrs @@ -198,14 +230,23 @@ class VectorVectorActivation(HLSCustomOp): out_width = o_bits * self.get_nodeattr("PE") return out_width - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): k_h, k_w = self.get_nodeattr("Kernel") sf = k_h * k_w dim_h, dim_w = self.get_nodeattr("Dim") ch = self.get_nodeattr("Channels") pe = self.get_nodeattr("PE") nf = ch // pe - folded_input_shape = tuple([1, dim_h, dim_w, sf * nf, pe]) + + if ind == 0: + # calculate shape of input 0 + folded_input_shape = tuple([1, dim_h, dim_w, sf * nf, pe]) + elif ind == 1 and self.get_nodeattr("mem_mode") == "external": + # calculate shape of input 1 (weights) + folded_input_shape = tuple([1, sf * nf, pe]) + else: + raise Exception("Undefined input shape for requested input") + return folded_input_shape def get_folded_output_shape(self): @@ -251,13 +292,31 @@ class VectorVectorActivation(HLSCustomOp): ret = dict() inp_hls_str = self.get_input_datatype().get_hls_datatype_str() out_hls_str = self.get_output_datatype().get_hls_datatype_str() + inp_is_binary = self.get_input_datatype() == DataType["BINARY"] + # out_is_binary = self.get_output_datatype() == DataType["BINARY"] + wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] + bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 + if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode): + raise Exception("True binary (non-bipolar) inputs not yet supported") inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] + # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"] wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] + # reinterpret inp/wt as bipolar if bin_xnor_mode is iset + inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) + wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) # fill in TSrcI and TWeightI - # TODO handle bipolar inputs - if inp_is_bipolar or wt_is_bipolar: - raise Exception("VVAU node doesn't support bipolar values yet.") - else: + # TODO check these with Giulio + # TODO handle non-bipolar binary inputs + if inp_is_bipolar and wt_is_bipolar: + ret["TSrcI"] = "Recast<XnorMul>" + ret["TWeightI"] = "Identity" + elif (not inp_is_bipolar) and wt_is_bipolar: + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + ret["TWeightI"] = "Recast<Binary>" + elif inp_is_bipolar and (not wt_is_bipolar): + ret["TSrcI"] = "Recast<Binary>" + ret["TWeightI"] = "Identity" + elif (not inp_is_bipolar) and (not wt_is_bipolar): ret["TSrcI"] = "Slice<%s>" % inp_hls_str ret["TWeightI"] = "Identity" @@ -286,6 +345,13 @@ class VectorVectorActivation(HLSCustomOp): return ret def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): + """Convert the original numpy weight matrix orig_weight_matrix into + a form suitable for passing to the hlslib call: + * ensure MH % PE == 0 + * for bipolar weights&inputs, ensure thresholds are positive + * interleave rows between PEs + * reshape into (PE, TMEM, n_thres_steps) and return + """ ch = self.get_nodeattr("Channels") pe = self.get_nodeattr("PE") tmem = self.calc_tmem() @@ -295,14 +361,33 @@ class VectorVectorActivation(HLSCustomOp): ), """Threshold matrix dimension is not as expected (2).""" n_thres_steps = orig_thres_matrix.shape[1] + inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] + wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] + # reinterpret inp/wt as bipolar if bin_xnor_mode is iset + inp_is_binary = self.get_input_datatype() == DataType["BINARY"] + wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] + bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 + inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) + wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) + if inp_is_bipolar and wt_is_bipolar: + # ensure all thresholds are nonnegative + assert (orig_thres_matrix >= 0).all() + # ensure all thresholds are integer + assert (orig_thres_matrix.astype(np.int32) == orig_thres_matrix).all() ret = orig_thres_matrix # workaround for vivado_hls threshold bug - if ret[0][0] == 0: + if ret[0][0] == 0 and n_thres_steps == 1: ret = np.copy(ret) ret[0][0] = 1 warnings.warn( "Setting 0-valued first threshold to 1 to avoid vivado_hls bug" ) + # ensure channels = mh , duplicating if necessary + if ret.shape[0] == 1: + ret = np.tile(ret, (ch, 1)) + assert ( + ret.shape[0] == ch + ), "Channels of threshold matrix are not as expected (ch)" # distribute rows between PEs ret = interleave_matrix_outer_dim_from_partitions(ret, pe) assert ( @@ -319,43 +404,173 @@ class VectorVectorActivation(HLSCustomOp): rows between PEs is not as expected (n_thres_steps)""" return ret.reshape(1, pe, tmem, n_thres_steps) - def generate_params(self, model, path): - # weights - weights = model.get_initializer(self.onnx_node.input[1]) + def make_weight_file(self, weights, weight_file_mode, weight_file_name): + """Produce a file containing given weights in appropriate format for this + layer. This file can be used for either synthesis or run-time reconfig + of weights. + + Arguments: + * weights : numpy array with weights to be put into the file + * weight_file_mode : one of {hls_header, decoupled_verilog_dat, + decoupled_runtime} + * weight_file_name : filename for the weight file to be generated + """ # convert weights into hlslib-compatible format weight_tensor = self.get_hls_compatible_weight_tensor(weights) - wdt = self.get_weight_datatype() - code_gen_dir = path - - """Saves weights into params.h""" - weight_hls_code = numpy_to_hls_code(weight_tensor, wdt, "weights", True, True) - # write weights into params.h - f_weights = open("{}/params.h".format(code_gen_dir), "w") - - if wdt.bitwidth() != 1: - f_weights.write( - "const FixedPointWeights<1,{},{},{}> weights = ".format( - wdt.get_hls_datatype_str(), - self.get_nodeattr("PE"), - self.calc_wmem(), + export_wdt = self.get_weight_datatype() + # we have converted bipolar weights to binary for export, + # so use it as such for weight generation + if self.get_weight_datatype() == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + if weight_file_mode == "hls_header": + weight_hls_code = numpy_to_hls_code( + weight_tensor, export_wdt, "weights", True, True + ) + # write weights into C++ header file as dictated by finn-hlslib + f_weights = open(weight_file_name, "w") + if export_wdt.bitwidth() != 1: + f_weights.write( + "const FixedPointWeights<1,{},{},{}> weights = ".format( + export_wdt.get_hls_datatype_str(), + self.get_nodeattr("PE"), + self.calc_wmem(), + ) ) + else: + f_weights.write( + "const BinaryWeights<1,{},{}> weights = ".format( + self.get_nodeattr("PE"), + self.calc_wmem(), + ) + ) + f_weights.write(weight_hls_code) + f_weights.close() + elif "decoupled" in weight_file_mode: + # create a weight stream for various flavors of decoupled mode: + # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD) + weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3)) + # reverse SIMD flip for saving weights in .npy + weight_tensor_simd_flipped = np.flip(weight_tensor_unflipped, axis=-1) + # PE flip for saving weights in .dat + weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2) + # reshape weight tensor (simd_flipped and pe_flipped) to desired shape + pe = self.get_nodeattr("PE") + simd = 1 + # simd_flipped + weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape( + 1, -1, pe * simd + ) + weight_tensor_simd_flipped = weight_tensor_simd_flipped.copy() + # flipped + weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape( + 1, -1, pe * simd ) + weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy() + if weight_file_mode == "decoupled_npy": + # save weight stream into npy for cppsim + np.save(weight_file_name, weight_tensor_simd_flipped) + elif weight_file_mode == "decoupled_verilog_dat": + # convert weight values into hexstring + weight_width = self.get_weightstream_width() + # pad to nearest 4 bits to get hex strings + weight_width_padded = roundup_to_integer_multiple(weight_width, 4) + weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( + weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" + ) + # add zeroes to pad out file to 1024 entries + weight_stream = weight_tensor_pe_flipped.flatten() + weight_stream = weight_stream.copy() + with open(weight_file_name, "w") as f: + for val in weight_stream: + f.write(val + "\n") + elif weight_file_mode == "decoupled_runtime": + # memstream axi-lite interface will map each mem line to + # one or multiple 32-bit words + weight_width = self.get_weightstream_width() + words_per_memwidth = 2 ** math.ceil(math.log2(weight_width / 32)) + if words_per_memwidth < 1: + words_per_memwidth = 1 + weight_width_padded = words_per_memwidth * 32 + # first, pack and ensure padding to 32 bits + weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( + weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" + ) + weight_stream = weight_tensor_pe_flipped.flatten() + weight_stream = weight_stream.copy() + with open(weight_file_name, "w") as f: + for val in weight_stream: + # split into groups of 8 hex digits (= 32 bits) + words_32b = textwrap.wrap(val, 8) + words_32b.reverse() + for word_32b in words_32b: + f.write(word_32b + "\n") + else: + raise Exception("Unknown weight_file_mode") + else: - f_weights.write( - "const BinaryWeights<1,{},{}> weights = ".format( - self.get_nodeattr("PE"), self.calc_wmem() + raise Exception("Unknown weight_file_mode") + + def generate_params(self, model, path): + mem_mode = self.get_nodeattr("mem_mode") + code_gen_dir = path + # weights, if not external + weights = model.get_initializer(self.onnx_node.input[1]) + if mem_mode == "const": + # save hlslib-compatible weights in params.h + weight_filename = "{}/params.h".format(code_gen_dir) + self.make_weight_file(weights, "hls_header", weight_filename) + elif mem_mode == "decoupled" or mem_mode == "external": + weight_filename_sim = "{}/weights.npy".format(code_gen_dir) + # save decoupled weights for cppsim + self.make_weight_file(weights, "decoupled_npy", weight_filename_sim) + if mem_mode == "decoupled": + # also save weights as Verilog .dat file + # note that we provide two different .dat files, one for synth + # and one for synthesis. this is because URAM-based weights always + # need zero weights for synthesis, otherwise they get inferred + # as BRAM + weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format( + code_gen_dir + ) + weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir) + # sim weights are always the true weights + self.make_weight_file( + weights, "decoupled_verilog_dat", weight_filename_rtl_sim ) + ram_style = self.get_nodeattr("ram_style") + if ram_style == "ultra": + # UltraRAM must have no memory initializer, or only zeroes + # otherwise BRAM will be inferred instead of URAM + # as a workaround we provide a zero-weight init here + synth_weights = np.zeros_like(weights, dtype=np.float32) + else: + synth_weights = weights + self.make_weight_file( + synth_weights, "decoupled_verilog_dat", weight_filename_rtl_synth + ) + else: + raise Exception( + """Please set mem_mode to "const", "decoupled", or "external", + currently no other parameter value is supported!""" ) - f_weights.write(weight_hls_code) - f_weights.close() # save thresholds in thresh.h if len(self.onnx_node.input) > 2: thresholds = model.get_initializer(self.onnx_node.input[2]) if thresholds is not None: threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + # use UINT32 threshold export for bipolar times bipolar + inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] + wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] + # reinterpret inp/wt as bipolar if bin_xnor_mode is iset + inp_is_binary = self.get_input_datatype() == DataType["BINARY"] + wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] + bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 + inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) + wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) # get computed threshold datatype from attribute tdt = DataType[self.get_nodeattr("accDataType")] + assert np.vectorize(tdt.allowed)( threshold_tensor ).all(), "Thresholds in %s can't be expressed with type %s" % ( @@ -368,8 +583,11 @@ class VectorVectorActivation(HLSCustomOp): # write thresholds into thresh.h f_thresh = open("{}/thresh.h".format(code_gen_dir), "w") tdt_hls = tdt.get_hls_datatype_str() - odt = self.get_output_datatype() - odt_hls = odt.get_hls_datatype_str() + # use binary to export bipolar activations + export_odt = self.get_output_datatype() + if self.get_output_datatype() == DataType["BIPOLAR"]: + export_odt = DataType["BINARY"] + odt_hls = export_odt.get_hls_datatype_str() f_thresh.write( "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \ = ".format( @@ -387,6 +605,7 @@ class VectorVectorActivation(HLSCustomOp): def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") + mem_mode = self.get_nodeattr("mem_mode") node = self.onnx_node # TODO ensure codegen dir exists @@ -440,7 +659,28 @@ class VectorVectorActivation(HLSCustomOp): inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), idt, nbits) super().reset_rtlsim(sim) super().toggle_clk(sim) - output = self.rtlsim(sim, inp) + + if mem_mode == "external" or mem_mode == "decoupled": + wnbits = self.get_weightstream_width() + export_wdt = self.get_weight_datatype() + # we have converted bipolar weights to binary for export, + # so use it as such for weight generation + if self.get_weight_datatype() == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + wei = npy_to_rtlsim_input( + "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits + ) + dim_h, dim_w = self.get_nodeattr("Dim") + num_w_reps = dim_h * dim_w + + io_dict = { + "inputs": {"in0": inp, "weights": wei * num_w_reps}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + output = io_dict["outputs"]["out"] + else: + output = self.rtlsim(sim, inp) odt = self.get_output_datatype() target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() @@ -466,6 +706,12 @@ class VectorVectorActivation(HLSCustomOp): def global_includes(self): self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"'] self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"'] + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode not in ["const", "decoupled", "external"]: + raise Exception( + """Please set mem_mode to "const", "decoupled", or "external", + currently no other parameter value is supported!""" + ) if self.calc_tmem() != 0: self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] @@ -474,6 +720,8 @@ class VectorVectorActivation(HLSCustomOp): numReps = 1 * dim_h * dim_w k_h, k_w = self.get_nodeattr("Kernel") innerProdDim = k_h * k_w + mem_mode = self.get_nodeattr("mem_mode") + self.code_gen_dict["$DEFINES$"] = [ """#define Channels1 {}\n #define InnerProdDim {}\n #define SIMD1 1\n #define PE1 {}\n #define numReps {}""".format( @@ -483,6 +731,11 @@ class VectorVectorActivation(HLSCustomOp): numReps, ) ] + if mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + self.code_gen_dict["$DEFINES$"].append( + "#define WP1 {}\n".format(wdt.bitwidth()) + ) def read_npy_data(self): code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") @@ -500,7 +753,23 @@ class VectorVectorActivation(HLSCustomOp): % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) ) + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + elem_bits = wdt.bitwidth() + packed_bits = self.get_weightstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = wdt.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/weights.npy" % code_gen_dir + + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", weights, false, numReps);' + % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) + ) + def strm_decl(self): + mem_mode = self.get_nodeattr("mem_mode") self.code_gen_dict["$STREAMDECLARATIONS$"] = [] self.code_gen_dict["$STREAMDECLARATIONS$"].append( 'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width()) @@ -508,8 +777,15 @@ class VectorVectorActivation(HLSCustomOp): self.code_gen_dict["$STREAMDECLARATIONS$"].append( 'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width()) ) + if mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> weights ("weights");'.format( + self.get_weightstream_width() + ) + ) def docompute(self): + mem_mode = self.get_nodeattr("mem_mode") map_to_hls_mult_style = { "auto": "ap_resource_dflt()", "lut": "ap_resource_lut()", @@ -521,16 +797,42 @@ class VectorVectorActivation(HLSCustomOp): threshs = "PassThroughActivation<%s>()" % odtype_hls_str else: threshs = "threshs" - self.code_gen_dict["$DOCOMPUTE$"] = [ - """Vector_Vector_Activate_Batch<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}> - (in0, out, weights, {}, numReps, {});""".format( - tmpl_args["TSrcI"], - tmpl_args["TDstI"], - tmpl_args["TWeightI"], - threshs, - map_to_hls_mult_style[self.get_nodeattr("resType")], + + if mem_mode == "const": + self.code_gen_dict["$DOCOMPUTE$"] = [ + """Vector_Vector_Activate_Batch<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}> + (in0, out, weights, {}, numReps, {});""".format( + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + threshs, + map_to_hls_mult_style[self.get_nodeattr("resType")], + ) + ] + elif mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + if wdt == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + else: + export_wdt = wdt + wdtype_hls_str = export_wdt.get_hls_datatype_str() + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}, {}> + (in0, out, weights, {}, numReps, {});""".format( + "Vector_Vector_Activate_Stream_Batch", + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + wdtype_hls_str, + threshs, + map_to_hls_mult_style[self.get_nodeattr("resType")], + ) + ] + else: + raise Exception( + """Please set mem_mode to "const", "decoupled", or "external", + currently no other parameter value is supported!""" ) - ] def dataoutstrm(self): code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") @@ -561,17 +863,38 @@ class VectorVectorActivation(HLSCustomOp): self.code_gen_dict["$SAVEASCNPY$"] = [] def blackboxfunction(self): - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream<ap_uint<{}>> &in0, - hls::stream<ap_uint<{}>> &out - )""".format( - self.onnx_node.name, - self.get_instream_width(), - self.get_outstream_width(), + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "const": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream<ap_uint<{}>> &in0, + hls::stream<ap_uint<{}>> &out + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.get_outstream_width(), + ) + ] + elif mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}( + hls::stream<ap_uint<{}>> &in0, + hls::stream<ap_uint<{}>> &weights, + hls::stream<ap_uint<{}>> &out + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.get_weightstream_width(), + self.get_outstream_width(), + ) + ] + else: + raise Exception( + """Please set mem_mode to "const" or "decoupled", currently no other + parameter value is supported!""" ) - ] def pragmas(self): + mem_mode = self.get_nodeattr("mem_mode") self.code_gen_dict["$PRAGMAS$"] = [ "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() ] @@ -593,12 +916,30 @@ class VectorVectorActivation(HLSCustomOp): "#pragma HLS INTERFACE ap_ctrl_none port=return" ) - self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') - # the weight tensor is ap_uint<ch*prec> [PE][WMEM] - # partition for parallel access along the PE dimension (dim 1) - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1") - ) + if mem_mode == "const": + self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') + # the weight tensor is ap_uint<ch*prec> [PE][WMEM] + # partition for parallel access along the PE dimension (dim 1) + self.code_gen_dict["$PRAGMAS$"].append( + ( + "#pragma HLS ARRAY_PARTITION variable=weights.m_weights " + "complete dim=1" + ) + ) + elif mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=weights name=weights_" + + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS stream depth=8 variable=weights" + ) + else: + raise Exception( + """Please set mem_mode to "const", "decoupled", or external, + currently no other parameter value is supported!""" + ) + if self.calc_tmem() != 0: # TODO find a better way of checking for no pregenerated thresholds self.code_gen_dict["$PRAGMAS$"].append( @@ -614,6 +955,157 @@ class VectorVectorActivation(HLSCustomOp): ) ) + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + mem_mode = self.get_nodeattr("mem_mode") + sname = self.hls_sname() + if mem_mode == "external": + intf_names["s_axis"].append( + ("weights_" + sname, self.get_weightstream_width_padded()) + ) + if mem_mode == "decoupled": + # only expose axilite interface if attribute is set + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + if runtime_writable: + intf_names["axilite"] = ["s_axilite"] + return intf_names + + def code_generation_ipi(self): + cmd = [] + # add streamer if needed + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "decoupled": + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + if self.get_nodeattr("ram_style") == "ultra": + assert ( + runtime_writable == 1 + ), "Layer with URAM weights must have runtime_writeable_weights=1" + node_name = self.onnx_node.name + sname = self.hls_sname() + # create a hierarchy for this layer, with the same port names + clk_name = self.get_verilog_top_module_intf_names()["clk"][0] + rst_name = self.get_verilog_top_module_intf_names()["rst"][0] + dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0] + din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] + cmd.append("create_bd_cell -type hier %s" % node_name) + cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) + cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) + cmd.append( + "create_bd_intf_pin -mode Master " + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" + % (node_name, dout_name) + ) + cmd.append( + "create_bd_intf_pin -mode Slave " + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) + ) + # instantiate the hls ip + cmd.append( + "create_bd_cell -type ip -vlnv %s /%s/%s" + % (self.get_nodeattr("ip_vlnv"), node_name, node_name) + ) + # instantiate a streamer and connect it to the HLS IP + strm_vlnv = "xilinx.com:user:memstream:1.0" + strm_inst = node_name + "_wstrm" + cmd.append( + "create_bd_cell -type ip -vlnv %s /%s/%s" + % (strm_vlnv, node_name, strm_inst) + ) + cmd.append( + "set_property -dict [list " + "CONFIG.NSTREAMS {1} " + "CONFIG.MEM_DEPTH {%d} " + "CONFIG.MEM_WIDTH {%d} " + "CONFIG.MEM_INIT {%s} " + "CONFIG.RAM_STYLE {%s} " + "CONFIG.STRM0_DEPTH {%d} " + "CONFIG.STRM0_WIDTH {%d} " + "CONFIG.STRM0_OFFSET {0} " + "] [get_bd_cells /%s/%s]" + % ( + self.calc_wmem(), + self.get_weightstream_width_padded(), + self.get_nodeattr("code_gen_dir_ipgen") + "/", + self.get_nodeattr("ram_style"), + self.calc_wmem(), + self.get_weightstream_width_padded(), + node_name, + strm_inst, + ) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s/m_axis_0] " + "[get_bd_intf_pins %s/%s/weights_%s]" + % (node_name, strm_inst, node_name, node_name, sname) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]" + % (node_name, rst_name, node_name, strm_inst) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aclk]" + % (node_name, clk_name, node_name, strm_inst) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]" + % (node_name, rst_name, node_name, node_name, rst_name) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]" + % (node_name, clk_name, node_name, node_name, clk_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, din_name, node_name, node_name, din_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, dout_name, node_name, node_name, dout_name) + ) + if runtime_writable: + # expose axi lite interface for writeable weights + axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0] + cmd.append( + "create_bd_intf_pin -mode Slave " + "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" + % (node_name, axilite_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, axilite_name, node_name, strm_inst, axilite_name) + ) + # TODO calculate and pass in segment size here + cmd.append("assign_bd_address") + cmd.append("save_bd_design") + elif mem_mode == "const" or mem_mode == "external": + # base class impl sufficient for const/external modes + return super().code_generation_ipi() + else: + raise Exception("Unrecognized mem_mode for VectorVectorActivation") + return cmd + + def uram_estimation(self): + P = self.get_nodeattr("PE") + Q = 1 + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + omega = self.calc_wmem() + mem_width = Q * W * P + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "decoupled" and mstyle != "ultra") + or (mmode == "const" and self.calc_wmem() <= 128) + or (mmode == "external") + ): + return 0 + width_multiplier = math.ceil(mem_width / 72) + depth_multiplier = math.ceil(omega / 4096) + return width_multiplier * depth_multiplier + def bram_estimation(self): """Calculates resource estimation for BRAM""" # TODO add in/out FIFO contributions @@ -624,7 +1116,13 @@ class VectorVectorActivation(HLSCustomOp): # assuming SDP mode RAMB18s (see UG573 Table 1-10) # since this is HLS memory, not using the full width of a BRAM # assuming memories up to 128 deep get implemented in LUTs - if self.calc_wmem() <= 128: + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "decoupled" and mstyle in ["distributed", "ultra"]) + or (mmode == "const" and self.calc_wmem() <= 128) + or (mmode == "external") + ): return 0 if W == 1: @@ -671,8 +1169,12 @@ class VectorVectorActivation(HLSCustomOp): c0 = 300 c1 = 1.1 c2 = 0 - if self.calc_wmem() <= 128: - c2 = P * W * math.ceil(self.calc_wmem() / 64) + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if (mmode == "decoupled" and mstyle == "distributed") or ( + mmode == "const" and self.calc_wmem() <= 128 + ): + c2 = (P * W) * math.ceil(self.calc_wmem() / 64) # multiplication res_type = self.get_nodeattr("resType") @@ -710,6 +1212,25 @@ class VectorVectorActivation(HLSCustomOp): mult_dsp = 0 return int(mult_dsp) + def get_weightstream_width(self): + """Returns weight stream width. Used only in decoupled mode.""" + if ( + self.get_nodeattr("mem_mode") == "decoupled" + or self.get_nodeattr("mem_mode") == "external" + ): + pe = self.get_nodeattr("PE") + wp = self.get_weight_datatype().bitwidth() + w_width = pe * wp + return w_width + else: + return 0 + + def get_weightstream_width_padded(self): + """Returns weight stream width padded to a multiple of 8. This is required + by the AXI Stream spec. Used in decoupled mode.""" + weight_width = self.get_weightstream_width() + return roundup_to_integer_multiple(weight_width, 8) + def get_op_and_param_counts(self): k_h, k_w = self.get_nodeattr("Kernel") fm = self.get_nodeattr("Channels") diff --git a/src/finn/qnn-data/build_dataflow/expected_output.npy b/src/finn/qnn-data/build_dataflow/expected_output.npy index a8d09384633791b7e3760dc8a2d1ba88a05d526d..98037351bb4ee49985a98631750f18e9b86965b1 100644 Binary files a/src/finn/qnn-data/build_dataflow/expected_output.npy and b/src/finn/qnn-data/build_dataflow/expected_output.npy differ diff --git a/src/finn/qnn-data/build_dataflow/input.npy b/src/finn/qnn-data/build_dataflow/input.npy index edd24de05a33a15ebc330cdab31f3d77d2c47196..8bece67b7daf5b7668ff5e7515f15a891146b00b 100644 Binary files a/src/finn/qnn-data/build_dataflow/input.npy and b/src/finn/qnn-data/build_dataflow/input.npy differ diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index 429bc34ffc59b5d98bb559f36ac557de4dbba92f..b7db49eb22e0ccb6e3ffbf8ccad44d4274cb2154 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -48,6 +48,10 @@ from finn.transformation.fpgadataflow.minimize_accumulator_width import ( class InferConvInpGen(Transformation): """Convert Im2Col layers to ConvolutionInputGenerator layers.""" + def __init__(self, use_rtl_variant=False): + super().__init__() + self.use_rtl_variant = use_rtl_variant + def apply(self, model): graph = model.graph node_ind = 0 @@ -128,105 +132,144 @@ class InferConvInpGen(Transformation): ) graph.node.insert(node_ind, padding_node) - # Ensure that only supported HLS nodes are inserted + is_kernel_pointwise = k_h == 1 and k_w == 1 is_square_image = ConvInpGen_idim_h == ConvInpGen_idim_w is_square_kernel = k_h == k_w - is_kernel_pointwise = k_h == 1 and k_w == 1 is_equal_stride = stride_h == stride_w is_1d_convolution = (k_h == 1 and k_w > 1 and ifm_dim_h == 1) or ( k_h > 1 and k_w == 1 and ifm_dim_w == 1 ) - if (stride_h > 1 or stride_w > 1) and is_kernel_pointwise: - assert is_square_image, ( - "%s : DownSampler currently only supports square input images." - % n.name - ) - assert is_equal_stride, ( - """%s : DownSampler currently only supports equal stride value - along different axes.""" - % n.name - ) - ConvInpGen_idim = ConvInpGen_idim_h - stride = stride_h - # create DownSampler node + # Ensure that RTL variant is not inserted for unsupported configuration + is_rtl_variant_compatible = True + if is_kernel_pointwise: + is_rtl_variant_compatible = False + if self.use_rtl_variant: + warnings.warn( + """%s : RTL ConvInpGen requested for unsupported + configuration. Falling back to HLS implementation.""" + % n.name + ) + + if self.use_rtl_variant and is_rtl_variant_compatible: + ConvInpGen_node = helper.make_node( - "DownSampler", + "ConvolutionInputGenerator_rtl", [ConvInpGen_input], [i2c_output], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - ImgDim=ConvInpGen_idim, - NumChannels=ifm_ch, + ConvKernelDim=[k_h, k_w], + IFMChannels=ifm_ch, + IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w], + OFMDim=[ofm_dim_h, ofm_dim_w], SIMD=ifm_ch, - Stride=stride, + M=1, + parallel_window=0, + Stride=[stride_h, stride_w], + Dilation=[dilation_h, dilation_w], inputDataType=dt.name, - name="DownSampler_" + n.name, + outputDataType=dt.name, + depthwise=depthwise, + name="ConvolutionInputGenerator_rtl_" + n.name, ) graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) else: - # create equivalent ConvolutionInputGenerator node - if ( - is_square_image and is_square_kernel - ): # square images and square kernels - assert is_equal_stride, ( - """%s: Non-equal strides along different axes is not supported - for (non-)square convolutions""" - % n.name - ) - assert dilation_h == 1 and dilation_w == 1, ( - """%s: Dilation value != 1 is not supported - for square convolutions""" - % n.name + # Ensure that only supported HLS nodes are inserted + if (stride_h > 1 or stride_w > 1) and is_kernel_pointwise: + downsample_1D = (ifm_dim_h == 1) or (ifm_dim_w == 1) + is1D_unitx = ifm_dim_w == 1 + downsample_2D = ( + (not downsample_1D) and is_square_image and is_equal_stride ) + if not (downsample_1D or downsample_2D): + warnings.warn( + f"Couldn't infer Downsample from {n.name},check config." + ) + continue + ConvInpGen_idim = max(ConvInpGen_idim_h, ConvInpGen_idim_w) + stride = max(stride_h, stride_w) + # create DownSampler node ConvInpGen_node = helper.make_node( - "ConvolutionInputGenerator", + "DownSampler", [ConvInpGen_input], [i2c_output], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - ConvKernelDim=[k_h, k_w], - IFMChannels=ifm_ch, - IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w], - OFMDim=[ofm_dim_h, ofm_dim_w], + ImgDim=ConvInpGen_idim, + NumChannels=ifm_ch, SIMD=ifm_ch, - Stride=[stride_h, stride_w], - Dilation=[dilation_h, dilation_w], + Stride=stride, inputDataType=dt.name, - outputDataType=dt.name, - depthwise=depthwise, - name="ConvolutionInputGenerator_" + n.name, + name="DownSampler_" + n.name, + is1D=downsample_1D, + is1D_unitx=is1D_unitx, ) - else: # 1D images and/or kernels - assert is_1d_convolution, ( - "%s: ConvolutionInputGenerator1D works only for 1D convs" - % n.name - ) - if dilation_h > 1 or dilation_w > 1: - assert depthwise == 1, ( - """%s: Dilation value > 1 is only supported for - 1D depthwise separable convolutions""" + graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) + else: + # create equivalent ConvolutionInputGenerator node + if ( + is_square_image and is_square_kernel + ): # square images and square kernels + assert is_equal_stride, ( + """%s: Non-equal strides along different axes is not supported + for (non-)square convolutions""" % n.name ) - ConvInpGen_node = helper.make_node( - "ConvolutionInputGenerator1D", - [ConvInpGen_input], - [i2c_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - ConvKernelDim=[k_h, k_w], - IFMChannels=ifm_ch, - IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w], - OFMDim=[ofm_dim_h, ofm_dim_w], - SIMD=ifm_ch, - Stride=[stride_h, stride_w], - Dilation=[dilation_h, dilation_w], - inputDataType=dt.name, - outputDataType=dt.name, - depthwise=depthwise, - name="ConvolutionInputGenerator1D_" + n.name, - ) - graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) + assert dilation_h == 1 and dilation_w == 1, ( + """%s: Dilation value != 1 is not supported + for square convolutions""" + % n.name + ) + ConvInpGen_node = helper.make_node( + "ConvolutionInputGenerator", + [ConvInpGen_input], + [i2c_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + ConvKernelDim=[k_h, k_w], + IFMChannels=ifm_ch, + IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w], + OFMDim=[ofm_dim_h, ofm_dim_w], + SIMD=ifm_ch, + Stride=[stride_h, stride_w], + Dilation=[dilation_h, dilation_w], + inputDataType=dt.name, + outputDataType=dt.name, + depthwise=depthwise, + name="ConvolutionInputGenerator_" + n.name, + ) + else: # 1D images and/or kernels + assert is_1d_convolution, ( + """%s: ConvolutionInputGenerator1D works only + for 1D convs""" + % n.name + ) + if dilation_h > 1 or dilation_w > 1: + assert depthwise == 1, ( + """%s: Dilation value > 1 is only supported for + 1D depthwise separable convolutions""" + % n.name + ) + ConvInpGen_node = helper.make_node( + "ConvolutionInputGenerator1D", + [ConvInpGen_input], + [i2c_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + ConvKernelDim=[k_h, k_w], + IFMChannels=ifm_ch, + IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w], + OFMDim=[ofm_dim_h, ofm_dim_w], + SIMD=ifm_ch, + Stride=[stride_h, stride_w], + Dilation=[dilation_h, dilation_w], + inputDataType=dt.name, + outputDataType=dt.name, + depthwise=depthwise, + name="ConvolutionInputGenerator1D_" + n.name, + ) + graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) # remove old nodes graph.node.remove(n) graph_modified = True @@ -870,6 +913,10 @@ class InferVectorVectorActivation(Transformation): a depthwise convolution. Any immediately following MultiThreshold layers will also be absorbed into the VVAU.""" + def __init__(self, mem_mode="const"): + super().__init__() + self.mem_mode = mem_mode + def apply(self, model): graph = model.graph node_ind = 0 @@ -970,6 +1017,7 @@ class InferVectorVectorActivation(Transformation): ActVal=actval, noActivation=0, name="VectorVectorActivation_" + n.name, + mem_mode=self.mem_mode, ) graph.node.insert(node_ind, new_node) # remove old nodes @@ -1671,3 +1719,95 @@ class InferConcatLayer(Transformation): model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) return (model, graph_modified) + + +class InferStreamingEltwise(Transformation): + """Convert eltwise Sub or Sub -> Abs to StreamingEltwise layer + with SubEltwise or AbsDiffEltwise op.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + if node.op_type == "Sub": + in0 = node.input[0] + in1 = node.input[1] + result = node.output[0] + in0_shape = model.get_tensor_shape(in0) + in1_shape = model.get_tensor_shape(in1) + + # skip if different shapes on inputs + if in0_shape != in1_shape: + continue + + idt0 = model.get_tensor_datatype(in0) + idt1 = model.get_tensor_datatype(in1) + + # skip conversion for layers with float input + if not (idt0.is_integer() and idt1.is_integer()): + continue + + eltwiseOp = "Sub" + nodes_to_remove = [node] + # look for a downstream Abs node + res_consumer = model.find_consumer(result) + if (res_consumer is not None) and (res_consumer.op_type == "Abs"): + eltwiseOp = "AbsDiff" + result = res_consumer.output[0] + nodes_to_remove.append(res_consumer) + + # check layout and convert if necessary + in0_layout = model.get_tensor_layout(in0) + in1_layout = model.get_tensor_layout(in1) + result_layout = model.get_tensor_layout(result) + + if in0_layout == DataLayout.NCHW: + in0 = nchw_to_nhwc(in0, model, node_ind) + node_ind += 1 + in0_shape = model.get_tensor_shape(in0) + + if in1_layout == DataLayout.NCHW: + in1 = nchw_to_nhwc(in1, model, node_ind) + node_ind += 1 + in1_shape = model.get_tensor_shape(in1) + + # keep track of where we need to insert the HLS Op + # it has to be ahead of the output transform + insert_point = node_ind + + if result_layout == DataLayout.NCHW: + result = nchw_to_nhwc(result, model, node_ind, reverse=True) + node_ind += 1 + + # now safe to assume num_channels is size of last dimension + num_channels = int(in0_shape[-1]) + # create node with no parallelization first + pe = 1 + + # create and insert new Eltwise node + new_node = helper.make_node( + "StreamingEltwise", + [in0, in1], + [result], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + NumChannels=num_channels, + PE=pe, + inputDataType0=idt0.name, + inputDataType1=idt1.name, + eltwiseOp=eltwiseOp, + numInputVectors=in0_shape[:-1], + name="StreamingEltwise_" + node.name, + ) + graph.node.insert(insert_point, new_node) + # remove old nodes + for nd in nodes_to_remove: + graph.node.remove(nd) + graph_modified = True + + # if graph_modified: + # model = model.transform(InferShapes()) + # model = model.transform(InferDataTypes()) + return (model, graph_modified) diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py index 5b0b0cb600ca10564db00bb6d57bcd19a4f49bb6..00e2cc3bb48bcb8b81ba4750382178a4e508bec6 100644 --- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py +++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py @@ -291,7 +291,7 @@ class CreateStitchedIP(Transformation): "make_bd_intf_pins_external [get_bd_intf_pins %s/s_axi]" % signature_name ) self.connect_cmds.append( - "set_property name s_axis_info [get_bd_intf_ports s_axi_0]" + "set_property name s_axilite_info [get_bd_intf_ports s_axi_0]" ) self.connect_cmds.append("assign_bd_address") diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py index a589cb039c825ff97c11df7ffa57109df27f3fd0..f48566326e576f4d39d81359fe7f28a12645a635 100644 --- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -45,7 +45,7 @@ from finn.transformation.fpgadataflow.insert_dwc import InsertDWC from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA from finn.transformation.fpgadataflow.prepare_ip import PrepareIP -from finn.util.basic import make_build_dir, pynq_part_map +from finn.util.basic import make_build_dir, pynq_native_port_width, pynq_part_map from . import templates @@ -320,6 +320,7 @@ class ZynqBuild(Transformation): ): super().__init__() self.fpga_part = pynq_part_map[platform] + self.axi_port_width = pynq_native_port_width[platform] self.period_ns = period_ns self.platform = platform self.enable_debug = enable_debug @@ -330,7 +331,7 @@ class ZynqBuild(Transformation): model = model.transform(InferDataLayouts()) # prepare at global level, then break up into kernels prep_transforms = [ - InsertIODMA(64), + InsertIODMA(self.axi_port_width), InsertDWC(), Floorplan(), CreateDataflowPartition(partition_model_dir=self.partition_model_dir), diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py index 23943084ab99d6ab880a69975e0b4a49756905a7..e24e24f1f8ebb2873c81617884cd333311d8aea9 100644 --- a/src/finn/transformation/fpgadataflow/set_folding.py +++ b/src/finn/transformation/fpgadataflow/set_folding.py @@ -109,6 +109,7 @@ class SetFolding(Transformation): "FMPadding_Batch", "ConvolutionInputGenerator", "ConvolutionInputGenerator1D", + "ConvolutionInputGenerator_rtl", ] # these ops are preceded by depthwise SWG and have special behavior, # as explained in the SetFolding docstring @@ -171,10 +172,7 @@ class SetFolding(Transformation): "Expected SWU on DW op input, found " + swu_node.op_type ) elif op_type in simd_ops: - if op_type in [ - "ConvolutionInputGenerator", - "ConvolutionInputGenerator1D", - ]: + if op_type.startswith("ConvolutionInputGenerator"): depthwise = node_inst.get_nodeattr("depthwise") if depthwise == 0: max_simd = node_inst.get_nodeattr("IFMChannels") diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py index 3e815c1537353cc2be970a2068d4ded30cc48bc8..29eefacc32370598ddcd39283d022f5eb61f3f0c 100644 --- a/src/finn/transformation/streamline/reorder.py +++ b/src/finn/transformation/streamline/reorder.py @@ -725,6 +725,77 @@ class MakeMaxPoolNHWC(Transformation): return (model, graph_modified) +class MakeScaleResizeNHWC(Transformation): + """ + Converts the inputs and outputs for all scales Resize and Upsample nodes + from NCHW to NHWC. + """ + + def apply(self, model): + graph = model.graph + node_ind = 0 + for n in graph.node: + node_ind += 1 + if n.op_type == "Upsample" or n.op_type == "Resize": + if model.get_tensor_layout(n.input[0]) != DataLayout.NCHW: + warnings.warn( + "%s: Input not NCHW. Can't operate transformation on node." + % n.name + ) + continue + consumer = model.find_consumer(n.output[0]) + producer = model.find_producer(n.input[0]) + if n.op_type == "Upsample": + scales_ind = 1 + else: + scales_ind = 2 + if producer is not None and producer.op_type == "Transpose": + perms = list(get_by_name(producer.attribute, "perm").ints) + if perms == [0, 3, 1, 2]: + old_value = model.get_initializer(n.input[scales_ind]) + new_value = np.array( + [old_value[idx] for idx in (0, 2, 3, 1)], + dtype=np.dtype("float32"), + ) + model.set_initializer(n.input[scales_ind], new_value) + start_name = producer.input[0] + mid_name = n.input[0] + end_name = n.output[0] + (b, hi, wi, c) = model.get_tensor_shape(start_name) + (b, c, ho, wo) = model.get_tensor_shape(end_name) + producer.input[0] = mid_name + producer.output[0] = end_name + n.input[0] = start_name + n.output[0] = mid_name + model.set_tensor_shape(mid_name, (b, ho, wo, c)) + model.set_tensor_shape(end_name, (b, c, ho, wo)) + graph.node.remove(producer) + graph.node.insert(node_ind, producer) + elif consumer is not None and consumer.op_type == "Transpose": + perms = list(get_by_name(consumer.attribute, "perm").ints) + if perms == [0, 2, 3, 1]: + old_value = model.get_initializer(n.input[scales_ind]) + new_value = np.array( + [old_value[idx] for idx in (0, 2, 3, 1)], + dtype=np.dtype("float32"), + ) + model.set_initializer(n.input[scales_ind], new_value) + start_name = n.input[0] + mid_name = consumer.input[0] + end_name = consumer.output[0] + (b, c, hi, wi) = model.get_tensor_shape(start_name) + (b, c, ho, wo) = model.get_tensor_shape(mid_name) + consumer.input[0] = start_name + consumer.output[0] = mid_name + n.input[0] = mid_name + n.output[0] = end_name + model.set_tensor_shape(mid_name, (b, hi, wi, c)) + model.set_tensor_shape(end_name, (b, ho, wo, c)) + graph.node.remove(consumer) + graph.node.insert(node_ind - 1, consumer) + return (model, False) + + class MoveOpPastFork(Transformation): """Move node operations past graph forks. Used when a node before a fork can be merged with nodes in the branches diff --git a/src/finn/util/test.py b/src/finn/util/test.py index f5d3b1c30b8b7b439eae1c684ad84b33a3401c7c..bfe4aa0bb826c73f6a7c67f025e24764da8c36cc 100644 --- a/src/finn/util/test.py +++ b/src/finn/util/test.py @@ -180,6 +180,7 @@ def execute_parent(parent_path, child_path, input_tensor_npy, return_full_ctx=Fa sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0] sdp_node = getCustomOp(sdp_node) sdp_node.set_nodeattr("model", child_path) + sdp_node.set_nodeattr("return_full_exec_context", 1 if return_full_ctx else 0) ret = execute_onnx(parent_model, {iname: input_tensor_npy}, True) if return_full_ctx: return ret diff --git a/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py index 5bbaefac2d3e5f800fbb9471df6469235271c2f3..7b3e20616410f54e4718290baec9a510a0d49c0d 100644 --- a/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py +++ b/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py @@ -66,11 +66,12 @@ from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode ], ) @pytest.mark.parametrize("depthwise", [False, True]) +@pytest.mark.parametrize("use_rtl_swg", [False, True]) @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, exec_mode): +def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mode): pad, kernel_size, stride, dilation = conv_config np.random.seed(0) idt = DataType["UINT4"] @@ -84,6 +85,9 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, exec_mode): pad_h = pad[0] + pad[2] pad_w = pad[1] + pad[3] + if use_rtl_swg and exec_mode == "cppsim": + pytest.skip("cppsim not supported for RTL SWG") + if depthwise is True: group = out_chn = in_chn conv_param_shape = [out_chn, 1, k_h, k_w] @@ -139,7 +143,7 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, exec_mode): model = model.transform(InferDataTypes()) new_model = model.transform(LowerConvsToMatMul()) - new_model = new_model.transform(to_hls.InferConvInpGen()) + new_model = new_model.transform(to_hls.InferConvInpGen(use_rtl_variant=use_rtl_swg)) if depthwise is True: new_model = new_model.transform(to_hls.InferVectorVectorActivation()) else: diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py index 55dc77cafb898ead28a7cbb9641e0b40db276919..8c9f110c315089ec03354863bf2213963197217a 100644 --- a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py +++ b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py @@ -57,11 +57,12 @@ from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode "conv_config", [(1, 2, 0), (1, 3, 0), (3, 2, 1), (3, 1, 0), (3, 1, 1), (5, 2, 1)] ) @pytest.mark.parametrize("depthwise", [False, True]) +@pytest.mark.parametrize("use_rtl_swg", [False, True]) @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode): +def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mode): kernel_size, stride, pad = conv_config np.random.seed(0) idt = DataType["UINT4"] @@ -69,6 +70,12 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode): in_feature_dim = 7 in_chn = 16 + if use_rtl_swg and exec_mode == "cppsim": + pytest.skip("cppsim not supported for RTL SWG") + + if use_rtl_swg and kernel_size == 1: + pytest.skip("1x1 kernel not supported by current RTL SWG") + if depthwise is True: group = out_chn = in_chn conv_param_shape = [out_chn, 1, kernel_size, kernel_size] @@ -122,7 +129,7 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode): model = model.transform(InferDataTypes()) new_model = model.transform(LowerConvsToMatMul()) - new_model = new_model.transform(to_hls.InferConvInpGen()) + new_model = new_model.transform(to_hls.InferConvInpGen(use_rtl_variant=use_rtl_swg)) if depthwise is True: new_model = new_model.transform(to_hls.InferVectorVectorActivation()) else: @@ -156,6 +163,7 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode): x = gen_finn_dt_tensor(idt, input_shape) inp_dict = {model.graph.input[0].name: x} assert oxe.compare_execution(model, new_model, inp_dict) + if kernel_size == 1 and stride > 1 and pad == 0: assert new_model.graph.node[1].op_type == "DownSampler" if exec_mode == "rtlsim": diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py new file mode 100755 index 0000000000000000000000000000000000000000..007360a5fd0b74ee49d54c84f332061dd5f3a114 --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py @@ -0,0 +1,260 @@ +# Copyright (C) 2022, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +from onnx import TensorProto, helper +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.general.im2col import compute_conv_output_dim +from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.util.basic import gen_finn_dt_tensor + +import finn.core.onnx_exec as oxe +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode + + +def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt): + k_h, k_w = k + ifm_dim_h, ifm_dim_w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + ofm_dim_h, ofm_dim_w = ofm_dim + + odt = idt + inp = helper.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch] + ) + outp = helper.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] + ) + + im2col_node = helper.make_node( + "Im2Col", + ["inp"], + ["outp"], + domain="finn.custom_op.general", + stride=[stride_h, stride_w], + kernel_size=[k_h, k_w], + input_shape=str((1, ifm_dim_h, ifm_dim_w, ifm_ch)), + dilations=[dilation_h, dilation_w], + pad_amount=[0, 0, 0, 0], + pad_value=0, + ) + graph = helper.make_graph( + nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp] + ) + + model = helper.make_model(graph, producer_name="im2col-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", odt) + + return model + + +def make_single_slidingwindow_modelwrapper( + k, ifm_ch, ifm_dim, ofm_dim, simd, m, parallel_window, stride, dilation, idt, dw=0 +): + k_h, k_w = k + ifm_dim_h, ifm_dim_w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + ofm_dim_h, ofm_dim_w = ofm_dim + + odt = idt + inp = helper.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch] + ) + outp = helper.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] + ) + + SlidingWindow_node = helper.make_node( + "ConvolutionInputGenerator_rtl", + ["inp"], + ["outp"], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + ConvKernelDim=[k_h, k_w], + IFMChannels=ifm_ch, + IFMDim=[ifm_dim_h, ifm_dim_w], + OFMDim=[ofm_dim_h, ofm_dim_w], + SIMD=simd, + M=m, + parallel_window=parallel_window, + Stride=[stride_h, stride_w], + Dilation=[dilation_h, dilation_w], + inputDataType=idt.name, + outputDataType=odt.name, + depthwise=dw, + ) + graph = helper.make_graph( + nodes=[SlidingWindow_node], + name="slidingwindow_graph", + inputs=[inp], + outputs=[outp], + ) + + model = helper.make_model(graph, producer_name="slidingwindow-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", odt) + + return model + + +def prepare_inputs(input_tensor): + return {"inp": input_tensor} + + +# input datatype +@pytest.mark.parametrize("idt", [DataType["UINT4"]]) +# kernel size +@pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 3]]) +# input dimension +@pytest.mark.parametrize("ifm_dim", [[24, 24], [15, 6], [13, 13], [1, 14]]) +# input channels +@pytest.mark.parametrize("ifm_ch", [6]) +# Stride +@pytest.mark.parametrize("stride", [[1, 1], [2, 2]]) +# Dilation +@pytest.mark.parametrize("dilation", [[1, 1], [2, 2]]) +# depthwise +@pytest.mark.parametrize("dw", [0, 1]) +# input channel parallelism ("SIMD") +@pytest.mark.parametrize("simd", [1, 2, 3, 6]) +# parallel_window enable (MMV_out = M*K) +@pytest.mark.parametrize("parallel_window", [0]) +# in/out MMV ("M") +@pytest.mark.parametrize("m", [1]) +# Flip dimensions +@pytest.mark.parametrize("flip", [False]) +@pytest.mark.slow +@pytest.mark.vivado +@pytest.mark.fpgadataflow +def test_fpgadataflow_slidingwindow_rtl( + idt, k, ifm_dim, ifm_ch, stride, dilation, dw, simd, m, parallel_window, flip +): + if flip: + if ( + ifm_dim[0] == ifm_dim[1] + and k[0] == k[1] + and stride[0] == stride[1] + and dilation[0] == dilation[1] + ): + pytest.skip("Dimension flip would have no effect") + k = k[::-1] + ifm_dim = ifm_dim[::-1] + stride = stride[::-1] + dilation = dilation[::-1] + + k_h, k_w = k + ifm_dim_h, ifm_dim_w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + + kernel_width = (k_w - 1) * dilation_w + 1 # incl. dilation + kernel_height = (k_h - 1) * dilation_h + 1 # incl. dilation + + if simd > ifm_ch: + pytest.skip("SIMD cannot be larger than number of input channels") + if ifm_ch % simd != 0: + pytest.skip("SIMD must divide number of input channels") + if kernel_height > ifm_dim_h or stride_h > ifm_dim_h: + pytest.skip( + "Illegal convolution configuration: kernel or stride > FM dimension" + ) + if kernel_width > ifm_dim_w or stride_w > ifm_dim_w: + pytest.skip( + "Illegal convolution configuration: kernel or stride > FM dimension" + ) + if (k_h == 1 and (stride_h != 1 or dilation_h != 1)) or ( + k_w == 1 and (stride_w != 1 or dilation_w != 1) + ): + pytest.skip( + """Illegal convolution configuration: + stride or dilation defined for unitary kernel dim""" + ) + if k_h == 1 and k_w == 1 and simd != ifm_ch: + pytest.skip("1x1 Kernel only supported in parallel mode (SIMD=C)") + if parallel_window and simd != ifm_ch: + pytest.skip("Parallel window requires SIMD=C") + + ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h) + ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w) + ofm_dim = [ofm_dim_h, ofm_dim_w] + + x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch)) + model = make_single_slidingwindow_modelwrapper( + k=k, + ifm_ch=ifm_ch, + ifm_dim=ifm_dim, + ofm_dim=ofm_dim, + simd=simd, + m=m, + parallel_window=parallel_window, + stride=stride, + dilation=dilation, + idt=idt, + dw=dw, + ) + + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP("xc7z020clg400-1", 5)) + model = model.transform(PrepareRTLSim()) + + # prepare input data + input_dict = prepare_inputs(x) + # execute model + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + golden = make_single_im2col_modelwrapper( + k=k, + ifm_ch=ifm_ch, + ifm_dim=ifm_dim, + ofm_dim=ofm_dim, + stride=stride, + dilation=dilation, + idt=idt, + ) + y_expected = oxe.execute_onnx(golden, input_dict)["outp"] + + if dw == 0: + assert (y_produced == y_expected).all() + else: + y_expected = y_expected.reshape( + 1, ofm_dim_h, ofm_dim_w, k_h * k_w, ifm_ch // simd, simd + ) + y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5) + y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, ifm_ch * k_h * k_w) + assert (y_produced == y_expected).all() diff --git a/tests/fpgadataflow/test_fpgadataflow_downsampler.py b/tests/fpgadataflow/test_fpgadataflow_downsampler.py new file mode 100644 index 0000000000000000000000000000000000000000..64da0a2368a69d6037c681d88391eef2844dae2c --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_downsampler.py @@ -0,0 +1,160 @@ +# Copyright (c) 2022, Xilinx, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import numpy as np +import onnx.parser as oprs +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.general.im2col import compute_conv_output_dim +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul +from qonnx.util.basic import gen_finn_dt_tensor + +import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer +from finn.core.onnx_exec import execute_onnx +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode + + +def build_model(is_1d, in_dim, k, stride, dt_in, dt_w, pad_half=0, flip_1d=False): + np.random.seed(0) + out_dim = compute_conv_output_dim(in_dim, k, stride, 2 * pad_half) + ifm = 8 + ofm = 16 + if is_1d: + if flip_1d: + shape_in = [1, ifm, 1, in_dim] + shape_out = [1, ofm, 1, out_dim] + shape_k = [1, k] + shape_s = [1, stride] + shape_p = [0, pad_half, 0, pad_half] + else: + shape_in = [1, ifm, in_dim, 1] + shape_out = [1, ofm, out_dim, 1] + shape_k = [k, 1] + shape_s = [stride, 1] + shape_p = [pad_half, 0, pad_half, 0] + else: + shape_in = [1, ifm, in_dim, in_dim] + shape_out = [1, ofm, out_dim, out_dim] + shape_k = [k, k] + shape_s = [stride, stride] + shape_p = [pad_half, pad_half, pad_half, pad_half] + shape_w = [ofm, ifm] + shape_k + + sstr_in = str(shape_in) + sstr_out = str(shape_out) + sstr_k = str(shape_k) + sstr_s = str(shape_s) + sstr_p = str(shape_p) + sstr_w = str(shape_w) + + input = f""" + < + ir_version: 7, + opset_import: ["" : 9] + > + agraph (float{sstr_in} in0) => (float{sstr_out} out0) + < + float{sstr_w} param_w_conv0 + > + {{ + out0 = Conv<kernel_shape={sstr_k}, group=1, pads={sstr_p}, + strides={sstr_s}>(in0, param_w_conv0) + }} + """ + model = oprs.parse_model(input) + model = ModelWrapper(model) + model.set_tensor_datatype("in0", dt_in) + model.set_tensor_datatype("param_w_conv0", dt_w) + model.set_initializer("param_w_conv0", gen_finn_dt_tensor(dt_w, shape_w)) + model = model.transform(InferShapes()) + model = model.transform(LowerConvsToMatMul()) + model = model.transform(InferShapes()) + return model + + +@pytest.mark.parametrize("is_1d", [True, False]) +@pytest.mark.parametrize("flip_1d", [True, False]) +@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +@pytest.mark.slow +@pytest.mark.vivado +@pytest.mark.fpgadataflow +def test_fpgadataflow_downsampler(is_1d, flip_1d, exec_mode): + if flip_1d and not is_1d: + pytest.skip("flip_1d only applicable for is_1d") + in_dim = 32 + k = 1 + stride = 2 + dt_in = DataType["UINT8"] + dt_w = DataType["INT2"] + model = build_model( + is_1d, in_dim, k, stride, dt_in, dt_w, pad_half=0, flip_1d=flip_1d + ) + inp = gen_finn_dt_tensor(dt_in, model.get_tensor_shape("in0")) + idict = {"in0": inp} + y_expected = execute_onnx(model, idict)["out0"] + model = model.transform(to_hls.InferConvInpGen()) + assert len(model.get_nodes_by_op_type("DownSampler")) == 1 + if exec_mode == "cppsim": + model = model.transform(SetExecMode("cppsim")) + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + elif exec_mode == "rtlsim": + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP("xc7z020clg400-1", 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + else: + raise Exception("Unknown exec_mode") + y_produced = execute_onnx(model, idict)["out0"] + assert (y_produced == y_expected).all() + if exec_mode == "rtlsim": + node = model.get_nodes_by_op_type("DownSampler")[0] + inst = getCustomOp(node) + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + exp_cycles_dict = model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + # small adjustment for 2D testcase due to how rtlsim works: + # output is finished before all pixels are read, since last + # row is dropped (rtlsim finishes based on # of expected + # pixels) + if not is_1d: + exp_cycles = exp_cycles - in_dim + assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) + assert exp_cycles != 0 diff --git a/tests/fpgadataflow/test_fpgadataflow_eltwise.py b/tests/fpgadataflow/test_fpgadataflow_eltwise.py new file mode 100644 index 0000000000000000000000000000000000000000..6028a9b9f0fb4a04d0f53fd8c4fae3aac3ae686e --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_eltwise.py @@ -0,0 +1,133 @@ +# Copyright (c) 2022, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import numpy as np +import onnx.parser as oprs +import qonnx.core.data_layout as dl +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.util.basic import gen_finn_dt_tensor + +import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer +from finn.core.onnx_exec import execute_onnx +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode + + +def build_model(shp, dt0, dt1, do_abs): + np.random.seed(0) + shp_str = str(shp) + if do_abs: + graph = """ + sub_out = Sub(in0, in1) + out0 = Abs(sub_out) + """ + else: + graph = "out0 = Sub(in0, in1)" + + input = f""" + < + ir_version: 7, + opset_import: ["" : 9] + > + agraph (float{shp_str} in0, float{shp_str} in1) => (float{shp_str} out0) + {{ + {graph} + }} + """ + model = oprs.parse_model(input) + model = ModelWrapper(model) + model.set_tensor_datatype("in0", dt0) + model.set_tensor_datatype("in1", dt1) + model.set_tensor_layout("in0", dl.NHWC) + model.set_tensor_layout("in1", dl.NHWC) + model = model.transform(InferShapes()) + return model + + +# input datatype for one operand +@pytest.mark.parametrize("dt0", [DataType["UINT4"], DataType["UINT7"]]) +# channels +@pytest.mark.parametrize("ch", [1, 64]) +# folding +@pytest.mark.parametrize("fold", [-1, 2, 1]) +# include Abs output node or not +@pytest.mark.parametrize("do_abs", [True, False]) +# execution mode +@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +@pytest.mark.fpgadataflow +@pytest.mark.vivado +def test_fpgadataflow_eltwise(dt0, ch, fold, do_abs, exec_mode): + if fold == -1: + pe = 1 + else: + pe = max(1, ch // fold) + assert ch % pe == 0 + dt1 = DataType["UINT8"] + shp = [1, 4, 2, ch] + model = build_model(shp, dt0, dt1, do_abs) + in0 = gen_finn_dt_tensor(dt0, shp) + in1 = gen_finn_dt_tensor(dt1, shp) + idict = {"in0": in0, "in1": in1} + y_expected = execute_onnx(model, idict)["out0"] + model = model.transform(to_hls.InferStreamingEltwise()) + assert len(model.graph.node) == 1 + assert model.graph.node[0].op_type == "StreamingEltwise" + getCustomOp(model.graph.node[0]).set_nodeattr("PE", pe) + if exec_mode == "cppsim": + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + model = model.transform(SetExecMode("cppsim")) + elif exec_mode == "rtlsim": + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP("xc7z020clg400-1", 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + else: + raise Exception("Unknown exec_mode") + y_produced = execute_onnx(model, idict)["out0"] + assert (y_produced == y_expected).all(), exec_mode + " failed" + if exec_mode == "rtlsim": + node = model.get_nodes_by_op_type("StreamingEltwise")[0] + inst = getCustomOp(node) + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + exp_cycles_dict = model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) + assert exp_cycles != 0 diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau.py b/tests/fpgadataflow/test_fpgadataflow_vvau.py index c48448787d8a3bb926c1e94850be6e99e8c106d3..03ddb1286320b8178276ea53082095106a43d7a1 100644 --- a/tests/fpgadataflow/test_fpgadataflow_vvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_vvau.py @@ -75,7 +75,19 @@ def _calculate_dot_prod_range(dt_a, dt_b, len): def _make_single_vvau_modelwrapper( - W, pe, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T=None, tdt=None + W, + pe, + k_h, + k_w, + channels, + dim_h, + dim_w, + wdt, + idt, + odt, + T=None, + tdt=None, + mem_mode="const", ): in_shape = [1, dim_h, dim_w, k_h * k_w * channels] # [N, H, W, K*K*CH] out_shape = [ @@ -113,6 +125,7 @@ def _make_single_vvau_modelwrapper( weightDataType=wdt.name, outputDataType=odt.name, noActivation=no_act, + mem_mode=mem_mode, ) graph = helper.make_graph( @@ -140,7 +153,7 @@ def prepare_inputs(input_tensor): return {"inp": input_tensor} -# mem_mode: const or decoupled +# input datatype @pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]]) # weight datatype @pytest.mark.parametrize("wdt", [DataType["INT4"]]) @@ -156,13 +169,15 @@ def prepare_inputs(input_tensor): @pytest.mark.parametrize("k_w", [3, 1]) # Number of input and output channels @pytest.mark.parametrize("channels", [3, 4]) +# memory mode +@pytest.mark.parametrize("mem_mode", ["const", "decoupled"]) # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado def test_fpgadataflow_vvau( - idt, wdt, act, pe, dim_h, dim_w, k_h, k_w, channels, exec_mode + idt, wdt, act, pe, dim_h, dim_w, k_h, k_w, channels, mem_mode, exec_mode ): if pe == "channels": pe = channels @@ -198,7 +213,7 @@ def test_fpgadataflow_vvau( tdt = DataType["INT32"] model = _make_single_vvau_modelwrapper( - W, pe, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt + W, pe, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt, mem_mode ) if exec_mode == "cppsim": diff --git a/tests/transformation/streamline/test_scale_resize_nhwc.py b/tests/transformation/streamline/test_scale_resize_nhwc.py new file mode 100644 index 0000000000000000000000000000000000000000..f10930f4e7d5aeb98a60630e7e4f48adfc371d59 --- /dev/null +++ b/tests/transformation/streamline/test_scale_resize_nhwc.py @@ -0,0 +1,293 @@ +import pytest + +import numpy as np +import onnx +import onnx.helper as oh +import qonnx.core.data_layout as DataLayout +from onnx import TensorProto +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.transformation.infer_data_layouts import InferDataLayouts +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.util.basic import gen_finn_dt_tensor + +import finn.core.onnx_exec as oxe +from finn.transformation.streamline.reorder import MakeScaleResizeNHWC + + +def create_resize_transpose(ifm_dim, ifm_ch, scales, mode, idt): + ofm_dim_h = ifm_dim[0] * scales[2] + ofm_dim_w = ifm_dim[1] * scales[3] + inp = oh.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim[0], ifm_dim[1]] + ) + + param = oh.make_tensor_value_info("scales", TensorProto.FLOAT, [4]) + + # Not actually used, only needed for compliance with the Resize node interface + roi = oh.make_tensor_value_info("roi", TensorProto.FLOAT, [4]) + + outp_up = oh.make_tensor_value_info( + "outp_up", TensorProto.FLOAT, [1, ifm_ch, ofm_dim_h, ofm_dim_w] + ) + outp = oh.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, ifm_ch] + ) + + resize_node = oh.make_node( + "Resize", + inputs=["inp", "roi", "scales"], + outputs=["outp_up"], + name="Resize1", + mode=mode, + ) + + transpose_node = onnx.helper.make_node( + "Transpose", + inputs=["outp_up"], + outputs=["outp"], + name="Transpose1", + perm=[0, 2, 3, 1], + ) + + graph = oh.make_graph( + nodes=[resize_node, transpose_node], + name="resize_graph", + inputs=[inp], + outputs=[outp], + value_info=[outp_up, param, roi], + ) + + model = oh.make_model(graph, producer_name="resize_model1") + model = ModelWrapper(model) + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", idt) + + model.set_tensor_layout("inp", DataLayout.NCHW) + model = model.transform(InferShapes()) + model = model.transform(InferDataLayouts()) + + return model + + +def create_transpose_resize(ifm_dim, ifm_ch, scales, mode, idt): + ofm_dim_h = ifm_dim[0] * scales[2] + ofm_dim_w = ifm_dim[1] * scales[3] + inp = oh.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, ifm_dim[0], ifm_dim[1], ifm_ch] + ) + + param = oh.make_tensor_value_info("scales", TensorProto.FLOAT, [4]) + + # Not actually used, only needed for compliance with the Resize node interface + roi = oh.make_tensor_value_info("roi", TensorProto.FLOAT, [4]) + + outp = oh.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ifm_ch, ofm_dim_h, ofm_dim_w] + ) + outp_tr = oh.make_tensor_value_info( + "outp_tr", TensorProto.FLOAT, [1, ifm_ch, ifm_dim[0], ifm_dim[1]] + ) + + transpose_node = onnx.helper.make_node( + "Transpose", + inputs=["inp"], + outputs=["outp_tr"], + name="Transpose1", + perm=[0, 3, 1, 2], + ) + + resize_node = oh.make_node( + "Resize", + inputs=["outp_tr", "roi", "scales"], + outputs=["outp"], + name="Resize1", + mode=mode, + ) + + graph = oh.make_graph( + nodes=[transpose_node, resize_node], + name="resize_graph", + inputs=[inp], + outputs=[outp], + value_info=[outp_tr, param, roi], + ) + + model = oh.make_model(graph, producer_name="resize_model2") + model = ModelWrapper(model) + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", idt) + model.set_tensor_layout("inp", DataLayout.NHWC) + + model = model.transform(InferShapes()) + model = model.transform(InferDataLayouts()) + + return model + + +def create_transpose_resize_transpose(ifm_dim, ifm_ch, scales, mode, idt): + ofm_dim_h = ifm_dim[0] * scales[2] + ofm_dim_w = ifm_dim[1] * scales[3] + inp = oh.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, ifm_dim[0], ifm_dim[1], ifm_ch] + ) + + param = oh.make_tensor_value_info("scales", TensorProto.FLOAT, scales) + + # Not actually used, only needed for compliance with the Resize node interface + roi = oh.make_tensor_value_info("roi", TensorProto.FLOAT, [4]) + + outp_tr = oh.make_tensor_value_info( + "outp_tr", TensorProto.FLOAT, [1, ifm_ch, ifm_dim[0], ifm_dim[1]] + ) + + outp_up = oh.make_tensor_value_info( + "outp_up", TensorProto.FLOAT, [1, ifm_ch, ofm_dim_h, ofm_dim_w] + ) + outp = oh.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, ifm_ch] + ) + + transpose_node1 = onnx.helper.make_node( + "Transpose", + inputs=["inp"], + outputs=["outp_tr"], + name="Transpose1", + perm=[0, 3, 1, 2], + ) + + resize_node = oh.make_node( + "Resize", + inputs=["outp_tr", "roi", "scales"], + outputs=["outp_up"], + name="Resize1", + mode=mode, + ) + + transpose_node2 = onnx.helper.make_node( + "Transpose", + inputs=["outp_up"], + outputs=["outp"], + name="Transpose2", + perm=[0, 2, 3, 1], + ) + + graph = oh.make_graph( + nodes=[transpose_node1, resize_node, transpose_node2], + name="resize_graph", + inputs=[inp], + outputs=[outp], + value_info=[outp_up, outp_tr, param, roi], + ) + + model = oh.make_model(graph, producer_name="resize_model3") + model = ModelWrapper(model) + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", idt) + model.set_tensor_layout("inp", DataLayout.NHWC) + + model = model.transform(InferShapes()) + model = model.transform(InferDataLayouts()) + + return model + + +def check_transform(model): + graph = model.graph + node_ind = 0 + for n in graph.node: + node_ind += 1 + if n.op_type == "Upsample" or n.op_type == "Resize": + if model.get_tensor_layout(n.output[0]) == DataLayout.NHWC: + return True + return False + + +@pytest.mark.streamline +# input dimension +@pytest.mark.parametrize("ifm_dim", [[2**i, 2**i] for i in range(3, 6)]) +# input channels +@pytest.mark.parametrize("ifm_ch", [3]) +# scales +@pytest.mark.parametrize( + "scales", [[1, 1, i, j] for i in range(2, 5) for j in range(2, 5)] +) +# mode +@pytest.mark.parametrize("mode", ["nearest"]) +# input datatype +@pytest.mark.parametrize("idt", [DataType["INT4"]]) +def test_scale_resize_nhwc(ifm_dim, ifm_ch, scales, mode, idt): + # create models + resize_model1 = create_resize_transpose(ifm_dim, ifm_ch, scales, mode, idt) + resize_model2 = create_transpose_resize(ifm_dim, ifm_ch, scales, mode, idt) + resize_model3 = create_transpose_resize_transpose( + ifm_dim, ifm_ch, scales, mode, idt + ) + + # set initializers + resize_model1.set_initializer("scales", np.array(scales, dtype=np.float32)) + resize_model2.set_initializer("scales", np.array(scales, dtype=np.float32)) + resize_model3.set_initializer("scales", np.array(scales, dtype=np.float32)) + + # generate input tensor for testing + input_tensor_nchw = gen_finn_dt_tensor(idt, [1, ifm_ch, ifm_dim[0], ifm_dim[1]]) + input_tensor_nhwc = gen_finn_dt_tensor(idt, [1, ifm_dim[0], ifm_dim[1], ifm_ch]) + input_dict_nchw = {"inp": input_tensor_nchw} + input_dict_nhwc = {"inp": input_tensor_nhwc} + + # execute first model + output_dict1 = oxe.execute_onnx(resize_model1, input_dict_nchw) + expected1 = output_dict1["outp"] + + # transform Resize into ResizeNHWC + resize_model1 = resize_model1.transform(MakeScaleResizeNHWC()) + resize_model1 = resize_model1.transform(InferDataLayouts()) + + # execute transformed model + output_node_name1 = resize_model1.graph.output[0].name + output_dict1 = oxe.execute_onnx( + resize_model1, input_dict_nchw, return_full_exec_context=False + ) + output1 = output_dict1[output_node_name1] + + # compare outputs + assert (expected1 == output1).all() + assert check_transform(resize_model1) + + # execute second model + output_dict2 = oxe.execute_onnx(resize_model2, input_dict_nhwc) + expected2 = output_dict2["outp"] + + # transform Resize into ResizeNHWC + resize_model2 = resize_model2.transform(MakeScaleResizeNHWC()) + resize_model2 = resize_model2.transform(InferDataLayouts()) + + # execute transformed model + output_node_name2 = resize_model2.graph.output[0].name + output_dict2 = oxe.execute_onnx( + resize_model2, input_dict_nhwc, return_full_exec_context=False + ) + output2 = output_dict2[output_node_name2] + + # compare outputs + assert (expected2 == output2).all() + assert check_transform(resize_model2) + + # execute third model + output_dict3 = oxe.execute_onnx(resize_model3, input_dict_nhwc) + expected3 = output_dict3["outp"] + + # transform Resize into ResizeNHWC + resize_model3 = resize_model3.transform(MakeScaleResizeNHWC()) + resize_model3 = resize_model3.transform(InferDataLayouts()) + + # execute transformed model + output_node_name3 = resize_model3.graph.output[0].name + output_dict3 = oxe.execute_onnx( + resize_model3, input_dict_nhwc, return_full_exec_context=False + ) + output3 = output_dict3[output_node_name3] + + # compare outputs + assert (expected3 == output3).all() + assert check_transform(resize_model3) diff --git a/tests/util/test_build_dataflow.py b/tests/util/test_build_dataflow.py index cdf69aebddc4d6af2288774acbff5dd8a52512b3..39f0b0dc89e9388c54a013becb53d9afbfb2ce4e 100644 --- a/tests/util/test_build_dataflow.py +++ b/tests/util/test_build_dataflow.py @@ -30,6 +30,7 @@ import pkg_resources as pk import pytest +import numpy as np import os from shutil import copytree @@ -55,7 +56,6 @@ def test_end2end_build_dataflow_directory(): assert os.path.isfile(output_dir + "/driver/driver.py") assert os.path.isfile(output_dir + "/report/estimate_layer_cycles.json") assert os.path.isfile(output_dir + "/report/estimate_layer_resources.json") - assert os.path.isfile(output_dir + "/report/verify_rtlsim.vcd") assert os.path.isfile(output_dir + "/report/rtlsim_perf_batch_1.vcd") assert os.path.isfile( output_dir + "/report/estimate_layer_config_alternatives.json" @@ -68,8 +68,19 @@ def test_end2end_build_dataflow_directory(): assert os.path.isfile(output_dir + "/report/post_synth_resources.xml") assert os.path.isfile(output_dir + "/report/post_route_timing.rpt") # verification outputs - verify_out_dir = output_dir + "/verification_output" - assert os.path.isfile(verify_out_dir + "/verify_initial_python_SUCCESS.npy") - assert os.path.isfile(verify_out_dir + "/verify_streamlined_python_SUCCESS.npy") - assert os.path.isfile(verify_out_dir + "/verify_folded_hls_cppsim_SUCCESS.npy") - assert os.path.isfile(verify_out_dir + "/verify_stitched_ip_rtlsim_SUCCESS.npy") + verif_batchsize = np.load(target_dir + "/input.npy").shape[0] + for i in range(verif_batchsize): + verify_out_dir = output_dir + "/verification_output" + assert os.path.isfile( + verify_out_dir + f"/verify_initial_python_{i}_SUCCESS.npy" + ) + assert os.path.isfile( + verify_out_dir + f"/verify_streamlined_python_{i}_SUCCESS.npy" + ) + assert os.path.isfile( + verify_out_dir + f"/verify_folded_hls_cppsim_{i}_SUCCESS.npy" + ) + assert os.path.isfile( + verify_out_dir + f"/verify_stitched_ip_rtlsim_{i}_SUCCESS.npy" + ) + assert os.path.isfile(output_dir + f"/report/verify_rtlsim_{i}.vcd")