diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 2fbb9265beb49644f08a2c6e916ab9c23d4bd339..20f5b48f7acc65ab18702ef2509e9791f919b825 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -13,10 +13,10 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Setup Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v3 - name: Run Lint - uses: pre-commit/action@v2.0.0 + uses: pre-commit/action@v3.0.0 diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn index a3f40d52ef6c8a5b79f46c1bb70f83fb61218fc9..9c18c03d7bdb8406d43aa8fc4efdb8a206b1217e 100644 --- a/docker/Dockerfile.finn +++ b/docker/Dockerfile.finn @@ -46,7 +46,6 @@ RUN apt-get update && \ libsm6 \ libxext6 \ libxrender-dev \ - verilator \ nano \ zsh \ rsync \ @@ -62,6 +61,16 @@ RUN apt-get update && \ RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config RUN locale-gen "en_US.UTF-8" +# install Verilator from source to get the right version +RUN apt-get install -y git perl python3 make autoconf g++ flex bison ccache libgoogle-perftools-dev numactl perl-doc libfl2 libfl-dev zlibc zlib1g zlib1g-dev +RUN git clone https://github.com/verilator/verilator +RUN cd verilator && \ + git checkout v4.012 && \ + autoconf && \ + ./configure && \ + make -j4 && \ + make install + # install XRT RUN wget https://www.xilinx.com/bin/public/openDownload?filename=$XRT_DEB_VERSION.deb -O /tmp/$XRT_DEB_VERSION.deb RUN apt install -y /tmp/$XRT_DEB_VERSION.deb diff --git a/fetch-repos.sh b/fetch-repos.sh index fe585b219099cf201c342256c189a98a2dc680d4..10b6b332550be5d914d80e242f01e77daeaf08a0 100755 --- a/fetch-repos.sh +++ b/fetch-repos.sh @@ -27,12 +27,12 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -QONNX_COMMIT="398a0ecfcb32407c0a3df39246cf6d2bca02886c" +QONNX_COMMIT="92184fea2dd417bc7a53c82811fef271e4833c4c" FINN_EXP_COMMIT="9cbd2787b5160e2b44e0e8164a0df1457dbd5366" BREVITAS_COMMIT="a5b71d6de1389d3e7db898fef72e014842670f03" PYVERILATOR_COMMIT="64b8294ff1afebb47be76fcad6ae87027e0402c2" CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4" -HLSLIB_COMMIT="8d582d0992b17866447a943fa93301bc15f92104" +HLSLIB_COMMIT="e7f2de91d1a2ddadaaea06b8f4c20e97a575470e" OMX_COMMIT="d1065a788219ca0eb54d5e57600b1f9d7f67d4cc" AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b" XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e" diff --git a/finn-rtllib/swg/swg_template_default.sv b/finn-rtllib/swg/swg_template_default.sv new file mode 100644 index 0000000000000000000000000000000000000000..97517438a0c261e4488b74a677a352f9dc51743b --- /dev/null +++ b/finn-rtllib/swg/swg_template_default.sv @@ -0,0 +1,351 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +module $TOP_MODULE_NAME$_controller #( + int unsigned LOOP_H_ITERATIONS = $LOOP_H_ITERATIONS$, + int unsigned LOOP_W_ITERATIONS = $LOOP_W_ITERATIONS$, + int unsigned LOOP_KH_ITERATIONS = $LOOP_KH_ITERATIONS$, + int unsigned LOOP_KW_ITERATIONS = $LOOP_KW_ITERATIONS$, + int unsigned LOOP_SIMD_ITERATIONS = $LOOP_SIMD_ITERATIONS$, + + int unsigned INCR_BITWIDTH = $INCR_BITWIDTH$, + bit [INCR_BITWIDTH-1:0] ADDR_INCREMENT_MAP[6] = $ADDR_INCREMENT_MAP$, + + bit IS_DEPTHWISE = $IS_DEPTHWISE$ +)( + input logic clk, + input logic rst_n, + + input logic advance, + output logic [INCR_BITWIDTH-1:0] addr_incr, + output logic [INCR_BITWIDTH-1:0] tail_incr +); + + // state and counters + typedef enum logic [2:0] { + STATE_START, + STATE_LOOP_SIMD, + STATE_LOOP_KW, + STATE_LOOP_KH, + STATE_LOOP_W, + STATE_LOOP_H + } state_e; + state_e State = $INNERMOST_STATE$; + state_e state_next; + + logic signed [$clog2(LOOP_H_ITERATIONS +2)+1-1:0] Counter_loop_h = LOOP_H_ITERATIONS-1; + logic signed [$clog2(LOOP_W_ITERATIONS +2)+1-1:0] Counter_loop_w = LOOP_W_ITERATIONS-1; + logic signed [$clog2(LOOP_KH_ITERATIONS +2)+1-1:0] Counter_loop_kh = LOOP_KH_ITERATIONS-1; + logic signed [$clog2(LOOP_KW_ITERATIONS +2)+1-1:0] Counter_loop_kw = LOOP_KW_ITERATIONS-1; + logic signed [$clog2(LOOP_SIMD_ITERATIONS+2)+1-1:0] Counter_loop_simd = LOOP_SIMD_ITERATIONS-1; + + assign addr_incr = ADDR_INCREMENT_MAP[State]; + + // combinational logic for tail_incr generation + uwire tail_incr_inner_condition = IS_DEPTHWISE? (Counter_loop_kh >= 0) : 0; + always_comb begin : blkTail + if (tail_incr_inner_condition) + tail_incr = 1; + else if (Counter_loop_w >= 0) + tail_incr = $TAIL_INCR_W$; + else if (Counter_loop_h >= 0) + tail_incr = $TAIL_INCR_H$; + else + tail_incr = $TAIL_INCR_LAST$; + end + + // combinational next state logic + always_comb begin : blkState + state_next = State; + if(State != $INNERMOST_STATE$) state_next = $INNERMOST_STATE$; + else begin + if(Counter_loop_simd < 0) begin + state_next = + (Counter_loop_kw >= 0)? STATE_LOOP_KW : + (Counter_loop_kh >= 0)? STATE_LOOP_KH : + (Counter_loop_w >= 0)? STATE_LOOP_W : + (Counter_loop_h >= 0)? STATE_LOOP_H : + /* else */ STATE_START; + end + end + end : blkState + + // sequential logic + always_ff @ (posedge clk) begin + if(!rst_n) begin + State <= $INNERMOST_STATE$; + Counter_loop_h <= LOOP_H_ITERATIONS-1; + Counter_loop_w <= LOOP_W_ITERATIONS-1; + Counter_loop_kh <= LOOP_KH_ITERATIONS-1; + Counter_loop_kw <= LOOP_KW_ITERATIONS-1; + Counter_loop_simd <= LOOP_SIMD_ITERATIONS-1; + end + else if(advance) begin + State <= state_next; + if (State == $INNERMOST_STATE$) begin + if(Counter_loop_simd >= 0) Counter_loop_simd <= Counter_loop_simd-1; + else begin + Counter_loop_simd <= LOOP_SIMD_ITERATIONS-1; + if(Counter_loop_kw >= 0) Counter_loop_kw <= Counter_loop_kw-1; + else begin + Counter_loop_kw <= LOOP_KW_ITERATIONS-1; + if(Counter_loop_kh >= 0) Counter_loop_kh <= Counter_loop_kh-1; + else begin + Counter_loop_kh <= LOOP_KH_ITERATIONS-1; + if(Counter_loop_w >= 0) Counter_loop_w <= Counter_loop_w-1; + else begin + Counter_loop_w <= LOOP_W_ITERATIONS-1; + if(Counter_loop_h >= 0) Counter_loop_h <= Counter_loop_h-1; + else Counter_loop_h <= LOOP_H_ITERATIONS-1; + end + end + end + end + end + end + end + +endmodule : $TOP_MODULE_NAME$_controller + +module $TOP_MODULE_NAME$_cyclic_buffer_addressable #( + int unsigned WIDTH, + int unsigned DEPTH +)( + input logic clk, + input logic rst_n, + + input logic write_enable, + input logic [$clog2(DEPTH)-1:0] write_addr, + input logic [WIDTH-1:0] data_in, + + input logic read_enable, + input logic [$clog2(DEPTH)-1:0] read_addr, // absolute (!) read address of cyclic buffer + output logic [WIDTH-1:0] data_out +); + + $RAM_STYLE$ logic [WIDTH-1:0] Ram[DEPTH]; + logic [WIDTH-1:0] Out = 'x; + always_ff @(posedge clk) begin + if (read_enable) Out <= Ram[read_addr]; + if (write_enable) Ram[write_addr] <= data_in; + end + assign data_out = Out; + +endmodule : $TOP_MODULE_NAME$_cyclic_buffer_addressable + +module $TOP_MODULE_NAME$_impl #( + int BIT_WIDTH, + int SIMD, + int MMV_IN, + int MMV_OUT, + int LAST_READ_ELEM = $LAST_READ_ELEM$, + int LAST_WRITE_ELEM = $LAST_WRITE_ELEM$, + int BUF_ELEM_TOTAL = $BUF_ELEM_TOTAL$, + int ELEM_PER_WINDOW = $ELEM_PER_WINDOW$, + int INCR_BITWIDTH = $INCR_BITWIDTH$ +)( + input logic ap_clk, + input logic ap_rst_n, + + input logic in0_V_V_TVALID, + output logic in0_V_V_TREADY, + input logic [BIT_WIDTH * SIMD * MMV_IN-1:0] in0_V_V_TDATA, + + output logic out_V_V_TVALID, + input logic out_V_V_TREADY, + output logic [BIT_WIDTH * SIMD * MMV_OUT-1:0] out_V_V_TDATA +); + // derived Constants + localparam int unsigned BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN; + localparam int unsigned BUF_OUT_ELEM_WIDTH = BIT_WIDTH * SIMD; + localparam int unsigned BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT; + + // main buffer instantiation + uwire [BUF_IN_WIDTH -1:0] window_buffer_in; + uwire [BUF_OUT_WIDTH-1:0] window_buffer_out; + uwire window_buffer_write_enable; + uwire window_buffer_read_enable; + uwire [$clog2(BUF_ELEM_TOTAL)-1:0] window_buffer_write_addr; + uwire [$clog2(BUF_ELEM_TOTAL)-1:0] window_buffer_read_addr; + $TOP_MODULE_NAME$_cyclic_buffer_addressable #( + .WIDTH(BUF_IN_WIDTH), + .DEPTH(BUF_ELEM_TOTAL) + ) window_buffer_inst ( + .clk(ap_clk), + .rst_n(ap_rst_n), + + .write_enable(window_buffer_write_enable), + .write_addr(window_buffer_write_addr), + .data_in(window_buffer_in), + + .read_enable(window_buffer_read_enable), + .read_addr(window_buffer_read_addr), + .data_out(window_buffer_out) + ); + + //controller instantiation + uwire advance_controller; + uwire signed [INCR_BITWIDTH-1:0] addr_incr; + uwire [INCR_BITWIDTH-1:0] tail_incr; + $TOP_MODULE_NAME$_controller controller_inst ( + .clk(ap_clk), + .rst_n(ap_rst_n), + .advance(advance_controller), + .addr_incr(addr_incr), + .tail_incr(tail_incr) + ); + + // Counters/address registers + // Add a sign bit even to (most) unsigned counters and Window_buffer_read_addr_reg, + // so we can use automatic sign extension and simplify calculations w/ signed increment. + // Alternatively, we could manually sign-extend and shave off a bit here or there. + logic signed [$clog2(LAST_READ_ELEM+1)+1-1:0] Newest_buffered_elem = -1; + logic [$clog2(LAST_READ_ELEM+1)+1-1:0] Current_elem = 0; + logic [$clog2(LAST_READ_ELEM+1)+1-1:0] First_elem_next_window = 0; + logic [$clog2(ELEM_PER_WINDOW) -1:0] Position_in_window = 0; + logic [$clog2(BUF_ELEM_TOTAL)+1 -1:0] Window_buffer_read_addr_reg = 0; + logic [$clog2(BUF_ELEM_TOTAL)-1:0] Window_buffer_write_addr_reg = 0; + + // Control signals/registers + uwire read_cmd = + !reading_done && ( // if there is still an input element left to read + Fetching_done || ( // if fetching is done (e.g. for skipped rows at FM end due to stride) + $signed(((Newest_buffered_elem - (BUF_ELEM_TOTAL - 1)))) < $signed(First_elem_next_window) && + $signed(((Newest_buffered_elem - (BUF_ELEM_TOTAL - 1)))) < $signed(Current_elem) + ) // (over-)write to buffer if oldest buffered element will no longer be needed + ); + uwire read_ok = read_cmd && in0_V_V_TVALID; + uwire reading_done = Newest_buffered_elem == LAST_READ_ELEM; + + uwire fetch_cmd = !($signed(Current_elem) > Newest_buffered_elem) && !write_blocked && !Fetching_done; + logic Fetching_done = 0; + + logic Write_cmd = 0; + logic Writing_done = 0; + uwire write_ok = Write_cmd && out_V_V_TREADY; + uwire write_blocked = Write_cmd && !out_V_V_TREADY;; + + //assign buffer control + assign window_buffer_write_addr = Window_buffer_write_addr_reg; + assign window_buffer_read_addr = Window_buffer_read_addr_reg; + assign window_buffer_write_enable = read_ok; + assign window_buffer_read_enable = fetch_cmd; + assign advance_controller = fetch_cmd; + + //assign I/O ports + assign window_buffer_in = in0_V_V_TDATA; + assign out_V_V_TDATA = window_buffer_out; + assign in0_V_V_TREADY = ap_rst_n && read_ok; //only asserted if data is available and we can store it (allowed) + assign out_V_V_TVALID = ap_rst_n && Write_cmd; //only asserted if we have data available and it has not been read yet (don't wait for READY from sink) + + //main process for advancing counters + always_ff @(posedge ap_clk) begin + if(!ap_rst_n) begin + Newest_buffered_elem <= -1; + Current_elem <= 0; + First_elem_next_window <= 0; + Position_in_window <= 0; + Window_buffer_read_addr_reg <= 0; + Window_buffer_write_addr_reg <= 0; + Fetching_done <= 0; + Write_cmd <= 0; + Writing_done <= 0; + end + else begin + if (read_ok) begin + Window_buffer_write_addr_reg <= (Window_buffer_write_addr_reg == BUF_ELEM_TOTAL-1)? 0 : Window_buffer_write_addr_reg + 1; + Newest_buffered_elem <= Newest_buffered_elem+1; + + if (Newest_buffered_elem == LAST_READ_ELEM-1) begin + Window_buffer_write_addr_reg <= 0; + end + //check if this is the last read cycle (reading_done will be true afterwards) + if ((Newest_buffered_elem == LAST_READ_ELEM-1) && Writing_done) begin + //start processing of next FM if writing is done already (possible due to unused input elements at the tail end) + //todo: allow for read overlapping between feature maps (i.e., reading first elements from next FM while still writing last window of current FM) + Newest_buffered_elem <= -1; + Current_elem <= 0; + Window_buffer_read_addr_reg <= 0; + First_elem_next_window <= 0; + Writing_done <= 0; + Fetching_done <= 0; + end + end + + if (fetch_cmd) begin + //count up to track which element index is about to be read from the buffer, and where it is located within the buffer + //use increment value calculated by controller + + // absolute buffer address wrap-around + automatic logic signed [$clog2(BUF_ELEM_TOTAL)+1:0] ra = $signed(Window_buffer_read_addr_reg) + $signed(addr_incr); + automatic logic signed [$clog2(BUF_ELEM_TOTAL+1):0] ra_correct = + (ra >= BUF_ELEM_TOTAL)? -BUF_ELEM_TOTAL : + (ra < 0)? BUF_ELEM_TOTAL : 0; + Window_buffer_read_addr_reg <= ra + ra_correct; + + //keep track where we are within a window + Position_in_window <= (Position_in_window != ELEM_PER_WINDOW - 1)? Position_in_window+1 : 0; + + //update first element of next window to allow buffer overwrite up until that point + if (Position_in_window == 0) + First_elem_next_window <= First_elem_next_window + tail_incr; + + //check if this is the last write cycle (Writing_done will be true afterwards) + if (Current_elem == LAST_WRITE_ELEM) + Fetching_done <= 1; + else + Current_elem <= $signed(Current_elem) + addr_incr; + + // determine if prefetched data will be outstanding in the next cycle + // if we fetch in this cycle -> yes + // if we do not fetch nor write -> do not change + // if we do not fetch but write successfully-> clear outstanding data + Write_cmd <= fetch_cmd; + end + + if (write_ok) + Write_cmd <= fetch_cmd; + + if (write_ok && Fetching_done) begin + //check if this is the last write cycle (Writing_done will be true afterwards) + if (reading_done || (read_ok && (Newest_buffered_elem == LAST_READ_ELEM - 1))) begin + //start processing of next FM if reading is done already, or completes in the same cycle + Newest_buffered_elem <= -1; + Current_elem <= 0; + Window_buffer_read_addr_reg <= 0; + First_elem_next_window <= 0; + Fetching_done <= 0; + end else + Writing_done <= 1; + end + end + end + +endmodule : $TOP_MODULE_NAME$_impl diff --git a/finn-rtllib/swg/swg_template_wrapper.v b/finn-rtllib/swg/swg_template_wrapper.v new file mode 100644 index 0000000000000000000000000000000000000000..0cc3579a255fddaf1a470d440b9e8ac245abe486 --- /dev/null +++ b/finn-rtllib/swg/swg_template_wrapper.v @@ -0,0 +1,75 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +`timescale 1 ns / 1 ps + +module $TOP_MODULE_NAME$ ( +(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *) +input ap_clk, +(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *) +input ap_rst_n, +input [BUF_IN_WIDTH-1:0] in0_V_TDATA, +input in0_V_TVALID, +output in0_V_TREADY, +output [BUF_OUT_WIDTH-1:0] out_V_TDATA, +output out_V_TVALID, +input out_V_TREADY +); + +// top-level parameters (set via code-generation) +parameter BIT_WIDTH = $BIT_WIDTH$; +parameter SIMD = $SIMD$; +parameter MMV_IN = $MMV_IN$; +parameter MMV_OUT = $MMV_OUT$; + +// derived constants +parameter BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN; +parameter BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT; + +$TOP_MODULE_NAME$_impl +#( + .BIT_WIDTH(BIT_WIDTH), + .SIMD(SIMD), + .MMV_IN(MMV_IN), + .MMV_OUT(MMV_OUT) +) +impl +( + .ap_clk(ap_clk), + .ap_rst_n(ap_rst_n), + .in0_V_V_TDATA(in0_V_TDATA), + .in0_V_V_TVALID(in0_V_TVALID), + .in0_V_V_TREADY(in0_V_TREADY), + .out_V_V_TDATA(out_V_TDATA), + .out_V_V_TVALID(out_V_TVALID), + .out_V_V_TREADY(out_V_TREADY) +); + +endmodule //TOP_MODULE_NAME diff --git a/requirements.txt b/requirements.txt index 970acc342bb7984e69929d1ef5eaa027b765ced0..9038a5e8170301421529e0b570482316e4fff20a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,7 @@ onnx==1.11.0 onnxoptimizer onnxruntime==1.11.1 pre-commit==2.9.2 -protobuf==3.20.1 +protobuf==3.20.2 pyscaffold==3.2.1 scipy==1.5.2 setupext-janitor>=1.1.2 diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py index 238083f653d410772a81115ff12dd987835d1f32..d6864994a70a0ea4c24567155ff7c0599bc0fb6f 100644 --- a/src/finn/builder/build_dataflow.py +++ b/src/finn/builder/build_dataflow.py @@ -155,12 +155,14 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): % (step_name, step_num, len(build_dataflow_steps)) ) # redirect output to logfile - sys.stdout = stdout_logger - sys.stderr = stderr_logger - print( - "Running step: %s [%d/%d]" - % (step_name, step_num, len(build_dataflow_steps)) - ) + if not cfg.verbose: + sys.stdout = stdout_logger + sys.stderr = stderr_logger + # also log current step name to logfile + print( + "Running step: %s [%d/%d]" + % (step_name, step_num, len(build_dataflow_steps)) + ) # run the step step_start = time.time() model = transform_step(model, cfg) diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index 09e9ec3a564dc2b459cd1ea3205e541f922b1af0..e16711f63b954707bc7ad9050dd7627ca1ce99c1 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -258,6 +258,10 @@ class DataflowBuildConfig: #: Which memory mode will be used for compute layers default_mem_mode: Optional[ComputeEngineMemMode] = ComputeEngineMemMode.DECOUPLED + #: Force inference of RTL ConvolutionInputGenerator over HLS implementation + #: If set to False, falls back to the default behavior of InferConvInpGen() + force_rtl_conv_inp_gen: Optional[bool] = False + #: Which Vitis platform will be used. #: Only relevant when `shell_flow_type = ShellFlowType.VITIS_ALVEO` #: e.g. "xilinx_u250_xdma_201830_2" @@ -285,6 +289,10 @@ class DataflowBuildConfig: #: Whether pdb postmortem debuggig will be launched when the build fails enable_build_pdb_debug: Optional[bool] = True + #: When True, all warnings and compiler output will be printed in stdout. + #: Otherwise, these will be suppressed and only appear in the build log. + verbose: Optional[bool] = False + #: If given, only run the steps in the list. If not, run default steps. #: See `default_build_dataflow_steps` for the default list of steps. #: When specified: diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 59f77650da5c3c3f9db0ea65e2288544b376bec3..e77f17d7c27f4be08aa6725e5803a1ea566c9443 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -302,7 +302,10 @@ def step_convert_to_hls(model: ModelWrapper, cfg: DataflowBuildConfig): # needed for convolutions -- TODO always exec? need_conv = len(model.get_nodes_by_op_type("Im2Col")) > 0 if need_conv: - model = model.transform(to_hls.InferConvInpGen()) + if cfg.force_rtl_conv_inp_gen: + model = model.transform(to_hls.InferConvInpGen(use_rtl_variant=True)) + else: + model = model.transform(to_hls.InferConvInpGen()) model = model.transform(to_hls.InferStreamingMaxPool()) model = model.transform(RemoveCNVtoFCFlatten()) # get rid of Tranpose -> Tranpose identity seq diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 2c7c86c64ea1279cb18cf8342aa20fb2792bdaf5..e5eb483a00f6890f5eeb16c5cec533a4533c9f15 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -36,8 +36,12 @@ from finn.custom_op.fpgadataflow.convolutioninputgenerator import ( from finn.custom_op.fpgadataflow.convolutioninputgenerator1d import ( ConvolutionInputGenerator1D, ) +from finn.custom_op.fpgadataflow.convolutioninputgenerator_rtl import ( + ConvolutionInputGenerator_rtl, +) from finn.custom_op.fpgadataflow.downsampler import DownSampler from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch +from finn.custom_op.fpgadataflow.eltwise import StreamingEltwise from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch from finn.custom_op.fpgadataflow.iodma import IODMA @@ -67,6 +71,7 @@ custom_op["StreamingMaxPool_Batch"] = StreamingMaxPool_Batch custom_op["MatrixVectorActivation"] = MatrixVectorActivation custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator custom_op["ConvolutionInputGenerator1D"] = ConvolutionInputGenerator1D +custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl custom_op["TLastMarker"] = TLastMarker custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch custom_op["StreamingFIFO"] = StreamingFIFO @@ -85,3 +90,4 @@ custom_op["UpsampleNearestNeighbour_Batch"] = UpsampleNearestNeighbour_Batch custom_op["Lookup"] = Lookup custom_op["StreamingConcat"] = StreamingConcat custom_op["CheckSum"] = CheckSum +custom_op["StreamingEltwise"] = StreamingEltwise diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py new file mode 100755 index 0000000000000000000000000000000000000000..399b36e15021af6f449df3e9ba2acdc699a27647 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py @@ -0,0 +1,834 @@ +# Copyright (C) 2022, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import math +import numpy as np +import os +from math import copysign +from qonnx.core.datatype import DataType +from qonnx.custom_op.general import im2col +from qonnx.custom_op.general.im2col import compute_conv_output_dim + +from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.util.basic import get_rtlsim_trace_depth, make_build_dir +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None + +# RTL Convolution Input Generator / Sliding Window Generator (SWG) +# Matches and extends the functionality of all ConvolutionInputGenerator_* functions +# in finn-hlslib by generating HDL code for two different implementation styles: +# - Addressable cyclic buffer: to be used when out_width <= in_width +# - Parallel registers + line buffers: to be used when out_width > in_width +# Supports non-square, 1D, strided, dilated, and depthwise convolutions. +# Note: the actual data layout produced is different for depthwise and non-depthwise: +# * non-depthwise SWG: (1, OFMDim_H, OFMDim_W, K_H, K_W, IFMChannels/SIMD, SIMD) +# * depthwise SWG: (1, OFMDim_H, OFMDim_W, IFMChannels/SIMD, K_H, K_W, SIMD) + +# NOTE: "Parallel" implementation style not yet implemented in this version! + + +class ConvolutionInputGenerator_rtl(HLSCustomOp): + """Class that does not correspond to one of the finn-hlslib ConvolutionInputGenerator + (sliding window) function variants. Generates an RTL ConvolutionInputGenerator + implementation based on (System-)Verilog templates, defined in finn-rtllib/swg.""" + + def __init__(self, onnx_node): + super().__init__(onnx_node) + + def get_nodeattr_types(self): + my_attrs = { + "ConvKernelDim": ("ints", True, []), # [H, W] = [Y, X] + "IFMChannels": ("i", True, 0), + "IFMDim": ("ints", True, []), # [H, W] = [Y, X] + "OFMDim": ("ints", True, []), # [H, W] = [Y, X] + "SIMD": ("i", True, 0), + # additional parallelization parameter - not yet implemented + "M": ("i", False, 1), + # alternative implementation style - not yet implemented + "parallel_window": ("i", False, 0, {0}), + "Stride": ("ints", True, []), # [H, W] = [Y, X] + "Dilation": ("ints", True, []), # [H, W] = [Y, X] + # FINN DataTypes for inputs, weights, outputs + "inputDataType": ("s", True, ""), + "outputDataType": ("s", True, ""), + "depthwise": ("i", False, 0, {0, 1}), + # FPGA resource type for ConvolutionInputGenerator input buffer + # auto -- let Vivado decide + # block -- use BRAM + # distributed -- use LUTRAM + # ultra -- use URAM + "ram_style": ( + "s", + False, + "auto", + {"auto", "block", "distributed", "ultra"}, + ), + # attribute to save top module name - not user configurable + "gen_top_module": ("s", False, ""), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_normal_input_shape(self): + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") + ifm_ch = self.get_nodeattr("IFMChannels") + ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) + return ishape + + def get_folded_input_shape(self): + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") + ifm_ch = self.get_nodeattr("IFMChannels") + simd = self.get_nodeattr("SIMD") + assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" + wf = int(ifm_ch / simd) + folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd) + return folded_ishape + + def get_normal_output_shape(self): + k_h, k_w = self.get_nodeattr("ConvKernelDim") + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") + ifm_ch = self.get_nodeattr("IFMChannels") + stride_h, stride_w = self.get_nodeattr("Stride") + dilation_h, dilation_w = self.get_nodeattr("Dilation") + pad = 0 + ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) + ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) + oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch) + return oshape + + def get_folded_output_shape(self): + k_h, k_w = self.get_nodeattr("ConvKernelDim") + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") + ifm_ch = self.get_nodeattr("IFMChannels") + stride_h, stride_w = self.get_nodeattr("Stride") + dilation_h, dilation_w = self.get_nodeattr("Dilation") + simd = self.get_nodeattr("SIMD") + pad = 0 + ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) + ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) + assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" + if self.get_nodeattr("parallel_window"): + wf = int((ifm_ch) // simd) + folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd) + else: + wf = int((k_h * k_w * ifm_ch) // simd) + folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd) + return folded_oshape + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpect input shape for ConvInpGen." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + # data type stays the same + dtype = model.get_tensor_datatype(node.input[0]) + model.set_tensor_datatype(node.output[0], dtype) + + def verify_node(self): + pass + + def get_input_datatype(self): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("outputDataType")] + + def get_instream_width(self): + ibits = self.get_input_datatype().bitwidth() + simd = self.get_nodeattr("SIMD") + ifm_ch = self.get_nodeattr("IFMChannels") + assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" + in_width = simd * ibits + return in_width + + def get_outstream_width(self): + if self.get_nodeattr("parallel_window"): + # feed all window pixels in parallel + k_h, k_w = self.get_nodeattr("ConvKernelDim") + return self.get_instream_width() * k_h * k_w + else: + # if parallel variant not in use: same width for output and input stream + return self.get_instream_width() + + def get_number_input_values(self): + folded_ishape = self.get_folded_input_shape() + num_input_elems = np.prod(folded_ishape[:-1]) + return num_input_elems + + def get_number_output_values(self): + folded_oshape = self.get_folded_output_shape() + num_output_elems = np.prod(folded_oshape[:-1]) + return num_output_elems + + def get_1d_conv_attrs_normalized(self): + # normalize FM dimensions so that: + # [H, W] = [Y, X] = [1, D] or [D, 1] are always mapped to [1, D]. + # The dummy ('1') dimension is the Y-dimension. + ifm_ch = self.get_nodeattr("IFMChannels") + k = self.get_nodeattr("ConvKernelDim") + ifm_dim = self.get_nodeattr("IFMDim") + ofm_dim = self.get_nodeattr("OFMDim") + stride = self.get_nodeattr("Stride") + dilation = self.get_nodeattr("Dilation") + + if ifm_dim[1] == 1: + ifm_dim = ifm_dim[::-1] + ofm_dim = ofm_dim[::-1] + k = k[::-1] + stride = stride[::-1] + dilation = dilation[::-1] + + return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation) + + def get_buffer_depth(self): + ifm_ch = self.get_nodeattr("IFMChannels") + k = self.get_nodeattr("ConvKernelDim") + ifm_dim = self.get_nodeattr("IFMDim") + stride = self.get_nodeattr("Stride") + dilation = self.get_nodeattr("Dilation") + simd = self.get_nodeattr("SIMD") + + k_h, k_w = k + h, w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + mmv_in = 1 + mmv_out = 1 + channel_factor = int(ifm_ch / simd) + + impl_style = self.select_impl_style() + if impl_style == "default": + # compute minimal buffer length (assuming it holds 1 complete window) + buffer_min_size = ( + (k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1 + ) * channel_factor + + # add additional buffer space in case of stride > 1 + # this minimizes cycle count as it allows an earlier pre-load of inputs + buffer_depth = ( + buffer_min_size + + max( + 0, + ((stride_w - 1) - (int(mmv_out * k_h * k_w / mmv_in))) + * channel_factor, + ) + + max( + 0, + ((stride_h - 1) * w - (int(mmv_out * k_h * k_w / mmv_in))) + * channel_factor, + ) + ) + else: + buffer_depth = 0 + raise Exception("Requested impl. style not implemented") + return buffer_depth + + def get_exp_cycles(self): + simd = self.get_nodeattr("SIMD") + ifm_ch = self.get_nodeattr("IFMChannels") + k = self.get_nodeattr("ConvKernelDim") + ifm_dim = self.get_nodeattr("IFMDim") + ofm_dim = self.get_nodeattr("OFMDim") + stride = self.get_nodeattr("Stride") + dilation = self.get_nodeattr("Dilation") + depthwise = self.get_nodeattr("depthwise") + ifm_dim_h, ifm_dim_w = ifm_dim + ofm_dim_h, ofm_dim_w = ofm_dim + k_h, k_w = k + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + + channel_factor = int(ifm_ch / simd) + + if ifm_dim_h == 1 or ifm_dim_w == 1: + # 1D case + ( + ifm_ch, + [ifm_dim_h, ifm_dim_w], + [ofm_dim_h, ofm_dim_w], + [k_h, k_w], + [stride_h, stride_w], + [dilation_h, dilation_w], + ) = self.get_1d_conv_attrs_normalized() + + if depthwise: + exp_cycles = ( + +ofm_dim_w * k_w * channel_factor + + channel_factor * (k_w - 1) * (stride_w - 1) + - (k_w - 1) + + 2 + ) + else: + exp_cycles = ofm_dim_w * k_w * channel_factor + 2 + else: + # 2D case + buffer_min_size = ( + (k_h - 1) * dilation_h * ifm_dim_w + (k_w - 1) * dilation_w + 1 + ) * channel_factor + cycles_write_block = ofm_dim_w * k_w * k_h * channel_factor + cycles_read_block = stride_w * ifm_dim_w * channel_factor + max_cycles = max(cycles_write_block, cycles_read_block) + if depthwise: + max_cycles += ofm_dim_w * (stride_w - 1) * (channel_factor - 1) + exp_cycles = buffer_min_size + ofm_dim_h * max_cycles # initial buffering + if depthwise: + exp_cycles += (stride_h - 1) * ifm_dim_w * channel_factor + + return int(exp_cycles) + + def bram_estimation(self): + simd = self.get_nodeattr("SIMD") + ram_style = self.get_nodeattr("ram_style") + + # NOTE: Actual BRAM usage might be lower in some cases. + # This does not account for the exact Vivado behavior yet. + buffer_width = simd * self.get_input_datatype().bitwidth() + buffer_depth = self.get_buffer_depth() + if ram_style == "block" or ram_style == "auto": + if buffer_depth <= 512: + ram_width = 36 + elif buffer_depth <= 1024: + ram_width = 18 + elif buffer_depth <= 2048: + ram_width = 9 + elif buffer_depth <= 4096: + ram_width = 4 + elif buffer_depth <= 8192: + ram_width = 2 + else: + ram_width = 1 + + ram_cascade_depth = math.ceil(buffer_depth / 16384) + ram_cascade_width = math.ceil(buffer_width / ram_width) + cascade_savings = 0 + if buffer_depth > 16384: + remainder_depth = buffer_depth % 16384 + if remainder_depth <= 512: + remainder_width = 36 + elif remainder_depth <= 1024: + remainder_width = 18 + elif remainder_depth <= 2048: + remainder_width = 9 + elif remainder_depth <= 4096: + remainder_width = 4 + elif remainder_depth <= 8192: + remainder_width = 2 + else: + remainder_width = 1 + + remainder_cascade_width = math.ceil(buffer_width / remainder_width) + cascade_savings = ram_cascade_width - remainder_cascade_width + + return int(ram_cascade_depth * ram_cascade_width - cascade_savings) + else: + return 0 + + def lut_estimation(self): + simd = self.get_nodeattr("SIMD") + ram_style = self.get_nodeattr("ram_style") + buffer_width = simd * self.get_input_datatype().bitwidth() + buffer_depth = self.get_buffer_depth() + if ram_style == "distributed": + ram_luts = int(buffer_width * math.ceil(buffer_depth / 38)) + else: + ram_luts = 0 + return 300 + ram_luts + + def uram_estimation(self): + simd = self.get_nodeattr("SIMD") + ram_style = self.get_nodeattr("ram_style") + buffer_width = simd * self.get_input_datatype().bitwidth() + buffer_depth = self.get_buffer_depth() + + if ram_style == "ultra": + ram_depth = 4096 + ram_width = 72 + ram_cascade_depth = math.ceil(buffer_depth / ram_depth) + ram_cascade_width = math.ceil(buffer_width / ram_width) + return int(ram_cascade_depth * ram_cascade_width) + else: + return 0 + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + if mode == "cppsim": + raise Exception( + "cppsim not possible for RTL SWG, please set exec_mode to rtlsim" + ) + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't match expected shape (1, ifm_dim, ifm_dim, ifm_ch).""" + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + inp = (inp + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + + # reshape input into folded form + inp = inp.reshape(folded_ishape) + # make copy before saving array + reshaped_input = inp.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + + # binary -> bipolar if needed + if self.get_output_datatype() == DataType["BIPOLAR"]: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output + shape doesn't match expected shape (1, ofm_dim_h, ofm_dim_w, k_h*k_w*ifm_ch).""" + + def prepare_codegen_default(self): + # Default implementation style for MMV_out = 1: addressable cyclic buffer + # Computing incremental addressing scheme directly.. + template_path = ( + os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_template_default.sv" + ) + code_gen_dict = {} + + ifm_ch = self.get_nodeattr("IFMChannels") + k = self.get_nodeattr("ConvKernelDim") + ifm_dim = self.get_nodeattr("IFMDim") + stride = self.get_nodeattr("Stride") + dilation = self.get_nodeattr("Dilation") + depthwise = self.get_nodeattr("depthwise") + simd = self.get_nodeattr("SIMD") + + k_h, k_w = k + h, w = ifm_dim + pad = [0, 0, 0, 0] # padding happens in separate padding node for now + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + pad_h = pad[0] + pad[2] + pad_w = pad[1] + pad[3] + out_dim_h = im2col.compute_conv_output_dim(h, k_h, stride_h, pad_h, dilation_h) + out_dim_w = im2col.compute_conv_output_dim(w, k_w, stride_w, pad_w, dilation_w) + mmv_in = 1 + mmv_out = 1 + channel_factor = int(ifm_ch / simd) + + # compute minimal buffer length (assuming it holds 1 complete window) + buffer_min_size = ( + (k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1 + ) * channel_factor + + buffer_actual_size = self.get_buffer_depth() + code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_actual_size)] + + # compute some intermediate values, e.g., kernel "width" = k_w incl. dilation + # or cols/rows that are skipped due to imperfect stride<->dim combination + kernel_width = (k_w - 1) * dilation_w + 1 + kernel_height = (k_h - 1) * dilation_h + 1 + skip_columns = w % (kernel_width + (out_dim_w - 1) * stride_w) + skip_rows = h % (kernel_height + (out_dim_h - 1) * stride_h) + + # compute address increment values for 5-loop nest + addr_incr_end_simd = 1 + addr_incr_end_window_elem = (dilation_w - 1) * channel_factor + 1 + addr_incr_end_window_row = ( + ((w - kernel_width) * channel_factor) # remaining line + + ((dilation_h - 1) * w * channel_factor) # skip lines + + 1 # wrap-around of minimally sized buffer + ) + addr_incr_end_window = -buffer_min_size + stride_w * channel_factor + 1 + addr_incr_end_row = ( + -buffer_min_size + + ((skip_columns + kernel_width) * channel_factor) # remaining line + + ((stride_h - 1) * w * channel_factor) # skip lines + + 1 + ) + + # re-use same controller structure -> re-assign address increments + if depthwise: + addr_incr_end_window_elem = dilation_w * channel_factor + addr_incr_end_window_row = ( + channel_factor + + (w - kernel_width) * channel_factor + + (dilation_h - 1) * w * channel_factor + ) + addr_incr_end_simd = -buffer_min_size + (channel_factor + 1) + + # sanity check + assert not ( + abs(addr_incr_end_window) > buffer_actual_size + ), "ERROR: W increment > buffer size, wrap logic doesn't account for this" + assert not ( + abs(addr_incr_end_row) > buffer_actual_size + ), "ERROR: H increment > buffer size, wrap logic doesn't account for this" + + # set certain threshold indices to detect when reading/writing finishes + code_gen_dict["$LAST_READ_ELEM$"] = [str(h * w * channel_factor - 1)] + code_gen_dict["$LAST_WRITE_ELEM$"] = [ + str(((h - skip_rows - 1) * w + (w - skip_columns)) * channel_factor - 1) + ] + + # default controller loop structure: # iterations (counters) map directly + loop_h_iterations = out_dim_h + loop_w_iterations = out_dim_w + loop_kh_iterations = k_h + loop_kw_iterations = k_w + loop_simd_iterations = channel_factor + + if depthwise and channel_factor > 1: + # re-arrange existing controller loop structure for depthwise convolutions + loop_kh_iterations = channel_factor + loop_kw_iterations = k_h + loop_simd_iterations = k_w + addr_incr_end_simd_ = addr_incr_end_simd + addr_incr_end_simd = addr_incr_end_window_elem + addr_incr_end_window_elem = addr_incr_end_window_row + addr_incr_end_window_row = addr_incr_end_simd_ + elem_per_window = k_h * k_w + + tail_incr_w = addr_incr_end_window + buffer_min_size - channel_factor + tail_incr_h = addr_incr_end_row + buffer_min_size - channel_factor + tail_incr_last_window = buffer_min_size - 1 + code_gen_dict["$IS_DEPTHWISE$"] = ["1"] + else: + # depthwise output format is equivalent to non-depthwise if SIMD=C + elem_per_window = k_h * k_w * channel_factor + + tail_incr_w = addr_incr_end_window + buffer_min_size - 1 + tail_incr_h = addr_incr_end_row + buffer_min_size - 1 + tail_incr_last_window = buffer_min_size - 1 + code_gen_dict["$IS_DEPTHWISE$"] = ["0"] + + code_gen_dict["$TAIL_INCR_W$"] = [str(tail_incr_w)] + code_gen_dict["$TAIL_INCR_H$"] = [str(tail_incr_h)] + code_gen_dict["$TAIL_INCR_LAST$"] = [str(tail_incr_last_window)] + + # support SIMD = IFMChannels and k_w = 1 cases + # for k = [k_h, k_w] = [1, k_w], no adjustment is needed + # for k = [k_h, k_w] = [1, 1], do not use this impl. style (mmv_out=K=1) + # innermost loop is executed at least once -> adjust if needed + if loop_simd_iterations == 1: + # skip innermost SIMD loop completely + if loop_kw_iterations == 1: + # skip innermost KW loop completely + code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_KH"] + loop_kh_iterations -= 1 # -1 because state is initial state + else: + code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_KW"] + loop_kw_iterations -= 1 # -1 because state is initial state + else: + code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_SIMD"] + loop_simd_iterations -= 1 # -1 because state is initial state + + code_gen_dict["$LOOP_H_ITERATIONS$"] = [str(loop_h_iterations - 1)] + code_gen_dict["$LOOP_W_ITERATIONS$"] = [str(loop_w_iterations - 1)] + code_gen_dict["$LOOP_KH_ITERATIONS$"] = [str(loop_kh_iterations - 1)] + code_gen_dict["$LOOP_KW_ITERATIONS$"] = [str(loop_kw_iterations - 1)] + code_gen_dict["$LOOP_SIMD_ITERATIONS$"] = [str(loop_simd_iterations - 1)] + + incr_bitwidth = 1 + math.ceil( + math.log2( + max( + abs(addr_incr_end_simd) + 1, + abs(addr_incr_end_window_elem) + 1, + abs(addr_incr_end_window_row) + 1, + abs(addr_incr_end_window) + 1, + abs(addr_incr_end_row) + 1, + abs(tail_incr_w) + 1, + abs(tail_incr_h) + 1, + abs(tail_incr_last_window) + 1, + ) + ) + ) + code_gen_dict["$INCR_BITWIDTH$"] = [str(incr_bitwidth)] + code_gen_dict["$ADDR_INCREMENT_MAP$"] = [ + "'{{ {}'d0, {}'d{}, {}'d{}, {}'d{}, {}'d{}, {}'d{}}}".format( + incr_bitwidth, + int(copysign(incr_bitwidth, addr_incr_end_simd)), + abs(addr_incr_end_simd), + int(copysign(incr_bitwidth, addr_incr_end_window_elem)), + abs(addr_incr_end_window_elem), + int(copysign(incr_bitwidth, addr_incr_end_window_row)), + abs(addr_incr_end_window_row), + int(copysign(incr_bitwidth, addr_incr_end_window)), + abs(addr_incr_end_window), + int(copysign(incr_bitwidth, addr_incr_end_row)), + abs(addr_incr_end_row), + ) + ] + + code_gen_dict["$ELEM_PER_WINDOW$"] = [str(elem_per_window)] + code_gen_dict["$SIMD$"] = [str(simd)] + code_gen_dict["$MMV_IN$"] = [str(mmv_in)] + code_gen_dict["$MMV_OUT$"] = [str(mmv_out)] + + return template_path, code_gen_dict + + def select_impl_style(self): + simd = self.get_nodeattr("SIMD") + M = self.get_nodeattr("M") + ifm_ch = self.get_nodeattr("IFMChannels") + ifm_dim = self.get_nodeattr("IFMDim") + stride = self.get_nodeattr("Stride") + dilation = self.get_nodeattr("Dilation") + k = self.get_nodeattr("ConvKernelDim") + ifm_dim_h, ifm_dim_w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + k_h, k_w = k + kernel_width = (k_w - 1) * dilation_w + 1 # incl. dilation + kernel_height = (k_h - 1) * dilation_h + 1 # incl. dilation + + # check for valid configuration + assert ( + kernel_height <= ifm_dim_h + and kernel_width <= ifm_dim_w + and stride_h <= ifm_dim_h + and stride_w <= ifm_dim_w + ), "Illegal conv configuration: kernel or stride > FM dimension" + + # init folding config + if self.get_nodeattr("parallel_window"): + # mmv_in = M * 1 + mmv_out = M * k_h * k_w + assert ( + ifm_ch == simd + ), "Constraint violated: SIMD must be equal to IFMChannels" + else: + # mmv_in = 1 + mmv_out = 1 + assert ( + ifm_ch % simd == 0 + ), "Constraint violated: SIMD must divide IFMChannels" + + # choose implementation style + if mmv_out > 1 or (k_h == 1 and k_w == 1): + impl_style = "parallel" + assert ( + ifm_ch == simd + ), "Constraint violated: SIMD must be equal to IFMChannels" + else: + impl_style = "default" + + assert ( + impl_style == "default" + ), "ERROR: Parallel window mode not yet implemented" + return impl_style + + def generate_hdl(self): + impl_style = self.select_impl_style() + + # prepare code generation by filling out dictionaries + if impl_style == "default": + template_path, code_gen_dict = self.prepare_codegen_default() + else: + raise Exception("Requested impl. style not implemented") + + # add general parameters to dictionary + code_gen_dict["$TOP_MODULE_NAME$"] = [self.get_verilog_top_module_name()] + # save top module name so we can refer to it after this node has been renamed + # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) + self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) + code_gen_dict["$BIT_WIDTH$"] = [str(self.get_input_datatype().bitwidth())] + ram_style = self.get_nodeattr("ram_style") + if ram_style == "auto": + code_gen_dict["$RAM_STYLE$"] = [""] + else: + code_gen_dict["$RAM_STYLE$"] = ['(* ram_style = "{}" *)'.format(ram_style)] + + # apply code generation to templates + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + with open(template_path, "r") as f: + template = f.read() + with open( + os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_template_wrapper.v", "r" + ) as f: + template_wrapper = f.read() + for key in code_gen_dict: + # transform list into long string separated by '\n' + code_gen_line = "\n".join(code_gen_dict[key]) + template = template.replace(key, code_gen_line) + template_wrapper = template_wrapper.replace(key, code_gen_line) + with open( + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv" + ), + "w", + ) as f: + f.write(template) + with open( + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" + ), + "w", + ) as f: + f.write(template_wrapper) + + # set ipgen_path and ip_path so that HLS-Synth transformation + # and stich_ip transformation do not complain + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + + def prepare_rtlsim(self): + """Creates a Verilator emulation library for the RTL code generated + for this node, sets the rtlsim_so attribute to its path and returns + a PyVerilator wrapper around it.""" + # Modified to use generated (System-)Verilog instead of HLS output products + + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + verilog_paths = [code_gen_dir] + verilog_files = [ + self.get_nodeattr("gen_top_module") + "_wrapper.v", + self.get_nodeattr("gen_top_module") + "_impl.sv", + ] + + # build the Verilator emu library + sim = PyVerilator.build( + verilog_files, + build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), + verilog_path=verilog_paths, + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_verilog_top_module_name(), + ) + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + return sim + + def code_generation_ipi(self): + """Constructs and returns the TCL for node instantiation in Vivado IPI.""" + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + + cmd = [ + "add_files -norecurse %s" + % ( + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" + ) + ), + "add_files -norecurse %s" + % ( + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv" + ) + ), + "create_bd_cell -type module -reference %s %s" + % (self.get_nodeattr("gen_top_module"), self.onnx_node.name), + ] + + return cmd + + def code_generation_ipgen(self, model, fpgapart, clk): + """Normally: Generates C++ code and tcl script for IP generation. + Here: Generates (System-)Verilog code for IP generation.""" + self.generate_hdl() + + def ipgen_singlenode_code(self): + """Normally: Builds the bash script for IP generation.""" + pass + + def code_generation_cppsim(self, model): + """Normally: Generates C++ code for simulation (cppsim).""" + pass + + def compile_singlenode_code(self): + pass + + def global_includes(self): + pass + + def defines(self, var): + pass + + def read_npy_data(self): + pass + + def strm_decl(self): + pass + + def docompute(self): + pass + + def dataoutstrm(self): + pass + + def save_as_npy(self): + pass + + def blackboxfunction(self): + pass + + def pragmas(self): + pass diff --git a/src/finn/custom_op/fpgadataflow/eltwise.py b/src/finn/custom_op/fpgadataflow/eltwise.py new file mode 100644 index 0000000000000000000000000000000000000000..a29e871fabbc01f0accd6858d69c0a96a5a8c495 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/eltwise.py @@ -0,0 +1,462 @@ +# Copyright (c) 2022, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class StreamingEltwise(HLSCustomOp): + """Class that corresponds to finn-hlslib StreamingEltwise function.""" + + def __init__(self, onnx_node): + super().__init__(onnx_node) + + def get_nodeattr_types(self): + my_attrs = { + "NumChannels": ("i", True, ""), + "PE": ("i", True, ""), + # FINN DataTypes for inputs; output datatype inferred from input + "inputDataType0": ("s", True, ""), + "inputDataType1": ("s", True, ""), + # type of EltwiseFunction for the operation + "eltwiseOp": ("s", True, "", ["Add", "Sub", "AbsDiff"]), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_eltwise_op_lambda(self): + eltwise_op = self.get_nodeattr("eltwiseOp") + idt0 = self.get_input_datatype(0) + idt1 = self.get_input_datatype(1) + odt = self.get_output_datatype() + tin0 = idt0.get_hls_datatype_str() + tin1 = idt1.get_hls_datatype_str() + tout = odt.get_hls_datatype_str() + eltwise_ops = { + # "Add": "[](auto a, auto b) { return a + b; }", + # "Sub": "[](auto a, auto b) { return a - b; }", + # "AbsDiff": "[](auto a, auto b) { return a>b? a-b : b-a; }", + "Add": f"add<{tin0}, {tin1}, {tout}>()", + "Sub": f"sub<{tin0}, {tin1}, {tout}>()", + "AbsDiff": f"absdiff<{tin0}, {tin1}, {tout}>()", + } + return eltwise_ops[eltwise_op] + + def get_normal_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + ishape = tuple(vecs + [ich]) + return ishape + + def get_folded_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + assert ich % pe == 0, "PE must divide NumChannels" + vecs = list(self.get_nodeattr("numInputVectors")) + ishape = tuple(vecs + [ich // pe, pe]) + return ishape + + def get_normal_output_shape(self): + return self.get_normal_input_shape() + + def get_folded_output_shape(self): + return self.get_folded_input_shape() + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpected input1 shape." + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[1])) + assert ishape == exp_ishape, "Unexpected input2 shape." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt0 = model.get_tensor_datatype(node.input[0]) + if idt0 != self.get_input_datatype(0): + warn_str = "inputDataType0 changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype(0)), + str(idt0), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType0", idt0.name) + idt1 = model.get_tensor_datatype(node.input[1]) + if idt1 != self.get_input_datatype(1): + warn_str = "inputDataType1 changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype(1)), + str(idt1), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType1", idt1.name) + # enforce output data type (calculated based on idt) + odt = self.get_output_datatype() + model.set_tensor_datatype(self.onnx_node.output[0], odt) + + def verify_node(self): + info_messages = [] + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify that all necessary attributes exist + try: + self.get_nodeattr("code_gen_dir_cppsim") + self.get_nodeattr("executable_path") + self.get_nodeattr("NumChannels") + self.get_nodeattr("PE") + self.get_nodeattr("inputDataType0") + self.get_nodeattr("inputDataType1") + self.get_nodeattr("eltwiseOp") + info_messages.append("All necessary attributes exist") + except Exception: + info_messages.append( + """The required StreamingEltwise attributes do not exist.""" + ) + + return info_messages + + def get_input_datatype(self, id=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType" + str(id))] + + def get_output_datatype(self): + """Returns FINN DataType of output.""" + op = self.get_nodeattr("eltwiseOp") + idt0 = self.get_input_datatype(0) + idt1 = self.get_input_datatype(1) + assert idt0.signed() == idt1.signed(), ( + "%s: Inputs must have same signedness" % self.onnx_node.name + ) + idt0_min, idt0_max = idt0.min(), idt0.max() + idt1_min, idt1_max = idt1.min(), idt1.max() + cands = [ + idt0_min - idt1_min, + idt0_min - idt1_max, + idt0_max - idt1_min, + idt0_max - idt1_max, + ] + largest_magnitude = max(map(abs, cands)) + if op == "Add": + if idt0.signed(): + return DataType.get_smallest_possible(idt0.min() + idt1.min()) + else: + return DataType.get_smallest_possible(idt0.max() + idt1.max()) + elif op == "Sub": + return DataType.get_smallest_possible(-largest_magnitude) + elif op == "AbsDiff": + return DataType.get_smallest_possible(largest_magnitude) + else: + raise Exception("%s: Unknown eltWiseOp = %s" % (self.onnx_node.name, op)) + + def get_instream_width(self, ind=0): + """Returns input stream width.""" + ibits = self.get_input_datatype(ind).bitwidth() + pe = self.get_nodeattr("PE") + in_width = pe * ibits + return in_width + + def get_outstream_width(self): + """Returns output stream width.""" + obits = self.get_output_datatype().bitwidth() + pe = self.get_nodeattr("PE") + out_width = pe * obits + return out_width + + def get_number_output_values(self): + return np.prod(self.get_folded_output_shape()[:-1]) + + def get_exp_cycles(self): + # Channels/PE * batch size * fmdim * fmdim + return np.prod(self.get_folded_output_shape()[:-1]) + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input0 shape doesn't match expected shape .""" + export_idt0 = self.get_input_datatype(0) + # reshape input into folded form + inp = inp.reshape(folded_ishape) + # make copy before saving array + reshaped_input = inp.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + # exact same thing for input1 + inp = context[node.input[1]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input1 shape doesn't match expected shape .""" + export_idt1 = self.get_input_datatype(1) + # reshape input into folded form + inp = inp.reshape(folded_ishape) + # make copy before saving array + reshaped_input = inp.copy() + np.save(os.path.join(code_gen_dir, "input_1.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == exp_oshape + ), "cppsim did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits0 = self.get_instream_width(0) + nbits1 = self.get_instream_width(1) + rtlsim_inp0 = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt0, nbits0 + ) + rtlsim_inp1 = npy_to_rtlsim_input( + "{}/input_1.npy".format(code_gen_dir), export_idt1, nbits1 + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp0, rtlsim_inp1) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape.""" + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = [ + '#include "eltwise.hpp"', + '#include "interpret.hpp"', + ] + + self.code_gen_dict["$GLOBALS$"].extend( + [ + "template<typename TI1, typename TI2, typename TO>", + "struct absdiff {", + "TO operator()(TI1 const &a, TI2 const &b) const {", + "#pragma HLS inline", + "return a>b? a-b : b-a;", + "}", + "};", + "template<typename TI1, typename TI2, typename TO>", + "struct sub {", + "TO operator()(TI1 const &a, TI2 const &b) const {", + "#pragma HLS inline", + "return a-b;", + "}", + "};", + "template<typename TI1, typename TI2, typename TO>", + "struct add {", + "TO operator()(TI1 const &a, TI2 const &b) const {", + "#pragma HLS inline", + "return a+b;", + "}", + "};", + ] + ) + + def defines(self, var): + self.code_gen_dict["$DEFINES$"] = [] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + idt0 = self.get_input_datatype(0) + idt1 = self.get_input_datatype(1) + elem_bits_0 = idt0.bitwidth() + elem_bits_1 = idt1.bitwidth() + packed_bits_0 = self.get_instream_width(0) + packed_hls_type_0 = "ap_uint<%d>" % packed_bits_0 + packed_bits_1 = self.get_instream_width(1) + packed_hls_type_1 = "ap_uint<%d>" % packed_bits_1 + elem_hls_type_0 = idt0.get_hls_datatype_str() + elem_hls_type_1 = idt1.get_hls_datatype_str() + npy_type = "float" + self.code_gen_dict["$READNPYDATA$"] = [] + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0);' + % (packed_hls_type_0, elem_hls_type_0, elem_bits_0, npy_type, npy_in) + ) + npy_in = "%s/input_1.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in1);' + % (packed_hls_type_1, elem_hls_type_1, elem_bits_1, npy_type, npy_in) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width(0)) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> in1 ("in1");'.format(self.get_instream_width(1)) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width()) + ) + + def docompute(self): + op = self.get_nodeattr("eltwiseOp") + idt0 = self.get_input_datatype(0) + idt1 = self.get_input_datatype(1) + odt = self.get_output_datatype() + elem_hls_type_0 = idt0.get_hls_datatype_str() + elem_hls_type_1 = idt1.get_hls_datatype_str() + out_hls_type = odt.get_hls_datatype_str() + slice_in0 = "Slice<%s>" % elem_hls_type_0 + slice_in1 = "Slice<%s>" % elem_hls_type_1 + slice_out = "Slice<%s>" % out_hls_type + eltwise_op_str = self.get_eltwise_op_lambda() + "%sEltwiseFunction<%s, %s, %s>()" % ( + op, + elem_hls_type_0, + elem_hls_type_1, + out_hls_type, + ) + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<{}, {}, {}, {}, {}, {}>(in0, in1, out, {});""".format( + "StreamingEltwise", + self.get_nodeattr("NumChannels"), + self.get_nodeattr("PE"), + self.get_number_output_values(), + slice_in0, + slice_in1, + slice_out, + eltwise_op_str, + ) + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + oshape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream<ap_uint<{}>> &in0, hls::stream<ap_uint<{}>> &in1, + hls::stream<ap_uint<{}>> &out)""".format( + self.onnx_node.name, + self.get_nodeattr("PE") * self.get_input_datatype(0).bitwidth(), + self.get_nodeattr("PE") * self.get_input_datatype(1).bitwidth(), + self.get_nodeattr("PE") * self.get_output_datatype().bitwidth(), + ) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=in1 name=in1_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE ap_ctrl_none port=return" + ) + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + sname = self.hls_sname() + swidth = self.get_instream_width_padded() + intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]] + return intf_names diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py index 27b23dd32835c265759a8cabfd2a3412844077ca..b0c05d1ad6c74ceaaaa2c932f4add3f0076bda51 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py @@ -29,6 +29,7 @@ import math import numpy as np import os +import textwrap import warnings from qonnx.core.datatype import DataType from qonnx.util.basic import ( @@ -41,6 +42,7 @@ from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp from finn.util.data_packing import ( npy_to_rtlsim_input, numpy_to_hls_code, + pack_innermost_dim_as_hex_string, rtlsim_output_to_npy, ) @@ -67,6 +69,36 @@ class VectorVectorActivation(HLSCustomOp): "accDataType": ("s", False, "INT32"), # no-activation mode (produce accumulators) "noActivation": ("i", False, 0, {0, 1}), + # memory mode for the layer weights + # const -- embedded weights, default, long compile/synth times + # decoupled -- streaming weights with weight streamer packaged inside IP + # external -- streaming weights with external streamer + "mem_mode": ("s", False, "const", {"const", "decoupled", "external"}), + # (mem_mode = decoupled only) whether weights will be writable through + # an AXI-lite interface during runtime + # 1 for enabled, 0 for disabled. + # see finn-rtllib/memstream/doc/README for more about the memory + # address map used for writable weights + # IMPORTANT: After using AXI lite to either read or write the weights, + # always "flush" the accelerator by first passing a dummy input + # vector through the accelerator. This will get rid of any old + # weight data from the weight FIFOs. + "runtime_writeable_weights": ("i", False, 0, {0, 1}), + # FPGA resource type for memories in decoupled mode + # auto -- let Vivado decide + # block -- use BRAM + # distributed -- use LUTRAM + # ultra -- use UltraRAM (URAM), must have runtime_writeable_weights=1 + # see also https://www.xilinx.com/support/answers/38070.html + "ram_style": ( + "s", + False, + "auto", + {"auto", "block", "distributed", "ultra"}, + ), + # use xnor-popcount for binary weights/inputs, thus treating them + # as bipolar + "binaryXnorMode": ("i", False, 0, {0, 1}), } my_attrs.update(super().get_nodeattr_types()) return my_attrs @@ -198,14 +230,23 @@ class VectorVectorActivation(HLSCustomOp): out_width = o_bits * self.get_nodeattr("PE") return out_width - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): k_h, k_w = self.get_nodeattr("Kernel") sf = k_h * k_w dim_h, dim_w = self.get_nodeattr("Dim") ch = self.get_nodeattr("Channels") pe = self.get_nodeattr("PE") nf = ch // pe - folded_input_shape = tuple([1, dim_h, dim_w, sf * nf, pe]) + + if ind == 0: + # calculate shape of input 0 + folded_input_shape = tuple([1, dim_h, dim_w, sf * nf, pe]) + elif ind == 1 and self.get_nodeattr("mem_mode") == "external": + # calculate shape of input 1 (weights) + folded_input_shape = tuple([1, sf * nf, pe]) + else: + raise Exception("Undefined input shape for requested input") + return folded_input_shape def get_folded_output_shape(self): @@ -251,13 +292,31 @@ class VectorVectorActivation(HLSCustomOp): ret = dict() inp_hls_str = self.get_input_datatype().get_hls_datatype_str() out_hls_str = self.get_output_datatype().get_hls_datatype_str() + inp_is_binary = self.get_input_datatype() == DataType["BINARY"] + # out_is_binary = self.get_output_datatype() == DataType["BINARY"] + wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] + bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 + if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode): + raise Exception("True binary (non-bipolar) inputs not yet supported") inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] + # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"] wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] + # reinterpret inp/wt as bipolar if bin_xnor_mode is iset + inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) + wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) # fill in TSrcI and TWeightI - # TODO handle bipolar inputs - if inp_is_bipolar or wt_is_bipolar: - raise Exception("VVAU node doesn't support bipolar values yet.") - else: + # TODO check these with Giulio + # TODO handle non-bipolar binary inputs + if inp_is_bipolar and wt_is_bipolar: + ret["TSrcI"] = "Recast<XnorMul>" + ret["TWeightI"] = "Identity" + elif (not inp_is_bipolar) and wt_is_bipolar: + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + ret["TWeightI"] = "Recast<Binary>" + elif inp_is_bipolar and (not wt_is_bipolar): + ret["TSrcI"] = "Recast<Binary>" + ret["TWeightI"] = "Identity" + elif (not inp_is_bipolar) and (not wt_is_bipolar): ret["TSrcI"] = "Slice<%s>" % inp_hls_str ret["TWeightI"] = "Identity" @@ -286,6 +345,13 @@ class VectorVectorActivation(HLSCustomOp): return ret def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): + """Convert the original numpy weight matrix orig_weight_matrix into + a form suitable for passing to the hlslib call: + * ensure MH % PE == 0 + * for bipolar weights&inputs, ensure thresholds are positive + * interleave rows between PEs + * reshape into (PE, TMEM, n_thres_steps) and return + """ ch = self.get_nodeattr("Channels") pe = self.get_nodeattr("PE") tmem = self.calc_tmem() @@ -295,14 +361,33 @@ class VectorVectorActivation(HLSCustomOp): ), """Threshold matrix dimension is not as expected (2).""" n_thres_steps = orig_thres_matrix.shape[1] + inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] + wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] + # reinterpret inp/wt as bipolar if bin_xnor_mode is iset + inp_is_binary = self.get_input_datatype() == DataType["BINARY"] + wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] + bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 + inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) + wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) + if inp_is_bipolar and wt_is_bipolar: + # ensure all thresholds are nonnegative + assert (orig_thres_matrix >= 0).all() + # ensure all thresholds are integer + assert (orig_thres_matrix.astype(np.int32) == orig_thres_matrix).all() ret = orig_thres_matrix # workaround for vivado_hls threshold bug - if ret[0][0] == 0: + if ret[0][0] == 0 and n_thres_steps == 1: ret = np.copy(ret) ret[0][0] = 1 warnings.warn( "Setting 0-valued first threshold to 1 to avoid vivado_hls bug" ) + # ensure channels = mh , duplicating if necessary + if ret.shape[0] == 1: + ret = np.tile(ret, (ch, 1)) + assert ( + ret.shape[0] == ch + ), "Channels of threshold matrix are not as expected (ch)" # distribute rows between PEs ret = interleave_matrix_outer_dim_from_partitions(ret, pe) assert ( @@ -319,43 +404,173 @@ class VectorVectorActivation(HLSCustomOp): rows between PEs is not as expected (n_thres_steps)""" return ret.reshape(1, pe, tmem, n_thres_steps) - def generate_params(self, model, path): - # weights - weights = model.get_initializer(self.onnx_node.input[1]) + def make_weight_file(self, weights, weight_file_mode, weight_file_name): + """Produce a file containing given weights in appropriate format for this + layer. This file can be used for either synthesis or run-time reconfig + of weights. + + Arguments: + * weights : numpy array with weights to be put into the file + * weight_file_mode : one of {hls_header, decoupled_verilog_dat, + decoupled_runtime} + * weight_file_name : filename for the weight file to be generated + """ # convert weights into hlslib-compatible format weight_tensor = self.get_hls_compatible_weight_tensor(weights) - wdt = self.get_weight_datatype() - code_gen_dir = path - - """Saves weights into params.h""" - weight_hls_code = numpy_to_hls_code(weight_tensor, wdt, "weights", True, True) - # write weights into params.h - f_weights = open("{}/params.h".format(code_gen_dir), "w") - - if wdt.bitwidth() != 1: - f_weights.write( - "const FixedPointWeights<1,{},{},{}> weights = ".format( - wdt.get_hls_datatype_str(), - self.get_nodeattr("PE"), - self.calc_wmem(), + export_wdt = self.get_weight_datatype() + # we have converted bipolar weights to binary for export, + # so use it as such for weight generation + if self.get_weight_datatype() == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + if weight_file_mode == "hls_header": + weight_hls_code = numpy_to_hls_code( + weight_tensor, export_wdt, "weights", True, True + ) + # write weights into C++ header file as dictated by finn-hlslib + f_weights = open(weight_file_name, "w") + if export_wdt.bitwidth() != 1: + f_weights.write( + "const FixedPointWeights<1,{},{},{}> weights = ".format( + export_wdt.get_hls_datatype_str(), + self.get_nodeattr("PE"), + self.calc_wmem(), + ) ) + else: + f_weights.write( + "const BinaryWeights<1,{},{}> weights = ".format( + self.get_nodeattr("PE"), + self.calc_wmem(), + ) + ) + f_weights.write(weight_hls_code) + f_weights.close() + elif "decoupled" in weight_file_mode: + # create a weight stream for various flavors of decoupled mode: + # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD) + weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3)) + # reverse SIMD flip for saving weights in .npy + weight_tensor_simd_flipped = np.flip(weight_tensor_unflipped, axis=-1) + # PE flip for saving weights in .dat + weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2) + # reshape weight tensor (simd_flipped and pe_flipped) to desired shape + pe = self.get_nodeattr("PE") + simd = 1 + # simd_flipped + weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape( + 1, -1, pe * simd + ) + weight_tensor_simd_flipped = weight_tensor_simd_flipped.copy() + # flipped + weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape( + 1, -1, pe * simd ) + weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy() + if weight_file_mode == "decoupled_npy": + # save weight stream into npy for cppsim + np.save(weight_file_name, weight_tensor_simd_flipped) + elif weight_file_mode == "decoupled_verilog_dat": + # convert weight values into hexstring + weight_width = self.get_weightstream_width() + # pad to nearest 4 bits to get hex strings + weight_width_padded = roundup_to_integer_multiple(weight_width, 4) + weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( + weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" + ) + # add zeroes to pad out file to 1024 entries + weight_stream = weight_tensor_pe_flipped.flatten() + weight_stream = weight_stream.copy() + with open(weight_file_name, "w") as f: + for val in weight_stream: + f.write(val + "\n") + elif weight_file_mode == "decoupled_runtime": + # memstream axi-lite interface will map each mem line to + # one or multiple 32-bit words + weight_width = self.get_weightstream_width() + words_per_memwidth = 2 ** math.ceil(math.log2(weight_width / 32)) + if words_per_memwidth < 1: + words_per_memwidth = 1 + weight_width_padded = words_per_memwidth * 32 + # first, pack and ensure padding to 32 bits + weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( + weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" + ) + weight_stream = weight_tensor_pe_flipped.flatten() + weight_stream = weight_stream.copy() + with open(weight_file_name, "w") as f: + for val in weight_stream: + # split into groups of 8 hex digits (= 32 bits) + words_32b = textwrap.wrap(val, 8) + words_32b.reverse() + for word_32b in words_32b: + f.write(word_32b + "\n") + else: + raise Exception("Unknown weight_file_mode") + else: - f_weights.write( - "const BinaryWeights<1,{},{}> weights = ".format( - self.get_nodeattr("PE"), self.calc_wmem() + raise Exception("Unknown weight_file_mode") + + def generate_params(self, model, path): + mem_mode = self.get_nodeattr("mem_mode") + code_gen_dir = path + # weights, if not external + weights = model.get_initializer(self.onnx_node.input[1]) + if mem_mode == "const": + # save hlslib-compatible weights in params.h + weight_filename = "{}/params.h".format(code_gen_dir) + self.make_weight_file(weights, "hls_header", weight_filename) + elif mem_mode == "decoupled" or mem_mode == "external": + weight_filename_sim = "{}/weights.npy".format(code_gen_dir) + # save decoupled weights for cppsim + self.make_weight_file(weights, "decoupled_npy", weight_filename_sim) + if mem_mode == "decoupled": + # also save weights as Verilog .dat file + # note that we provide two different .dat files, one for synth + # and one for synthesis. this is because URAM-based weights always + # need zero weights for synthesis, otherwise they get inferred + # as BRAM + weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format( + code_gen_dir + ) + weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir) + # sim weights are always the true weights + self.make_weight_file( + weights, "decoupled_verilog_dat", weight_filename_rtl_sim ) + ram_style = self.get_nodeattr("ram_style") + if ram_style == "ultra": + # UltraRAM must have no memory initializer, or only zeroes + # otherwise BRAM will be inferred instead of URAM + # as a workaround we provide a zero-weight init here + synth_weights = np.zeros_like(weights, dtype=np.float32) + else: + synth_weights = weights + self.make_weight_file( + synth_weights, "decoupled_verilog_dat", weight_filename_rtl_synth + ) + else: + raise Exception( + """Please set mem_mode to "const", "decoupled", or "external", + currently no other parameter value is supported!""" ) - f_weights.write(weight_hls_code) - f_weights.close() # save thresholds in thresh.h if len(self.onnx_node.input) > 2: thresholds = model.get_initializer(self.onnx_node.input[2]) if thresholds is not None: threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + # use UINT32 threshold export for bipolar times bipolar + inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] + wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] + # reinterpret inp/wt as bipolar if bin_xnor_mode is iset + inp_is_binary = self.get_input_datatype() == DataType["BINARY"] + wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] + bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 + inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) + wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) # get computed threshold datatype from attribute tdt = DataType[self.get_nodeattr("accDataType")] + assert np.vectorize(tdt.allowed)( threshold_tensor ).all(), "Thresholds in %s can't be expressed with type %s" % ( @@ -368,8 +583,11 @@ class VectorVectorActivation(HLSCustomOp): # write thresholds into thresh.h f_thresh = open("{}/thresh.h".format(code_gen_dir), "w") tdt_hls = tdt.get_hls_datatype_str() - odt = self.get_output_datatype() - odt_hls = odt.get_hls_datatype_str() + # use binary to export bipolar activations + export_odt = self.get_output_datatype() + if self.get_output_datatype() == DataType["BIPOLAR"]: + export_odt = DataType["BINARY"] + odt_hls = export_odt.get_hls_datatype_str() f_thresh.write( "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \ = ".format( @@ -387,6 +605,7 @@ class VectorVectorActivation(HLSCustomOp): def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") + mem_mode = self.get_nodeattr("mem_mode") node = self.onnx_node # TODO ensure codegen dir exists @@ -440,7 +659,28 @@ class VectorVectorActivation(HLSCustomOp): inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), idt, nbits) super().reset_rtlsim(sim) super().toggle_clk(sim) - output = self.rtlsim(sim, inp) + + if mem_mode == "external" or mem_mode == "decoupled": + wnbits = self.get_weightstream_width() + export_wdt = self.get_weight_datatype() + # we have converted bipolar weights to binary for export, + # so use it as such for weight generation + if self.get_weight_datatype() == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + wei = npy_to_rtlsim_input( + "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits + ) + dim_h, dim_w = self.get_nodeattr("Dim") + num_w_reps = dim_h * dim_w + + io_dict = { + "inputs": {"in0": inp, "weights": wei * num_w_reps}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + output = io_dict["outputs"]["out"] + else: + output = self.rtlsim(sim, inp) odt = self.get_output_datatype() target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() @@ -466,6 +706,12 @@ class VectorVectorActivation(HLSCustomOp): def global_includes(self): self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"'] self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"'] + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode not in ["const", "decoupled", "external"]: + raise Exception( + """Please set mem_mode to "const", "decoupled", or "external", + currently no other parameter value is supported!""" + ) if self.calc_tmem() != 0: self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] @@ -474,6 +720,8 @@ class VectorVectorActivation(HLSCustomOp): numReps = 1 * dim_h * dim_w k_h, k_w = self.get_nodeattr("Kernel") innerProdDim = k_h * k_w + mem_mode = self.get_nodeattr("mem_mode") + self.code_gen_dict["$DEFINES$"] = [ """#define Channels1 {}\n #define InnerProdDim {}\n #define SIMD1 1\n #define PE1 {}\n #define numReps {}""".format( @@ -483,6 +731,11 @@ class VectorVectorActivation(HLSCustomOp): numReps, ) ] + if mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + self.code_gen_dict["$DEFINES$"].append( + "#define WP1 {}\n".format(wdt.bitwidth()) + ) def read_npy_data(self): code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") @@ -500,7 +753,23 @@ class VectorVectorActivation(HLSCustomOp): % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) ) + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + elem_bits = wdt.bitwidth() + packed_bits = self.get_weightstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = wdt.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/weights.npy" % code_gen_dir + + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", weights, false, numReps);' + % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) + ) + def strm_decl(self): + mem_mode = self.get_nodeattr("mem_mode") self.code_gen_dict["$STREAMDECLARATIONS$"] = [] self.code_gen_dict["$STREAMDECLARATIONS$"].append( 'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width()) @@ -508,8 +777,15 @@ class VectorVectorActivation(HLSCustomOp): self.code_gen_dict["$STREAMDECLARATIONS$"].append( 'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width()) ) + if mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> weights ("weights");'.format( + self.get_weightstream_width() + ) + ) def docompute(self): + mem_mode = self.get_nodeattr("mem_mode") map_to_hls_mult_style = { "auto": "ap_resource_dflt()", "lut": "ap_resource_lut()", @@ -521,16 +797,42 @@ class VectorVectorActivation(HLSCustomOp): threshs = "PassThroughActivation<%s>()" % odtype_hls_str else: threshs = "threshs" - self.code_gen_dict["$DOCOMPUTE$"] = [ - """Vector_Vector_Activate_Batch<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}> - (in0, out, weights, {}, numReps, {});""".format( - tmpl_args["TSrcI"], - tmpl_args["TDstI"], - tmpl_args["TWeightI"], - threshs, - map_to_hls_mult_style[self.get_nodeattr("resType")], + + if mem_mode == "const": + self.code_gen_dict["$DOCOMPUTE$"] = [ + """Vector_Vector_Activate_Batch<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}> + (in0, out, weights, {}, numReps, {});""".format( + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + threshs, + map_to_hls_mult_style[self.get_nodeattr("resType")], + ) + ] + elif mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + if wdt == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + else: + export_wdt = wdt + wdtype_hls_str = export_wdt.get_hls_datatype_str() + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}, {}> + (in0, out, weights, {}, numReps, {});""".format( + "Vector_Vector_Activate_Stream_Batch", + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + wdtype_hls_str, + threshs, + map_to_hls_mult_style[self.get_nodeattr("resType")], + ) + ] + else: + raise Exception( + """Please set mem_mode to "const", "decoupled", or "external", + currently no other parameter value is supported!""" ) - ] def dataoutstrm(self): code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") @@ -561,17 +863,38 @@ class VectorVectorActivation(HLSCustomOp): self.code_gen_dict["$SAVEASCNPY$"] = [] def blackboxfunction(self): - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream<ap_uint<{}>> &in0, - hls::stream<ap_uint<{}>> &out - )""".format( - self.onnx_node.name, - self.get_instream_width(), - self.get_outstream_width(), + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "const": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream<ap_uint<{}>> &in0, + hls::stream<ap_uint<{}>> &out + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.get_outstream_width(), + ) + ] + elif mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}( + hls::stream<ap_uint<{}>> &in0, + hls::stream<ap_uint<{}>> &weights, + hls::stream<ap_uint<{}>> &out + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.get_weightstream_width(), + self.get_outstream_width(), + ) + ] + else: + raise Exception( + """Please set mem_mode to "const" or "decoupled", currently no other + parameter value is supported!""" ) - ] def pragmas(self): + mem_mode = self.get_nodeattr("mem_mode") self.code_gen_dict["$PRAGMAS$"] = [ "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() ] @@ -593,12 +916,30 @@ class VectorVectorActivation(HLSCustomOp): "#pragma HLS INTERFACE ap_ctrl_none port=return" ) - self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') - # the weight tensor is ap_uint<ch*prec> [PE][WMEM] - # partition for parallel access along the PE dimension (dim 1) - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1") - ) + if mem_mode == "const": + self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') + # the weight tensor is ap_uint<ch*prec> [PE][WMEM] + # partition for parallel access along the PE dimension (dim 1) + self.code_gen_dict["$PRAGMAS$"].append( + ( + "#pragma HLS ARRAY_PARTITION variable=weights.m_weights " + "complete dim=1" + ) + ) + elif mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=weights name=weights_" + + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS stream depth=8 variable=weights" + ) + else: + raise Exception( + """Please set mem_mode to "const", "decoupled", or external, + currently no other parameter value is supported!""" + ) + if self.calc_tmem() != 0: # TODO find a better way of checking for no pregenerated thresholds self.code_gen_dict["$PRAGMAS$"].append( @@ -614,6 +955,157 @@ class VectorVectorActivation(HLSCustomOp): ) ) + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + mem_mode = self.get_nodeattr("mem_mode") + sname = self.hls_sname() + if mem_mode == "external": + intf_names["s_axis"].append( + ("weights_" + sname, self.get_weightstream_width_padded()) + ) + if mem_mode == "decoupled": + # only expose axilite interface if attribute is set + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + if runtime_writable: + intf_names["axilite"] = ["s_axilite"] + return intf_names + + def code_generation_ipi(self): + cmd = [] + # add streamer if needed + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "decoupled": + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + if self.get_nodeattr("ram_style") == "ultra": + assert ( + runtime_writable == 1 + ), "Layer with URAM weights must have runtime_writeable_weights=1" + node_name = self.onnx_node.name + sname = self.hls_sname() + # create a hierarchy for this layer, with the same port names + clk_name = self.get_verilog_top_module_intf_names()["clk"][0] + rst_name = self.get_verilog_top_module_intf_names()["rst"][0] + dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0] + din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] + cmd.append("create_bd_cell -type hier %s" % node_name) + cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) + cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) + cmd.append( + "create_bd_intf_pin -mode Master " + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" + % (node_name, dout_name) + ) + cmd.append( + "create_bd_intf_pin -mode Slave " + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) + ) + # instantiate the hls ip + cmd.append( + "create_bd_cell -type ip -vlnv %s /%s/%s" + % (self.get_nodeattr("ip_vlnv"), node_name, node_name) + ) + # instantiate a streamer and connect it to the HLS IP + strm_vlnv = "xilinx.com:user:memstream:1.0" + strm_inst = node_name + "_wstrm" + cmd.append( + "create_bd_cell -type ip -vlnv %s /%s/%s" + % (strm_vlnv, node_name, strm_inst) + ) + cmd.append( + "set_property -dict [list " + "CONFIG.NSTREAMS {1} " + "CONFIG.MEM_DEPTH {%d} " + "CONFIG.MEM_WIDTH {%d} " + "CONFIG.MEM_INIT {%s} " + "CONFIG.RAM_STYLE {%s} " + "CONFIG.STRM0_DEPTH {%d} " + "CONFIG.STRM0_WIDTH {%d} " + "CONFIG.STRM0_OFFSET {0} " + "] [get_bd_cells /%s/%s]" + % ( + self.calc_wmem(), + self.get_weightstream_width_padded(), + self.get_nodeattr("code_gen_dir_ipgen") + "/", + self.get_nodeattr("ram_style"), + self.calc_wmem(), + self.get_weightstream_width_padded(), + node_name, + strm_inst, + ) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s/m_axis_0] " + "[get_bd_intf_pins %s/%s/weights_%s]" + % (node_name, strm_inst, node_name, node_name, sname) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]" + % (node_name, rst_name, node_name, strm_inst) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aclk]" + % (node_name, clk_name, node_name, strm_inst) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]" + % (node_name, rst_name, node_name, node_name, rst_name) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]" + % (node_name, clk_name, node_name, node_name, clk_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, din_name, node_name, node_name, din_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, dout_name, node_name, node_name, dout_name) + ) + if runtime_writable: + # expose axi lite interface for writeable weights + axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0] + cmd.append( + "create_bd_intf_pin -mode Slave " + "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" + % (node_name, axilite_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, axilite_name, node_name, strm_inst, axilite_name) + ) + # TODO calculate and pass in segment size here + cmd.append("assign_bd_address") + cmd.append("save_bd_design") + elif mem_mode == "const" or mem_mode == "external": + # base class impl sufficient for const/external modes + return super().code_generation_ipi() + else: + raise Exception("Unrecognized mem_mode for VectorVectorActivation") + return cmd + + def uram_estimation(self): + P = self.get_nodeattr("PE") + Q = 1 + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + omega = self.calc_wmem() + mem_width = Q * W * P + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "decoupled" and mstyle != "ultra") + or (mmode == "const" and self.calc_wmem() <= 128) + or (mmode == "external") + ): + return 0 + width_multiplier = math.ceil(mem_width / 72) + depth_multiplier = math.ceil(omega / 4096) + return width_multiplier * depth_multiplier + def bram_estimation(self): """Calculates resource estimation for BRAM""" # TODO add in/out FIFO contributions @@ -624,7 +1116,13 @@ class VectorVectorActivation(HLSCustomOp): # assuming SDP mode RAMB18s (see UG573 Table 1-10) # since this is HLS memory, not using the full width of a BRAM # assuming memories up to 128 deep get implemented in LUTs - if self.calc_wmem() <= 128: + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "decoupled" and mstyle in ["distributed", "ultra"]) + or (mmode == "const" and self.calc_wmem() <= 128) + or (mmode == "external") + ): return 0 if W == 1: @@ -671,8 +1169,12 @@ class VectorVectorActivation(HLSCustomOp): c0 = 300 c1 = 1.1 c2 = 0 - if self.calc_wmem() <= 128: - c2 = P * W * math.ceil(self.calc_wmem() / 64) + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if (mmode == "decoupled" and mstyle == "distributed") or ( + mmode == "const" and self.calc_wmem() <= 128 + ): + c2 = (P * W) * math.ceil(self.calc_wmem() / 64) # multiplication res_type = self.get_nodeattr("resType") @@ -710,6 +1212,25 @@ class VectorVectorActivation(HLSCustomOp): mult_dsp = 0 return int(mult_dsp) + def get_weightstream_width(self): + """Returns weight stream width. Used only in decoupled mode.""" + if ( + self.get_nodeattr("mem_mode") == "decoupled" + or self.get_nodeattr("mem_mode") == "external" + ): + pe = self.get_nodeattr("PE") + wp = self.get_weight_datatype().bitwidth() + w_width = pe * wp + return w_width + else: + return 0 + + def get_weightstream_width_padded(self): + """Returns weight stream width padded to a multiple of 8. This is required + by the AXI Stream spec. Used in decoupled mode.""" + weight_width = self.get_weightstream_width() + return roundup_to_integer_multiple(weight_width, 8) + def get_op_and_param_counts(self): k_h, k_w = self.get_nodeattr("Kernel") fm = self.get_nodeattr("Channels") diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index 9059e023a03ac06a5f24fc09d52bd7aca6451107..b7db49eb22e0ccb6e3ffbf8ccad44d4274cb2154 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -48,6 +48,10 @@ from finn.transformation.fpgadataflow.minimize_accumulator_width import ( class InferConvInpGen(Transformation): """Convert Im2Col layers to ConvolutionInputGenerator layers.""" + def __init__(self, use_rtl_variant=False): + super().__init__() + self.use_rtl_variant = use_rtl_variant + def apply(self, model): graph = model.graph node_ind = 0 @@ -128,108 +132,144 @@ class InferConvInpGen(Transformation): ) graph.node.insert(node_ind, padding_node) - # Ensure that only supported HLS nodes are inserted + is_kernel_pointwise = k_h == 1 and k_w == 1 is_square_image = ConvInpGen_idim_h == ConvInpGen_idim_w is_square_kernel = k_h == k_w - is_kernel_pointwise = k_h == 1 and k_w == 1 is_equal_stride = stride_h == stride_w is_1d_convolution = (k_h == 1 and k_w > 1 and ifm_dim_h == 1) or ( k_h > 1 and k_w == 1 and ifm_dim_w == 1 ) - if (stride_h > 1 or stride_w > 1) and is_kernel_pointwise: - downsample_1D = (ifm_dim_h == 1) or (ifm_dim_w == 1) - is1D_unitx = ifm_dim_w == 1 - downsample_2D = ( - (not downsample_1D) and is_square_image and is_equal_stride - ) - if not (downsample_1D or downsample_2D): + # Ensure that RTL variant is not inserted for unsupported configuration + is_rtl_variant_compatible = True + if is_kernel_pointwise: + is_rtl_variant_compatible = False + if self.use_rtl_variant: warnings.warn( - f"Couldn't infer Downsample from {n.name}, check config." + """%s : RTL ConvInpGen requested for unsupported + configuration. Falling back to HLS implementation.""" + % n.name ) - continue - ConvInpGen_idim = max(ConvInpGen_idim_h, ConvInpGen_idim_w) - stride = max(stride_h, stride_w) - # create DownSampler node + + if self.use_rtl_variant and is_rtl_variant_compatible: + ConvInpGen_node = helper.make_node( - "DownSampler", + "ConvolutionInputGenerator_rtl", [ConvInpGen_input], [i2c_output], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - ImgDim=ConvInpGen_idim, - NumChannels=ifm_ch, + ConvKernelDim=[k_h, k_w], + IFMChannels=ifm_ch, + IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w], + OFMDim=[ofm_dim_h, ofm_dim_w], SIMD=ifm_ch, - Stride=stride, + M=1, + parallel_window=0, + Stride=[stride_h, stride_w], + Dilation=[dilation_h, dilation_w], inputDataType=dt.name, - name="DownSampler_" + n.name, - is1D=downsample_1D, - is1D_unitx=is1D_unitx, + outputDataType=dt.name, + depthwise=depthwise, + name="ConvolutionInputGenerator_rtl_" + n.name, ) graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) else: - # create equivalent ConvolutionInputGenerator node - if ( - is_square_image and is_square_kernel - ): # square images and square kernels - assert is_equal_stride, ( - """%s: Non-equal strides along different axes is not supported - for (non-)square convolutions""" - % n.name - ) - assert dilation_h == 1 and dilation_w == 1, ( - """%s: Dilation value != 1 is not supported - for square convolutions""" - % n.name + # Ensure that only supported HLS nodes are inserted + if (stride_h > 1 or stride_w > 1) and is_kernel_pointwise: + downsample_1D = (ifm_dim_h == 1) or (ifm_dim_w == 1) + is1D_unitx = ifm_dim_w == 1 + downsample_2D = ( + (not downsample_1D) and is_square_image and is_equal_stride ) + if not (downsample_1D or downsample_2D): + warnings.warn( + f"Couldn't infer Downsample from {n.name},check config." + ) + continue + ConvInpGen_idim = max(ConvInpGen_idim_h, ConvInpGen_idim_w) + stride = max(stride_h, stride_w) + # create DownSampler node ConvInpGen_node = helper.make_node( - "ConvolutionInputGenerator", + "DownSampler", [ConvInpGen_input], [i2c_output], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - ConvKernelDim=[k_h, k_w], - IFMChannels=ifm_ch, - IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w], - OFMDim=[ofm_dim_h, ofm_dim_w], + ImgDim=ConvInpGen_idim, + NumChannels=ifm_ch, SIMD=ifm_ch, - Stride=[stride_h, stride_w], - Dilation=[dilation_h, dilation_w], + Stride=stride, inputDataType=dt.name, - outputDataType=dt.name, - depthwise=depthwise, - name="ConvolutionInputGenerator_" + n.name, + name="DownSampler_" + n.name, + is1D=downsample_1D, + is1D_unitx=is1D_unitx, ) - else: # 1D images and/or kernels - assert is_1d_convolution, ( - "%s: ConvolutionInputGenerator1D works only for 1D convs" - % n.name - ) - if dilation_h > 1 or dilation_w > 1: - assert depthwise == 1, ( - """%s: Dilation value > 1 is only supported for - 1D depthwise separable convolutions""" + graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) + else: + # create equivalent ConvolutionInputGenerator node + if ( + is_square_image and is_square_kernel + ): # square images and square kernels + assert is_equal_stride, ( + """%s: Non-equal strides along different axes is not supported + for (non-)square convolutions""" % n.name ) - ConvInpGen_node = helper.make_node( - "ConvolutionInputGenerator1D", - [ConvInpGen_input], - [i2c_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - ConvKernelDim=[k_h, k_w], - IFMChannels=ifm_ch, - IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w], - OFMDim=[ofm_dim_h, ofm_dim_w], - SIMD=ifm_ch, - Stride=[stride_h, stride_w], - Dilation=[dilation_h, dilation_w], - inputDataType=dt.name, - outputDataType=dt.name, - depthwise=depthwise, - name="ConvolutionInputGenerator1D_" + n.name, - ) - graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) + assert dilation_h == 1 and dilation_w == 1, ( + """%s: Dilation value != 1 is not supported + for square convolutions""" + % n.name + ) + ConvInpGen_node = helper.make_node( + "ConvolutionInputGenerator", + [ConvInpGen_input], + [i2c_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + ConvKernelDim=[k_h, k_w], + IFMChannels=ifm_ch, + IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w], + OFMDim=[ofm_dim_h, ofm_dim_w], + SIMD=ifm_ch, + Stride=[stride_h, stride_w], + Dilation=[dilation_h, dilation_w], + inputDataType=dt.name, + outputDataType=dt.name, + depthwise=depthwise, + name="ConvolutionInputGenerator_" + n.name, + ) + else: # 1D images and/or kernels + assert is_1d_convolution, ( + """%s: ConvolutionInputGenerator1D works only + for 1D convs""" + % n.name + ) + if dilation_h > 1 or dilation_w > 1: + assert depthwise == 1, ( + """%s: Dilation value > 1 is only supported for + 1D depthwise separable convolutions""" + % n.name + ) + ConvInpGen_node = helper.make_node( + "ConvolutionInputGenerator1D", + [ConvInpGen_input], + [i2c_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + ConvKernelDim=[k_h, k_w], + IFMChannels=ifm_ch, + IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w], + OFMDim=[ofm_dim_h, ofm_dim_w], + SIMD=ifm_ch, + Stride=[stride_h, stride_w], + Dilation=[dilation_h, dilation_w], + inputDataType=dt.name, + outputDataType=dt.name, + depthwise=depthwise, + name="ConvolutionInputGenerator1D_" + n.name, + ) + graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) # remove old nodes graph.node.remove(n) graph_modified = True @@ -873,6 +913,10 @@ class InferVectorVectorActivation(Transformation): a depthwise convolution. Any immediately following MultiThreshold layers will also be absorbed into the VVAU.""" + def __init__(self, mem_mode="const"): + super().__init__() + self.mem_mode = mem_mode + def apply(self, model): graph = model.graph node_ind = 0 @@ -973,6 +1017,7 @@ class InferVectorVectorActivation(Transformation): ActVal=actval, noActivation=0, name="VectorVectorActivation_" + n.name, + mem_mode=self.mem_mode, ) graph.node.insert(node_ind, new_node) # remove old nodes @@ -1674,3 +1719,95 @@ class InferConcatLayer(Transformation): model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) return (model, graph_modified) + + +class InferStreamingEltwise(Transformation): + """Convert eltwise Sub or Sub -> Abs to StreamingEltwise layer + with SubEltwise or AbsDiffEltwise op.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + if node.op_type == "Sub": + in0 = node.input[0] + in1 = node.input[1] + result = node.output[0] + in0_shape = model.get_tensor_shape(in0) + in1_shape = model.get_tensor_shape(in1) + + # skip if different shapes on inputs + if in0_shape != in1_shape: + continue + + idt0 = model.get_tensor_datatype(in0) + idt1 = model.get_tensor_datatype(in1) + + # skip conversion for layers with float input + if not (idt0.is_integer() and idt1.is_integer()): + continue + + eltwiseOp = "Sub" + nodes_to_remove = [node] + # look for a downstream Abs node + res_consumer = model.find_consumer(result) + if (res_consumer is not None) and (res_consumer.op_type == "Abs"): + eltwiseOp = "AbsDiff" + result = res_consumer.output[0] + nodes_to_remove.append(res_consumer) + + # check layout and convert if necessary + in0_layout = model.get_tensor_layout(in0) + in1_layout = model.get_tensor_layout(in1) + result_layout = model.get_tensor_layout(result) + + if in0_layout == DataLayout.NCHW: + in0 = nchw_to_nhwc(in0, model, node_ind) + node_ind += 1 + in0_shape = model.get_tensor_shape(in0) + + if in1_layout == DataLayout.NCHW: + in1 = nchw_to_nhwc(in1, model, node_ind) + node_ind += 1 + in1_shape = model.get_tensor_shape(in1) + + # keep track of where we need to insert the HLS Op + # it has to be ahead of the output transform + insert_point = node_ind + + if result_layout == DataLayout.NCHW: + result = nchw_to_nhwc(result, model, node_ind, reverse=True) + node_ind += 1 + + # now safe to assume num_channels is size of last dimension + num_channels = int(in0_shape[-1]) + # create node with no parallelization first + pe = 1 + + # create and insert new Eltwise node + new_node = helper.make_node( + "StreamingEltwise", + [in0, in1], + [result], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + NumChannels=num_channels, + PE=pe, + inputDataType0=idt0.name, + inputDataType1=idt1.name, + eltwiseOp=eltwiseOp, + numInputVectors=in0_shape[:-1], + name="StreamingEltwise_" + node.name, + ) + graph.node.insert(insert_point, new_node) + # remove old nodes + for nd in nodes_to_remove: + graph.node.remove(nd) + graph_modified = True + + # if graph_modified: + # model = model.transform(InferShapes()) + # model = model.transform(InferDataTypes()) + return (model, graph_modified) diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py index 892ab09fdf41947f86e2bf122e057e94585dfa8c..831929a3ff7a63b40435d5b11a9bc268c097df38 100644 --- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py +++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py @@ -275,7 +275,7 @@ class CreateStitchedIP(Transformation): "make_bd_intf_pins_external [get_bd_intf_pins %s/s_axi]" % signature_name ) self.connect_cmds.append( - "set_property name s_axis_info [get_bd_intf_ports s_axi_0]" + "set_property name s_axilite_info [get_bd_intf_ports s_axi_0]" ) self.connect_cmds.append("assign_bd_address") diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py index a589cb039c825ff97c11df7ffa57109df27f3fd0..f48566326e576f4d39d81359fe7f28a12645a635 100644 --- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -45,7 +45,7 @@ from finn.transformation.fpgadataflow.insert_dwc import InsertDWC from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA from finn.transformation.fpgadataflow.prepare_ip import PrepareIP -from finn.util.basic import make_build_dir, pynq_part_map +from finn.util.basic import make_build_dir, pynq_native_port_width, pynq_part_map from . import templates @@ -320,6 +320,7 @@ class ZynqBuild(Transformation): ): super().__init__() self.fpga_part = pynq_part_map[platform] + self.axi_port_width = pynq_native_port_width[platform] self.period_ns = period_ns self.platform = platform self.enable_debug = enable_debug @@ -330,7 +331,7 @@ class ZynqBuild(Transformation): model = model.transform(InferDataLayouts()) # prepare at global level, then break up into kernels prep_transforms = [ - InsertIODMA(64), + InsertIODMA(self.axi_port_width), InsertDWC(), Floorplan(), CreateDataflowPartition(partition_model_dir=self.partition_model_dir), diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py index 23943084ab99d6ab880a69975e0b4a49756905a7..e24e24f1f8ebb2873c81617884cd333311d8aea9 100644 --- a/src/finn/transformation/fpgadataflow/set_folding.py +++ b/src/finn/transformation/fpgadataflow/set_folding.py @@ -109,6 +109,7 @@ class SetFolding(Transformation): "FMPadding_Batch", "ConvolutionInputGenerator", "ConvolutionInputGenerator1D", + "ConvolutionInputGenerator_rtl", ] # these ops are preceded by depthwise SWG and have special behavior, # as explained in the SetFolding docstring @@ -171,10 +172,7 @@ class SetFolding(Transformation): "Expected SWU on DW op input, found " + swu_node.op_type ) elif op_type in simd_ops: - if op_type in [ - "ConvolutionInputGenerator", - "ConvolutionInputGenerator1D", - ]: + if op_type.startswith("ConvolutionInputGenerator"): depthwise = node_inst.get_nodeattr("depthwise") if depthwise == 0: max_simd = node_inst.get_nodeattr("IFMChannels") diff --git a/src/finn/transformation/qonnx/fold_quant_weights.py b/src/finn/transformation/qonnx/fold_quant_weights.py index 80b6042d03ea11a45493011288133ed3a6f57c8d..e8339ae24472fa238e5c5da176b1316611218a54 100644 --- a/src/finn/transformation/qonnx/fold_quant_weights.py +++ b/src/finn/transformation/qonnx/fold_quant_weights.py @@ -126,10 +126,20 @@ class FoldQuantWeights(Transformation): model.set_tensor_datatype(node_out, new_dtype) # Reshape scale for Conv if required + target_output_shape = model.get_tensor_shape( + target_node.output[0] + ) if target_node.op_type == "Conv" and len(scale.shape) > 0: - bias_shape = [1] * len(scale.shape) - bias_shape[1] = -1 - scale = scale.reshape(bias_shape) + conv_out_shape = [1] * len(target_output_shape) + # only support per-output channel scaling + # (i.e. all scale shape elems besides 0th must be 1s) + if len(scale.shape) > 1: + assert ( + np.prod(scale.shape[1:]) == 1 + ), "Can't fold scale beyond per-out-channel granularity" + # collect all scaling in channels dim (since we constrain) + conv_out_shape[1] = -1 + scale = scale.reshape(conv_out_shape) if scale.shape == (1,): scale = scale[0] diff --git a/src/finn/transformation/qonnx/quant_act_to_multithreshold.py b/src/finn/transformation/qonnx/quant_act_to_multithreshold.py index c52d69b0f09d306c5b076bb6ef1775f38977241a..77025ecdf57d5a422992d4163d05c740454986bb 100644 --- a/src/finn/transformation/qonnx/quant_act_to_multithreshold.py +++ b/src/finn/transformation/qonnx/quant_act_to_multithreshold.py @@ -110,11 +110,6 @@ class ConvertQuantActToMultiThreshold(Transformation): predecessor_op_type = predecessor[0].op_type else: predecessor_op_type = predecessor - if model.is_fork_node(n): - raise ValueError( - "Forking Quant/BipolarQuant nodes are currently " - "not supported by FINN." - ) if n.op_type == "Quant" and not model.get_initializer(n.input[2]) == 0: raise ValueError( "Only Quant nodes with zero-point == 0 are currently supported." diff --git a/src/finn/transformation/streamline/absorb.py b/src/finn/transformation/streamline/absorb.py index 0299c4f4d89d1fdd94434db77c77a0e529c86d26..a983e67750a0a860eeeb4b429f7d6b181fc84fe3 100644 --- a/src/finn/transformation/streamline/absorb.py +++ b/src/finn/transformation/streamline/absorb.py @@ -473,7 +473,7 @@ class AbsorbConsecutiveTransposes(Transformation): """Remove (Transpose -> Transpose) patterns when the input and output of the pattern have the same layout.""" - def Are_opposite_permutations(self, perms1, perms2): + def are_opposite_permutations(self, perms1, perms2): if len(perms1) != len(perms2): return False assert 0 <= max(perms2) < len(perms2), "invalid permutation" @@ -488,72 +488,40 @@ class AbsorbConsecutiveTransposes(Transformation): def apply(self, model): graph = model.graph graph_modified = False - for n in graph.node: - if n.op_type == "Transpose": - if model.is_fork_node(n): - next_nodes = model.find_direct_successors(n) - perms1 = list(get_by_name(n.attribute, "perm").ints) - - # check if all nodes after fork are opposite transposes - all_opposite_transposes = True - for next_node in next_nodes: - if next_node is not None and next_node.op_type == "Transpose": - perms2 = list(get_by_name(next_node.attribute, "perm").ints) - if not self.Are_opposite_permutations(perms1, perms2): - all_opposite_transposes = False - break - else: - all_opposite_transposes = False - break - - if not all_opposite_transposes: - continue - - prod = model.find_producer(n.input[0]) - for next_node in next_nodes: - # connect next_node's consumer input to n's producer output - # TODO implement this to allow for forks as producers and - # joins as consumers - cons = model.find_consumer(next_node.output[0]) - cons.input[0] = prod.output[0] - - # remove consumer transpose - graph.node.remove(next_node) - - # remove producer transpose - graph.node.remove(n) - graph_modified = True - - else: - next_node = model.find_consumer(n.output[0]) + for node in graph.node: + if node.op_type == "Transpose": + next_nodes = model.find_consumers(node.output[0]) + perms1 = list(get_by_name(node.attribute, "perm").ints) + # check if all nodes after fork are opposite transposes + all_opposite_transposes = True + for next_node in next_nodes: if next_node is not None and next_node.op_type == "Transpose": - perms1 = list(get_by_name(n.attribute, "perm").ints) perms2 = list(get_by_name(next_node.attribute, "perm").ints) - if self.Are_opposite_permutations(perms1, perms2): - - # connect next_node's consumer input to n's producer output - # TODO implement this to allow for forks as producers - consumers = model.find_direct_successors(next_node) - prod = model.find_producer(n.input[0]) - if prod is not None: - for cons in consumers: - for cons_in in cons.input: - if cons_in == next_node.output[0]: - prod.output[0] = cons_in - break - else: - # n.input[0] is top-level graph input - # wire consumers directly to that - for cons in consumers: - for i, iname in enumerate(cons.input): - if iname == next_node.output[0]: - cons.input[i] = n.input[0] - - # remove both transposes - graph.node.remove(n) - graph.node.remove(next_node) + if not self.are_opposite_permutations(perms1, perms2): + all_opposite_transposes = False + break + else: + all_opposite_transposes = False + break + if not all_opposite_transposes: + continue + source_tensor = node.input[0] + for next_node in next_nodes: + # connect next_node's consumers' appropriate input to n's input + # TODO how to handle top-level outputs if any? + nextnode_out = next_node.output[0] + assert nextnode_out not in [x.name for x in model.graph.output] + consumers = model.find_consumers(nextnode_out) + for cons in consumers: + for i, iname in enumerate(cons.input): + if iname == nextnode_out: + cons.input[i] = source_tensor + # remove consumer transpose + graph.node.remove(next_node) + # remove producer transpose + graph.node.remove(node) + graph_modified = True - graph_modified = True if graph_modified: model = model.transform(InferDataTypes()) return (model, graph_modified) diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py index 9ff8a2173ce81e2a19c56bbd20a326759c3b9df2..29eefacc32370598ddcd39283d022f5eb61f3f0c 100644 --- a/src/finn/transformation/streamline/reorder.py +++ b/src/finn/transformation/streamline/reorder.py @@ -553,6 +553,8 @@ class MoveLinearPastEltwiseAdd(Transformation): # Other transform should handle that if prod0 is None or prod1 is None or (prod0 == prod1): continue + if len(prod0.input) < 2 or len(prod1.input) < 2: + continue init0 = model.get_initializer(prod0.input[1]) init1 = model.get_initializer(prod1.input[1]) # if either initializer is None, skip @@ -723,14 +725,86 @@ class MakeMaxPoolNHWC(Transformation): return (model, graph_modified) +class MakeScaleResizeNHWC(Transformation): + """ + Converts the inputs and outputs for all scales Resize and Upsample nodes + from NCHW to NHWC. + """ + + def apply(self, model): + graph = model.graph + node_ind = 0 + for n in graph.node: + node_ind += 1 + if n.op_type == "Upsample" or n.op_type == "Resize": + if model.get_tensor_layout(n.input[0]) != DataLayout.NCHW: + warnings.warn( + "%s: Input not NCHW. Can't operate transformation on node." + % n.name + ) + continue + consumer = model.find_consumer(n.output[0]) + producer = model.find_producer(n.input[0]) + if n.op_type == "Upsample": + scales_ind = 1 + else: + scales_ind = 2 + if producer is not None and producer.op_type == "Transpose": + perms = list(get_by_name(producer.attribute, "perm").ints) + if perms == [0, 3, 1, 2]: + old_value = model.get_initializer(n.input[scales_ind]) + new_value = np.array( + [old_value[idx] for idx in (0, 2, 3, 1)], + dtype=np.dtype("float32"), + ) + model.set_initializer(n.input[scales_ind], new_value) + start_name = producer.input[0] + mid_name = n.input[0] + end_name = n.output[0] + (b, hi, wi, c) = model.get_tensor_shape(start_name) + (b, c, ho, wo) = model.get_tensor_shape(end_name) + producer.input[0] = mid_name + producer.output[0] = end_name + n.input[0] = start_name + n.output[0] = mid_name + model.set_tensor_shape(mid_name, (b, ho, wo, c)) + model.set_tensor_shape(end_name, (b, c, ho, wo)) + graph.node.remove(producer) + graph.node.insert(node_ind, producer) + elif consumer is not None and consumer.op_type == "Transpose": + perms = list(get_by_name(consumer.attribute, "perm").ints) + if perms == [0, 2, 3, 1]: + old_value = model.get_initializer(n.input[scales_ind]) + new_value = np.array( + [old_value[idx] for idx in (0, 2, 3, 1)], + dtype=np.dtype("float32"), + ) + model.set_initializer(n.input[scales_ind], new_value) + start_name = n.input[0] + mid_name = consumer.input[0] + end_name = consumer.output[0] + (b, c, hi, wi) = model.get_tensor_shape(start_name) + (b, c, ho, wo) = model.get_tensor_shape(mid_name) + consumer.input[0] = start_name + consumer.output[0] = mid_name + n.input[0] = mid_name + n.output[0] = end_name + model.set_tensor_shape(mid_name, (b, hi, wi, c)) + model.set_tensor_shape(end_name, (b, ho, wo, c)) + graph.node.remove(consumer) + graph.node.insert(node_ind - 1, consumer) + return (model, False) + + class MoveOpPastFork(Transformation): """Move node operations past graph forks. Used when a node before a fork can be merged with nodes in the branches """ - def __init__(self, op_name_list): + def __init__(self, op_name_list, get_attrs_fxn=lambda x: {}): super().__init__() self.ops_to_move = op_name_list + self.get_attrs_fxn = get_attrs_fxn def apply(self, model): graph = model.graph @@ -747,9 +821,10 @@ class MoveOpPastFork(Transformation): # Restrict this transform to operations with constant parameters # Assuming parameters is in input 1 - op_init_param = model.get_initializer(n.input[1]) - if op_init_param is None: - continue + if len(n.input) > 1: + op_init_param = model.get_initializer(n.input[1]) + else: + op_init_param = None # Check case when branches are empty and go # to the same node @@ -766,16 +841,20 @@ class MoveOpPastFork(Transformation): for consumer_node in consumers[1:]: # create new node - new_param_name = model.make_new_valueinfo_name() new_output_tensor_name = model.make_new_valueinfo_name() + if op_init_param is None: + new_inp_list = [n.input[0]] + else: + new_param_name = model.make_new_valueinfo_name() + new_inp_list = [n.input[0], new_param_name] + model.set_initializer(new_param_name, op_init_param) + attrs = self.get_attrs_fxn(n) + # TODO use copy of original node instead to get attrs? new_node = oh.make_node( - n.op_type, - [n.input[0], new_param_name], - [new_output_tensor_name], + n.op_type, new_inp_list, [new_output_tensor_name], **attrs ) graph.node.insert(node_ind, new_node) node_ind += 1 - model.set_initializer(new_param_name, op_init_param) # change consumer input tensor graph.node.remove(consumer_node) @@ -811,6 +890,13 @@ class MoveLinearPastFork(MoveOpPastFork): super().__init__(["Add", "Mul"]) +class MoveTransposePastFork(MoveOpPastFork): + def __init__(self): + super().__init__( + ["Transpose"], lambda x: {"perm": get_by_name(x.attribute, "perm").ints} + ) + + class MoveMaxPoolPastMultiThreshold(Transformation): """Move MaxPool nodes past MultiThreshold nodes on linear segments of the graph.""" diff --git a/tests/brevitas/test_brevitas_relu_act_export.py b/tests/brevitas/test_brevitas_relu_act_export.py index b0c3d6088c27291f1f49dd2f1ee746b65ca0a737..3dc46ec31e49d7115b19b3373d54be6ddc29bb80 100644 --- a/tests/brevitas/test_brevitas_relu_act_export.py +++ b/tests/brevitas/test_brevitas_relu_act_export.py @@ -41,6 +41,7 @@ from brevitas.nn import QuantReLU from qonnx.core.modelwrapper import ModelWrapper from qonnx.transformation.infer_shapes import InferShapes from qonnx.util.cleanup import cleanup as qonnx_cleanup +from torch import nn import finn.core.onnx_exec as oxe from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN @@ -179,3 +180,83 @@ scaling_impl.learned_value": rand_tensor.type( assert np.isclose(produced, expected, atol=1e-3).all() os.remove(export_onnx_path) + + +class PyTorchTestModel(nn.Module): + def __init__(self, abits): + super(PyTorchTestModel, self).__init__() + out_channels = 32 + self.b_act = QuantReLU( + bit_width=abits, + quant_type=QuantType.INT, + scaling_impl_type=ScalingImplType.PARAMETER, + scaling_per_channel=True, + restrict_scaling_type=RestrictValueType.LOG_FP, + scaling_min_val=2e-16, + max_val=6.0, + return_quant_tensor=False, + per_channel_broadcastable_shape=(1, out_channels, 1, 1), + ) + + def forward(self, x): + act_out = self.b_act(x) + y0 = act_out * 2.0 + y1 = act_out * -1.0 + y = y0 + y1 + return y + + +@pytest.mark.brevitas_export +@pytest.mark.parametrize("abits", [2, 4, 8]) +@pytest.mark.parametrize("max_val", [1.0, 1.5, 1 - 2 ** (-7)]) +@pytest.mark.parametrize("scaling_per_channel", [True]) +@pytest.mark.parametrize("QONNX_export", [True]) +def test_brevitas_act_export_relu_forking( + abits, max_val, scaling_per_channel, QONNX_export +): + out_channels = 32 + ishape = (1, out_channels, 1, 1) + min_val = -1.0 + model_pyt = PyTorchTestModel(abits) + + rand_tensor = (2) * torch.rand((1, out_channels, 1, 1)) + + checkpoint = { + "b_act.act_quant_proxy.fused_activation_quant_proxy." + "tensor_quant.scaling_impl.learned_value": rand_tensor.type(torch.FloatTensor) + } + model_pyt.load_state_dict(checkpoint) + + if QONNX_export: + m_path = export_onnx_path + BrevitasONNXManager.export(model_pyt, ishape, m_path) + qonnx_cleanup(m_path, out_file=m_path) + model = ModelWrapper(m_path) + model = model.transform(ConvertQONNXtoFINN()) + model.save(m_path) + + model = ModelWrapper(export_onnx_path) + model = model.transform(InferShapes()) + inp_tensor = np.random.uniform(low=min_val, high=max_val, size=ishape).astype( + np.float32 + ) + idict = {model.graph.input[0].name: inp_tensor} + odict = oxe.execute_onnx(model, idict, True) + produced = odict[model.graph.output[0].name] + inp_tensor = torch.from_numpy(inp_tensor).float() + model_pyt.eval() + expected = model_pyt.forward(inp_tensor).detach().numpy() + if not np.isclose(produced, expected, atol=1e-3).all(): + print(abits, max_val) + print("scale: ", model_pyt.quant_act_scale().type(torch.FloatTensor).detach()) + if abits < 5: + print( + "thres:", + ", ".join(["{:8.4f}".format(x) for x in model_pyt.export_thres[0]]), + ) + print("input:", ", ".join(["{:8.4f}".format(x) for x in inp_tensor[0]])) + print("prod :", ", ".join(["{:8.4f}".format(x) for x in produced[0]])) + print("expec:", ", ".join(["{:8.4f}".format(x) for x in expected[0]])) + + assert np.isclose(produced, expected, atol=1e-3).all() + os.remove(export_onnx_path) diff --git a/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py index 5bbaefac2d3e5f800fbb9471df6469235271c2f3..7b3e20616410f54e4718290baec9a510a0d49c0d 100644 --- a/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py +++ b/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py @@ -66,11 +66,12 @@ from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode ], ) @pytest.mark.parametrize("depthwise", [False, True]) +@pytest.mark.parametrize("use_rtl_swg", [False, True]) @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, exec_mode): +def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mode): pad, kernel_size, stride, dilation = conv_config np.random.seed(0) idt = DataType["UINT4"] @@ -84,6 +85,9 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, exec_mode): pad_h = pad[0] + pad[2] pad_w = pad[1] + pad[3] + if use_rtl_swg and exec_mode == "cppsim": + pytest.skip("cppsim not supported for RTL SWG") + if depthwise is True: group = out_chn = in_chn conv_param_shape = [out_chn, 1, k_h, k_w] @@ -139,7 +143,7 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, exec_mode): model = model.transform(InferDataTypes()) new_model = model.transform(LowerConvsToMatMul()) - new_model = new_model.transform(to_hls.InferConvInpGen()) + new_model = new_model.transform(to_hls.InferConvInpGen(use_rtl_variant=use_rtl_swg)) if depthwise is True: new_model = new_model.transform(to_hls.InferVectorVectorActivation()) else: diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py index 55dc77cafb898ead28a7cbb9641e0b40db276919..8c9f110c315089ec03354863bf2213963197217a 100644 --- a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py +++ b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py @@ -57,11 +57,12 @@ from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode "conv_config", [(1, 2, 0), (1, 3, 0), (3, 2, 1), (3, 1, 0), (3, 1, 1), (5, 2, 1)] ) @pytest.mark.parametrize("depthwise", [False, True]) +@pytest.mark.parametrize("use_rtl_swg", [False, True]) @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode): +def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mode): kernel_size, stride, pad = conv_config np.random.seed(0) idt = DataType["UINT4"] @@ -69,6 +70,12 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode): in_feature_dim = 7 in_chn = 16 + if use_rtl_swg and exec_mode == "cppsim": + pytest.skip("cppsim not supported for RTL SWG") + + if use_rtl_swg and kernel_size == 1: + pytest.skip("1x1 kernel not supported by current RTL SWG") + if depthwise is True: group = out_chn = in_chn conv_param_shape = [out_chn, 1, kernel_size, kernel_size] @@ -122,7 +129,7 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode): model = model.transform(InferDataTypes()) new_model = model.transform(LowerConvsToMatMul()) - new_model = new_model.transform(to_hls.InferConvInpGen()) + new_model = new_model.transform(to_hls.InferConvInpGen(use_rtl_variant=use_rtl_swg)) if depthwise is True: new_model = new_model.transform(to_hls.InferVectorVectorActivation()) else: @@ -156,6 +163,7 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode): x = gen_finn_dt_tensor(idt, input_shape) inp_dict = {model.graph.input[0].name: x} assert oxe.compare_execution(model, new_model, inp_dict) + if kernel_size == 1 and stride > 1 and pad == 0: assert new_model.graph.node[1].op_type == "DownSampler" if exec_mode == "rtlsim": diff --git a/tests/fpgadataflow/test_fpgadataflow_concat.py b/tests/fpgadataflow/test_fpgadataflow_concat.py index dddc470ec2ed88faf078f19bd0d2a7a4a6b5b6cd..8488a34dff52d39c28fbea25275c9a4b59c37f80 100644 --- a/tests/fpgadataflow/test_fpgadataflow_concat.py +++ b/tests/fpgadataflow/test_fpgadataflow_concat.py @@ -144,6 +144,5 @@ def test_fpgadataflow_concat_stitchedip(): ) model.set_metadata_prop("exec_mode", "rtlsim") model.set_metadata_prop("rtlsim_trace", "trace.vcd") - model.save("dbg.onnx") ret_sim = execute_onnx(model, inp_dict) assert (exp_out == ret_sim[oname]).all() diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py new file mode 100755 index 0000000000000000000000000000000000000000..007360a5fd0b74ee49d54c84f332061dd5f3a114 --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py @@ -0,0 +1,260 @@ +# Copyright (C) 2022, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +from onnx import TensorProto, helper +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.general.im2col import compute_conv_output_dim +from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.util.basic import gen_finn_dt_tensor + +import finn.core.onnx_exec as oxe +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode + + +def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt): + k_h, k_w = k + ifm_dim_h, ifm_dim_w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + ofm_dim_h, ofm_dim_w = ofm_dim + + odt = idt + inp = helper.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch] + ) + outp = helper.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] + ) + + im2col_node = helper.make_node( + "Im2Col", + ["inp"], + ["outp"], + domain="finn.custom_op.general", + stride=[stride_h, stride_w], + kernel_size=[k_h, k_w], + input_shape=str((1, ifm_dim_h, ifm_dim_w, ifm_ch)), + dilations=[dilation_h, dilation_w], + pad_amount=[0, 0, 0, 0], + pad_value=0, + ) + graph = helper.make_graph( + nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp] + ) + + model = helper.make_model(graph, producer_name="im2col-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", odt) + + return model + + +def make_single_slidingwindow_modelwrapper( + k, ifm_ch, ifm_dim, ofm_dim, simd, m, parallel_window, stride, dilation, idt, dw=0 +): + k_h, k_w = k + ifm_dim_h, ifm_dim_w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + ofm_dim_h, ofm_dim_w = ofm_dim + + odt = idt + inp = helper.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch] + ) + outp = helper.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] + ) + + SlidingWindow_node = helper.make_node( + "ConvolutionInputGenerator_rtl", + ["inp"], + ["outp"], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + ConvKernelDim=[k_h, k_w], + IFMChannels=ifm_ch, + IFMDim=[ifm_dim_h, ifm_dim_w], + OFMDim=[ofm_dim_h, ofm_dim_w], + SIMD=simd, + M=m, + parallel_window=parallel_window, + Stride=[stride_h, stride_w], + Dilation=[dilation_h, dilation_w], + inputDataType=idt.name, + outputDataType=odt.name, + depthwise=dw, + ) + graph = helper.make_graph( + nodes=[SlidingWindow_node], + name="slidingwindow_graph", + inputs=[inp], + outputs=[outp], + ) + + model = helper.make_model(graph, producer_name="slidingwindow-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", odt) + + return model + + +def prepare_inputs(input_tensor): + return {"inp": input_tensor} + + +# input datatype +@pytest.mark.parametrize("idt", [DataType["UINT4"]]) +# kernel size +@pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 3]]) +# input dimension +@pytest.mark.parametrize("ifm_dim", [[24, 24], [15, 6], [13, 13], [1, 14]]) +# input channels +@pytest.mark.parametrize("ifm_ch", [6]) +# Stride +@pytest.mark.parametrize("stride", [[1, 1], [2, 2]]) +# Dilation +@pytest.mark.parametrize("dilation", [[1, 1], [2, 2]]) +# depthwise +@pytest.mark.parametrize("dw", [0, 1]) +# input channel parallelism ("SIMD") +@pytest.mark.parametrize("simd", [1, 2, 3, 6]) +# parallel_window enable (MMV_out = M*K) +@pytest.mark.parametrize("parallel_window", [0]) +# in/out MMV ("M") +@pytest.mark.parametrize("m", [1]) +# Flip dimensions +@pytest.mark.parametrize("flip", [False]) +@pytest.mark.slow +@pytest.mark.vivado +@pytest.mark.fpgadataflow +def test_fpgadataflow_slidingwindow_rtl( + idt, k, ifm_dim, ifm_ch, stride, dilation, dw, simd, m, parallel_window, flip +): + if flip: + if ( + ifm_dim[0] == ifm_dim[1] + and k[0] == k[1] + and stride[0] == stride[1] + and dilation[0] == dilation[1] + ): + pytest.skip("Dimension flip would have no effect") + k = k[::-1] + ifm_dim = ifm_dim[::-1] + stride = stride[::-1] + dilation = dilation[::-1] + + k_h, k_w = k + ifm_dim_h, ifm_dim_w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + + kernel_width = (k_w - 1) * dilation_w + 1 # incl. dilation + kernel_height = (k_h - 1) * dilation_h + 1 # incl. dilation + + if simd > ifm_ch: + pytest.skip("SIMD cannot be larger than number of input channels") + if ifm_ch % simd != 0: + pytest.skip("SIMD must divide number of input channels") + if kernel_height > ifm_dim_h or stride_h > ifm_dim_h: + pytest.skip( + "Illegal convolution configuration: kernel or stride > FM dimension" + ) + if kernel_width > ifm_dim_w or stride_w > ifm_dim_w: + pytest.skip( + "Illegal convolution configuration: kernel or stride > FM dimension" + ) + if (k_h == 1 and (stride_h != 1 or dilation_h != 1)) or ( + k_w == 1 and (stride_w != 1 or dilation_w != 1) + ): + pytest.skip( + """Illegal convolution configuration: + stride or dilation defined for unitary kernel dim""" + ) + if k_h == 1 and k_w == 1 and simd != ifm_ch: + pytest.skip("1x1 Kernel only supported in parallel mode (SIMD=C)") + if parallel_window and simd != ifm_ch: + pytest.skip("Parallel window requires SIMD=C") + + ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h) + ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w) + ofm_dim = [ofm_dim_h, ofm_dim_w] + + x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch)) + model = make_single_slidingwindow_modelwrapper( + k=k, + ifm_ch=ifm_ch, + ifm_dim=ifm_dim, + ofm_dim=ofm_dim, + simd=simd, + m=m, + parallel_window=parallel_window, + stride=stride, + dilation=dilation, + idt=idt, + dw=dw, + ) + + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP("xc7z020clg400-1", 5)) + model = model.transform(PrepareRTLSim()) + + # prepare input data + input_dict = prepare_inputs(x) + # execute model + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + golden = make_single_im2col_modelwrapper( + k=k, + ifm_ch=ifm_ch, + ifm_dim=ifm_dim, + ofm_dim=ofm_dim, + stride=stride, + dilation=dilation, + idt=idt, + ) + y_expected = oxe.execute_onnx(golden, input_dict)["outp"] + + if dw == 0: + assert (y_produced == y_expected).all() + else: + y_expected = y_expected.reshape( + 1, ofm_dim_h, ofm_dim_w, k_h * k_w, ifm_ch // simd, simd + ) + y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5) + y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, ifm_ch * k_h * k_w) + assert (y_produced == y_expected).all() diff --git a/tests/fpgadataflow/test_fpgadataflow_eltwise.py b/tests/fpgadataflow/test_fpgadataflow_eltwise.py new file mode 100644 index 0000000000000000000000000000000000000000..6028a9b9f0fb4a04d0f53fd8c4fae3aac3ae686e --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_eltwise.py @@ -0,0 +1,133 @@ +# Copyright (c) 2022, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import numpy as np +import onnx.parser as oprs +import qonnx.core.data_layout as dl +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.util.basic import gen_finn_dt_tensor + +import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer +from finn.core.onnx_exec import execute_onnx +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode + + +def build_model(shp, dt0, dt1, do_abs): + np.random.seed(0) + shp_str = str(shp) + if do_abs: + graph = """ + sub_out = Sub(in0, in1) + out0 = Abs(sub_out) + """ + else: + graph = "out0 = Sub(in0, in1)" + + input = f""" + < + ir_version: 7, + opset_import: ["" : 9] + > + agraph (float{shp_str} in0, float{shp_str} in1) => (float{shp_str} out0) + {{ + {graph} + }} + """ + model = oprs.parse_model(input) + model = ModelWrapper(model) + model.set_tensor_datatype("in0", dt0) + model.set_tensor_datatype("in1", dt1) + model.set_tensor_layout("in0", dl.NHWC) + model.set_tensor_layout("in1", dl.NHWC) + model = model.transform(InferShapes()) + return model + + +# input datatype for one operand +@pytest.mark.parametrize("dt0", [DataType["UINT4"], DataType["UINT7"]]) +# channels +@pytest.mark.parametrize("ch", [1, 64]) +# folding +@pytest.mark.parametrize("fold", [-1, 2, 1]) +# include Abs output node or not +@pytest.mark.parametrize("do_abs", [True, False]) +# execution mode +@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +@pytest.mark.fpgadataflow +@pytest.mark.vivado +def test_fpgadataflow_eltwise(dt0, ch, fold, do_abs, exec_mode): + if fold == -1: + pe = 1 + else: + pe = max(1, ch // fold) + assert ch % pe == 0 + dt1 = DataType["UINT8"] + shp = [1, 4, 2, ch] + model = build_model(shp, dt0, dt1, do_abs) + in0 = gen_finn_dt_tensor(dt0, shp) + in1 = gen_finn_dt_tensor(dt1, shp) + idict = {"in0": in0, "in1": in1} + y_expected = execute_onnx(model, idict)["out0"] + model = model.transform(to_hls.InferStreamingEltwise()) + assert len(model.graph.node) == 1 + assert model.graph.node[0].op_type == "StreamingEltwise" + getCustomOp(model.graph.node[0]).set_nodeattr("PE", pe) + if exec_mode == "cppsim": + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + model = model.transform(SetExecMode("cppsim")) + elif exec_mode == "rtlsim": + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP("xc7z020clg400-1", 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + else: + raise Exception("Unknown exec_mode") + y_produced = execute_onnx(model, idict)["out0"] + assert (y_produced == y_expected).all(), exec_mode + " failed" + if exec_mode == "rtlsim": + node = model.get_nodes_by_op_type("StreamingEltwise")[0] + inst = getCustomOp(node) + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + exp_cycles_dict = model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) + assert exp_cycles != 0 diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau.py b/tests/fpgadataflow/test_fpgadataflow_vvau.py index c48448787d8a3bb926c1e94850be6e99e8c106d3..03ddb1286320b8178276ea53082095106a43d7a1 100644 --- a/tests/fpgadataflow/test_fpgadataflow_vvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_vvau.py @@ -75,7 +75,19 @@ def _calculate_dot_prod_range(dt_a, dt_b, len): def _make_single_vvau_modelwrapper( - W, pe, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T=None, tdt=None + W, + pe, + k_h, + k_w, + channels, + dim_h, + dim_w, + wdt, + idt, + odt, + T=None, + tdt=None, + mem_mode="const", ): in_shape = [1, dim_h, dim_w, k_h * k_w * channels] # [N, H, W, K*K*CH] out_shape = [ @@ -113,6 +125,7 @@ def _make_single_vvau_modelwrapper( weightDataType=wdt.name, outputDataType=odt.name, noActivation=no_act, + mem_mode=mem_mode, ) graph = helper.make_graph( @@ -140,7 +153,7 @@ def prepare_inputs(input_tensor): return {"inp": input_tensor} -# mem_mode: const or decoupled +# input datatype @pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]]) # weight datatype @pytest.mark.parametrize("wdt", [DataType["INT4"]]) @@ -156,13 +169,15 @@ def prepare_inputs(input_tensor): @pytest.mark.parametrize("k_w", [3, 1]) # Number of input and output channels @pytest.mark.parametrize("channels", [3, 4]) +# memory mode +@pytest.mark.parametrize("mem_mode", ["const", "decoupled"]) # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado def test_fpgadataflow_vvau( - idt, wdt, act, pe, dim_h, dim_w, k_h, k_w, channels, exec_mode + idt, wdt, act, pe, dim_h, dim_w, k_h, k_w, channels, mem_mode, exec_mode ): if pe == "channels": pe = channels @@ -198,7 +213,7 @@ def test_fpgadataflow_vvau( tdt = DataType["INT32"] model = _make_single_vvau_modelwrapper( - W, pe, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt + W, pe, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt, mem_mode ) if exec_mode == "cppsim": diff --git a/tests/transformation/streamline/test_absorb_opposite_transposes.py b/tests/transformation/streamline/test_absorb_opposite_transposes.py index 51ea5edfc420bf935de3e196df1b150934782a91..6d8d2b9f0cd4ad28c3ea0922d69b9b963a0deb08 100644 --- a/tests/transformation/streamline/test_absorb_opposite_transposes.py +++ b/tests/transformation/streamline/test_absorb_opposite_transposes.py @@ -29,8 +29,7 @@ import pytest import numpy as np -import onnx.helper as oh -from onnx import TensorProto +import onnx.parser as oprs from qonnx.core.modelwrapper import ModelWrapper from qonnx.transformation.infer_shapes import InferShapes @@ -41,39 +40,42 @@ from finn.transformation.streamline.absorb import AbsorbConsecutiveTransposes @pytest.mark.streamline def test_absorb_opposite_transposes(): np.random.seed(0) - input_shape = [1, 3, 4, 2] - top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape) - top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, input_shape) - value_info = [oh.make_tensor_value_info("add_param_0", TensorProto.FLOAT, [1])] - value_info += [oh.make_tensor_value_info("add_param_1", TensorProto.FLOAT, [1])] - value_info += [oh.make_tensor_value_info("mul_param_0", TensorProto.FLOAT, [1])] - modelproto = oh.make_model( - oh.make_graph( - name="test", - inputs=[top_in], - outputs=[top_out], - value_info=value_info, - nodes=[ - oh.make_node("Add", ["top_in", "add_param_0"], ["t0"]), - oh.make_node("Transpose", ["t0"], ["t1"], perm=[0, 2, 3, 1]), - oh.make_node("Transpose", ["t1"], ["t2"], perm=[0, 3, 1, 2]), - oh.make_node("Add", ["t2", "add_param_1"], ["t3"]), - oh.make_node("Transpose", ["t3"], ["t4"], perm=[0, 2, 3, 1]), - oh.make_node("Transpose", ["t4"], ["t5"], perm=[0, 3, 1, 2]), - oh.make_node("Add", ["t5", "t2"], ["t6"]), - oh.make_node("Mul", ["t6", "mul_param_0"], ["top_out"]), - ], - ) - ) - model = ModelWrapper(modelproto) + shp = [1, 3, 4, 2] + shp_str = str(shp) + input = f""" + < + ir_version: 7, + opset_import: ["" : 9] + > + agraph (float{shp_str} in0) => (float{shp_str} out0) + < + float[1] add0_param = {{1.0}}, + float[1] add1_param = {{3.0}}, + float[1] mul0_param = {{2.0}} + > + {{ + add0_out = Add(in0, add0_param) + t0_out = Transpose<perm=[0,2,3,1]>(add0_out) + t1_out = Transpose<perm=[0,3,1,2]>(t0_out) + add1_out = Add(t1_out, add1_param) + t2_out = Transpose<perm=[0,2,3,1]>(add1_out) + t3_out = Transpose<perm=[0,3,1,2]>(t2_out) + add2_out = Add(t1_out, t3_out) + t4_out = Transpose<perm=[0,2,3,1]>(add2_out) + t5_out = Transpose<perm=[0,3,1,2]>(t4_out) + t6_out = Transpose<perm=[0,3,1,2]>(t4_out) + m0_out = Mul(t5_out, mul0_param) + m1_out = Mul(t6_out, mul0_param) + out0 = Mul(m0_out, m1_out) + }} + """ + model = oprs.parse_model(input) + model = ModelWrapper(model) model = model.transform(InferShapes()) - model.set_initializer("add_param_0", np.asarray([1], dtype=np.float32)) - model.set_initializer("add_param_1", np.asarray([3], dtype=np.float32)) - model.set_initializer("mul_param_0", np.asarray([2], dtype=np.float32)) new_model = model.transform(AbsorbConsecutiveTransposes()) new_model = new_model.transform(InferShapes()) - inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)} + inp_dict = {"top_in": np.random.rand(*shp).astype(np.float32)} assert ox.compare_execution(model, model, inp_dict) - assert len(new_model.graph.node) == 4 + assert len(new_model.graph.node) == 6 for n in new_model.graph.node: assert new_model.graph.node[0].op_type != "Transpose" diff --git a/tests/transformation/streamline/test_move_past_fork.py b/tests/transformation/streamline/test_move_past_fork.py index 5064fa3fca869a245c87cf0c1680d1357e5de60b..7e77d7f9b3502429f08c40558e330b6261d0dbad 100644 --- a/tests/transformation/streamline/test_move_past_fork.py +++ b/tests/transformation/streamline/test_move_past_fork.py @@ -28,80 +28,113 @@ import pytest import numpy as np -from onnx import TensorProto, helper +import onnx.parser as oprs from qonnx.core.modelwrapper import ModelWrapper +from qonnx.transformation.general import GiveUniqueNodeNames from qonnx.transformation.infer_shapes import InferShapes +from qonnx.util.basic import get_by_name import finn.core.onnx_exec as oxe -from finn.transformation.streamline.reorder import MoveLinearPastFork +from finn.transformation.streamline.reorder import ( + MoveLinearPastFork, + MoveTransposePastFork, +) + + +@pytest.mark.streamline +def test_move_past_fork_transpose(): + shp = [1, 3, 32, 32] + shp_str = str(shp) + input = f""" + < + ir_version: 7, + opset_import: ["" : 9] + > + agraph (float{shp_str} in0) => (float{shp_str} out0) + {{ + t0_out = Transpose<perm=[0,2,3,1]>(in0) + t1_out = Transpose<perm=[0,3,1,2]>(t0_out) + t2_out = Transpose<perm=[0,3,1,2]>(t0_out) + out0 = Add(t1_out, t2_out) + }} + """ + model = oprs.parse_model(input) + model = ModelWrapper(model) + model = model.transform(InferShapes()) + new_model = model.transform(MoveTransposePastFork()) + new_model = new_model.transform(GiveUniqueNodeNames()) + nodes = new_model.graph.node + assert oxe.compare_execution( + model, new_model, {"in0": np.random.rand(*shp).astype(np.float32)} + ) + assert len(nodes) == 5 + assert not new_model.is_fork_node(get_by_name(nodes, "Transpose_0")) @pytest.mark.streamline @pytest.mark.parametrize("ch", [64, 1]) # ifmdim @pytest.mark.parametrize("ifmdim", [-1, 7]) -def test_move_past_fork(ch, ifmdim): - # generate test vectors of correct shape +def test_move_past_fork_linear(ch, ifmdim): if ifmdim == -1: - input_shape = (1, ch) + shp = [1, ch] else: - input_shape = (1, ch, ifmdim, ifmdim) + shp = [1, ch, ifmdim, ifmdim] + shp_str = str(shp) + input = f""" + < + ir_version: 7, + opset_import: ["" : 9] + > + agraph (float{shp_str} in0) => (float{shp_str} out0) + < + float{shp_str} add0_param, + float{shp_str} mul_shared_param, + float{shp_str} add2_param, + float{shp_str} mul2_param, + float{shp_str} add3_param, + float{shp_str} add4_param, + float{shp_str} mul3_param, + float{shp_str} add6_param + > + {{ - top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape) - top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, input_shape) - - num_of_params = 8 - value_info = [] - for i in range(num_of_params): - value_info += [ - helper.make_tensor_value_info("p" + str(i), TensorProto.FLOAT, input_shape) - ] - - add_1_to_move = helper.make_node("Add", ["top_in", "p0"], ["fork1"]) - mul_1_to_move = helper.make_node("Mul", ["t5", "p4"], ["fork2"]) - add_2_to_move = helper.make_node("Add", ["fork2", "p5"], ["t6"]) - mul_1_not_to_move = helper.make_node("Mul", ["t8", "p7"], ["fork3"]) - modelproto = helper.make_model( - helper.make_graph( - name="test", - inputs=[top_in], - outputs=[top_out], - value_info=value_info, - nodes=[ - # fork1 - add_1_to_move, - helper.make_node("Mul", ["fork1", "p1"], ["t2"]), - helper.make_node("Mul", ["fork1", "p2"], ["t3"]), - helper.make_node("Add", ["t2", "t3"], ["t4"]), - helper.make_node("Add", ["t4", "p3"], ["t5"]), - # fork2 - mul_1_to_move, - add_2_to_move, - helper.make_node("Add", ["fork2", "p6"], ["t7"]), - helper.make_node("Add", ["t6", "t7"], ["t8"]), - # empty branches: do nothing - mul_1_not_to_move, - helper.make_node("Add", ["fork3", "fork3"], ["top_out"]), - ], - ) - ) - model = ModelWrapper(modelproto) + add0_out = Add(in0, add0_param) + mul0_out = Mul(add0_out, mul_shared_param) + mul1_out = Mul(add0_out, mul_shared_param) + add1_out = Add(mul0_out, mul1_out) + add2_out = Add(add1_out, add2_param) + mul2_out = Mul(add2_out, mul2_param) + add3_out = Add(mul2_out, add3_param) + add4_out = Add(mul2_out, add4_param) + add5_out = Add(add3_out, add4_out) + mul3_out = Mul(add5_out, mul3_param) + out0 = Add(mul3_out, add6_param) + }} + """ + model = oprs.parse_model(input) + model = ModelWrapper(model) model = model.transform(InferShapes()) np.random.seed(0) - for i in range(num_of_params): - model.set_initializer( - "p" + str(i), np.random.rand(*input_shape).astype(np.float32) - ) - + for tensor_name in model.get_all_tensor_names(): + if tensor_name.endswith("_param"): + pshape = model.get_tensor_shape(tensor_name) + model.set_initializer( + tensor_name, np.random.rand(*pshape).astype(np.float32) + ) + model = model.transform(GiveUniqueNodeNames()) # Transform new_model = model.transform(MoveLinearPastFork()) - inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)} - + new_model = new_model.transform(GiveUniqueNodeNames()) + inp_dict = {"top_in": np.random.rand(*shp).astype(np.float32)} # Test assert oxe.compare_execution(model, new_model, inp_dict) - assert not new_model.is_fork_node(add_1_to_move) - assert not new_model.is_fork_node(mul_1_to_move) - assert not new_model.is_fork_node(add_2_to_move) - assert new_model.is_fork_node(mul_1_not_to_move) + nodes = new_model.graph.node + assert len(new_model.get_nodes_by_op_type("Add")) == 9 + assert len(new_model.get_nodes_by_op_type("Mul")) == 5 + assert not new_model.is_fork_node(get_by_name(nodes, "Add_0")) + assert new_model.is_join_node(get_by_name(nodes, "Add_2")) + assert not new_model.is_fork_node(get_by_name(nodes, "Mul_2")) + assert not new_model.is_join_node(get_by_name(nodes, "Add_5")) assert len(new_model.graph.node) == 14 diff --git a/tests/transformation/streamline/test_scale_resize_nhwc.py b/tests/transformation/streamline/test_scale_resize_nhwc.py new file mode 100644 index 0000000000000000000000000000000000000000..f10930f4e7d5aeb98a60630e7e4f48adfc371d59 --- /dev/null +++ b/tests/transformation/streamline/test_scale_resize_nhwc.py @@ -0,0 +1,293 @@ +import pytest + +import numpy as np +import onnx +import onnx.helper as oh +import qonnx.core.data_layout as DataLayout +from onnx import TensorProto +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.transformation.infer_data_layouts import InferDataLayouts +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.util.basic import gen_finn_dt_tensor + +import finn.core.onnx_exec as oxe +from finn.transformation.streamline.reorder import MakeScaleResizeNHWC + + +def create_resize_transpose(ifm_dim, ifm_ch, scales, mode, idt): + ofm_dim_h = ifm_dim[0] * scales[2] + ofm_dim_w = ifm_dim[1] * scales[3] + inp = oh.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim[0], ifm_dim[1]] + ) + + param = oh.make_tensor_value_info("scales", TensorProto.FLOAT, [4]) + + # Not actually used, only needed for compliance with the Resize node interface + roi = oh.make_tensor_value_info("roi", TensorProto.FLOAT, [4]) + + outp_up = oh.make_tensor_value_info( + "outp_up", TensorProto.FLOAT, [1, ifm_ch, ofm_dim_h, ofm_dim_w] + ) + outp = oh.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, ifm_ch] + ) + + resize_node = oh.make_node( + "Resize", + inputs=["inp", "roi", "scales"], + outputs=["outp_up"], + name="Resize1", + mode=mode, + ) + + transpose_node = onnx.helper.make_node( + "Transpose", + inputs=["outp_up"], + outputs=["outp"], + name="Transpose1", + perm=[0, 2, 3, 1], + ) + + graph = oh.make_graph( + nodes=[resize_node, transpose_node], + name="resize_graph", + inputs=[inp], + outputs=[outp], + value_info=[outp_up, param, roi], + ) + + model = oh.make_model(graph, producer_name="resize_model1") + model = ModelWrapper(model) + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", idt) + + model.set_tensor_layout("inp", DataLayout.NCHW) + model = model.transform(InferShapes()) + model = model.transform(InferDataLayouts()) + + return model + + +def create_transpose_resize(ifm_dim, ifm_ch, scales, mode, idt): + ofm_dim_h = ifm_dim[0] * scales[2] + ofm_dim_w = ifm_dim[1] * scales[3] + inp = oh.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, ifm_dim[0], ifm_dim[1], ifm_ch] + ) + + param = oh.make_tensor_value_info("scales", TensorProto.FLOAT, [4]) + + # Not actually used, only needed for compliance with the Resize node interface + roi = oh.make_tensor_value_info("roi", TensorProto.FLOAT, [4]) + + outp = oh.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ifm_ch, ofm_dim_h, ofm_dim_w] + ) + outp_tr = oh.make_tensor_value_info( + "outp_tr", TensorProto.FLOAT, [1, ifm_ch, ifm_dim[0], ifm_dim[1]] + ) + + transpose_node = onnx.helper.make_node( + "Transpose", + inputs=["inp"], + outputs=["outp_tr"], + name="Transpose1", + perm=[0, 3, 1, 2], + ) + + resize_node = oh.make_node( + "Resize", + inputs=["outp_tr", "roi", "scales"], + outputs=["outp"], + name="Resize1", + mode=mode, + ) + + graph = oh.make_graph( + nodes=[transpose_node, resize_node], + name="resize_graph", + inputs=[inp], + outputs=[outp], + value_info=[outp_tr, param, roi], + ) + + model = oh.make_model(graph, producer_name="resize_model2") + model = ModelWrapper(model) + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", idt) + model.set_tensor_layout("inp", DataLayout.NHWC) + + model = model.transform(InferShapes()) + model = model.transform(InferDataLayouts()) + + return model + + +def create_transpose_resize_transpose(ifm_dim, ifm_ch, scales, mode, idt): + ofm_dim_h = ifm_dim[0] * scales[2] + ofm_dim_w = ifm_dim[1] * scales[3] + inp = oh.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, ifm_dim[0], ifm_dim[1], ifm_ch] + ) + + param = oh.make_tensor_value_info("scales", TensorProto.FLOAT, scales) + + # Not actually used, only needed for compliance with the Resize node interface + roi = oh.make_tensor_value_info("roi", TensorProto.FLOAT, [4]) + + outp_tr = oh.make_tensor_value_info( + "outp_tr", TensorProto.FLOAT, [1, ifm_ch, ifm_dim[0], ifm_dim[1]] + ) + + outp_up = oh.make_tensor_value_info( + "outp_up", TensorProto.FLOAT, [1, ifm_ch, ofm_dim_h, ofm_dim_w] + ) + outp = oh.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, ifm_ch] + ) + + transpose_node1 = onnx.helper.make_node( + "Transpose", + inputs=["inp"], + outputs=["outp_tr"], + name="Transpose1", + perm=[0, 3, 1, 2], + ) + + resize_node = oh.make_node( + "Resize", + inputs=["outp_tr", "roi", "scales"], + outputs=["outp_up"], + name="Resize1", + mode=mode, + ) + + transpose_node2 = onnx.helper.make_node( + "Transpose", + inputs=["outp_up"], + outputs=["outp"], + name="Transpose2", + perm=[0, 2, 3, 1], + ) + + graph = oh.make_graph( + nodes=[transpose_node1, resize_node, transpose_node2], + name="resize_graph", + inputs=[inp], + outputs=[outp], + value_info=[outp_up, outp_tr, param, roi], + ) + + model = oh.make_model(graph, producer_name="resize_model3") + model = ModelWrapper(model) + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", idt) + model.set_tensor_layout("inp", DataLayout.NHWC) + + model = model.transform(InferShapes()) + model = model.transform(InferDataLayouts()) + + return model + + +def check_transform(model): + graph = model.graph + node_ind = 0 + for n in graph.node: + node_ind += 1 + if n.op_type == "Upsample" or n.op_type == "Resize": + if model.get_tensor_layout(n.output[0]) == DataLayout.NHWC: + return True + return False + + +@pytest.mark.streamline +# input dimension +@pytest.mark.parametrize("ifm_dim", [[2**i, 2**i] for i in range(3, 6)]) +# input channels +@pytest.mark.parametrize("ifm_ch", [3]) +# scales +@pytest.mark.parametrize( + "scales", [[1, 1, i, j] for i in range(2, 5) for j in range(2, 5)] +) +# mode +@pytest.mark.parametrize("mode", ["nearest"]) +# input datatype +@pytest.mark.parametrize("idt", [DataType["INT4"]]) +def test_scale_resize_nhwc(ifm_dim, ifm_ch, scales, mode, idt): + # create models + resize_model1 = create_resize_transpose(ifm_dim, ifm_ch, scales, mode, idt) + resize_model2 = create_transpose_resize(ifm_dim, ifm_ch, scales, mode, idt) + resize_model3 = create_transpose_resize_transpose( + ifm_dim, ifm_ch, scales, mode, idt + ) + + # set initializers + resize_model1.set_initializer("scales", np.array(scales, dtype=np.float32)) + resize_model2.set_initializer("scales", np.array(scales, dtype=np.float32)) + resize_model3.set_initializer("scales", np.array(scales, dtype=np.float32)) + + # generate input tensor for testing + input_tensor_nchw = gen_finn_dt_tensor(idt, [1, ifm_ch, ifm_dim[0], ifm_dim[1]]) + input_tensor_nhwc = gen_finn_dt_tensor(idt, [1, ifm_dim[0], ifm_dim[1], ifm_ch]) + input_dict_nchw = {"inp": input_tensor_nchw} + input_dict_nhwc = {"inp": input_tensor_nhwc} + + # execute first model + output_dict1 = oxe.execute_onnx(resize_model1, input_dict_nchw) + expected1 = output_dict1["outp"] + + # transform Resize into ResizeNHWC + resize_model1 = resize_model1.transform(MakeScaleResizeNHWC()) + resize_model1 = resize_model1.transform(InferDataLayouts()) + + # execute transformed model + output_node_name1 = resize_model1.graph.output[0].name + output_dict1 = oxe.execute_onnx( + resize_model1, input_dict_nchw, return_full_exec_context=False + ) + output1 = output_dict1[output_node_name1] + + # compare outputs + assert (expected1 == output1).all() + assert check_transform(resize_model1) + + # execute second model + output_dict2 = oxe.execute_onnx(resize_model2, input_dict_nhwc) + expected2 = output_dict2["outp"] + + # transform Resize into ResizeNHWC + resize_model2 = resize_model2.transform(MakeScaleResizeNHWC()) + resize_model2 = resize_model2.transform(InferDataLayouts()) + + # execute transformed model + output_node_name2 = resize_model2.graph.output[0].name + output_dict2 = oxe.execute_onnx( + resize_model2, input_dict_nhwc, return_full_exec_context=False + ) + output2 = output_dict2[output_node_name2] + + # compare outputs + assert (expected2 == output2).all() + assert check_transform(resize_model2) + + # execute third model + output_dict3 = oxe.execute_onnx(resize_model3, input_dict_nhwc) + expected3 = output_dict3["outp"] + + # transform Resize into ResizeNHWC + resize_model3 = resize_model3.transform(MakeScaleResizeNHWC()) + resize_model3 = resize_model3.transform(InferDataLayouts()) + + # execute transformed model + output_node_name3 = resize_model3.graph.output[0].name + output_dict3 = oxe.execute_onnx( + resize_model3, input_dict_nhwc, return_full_exec_context=False + ) + output3 = output_dict3[output_node_name3] + + # compare outputs + assert (expected3 == output3).all() + assert check_transform(resize_model3)