diff --git a/finn-rtllib/swg/swg_hdl_template.v b/finn-rtllib/swg/swg_hdl_template.v new file mode 100755 index 0000000000000000000000000000000000000000..b0e00ea4d23395a446cc3522e3d2d61b158dc5e3 --- /dev/null +++ b/finn-rtllib/swg/swg_hdl_template.v @@ -0,0 +1,184 @@ +// ============================================================== +// RTL generated by Vivado(TM) HLS - High-Level Synthesis from C, C++ and OpenCL +// Version: 2020.1 +// Copyright (C) 1986-2020 Xilinx, Inc. All Rights Reserved. +// +// =========================================================== + +`timescale 1 ns / 1 ps +module window_buffer +#( + parameter IN_WIDTH = 1, //c*bit-width + parameter OUT_WIDTH = 1, //c*bit-width*MMV_out + parameter BUFFER_ELEM_TOTAL = 1 +) +( + CLK, + data_in, + shift_enable, + data_out +); + +input CLK; +input [IN_WIDTH-1:0] data_in; +input shift_enable; +output [OUT_WIDTH-1:0] data_out; + +//Input REG to enable simultaneous R/W +reg [IN_WIDTH-1:0] reg_input; + +//REG FIFOs +$GENERATE_REG_FIFOS$ + +//BRAM FIFOs +//todo: generate real BRAM shift buffers if these get too large +$GENERATE_BRAM_FIFOS$ + +//Fixed REG FIFO <-> output mapping +$GENERATE_OUTPUT_MAPPING$ + +//main process +integer i; +always @ (posedge CLK) begin + if (shift_enable) begin + //shift logic + $GENERATE_SHIFT_LOGIC$ + + //shift in new data + reg_input <= data_in; + end +end + +endmodule //window_buffer + +module $TOP_MODULE_NAME$ ( + ap_clk, + ap_rst_n, + in0_V_V_TDATA, + in0_V_V_TVALID, + in0_V_V_TREADY, + out_V_V_TDATA, + out_V_V_TVALID, + out_V_V_TREADY +); + +//parameters +parameter BIT_WIDTH = $BIT_WIDTH$; +parameter SIMD = $SIMD$; //assuming SIMD=C for now +parameter MMV_IN = $MMV_IN$; //assuming MMV_IN=1 for now +parameter MMV_OUT = $MMV_OUT$; //assuming MMV_OUT=K for now +parameter BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN; //c*bit-width +parameter BUF_OUT_WIDTH = BUF_IN_WIDTH * MMV_OUT; //c*bit-width*MMV_out + +parameter CYCLES_TOTAL = $CYCLES_TOTAL$; +parameter BUF_ELEM_TOTAL = $BUF_ELEM_TOTAL$; + +//IO ports +input ap_clk; +input ap_rst_n; +input [BUF_IN_WIDTH-1:0] in0_V_V_TDATA; +input in0_V_V_TVALID; +output in0_V_V_TREADY; +output [BUF_OUT_WIDTH-1:0] out_V_V_TDATA; +output out_V_V_TVALID; +input out_V_V_TREADY; + +//main buffer instantiation +wire [BUF_IN_WIDTH-1:0] window_buffer_in; +wire [BUF_OUT_WIDTH-1:0] window_buffer_out; +wire window_buffer_shift_enable; +window_buffer +#( + .IN_WIDTH(BUF_IN_WIDTH), + .OUT_WIDTH(BUF_OUT_WIDTH), + .BUFFER_ELEM_TOTAL(BUF_ELEM_TOTAL) +) +window_buffer_inst +( + .CLK(ap_clk), + .data_in(window_buffer_in), + .shift_enable(window_buffer_shift_enable), + .data_out(window_buffer_out) +); + +//FSM state +reg [1:0] state; +parameter STATE_RESET = 0, STATE_OPERATE = 1, S2 = 2; + +//main cycle counter (where either read/write/both happen, resets for each image) +integer cycle; + +//read/write loop state +wire read_state; +wire write_state; + +//output registers +reg out_V_V_TVALID_reg; + +//assign buffer control +//todo: if mmv_out < k: might not shift and/or write for multiple read_state cycles +assign window_buffer_shift_enable = (read_state && in0_V_V_TVALID) || write_state; + +//assign I/O ports +assign window_buffer_in = in0_V_V_TDATA; +assign in0_V_V_TREADY = read_state; //accept data whenever read loop wants to read +assign out_V_V_TDATA = window_buffer_out; //out_V_V_TDATA_reg; +assign out_V_V_TVALID = out_V_V_TVALID_reg; + +//read schedule +//todo: generate differently +$GENERATE_READ_SCHEDULE$ + +//write schedule +//todo: generate differently +$GENERATE_WRITE_SCHEDULE$ + +//read process (writing to buffer) +always @ (posedge ap_clk) begin + if (ap_rst_n == 1'b0) begin + state <= STATE_RESET; + end else begin + case (state) + STATE_RESET: begin + state <= STATE_OPERATE; + cycle <= 0; + end + STATE_OPERATE: begin + if (read_state && in0_V_V_TVALID) begin + //read into buffer + //done in concurrent assignment + //count cycle (R) + cycle <= cycle+1; + if (cycle == CYCLES_TOTAL-1) + state <= STATE_RESET; + end else if (write_state && out_V_V_TREADY) begin + cycle <= cycle+1; //count cycle (or W) + if (cycle == CYCLES_TOTAL-1) + state <= STATE_RESET; + end + end + endcase + end +end + +//write process (reading from buffer) +always @ (posedge ap_clk) begin + if (ap_rst_n == 1'b0) begin + end else begin + case (state) + STATE_RESET: begin + end + STATE_OPERATE: begin + if (write_state && out_V_V_TREADY) begin + //write from buffer + //todo: VALID seems to be deasserted 1 cycle too late?! + out_V_V_TVALID_reg <= 1'b1; + end else begin + out_V_V_TVALID_reg <= 1'b0; + end + end + endcase + end +end + +endmodule //ConvolutionInputGenerator1D_0_ConvolutionInputGenerator1D_0 diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 417a505898fb1aba751e4b44db336b8cf313cb6a..50746d4834cb1e7b29979f1876da007425352e76 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -34,6 +34,9 @@ from finn.custom_op.fpgadataflow.convolutioninputgenerator import ( from finn.custom_op.fpgadataflow.convolutioninputgenerator1d import ( ConvolutionInputGenerator1D, ) +from finn.custom_op.fpgadataflow.convolutioninputgenerator_rtl import ( + ConvolutionInputGenerator_rtl, +) from finn.custom_op.fpgadataflow.downsampler import DownSampler from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch @@ -67,6 +70,7 @@ custom_op["StreamingMaxPool_Batch"] = StreamingMaxPool_Batch custom_op["StreamingFCLayer_Batch"] = StreamingFCLayer_Batch custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator custom_op["ConvolutionInputGenerator1D"] = ConvolutionInputGenerator1D +custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl custom_op["TLastMarker"] = TLastMarker custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch custom_op["StreamingFIFO"] = StreamingFIFO diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py new file mode 100755 index 0000000000000000000000000000000000000000..9908bbb30d2dd6669ecdc44d3568dcadc3ac17ad --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py @@ -0,0 +1,1016 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import math +import numpy as np +import os + +from finn.core.datatype import DataType +from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.general.im2col import compute_conv_output_dim +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +from finn.custom_op.general import im2col + +from finn.util.basic import ( + get_rtlsim_trace_depth, + make_build_dir, +) + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None + +# This operation should only be used for 1D convolutions. Either the +# IFMDim_H or IFMDim_W should be '1', which represents the so-called +# dummy-dimension + +# ONNX i/o tensor shape assumptions for ConvolutionInputGenerator1D: +# input 0 is the input tensor, shape NHWC = (1, IFMDim_H, IFMDim_W, IFMChannels) +# output 0 is the output tensor, shape NHWC: +# = (1, OFMDim_H, OFMDim_W, (ConvKernelDim_H*ConvKernelDim_W)*IFMChannels) + +# note: the actual data layout produced by the hlslib kernels is different +# for depthwise and non-depthwise ops. +# * non-depthwise SWG: (1, OFMDim_H, OFMDim_W, K_H, K_W, IFMChannels/SIMD, SIMD) +# * depthwise SWG: (1, OFMDim_H, OFMDim_W, IFMChannels/SIMD, K_H, K_W, SIMD) +# see test_fpgadataflow_slidingwindow.py for an example of how to transform +# between the two layouts + + +class ConvolutionInputGenerator_rtl(HLSCustomOp): + """Class that corresponds to one of the 1D finn-hlslib ConvolutionInputGenerator + (sliding window) function variants. Depending on the combination of + attributes (e.g. depthwise or not, whether dilation is 0) a different + variant will be picked for the actual HLS implementation.""" + + def __init__(self, onnx_node): + super().__init__(onnx_node) + + def get_nodeattr_types(self): + my_attrs = { + "ConvKernelDim": ("ints", True, []), # [H, W] = [Y, X] + "IFMChannels": ("i", True, 0), + "IFMDim": ("ints", True, []), # [H, W] = [Y, X] + "OFMDim": ("ints", True, []), # [H, W] = [Y, X] + "SIMD": ("i", True, 0), + "Stride": ("ints", True, []), # [H, W] = [Y, X] + "Dilation": ("ints", True, []), # [H, W] = [Y, X] + # FINN DataTypes for inputs, weights, outputs + "inputDataType": ("s", True, ""), + "outputDataType": ("s", True, ""), + "depthwise": ("i", False, 0, {0, 1}), + # FPGA resource type for ConvolutionInputGenerator input buffer + # auto -- let Vivado HLS decide + # block -- use BRAM + # distributed -- use LUTRAM + # ultra -- use URAM + "ram_style": ( + "s", + False, + "distributed", + {"auto", "block", "distributed", "ultra"}, + ), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_normal_input_shape(self): + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") + ifm_ch = self.get_nodeattr("IFMChannels") + ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) + return ishape + + def get_folded_input_shape(self): + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") + ifm_ch = self.get_nodeattr("IFMChannels") + simd = self.get_nodeattr("SIMD") + assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" + wf = int(ifm_ch / simd) + folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd) + return folded_ishape + + def get_normal_output_shape(self): + k_h, k_w = self.get_nodeattr("ConvKernelDim") + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") + ifm_ch = self.get_nodeattr("IFMChannels") + stride_h, stride_w = self.get_nodeattr("Stride") + dilation_h, dilation_w = self.get_nodeattr("Dilation") + pad = 0 + ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) + ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) + oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch) + return oshape + + def get_folded_output_shape(self): + k_h, k_w = self.get_nodeattr("ConvKernelDim") + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") + ifm_ch = self.get_nodeattr("IFMChannels") + stride_h, stride_w = self.get_nodeattr("Stride") + dilation_h, dilation_w = self.get_nodeattr("Dilation") + simd = self.get_nodeattr("SIMD") + pad = 0 + ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) + ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) + assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" + if self.use_parallel_window_output(): + wf = int((ifm_ch) // simd) + folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd) + else: + wf = int((k_h * k_w * ifm_ch) // simd) + folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd) + return folded_oshape + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpect input shape for ConvInpGen." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + # data type stays the same + dtype = model.get_tensor_datatype(node.input[0]) + model.set_tensor_datatype(node.output[0], dtype) + + def verify_node(self): + pass + + def get_input_datatype(self): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("outputDataType")] + + def get_instream_width(self): + ibits = self.get_input_datatype().bitwidth() + simd = self.get_nodeattr("SIMD") + ifm_ch = self.get_nodeattr("IFMChannels") + assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" + in_width = simd * ibits + return in_width + + def get_outstream_width(self): + if self.use_parallel_window_output(): + # feed all window pixels in parallel + k_h, k_w = self.get_nodeattr("ConvKernelDim") + return self.get_instream_width() * k_h * k_w + else: + # if parallel variant not in use: same width for output and input stream + return self.get_instream_width() + + def get_number_output_values(self): + folded_oshape = self.get_folded_output_shape() + num_output_elems = np.prod(folded_oshape[:-1]) + return num_output_elems + + def get_1d_conv_attrs_normalized(self): + # support both (1, D) and (D, 1) cases transparently: + # For the kernel, presenting the input data of size D as + # [H, W] = [Y, X] = [1, D] or [D, 1] + # effectively gives the same result. Because the + # ConvolutionInputGenerator_NonSquare_Dilated(_dws) kernel currently only + # supports dilation>1 along the X-axis and the + # ConvolutionInputGenerator_NonSquare only works for stride>1 along the + # X-axis, we are working with the following assumption: + # the dummy ('1') dimension is the Y-dimension, i.e. + # images and kernels (and their attributes) of dimension + # [H, W] = [Y, X] = [D, 1] or [1, D] are always mapped to [1, D] + ifm_ch = self.get_nodeattr("IFMChannels") + k = self.get_nodeattr("ConvKernelDim") + ifm_dim = self.get_nodeattr("IFMDim") + ofm_dim = self.get_nodeattr("OFMDim") + stride = self.get_nodeattr("Stride") + dilation = self.get_nodeattr("Dilation") + + # see defines() for an explanation + if ifm_dim[1] == 1: + ifm_dim = ifm_dim[::-1] + ofm_dim = ofm_dim[::-1] + k = k[::-1] + stride = stride[::-1] + dilation = dilation[::-1] + + return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation) + + def use_parallel_window_output(self): + # Check if simple "ConvolutionInputGenerator_1D_parallel" variant can be used to + # feed window in parallel to the following layer, enabling full SIMD unfolding. + dilation = self.get_nodeattr("Dilation") + dilation_h, dilation_w = dilation + + #todo: make this configurable via mmv_out instead of an automatic selection + + if self.get_nodeattr("SIMD") == self.get_nodeattr("IFMChannels"): + if self.get_nodeattr("depthwise") == 0: + return True + + return False + + def get_exp_cycles(self): + simd = self.get_nodeattr("SIMD") + ( + ifm_ch, + ifm_dim, + ofm_dim, + k, + stride, + dilation, + ) = self.get_1d_conv_attrs_normalized() + ifm_dim_h, ifm_dim_w = ifm_dim + ofm_dim_h, ofm_dim_w = ofm_dim + k_h, k_w = k + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + + # since mmv != 1 is not supported yet, we set mmv for now to 1 + mmv = 1 + # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h + if self.use_parallel_window_output(): + exp_cycles = ifm_dim_w + 1 + else: + cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv + cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd) + max_cycles = max(cycles_write_block, cycles_read_block) + exp_cycles = ( + ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles + ) + + return int(exp_cycles) + + def bram_estimation(self): + # NOTE: not tested for correctness + simd = self.get_nodeattr("SIMD") + ifm_ch = self.get_nodeattr("IFMChannels") + ifm_dim = np.prod(self.get_nodeattr("IFMDim")) + k = np.prod(self.get_nodeattr("ConvKernelDim")) + stride = np.prod(self.get_nodeattr("Stride")) + ram_style = self.get_nodeattr("ram_style") + if ram_style == "block" or ram_style == "auto": + ram_depth = ifm_dim * ifm_ch / simd + if ram_depth <= 512: + ram_width = 36 + elif ram_depth <= 1024: + ram_width = 18 + elif ram_depth <= 2048: + ram_width = 9 + elif ram_depth <= 4096: + ram_width = 4 + elif ram_depth <= 8192: + ram_width = 2 + else: + ram_width = 1 + return int( + (k + stride) + * ( + math.ceil(simd * self.get_input_datatype().bitwidth() / ram_width) + * math.ceil(ifm_dim * ifm_ch / simd / ram_depth) + ) + ) + else: + return 0 + + def lut_estimation(self): + # NOTE: not tested for correctness + simd = self.get_nodeattr("SIMD") + ifm_ch = self.get_nodeattr("IFMChannels") + ifm_dim = np.prod(self.get_nodeattr("IFMDim")) + k = np.prod(self.get_nodeattr("ConvKernelDim")) + stride = np.prod(self.get_nodeattr("Stride")) + ram_style = self.get_nodeattr("ram_style") + if ram_style == "distributed": + ram_luts = int( + (k + stride) + * ( + simd + * self.get_input_datatype().bitwidth() + * math.ceil(ifm_dim * ifm_ch / simd / 64) + ) + ) + else: + ram_luts = 0 + return 300 + ram_luts + + def uram_estimation(self): + # NOTE: not tested for correctness + simd = self.get_nodeattr("SIMD") + ifm_ch = self.get_nodeattr("IFMChannels") + ifm_dim = np.prod(self.get_nodeattr("IFMDim")) + k = np.prod(self.get_nodeattr("ConvKernelDim")) + stride = np.prod(self.get_nodeattr("Stride")) + ram_style = self.get_nodeattr("ram_style") + if ram_style == "ultra": + return int( + (k + stride) + * ( + math.ceil(simd * self.get_input_datatype().bitwidth() / 64) + * math.ceil(ifm_dim * ifm_ch / simd / 4096) + ) + ) + else: + return 0 + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + folded_oshape = self.get_folded_output_shape() + + # TODO ensure codegen dir exists + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't + match expected shape (1, ifm_dim, ifm_dim, ifm_ch).""" + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + inp = (inp + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + # reshape input into folded form + inp = inp.reshape(folded_ishape) + # make copy before saving array + reshaped_input = inp.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == folded_oshape + ), "cppsim \ + did not produce expected ofolded utput shape" + context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + # binary -> bipolar if needed + if self.get_output_datatype() == DataType["BIPOLAR"]: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output + shape doesn't match expected shape (1, ofm_dim_h, ofm_dim_w, k_h*k_w*ifm_ch).""" + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "slidingwindow.h"'] + + def defines(self, var): + numReps = 1 + ( + ifm_ch, + ifm_dim, + ofm_dim, + k, + stride, + dilation, + ) = self.get_1d_conv_attrs_normalized() + simd = self.get_nodeattr("SIMD") + ifm_precision = self.get_input_datatype().bitwidth() + ifm_dim_y, ifm_dim_x = ifm_dim + ofm_dim_y, ofm_dim_x = ofm_dim + k_y, k_x = k + dilation_y, dilation_x = dilation + # For a 1d convolution with stride=[S,1] or [1,S], the finn-hlslib function + # of ConvInpGen must be created with [stride_y, stride_x] = [S, S]. + # TODO: changes in finn-hlslib (slidingwindow.h) + stride_y = np.prod(stride) + stride_x = np.prod(stride) + + if dilation_x > 1: + assert ( + dilation_y == 1 + ), "Dilation value greater than 1 along y-axis is not yet supported" + self.code_gen_dict["$DEFINES$"] = [ + """ + #define ConvKernelDim1_x {}\n + #define ConvKernelDim1_y {}\n + #define IFMChannels1 {}\n + #define Input_precision1 {}\n + #define IFMDim1_x {}\n + #define IFMDim1_y {}\n + #define OFMDim1_x {}\n + #define OFMDim1_y {}\n + #define SIMD1 {}\n + #define Stride1_x {}\n + #define Stride1_y {}\n + #define Dilation1_x {}\n + #define Dilation1_y {}\n + #define numReps {} + """.format( + k_x, + k_y, + ifm_ch, + ifm_precision, + ifm_dim_x, + ifm_dim_y, + ofm_dim_x, + ofm_dim_y, + simd, + stride_x, + stride_y, + dilation_x, + dilation_y, + numReps, + ) + ] + else: + ofm_dim = self.get_nodeattr("OFMDim") + self.code_gen_dict["$DEFINES$"] = [ + """ + #define ConvKernelDim1_x {}\n + #define ConvKernelDim1_y {}\n + #define IFMChannels1 {}\n + #define Input_precision1 {}\n + #define IFMDim1_x {}\n + #define IFMDim1_y {}\n + #define OFMDim1_x {}\n + #define OFMDim1_y {}\n + #define SIMD1 {}\n + #define Stride1_x {}\n + #define Stride1_y {}\n + #define numReps {} + """.format( + k_x, + k_y, + ifm_ch, + ifm_precision, + ifm_dim_x, + ifm_dim_y, + ofm_dim_x, + ofm_dim_y, + simd, + stride_x, + stride_y, + numReps, + ) + ] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0);' + % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width()) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width()) + ) + + def docompute(self): + ram_style = self.get_nodeattr("ram_style") + map_to_hls_ram_style = { + "auto": "ap_resource_dflt()", + "block": "ap_resource_bram()", + "distributed": "ap_resource_lutram()", + "ultra": "ap_resource_uram()", + } + hls_ram_style = map_to_hls_ram_style[ram_style] + + # check which ConvolutionInputGenerator is needed + if self.use_parallel_window_output(): + hls_call = "ConvolutionInputGenerator_1D_parallel" + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<ConvKernelDim1_x, IFMChannels1, Input_precision1, + IFMDim1_x, OFMDim1_x, SIMD1, Stride1_x> + (in0, out, numReps, {});""".format( + hls_call, hls_ram_style + ) + ] + else: + hls_call = "ConvolutionInputGenerator_NonSquare" + dilation_h, dilation_w = self.get_nodeattr("Dilation") + if dilation_h > 1 or dilation_w > 1: + hls_call += "_Dilated" + if self.get_nodeattr("depthwise") == 1: + hls_call += "_dws" + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, + Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, + SIMD1, Stride1_x, Stride1_y, Dilation1_x, Dilation1_y> + (in0, out, numReps, {});""".format( + hls_call, hls_ram_style + ) + ] + elif self.get_nodeattr("depthwise") == 1: + hls_call += "_dws" + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, + Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, + SIMD1, Stride1_x, Stride1_y> (in0, out, numReps, {});""".format( + hls_call, hls_ram_style + ) + ] + else: + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, + Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, + SIMD1, Stride1_x, Stride1_y> (in0, out, numReps, {});""".format( + hls_call, hls_ram_style + ) + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + if self.use_parallel_window_output(): + # pass the number of pixels in the folded output to apintstream2npy, needed + # to unpack the ouput correctly and reverse only the inner SIMD dimension + k_h, k_w = self.get_nodeattr("ConvKernelDim") + multi_pixel_out = k_h * k_w + else: + multi_pixel_out = 1 + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", true, 1, %d);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + oshape_cpp_str, + npy_out, + multi_pixel_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + if self.use_parallel_window_output(): + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0, + hls::stream<ap_uint<ConvKernelDim1_x*SIMD1*Input_precision1>> + &out)""".format( + self.onnx_node.name + ) + ] + else: + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0, + hls::stream<ap_uint<SIMD1*Input_precision1>> &out)""".format( + self.onnx_node.name + ) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"] + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out") + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE ap_ctrl_none port=return" + ) + + def generate_hdl(self): + #todo: generate into some code gen dict + f_debug = open(os.path.join("/workspace/finn/finn-rtllib/swg/", "swg_hdl_debuginfo.log"), "w") + code_gen_dict = {} + + #-------------------- + # init hyperparameters + # for 1D case: it does not matter if dummy dim is x or y + ifm_ch = self.get_nodeattr("IFMChannels") + k = self.get_nodeattr("ConvKernelDim") + ifm_dim = self.get_nodeattr("IFMDim") + ofm_dim = self.get_nodeattr("OFMDim") + stride = self.get_nodeattr("Stride") + dilation = self.get_nodeattr("Dilation") + + n = 1 + h, w = ifm_dim + c = 1#ifm_ch not considered atm (always parallelize across c) + k_h, k_w = k + pad = [0,0,0,0] + pad_val = 0 + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + conv_c = 99 + + # init folding config + simd = self.get_nodeattr("SIMD") + mmv_in = 1 + mmv_out = k_h*k_w + + assert simd==ifm_ch, "Constraint violated: SIMD = C" + assert mmv_in==1, "Constraint violated: MMV_IN = 1" + assert mmv_out==k_h*k_w, "Constraint violated: mmv_out = K" + + # how many "unused" registers are allowed between buffer positions that will be accessed in parallel + # example: + # 0: only consecutive access patterns will be implemented in regs, rest in BRAM line buffers + # 2: [0, 3, 6] access pattern is still allowed and will be implemented with 1 7-position shift reg + REG_BRAM_THRESHOLD = 1 + #-------------------- + + in_shape = (n,c,h,w) #NCHW + + in_image = np.empty(in_shape, dtype=int) + + for index, x in np.ndenumerate(in_image): + # "HWC" dummy values + val = int((index[2]+1)*100+(index[3]+1)*10+(index[1]+1)*1) + in_image[index] = val + + in_image_padded = np.pad( + in_image, + ((0, 0), (0, 0), (pad[0], pad[2]), (pad[1], pad[3])), + mode="constant", + constant_values=pad_val, + ) + in_shape_padded = in_image_padded.shape + h_padded = in_shape_padded[2] + w_padded = in_shape_padded[3] + + pad_h = pad[0] + pad[2] + pad_w = pad[1] + pad[3] + out_dim_h = im2col.compute_conv_output_dim(h, k_h, stride_h, pad_h, dilation_h) + out_dim_w = im2col.compute_conv_output_dim(w, k_w, stride_w, pad_w, dilation_w) + + f_debug.write("\n"+"in shape " + str(in_shape)) + f_debug.write("\n"+"in shape padded " + str(in_shape_padded)) + f_debug.write("\n"+"conv out shape " + str((n,conv_c,out_dim_h,out_dim_w))) + f_debug.write("\n"+"im2col out shape " + str((n,out_dim_h,out_dim_w,k_h*k_w*c))) + + idx_c, idx_h, idx_w = im2col.get_im2col_indices_nchw( + in_shape, + k_h, + k_w, + pad, + stride_h, + stride_w, + dilation_h, + dilation_w + ) + + f_debug.write("\n"+"c indices") + f_debug.write("\n"+str(idx_c)) + f_debug.write("\n"+"h indices") + f_debug.write("\n"+str(idx_h)) + f_debug.write("\n"+"w indices") + f_debug.write("\n"+str(idx_w)) + + cols = in_image_padded[:, idx_c, idx_h, idx_w] + cols = cols.transpose(1, 2, 0).reshape(k_h * k_w * c, -1) + + f_debug.write("\n"+"cols (shape %s)" % str(cols.shape)) + f_debug.write("\n"+str(cols)) + + # result shape is (k_H*k_W*N, out_dim_H*out_dim_W), convert to NCHW + out_image = cols.reshape(n, c, k_h, k_w, out_dim_h, out_dim_w) + # (N=0,C=1,kh=2,kw=3,H=4,W=5) -> (N=0,H=4,W=5,kh=2,kw=3,C=1) + out_image = out_image.transpose(0, 4, 5, 2, 3, 1) + out_image = out_image.reshape(n, out_dim_h, out_dim_w, k_h * k_w * c) + + f_debug.write("\n"+"output (shape %s)" % str(out_image.shape)) + f_debug.write("\n"+str(out_image)) + + f_debug.write("\n"+"h indices") + f_debug.write("\n"+str(idx_h)) + f_debug.write("\n"+"w indices") + f_debug.write("\n"+str(idx_w)) + + idx_px = idx_h*w+idx_w + f_debug.write("\n"+"sequential pixel indices") + f_debug.write("\n"+str(idx_px)) + + buffer = [] + buffer_max_size = 0 + # buffer schedule (write from input, read to output) + schedule_write = [] + schedule_read = [] + next_in_px = 0 + + idx_px_relative = idx_px.copy() + + # compute schedule and buffer read pattern + Y, X = idx_px_relative.shape + for x in range(X): + # load missing inputs into buffer + for y in range(Y): + while int(idx_px_relative[y,x]) not in buffer: + buffer.append(next_in_px) + next_in_px += 1 + schedule_write.append(1) + schedule_read.append(0) + + # discard unused buffer elements (assumes in-order access) + oldest_px = min(idx_px_relative[:,x]) + while buffer[0] < oldest_px: + buffer.pop(0) + + # adjust relative buffer index + for y in range(Y): + idx_px_relative[y,x] -= oldest_px + + # record max needed buffer depth + if len(buffer) > buffer_max_size: + buffer_max_size = len(buffer) + + # read from buffer + schedule_read.append(1) + + # simultaneously load next pixel(s) into buffer if there are any left + if next_in_px > (h_padded*w_padded-1): + schedule_write.append(0) + else: + buffer.append(next_in_px) + next_in_px += 1 + schedule_write.append(1) + + + # find buffer access patterns + buffer_access_patterns = [] + for x in range(X): + if idx_px_relative[:,x].tolist() not in buffer_access_patterns: + buffer_access_patterns.append(idx_px_relative[:,x].tolist()) + + + f_debug.write("\n"+"max buffer size observed: %d" %(buffer_max_size)) + f_debug.write("\n"+"output vector elements: relative buffer indices") + f_debug.write("\n"+str(idx_px_relative)) + f_debug.write("\n"+"found %d buffer access patterns:" % len(buffer_access_patterns)) + f_debug.write("\n"+str(buffer_access_patterns)) + f_debug.write("\n"+"required parallel-access registers for mmv_out=k: %d" % len(sum(buffer_access_patterns,[]))) + f_debug.write("\n"+"buffer write schedule (%d cycles)" % len(schedule_write)) + f_debug.write("\n"+str(schedule_write)) + f_debug.write("\n"+"writing buffer in %d cycles" % schedule_write.count(1)) + f_debug.write("\n"+"buffer read schedule (%d cycles)" % len(schedule_read)) + f_debug.write("\n"+str(schedule_read)) + f_debug.write("\n"+"reading buffer in %d cycles" % schedule_read.count(1)) + + assert len(schedule_write) == len(schedule_read), "ERROR: Schedules have different lenghts" + cycles_total = len(schedule_write) + + assert schedule_read.count(1) == self.get_number_output_values(), "ERROR: Reading buffer in fewer cycles than expected" + + code_gen_dict["$TOP_MODULE_NAME$"] = [self.get_verilog_top_module_name()] + code_gen_dict["$BIT_WIDTH$"] = [str(self.get_input_datatype().bitwidth())] + code_gen_dict["$SIMD$"] = [str(simd)] + code_gen_dict["$MMV_IN$"] = [str(mmv_in)] + code_gen_dict["$MMV_OUT$"] = [str(mmv_out)] + code_gen_dict["$CYCLES_TOTAL$"] = [str(cycles_total)] + code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_max_size)] + + # determine buffer partitioning into REG FIFOs (parallel access) and BRAM FIFOs (line buffers) + assert len(buffer_access_patterns) == 1, "ERROR: Buffer access pattern is not static" + buf_static_access_pattern = buffer_access_patterns[0] + reg_fifos = [] + bram_fifos = [] + current = [] + for i in range(len(buf_static_access_pattern)): + access_idx = buf_static_access_pattern[i] + if len(current) == 0: + current.append(access_idx) + else: + # assume non-decreasing index order in access pattern + distance = access_idx - max(current) + if not (distance-1 > REG_BRAM_THRESHOLD): + for i in range(distance-1): + # insert dummy into REG FIFO (not read as part of window) + current.append(-1) + # assign this access to same REG FIFO as previous one + current.append(access_idx) + else: + # assign skipped accesses to new BRAM FIFO + bram_fifos.append([-1]*(distance-1)) + # start with new REG FIFO + reg_fifos.append(current) + current = [] + current.append(access_idx) + reg_fifos.append(current) + + f_debug.write("\n"+"Buffer partitioning using REG_BRAM_THRESHOLD=%d" % REG_BRAM_THRESHOLD) + f_debug.write("\n"+"%d REG FIFOs (parallel read access):" % len(reg_fifos)) + f_debug.write("\n"+str(reg_fifos)) + f_debug.write("\n"+"%d BRAM FIFOs (line buffers):" % len(bram_fifos)) + f_debug.write("\n"+str(bram_fifos)) + + code_gen_dict["$GENERATE_REG_FIFOS$"] = [] + for i in range(len(reg_fifos)): + code_gen_dict["$GENERATE_REG_FIFOS$"].append( + """parameter reg_fifo_{id}_len = {len}; + reg [IN_WIDTH-1:0] reg_fifo_{id} [reg_fifo_{id}_len-1:0]; + """.format(id=i, len=len(reg_fifos[i]))) + + #todo: generate actual bram shift buffers instead of regs + code_gen_dict["$GENERATE_BRAM_FIFOS$"] = [] + for i in range(len(bram_fifos)): + code_gen_dict["$GENERATE_BRAM_FIFOS$"].append( + """parameter bram_fifo_{id}_len = {len}; + reg [IN_WIDTH-1:0] bram_fifo_{id} [bram_fifo_{id}_len-1:0]; + """.format(id=i, len=len(bram_fifos[i]))) + + code_gen_dict["$GENERATE_OUTPUT_MAPPING$"] = [] + out_idx = mmv_out-1 + for fifo_id, reg_fifo in enumerate(reg_fifos): + for fifo_idx, access_idx in enumerate(reg_fifo): + if(access_idx != -1): + code_gen_dict["$GENERATE_OUTPUT_MAPPING$"].append( + "assign data_out[IN_WIDTH*{out_idx}+:IN_WIDTH] = reg_fifo_{fifo_id}[{fifo_idx}]; //{access_idx}".format( + out_idx=out_idx, fifo_id=fifo_id, fifo_idx=fifo_idx, access_idx=access_idx + ) + ) + # reversal: out_idx=0 -> oldest buffer element -> highest access_idx + out_idx = out_idx-1 + assert out_idx==-1, "ERROR: Not all output vector elements connected" + + code_gen_dict["$GENERATE_SHIFT_LOGIC$"] = [] + for i in range(len(reg_fifos)): + if i == 0: + # first FIFO containing newest elements -> input comes from input reg + code_gen_dict["$GENERATE_SHIFT_LOGIC$"].append( + """for (i=reg_fifo_{fifo_id}_len-1; i>0; i=i-1) + reg_fifo_{fifo_id}[i] <= reg_fifo_{fifo_id}[i-1]; + reg_fifo_{fifo_id}[0] <= reg_input;""".format( + fifo_id=i, + ) + ) + else: + # other REG FIFOs -> input comes from connected BRAM FIFO (line buffer) + input_fifo_id = i-1 + code_gen_dict["$GENERATE_SHIFT_LOGIC$"].append( + """for (i=reg_fifo_{fifo_id}_len-1; i>0; i=i-1) + reg_fifo_{fifo_id}[i] <= reg_fifo_{fifo_id}[i-1]; + reg_fifo_{fifo_id}[0] <= bram_fifo_{input_fifo_id} [bram_fifo_{input_fifo_id}_len-1];""".format( + fifo_id=i, input_fifo_id=input_fifo_id + ) + ) + for i in range(len(bram_fifos)): + input_fifo_id = i + code_gen_dict["$GENERATE_SHIFT_LOGIC$"].append( + """for (i=bram_fifo_{fifo_id}_len-1; i>0; i=i-1) + bram_fifo_{fifo_id}[i] <= bram_fifo_{fifo_id}[i-1]; + bram_fifo_{fifo_id}[0] <= reg_fifo_{input_fifo_id} [reg_fifo_{input_fifo_id}_len-1];""".format( + fifo_id=i, input_fifo_id=input_fifo_id + ) + ) + + # Generate read schedule (when data is read from input, written to buffer) + code_gen_dict["$GENERATE_READ_SCHEDULE$"] = [] + schedule_as_string = "" + #todo: change naming to swap write/read + for i in schedule_write: + if i == 1: + schedule_as_string += "1'b1," + else: + schedule_as_string += "1'b0," + schedule_as_string = schedule_as_string[:-1] # remove trailing ',' + code_gen_dict["$GENERATE_READ_SCHEDULE$"].append( + "localparam [0:{len}-1] READ_SCHEDULE = {{{str}}};".format(len=cycles_total, str=schedule_as_string) + ) + code_gen_dict["$GENERATE_READ_SCHEDULE$"].append( + "assign read_state = READ_SCHEDULE[cycle];" + ) + + # Generate write schedule (when data is written to output, read from buffer) + code_gen_dict["$GENERATE_WRITE_SCHEDULE$"] = [] + schedule_as_string = "" + #todo: change naming to swap write/read + for i in schedule_read: + if i == 1: + schedule_as_string += "1'b1," + else: + schedule_as_string += "1'b0," + schedule_as_string = schedule_as_string[:-1] # remove trailing ',' + code_gen_dict["$GENERATE_WRITE_SCHEDULE$"].append( + "localparam [0:{len}-1] WRITE_SCHEDULE = {{{str}}};".format(len=cycles_total, str=schedule_as_string) + ) + code_gen_dict["$GENERATE_WRITE_SCHEDULE$"].append( + "assign write_state = WRITE_SCHEDULE[cycle];" + ) + + with open("/workspace/finn/finn-rtllib/swg/swg_hdl_template.v", "r") as f: + template = f.read() + + for key in code_gen_dict: + # transform list into long string separated by '\n' + code_gen_line = "\n".join(code_gen_dict[key]) + template = template.replace(key, code_gen_line) + f = open(os.path.join("/workspace/finn/finn-rtllib/swg/", "swg_hdl_generated.v"), "w") + f.write(template) + f.close() + f_debug.close() + + def prepare_rtlsim(self): + """Creates a Verilator emulation library for the RTL code generated + for this node, sets the rtlsim_so attribute to its path and returns + a PyVerilator wrapper around it.""" + #modified to use generated verilog instead of HLS output products + + self.generate_hdl() + + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + verilog_paths = ["/workspace/finn/finn-rtllib/swg/"] + verilog_files = ["swg_hdl_generated.v"] + # build the Verilator emu library + sim = PyVerilator.build( + verilog_files, + build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), + verilog_path=verilog_paths, + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_verilog_top_module_name(), + ) + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + return sim diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py new file mode 100755 index 0000000000000000000000000000000000000000..f7a724133333156811d5e3f7721c9585dba94eca --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py @@ -0,0 +1,265 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import numpy as np +from onnx import TensorProto, helper + +import finn.core.onnx_exec as oxe +from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer +from finn.core.datatype import DataType +from finn.core.modelwrapper import ModelWrapper +from finn.custom_op.general.im2col import compute_conv_output_dim +from finn.custom_op.registry import getCustomOp +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.general import GiveUniqueNodeNames +from finn.util.basic import gen_finn_dt_tensor + + +def make_single_im2col_modelwrapper( + k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt +): + k_h, k_w = k + ifm_dim_h, ifm_dim_w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + ofm_dim_h, ofm_dim_w = ofm_dim + + odt = idt + inp = helper.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch] + ) + outp = helper.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] + ) + + im2col_node = helper.make_node( + "Im2Col", + ["inp"], + ["outp"], + domain="finn.custom_op.general", + stride=[stride_h, stride_w], + kernel_size=[k_h, k_w], + input_shape=str((1, ifm_dim_h, ifm_dim_w, ifm_ch)), + dilations=[dilation_h, dilation_w], + pad_amount=[0, 0, 0, 0], + pad_value=0, + ) + graph = helper.make_graph( + nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp] + ) + + model = helper.make_model(graph, producer_name="im2col-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", odt) + + return model + + +def make_single_slidingwindow_modelwrapper( + k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt, dw=0 +): + k_h, k_w = k + ifm_dim_h, ifm_dim_w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + ofm_dim_h, ofm_dim_w = ofm_dim + + odt = idt + inp = helper.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch] + ) + outp = helper.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] + ) + + SlidingWindow_node = helper.make_node( + "ConvolutionInputGenerator_rtl", + ["inp"], + ["outp"], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + ConvKernelDim=[k_h, k_w], + IFMChannels=ifm_ch, + IFMDim=[ifm_dim_h, ifm_dim_w], + OFMDim=[ofm_dim_h, ofm_dim_w], + SIMD=simd, + Stride=[stride_h, stride_w], + Dilation=[dilation_h, dilation_w], + inputDataType=idt.name, + outputDataType=odt.name, + depthwise=dw, + ) + graph = helper.make_graph( + nodes=[SlidingWindow_node], + name="slidingwindow_graph", + inputs=[inp], + outputs=[outp], + ) + + model = helper.make_model(graph, producer_name="slidingwindow-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", odt) + + #DEBUG + swg_node = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")[0] + swg_inst = getCustomOp(swg_node) + swg_inst.set_nodeattr("rtlsim_trace", "/workspace/finn/finn-rtllib/swg/swg_test_trace.vcd") + + return model + + +def prepare_inputs(input_tensor): + return {"inp": input_tensor} + + +# input datatype +@pytest.mark.parametrize("idt", [DataType["INT4"]]) +# kernel size +@pytest.mark.parametrize("k", [[3, 3]]) +# input dimension +@pytest.mark.parametrize("ifm_dim", [[6, 11]]) +# input channels +@pytest.mark.parametrize("ifm_ch", [2]) +# Stride +@pytest.mark.parametrize("stride", [[1, 2]]) +# Dilation +@pytest.mark.parametrize("dilation", [[1, 2]]) +# execution mode +@pytest.mark.parametrize("exec_mode", ["rtlsim"]) +# input channel parallelism ("SIMD") +@pytest.mark.parametrize("simd", [2]) +# depthwise +@pytest.mark.parametrize("dw", [0]) +# Flip dimensions +@pytest.mark.parametrize("flip", [False]) +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_slidingwindow_rtl( + idt, k, ifm_dim, ifm_ch, stride, dilation, exec_mode, simd, dw, flip +): + if flip: + k = k[::-1] + ifm_dim = ifm_dim[::-1] + stride = stride[::-1] + dilation = dilation[::-1] + + k_h, k_w = k + ifm_dim_h, ifm_dim_w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + + #if (dilation_h > 1 or dilation_w > 1) and (stride_h > 1 or stride_w > 1): + # pytest.skip( + # """Dilation value greater than 1 and stride greater than 1 + # currently not supported for 1D convolutions""" + # ) + if simd > ifm_ch: + pytest.skip("SIMD cannot be larger than number of input channels") + + ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h) + ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w) + ofm_dim = [ofm_dim_h, ofm_dim_w] + + x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch)) + model = make_single_slidingwindow_modelwrapper( + k=k, + ifm_ch=ifm_ch, + ifm_dim=ifm_dim, + ofm_dim=ofm_dim, + simd=simd, + stride=stride, + dilation=dilation, + idt=idt, + dw=dw, + ) + + if exec_mode == "cppsim": + model = model.transform(SetExecMode("cppsim")) + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + elif exec_mode == "rtlsim": + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP("xc7z020clg400-1", 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + else: + raise Exception("Unknown exec_mode in test_fpgadataflow_slidingwindow") + + # prepare input data + input_dict = prepare_inputs(x) + # execute model + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + golden = make_single_im2col_modelwrapper( + k=k, + ifm_ch=ifm_ch, + ifm_dim=ifm_dim, + ofm_dim=ofm_dim, + simd=simd, + stride=stride, + dilation=dilation, + idt=idt, + ) + y_expected = oxe.execute_onnx(golden, input_dict)["outp"] + + #DEBUG + print("-------expected:") + print(y_expected) + print("--------produced:") + print(y_produced) + + if dw == 0: + assert (y_produced == y_expected).all() + else: + y_expected = y_expected.reshape( + 1, ofm_dim_h, ofm_dim_w, k_h * k_w, ifm_ch // simd, simd + ) + y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5) + y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, ifm_ch * k_h * k_w) + assert (y_produced == y_expected).all() + + + # if exec_mode == "rtlsim": + # node = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")[0] + # inst = getCustomOp(node) + # cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + # exp_cycles_dict = model.analysis(exp_cycles_per_layer) + # exp_cycles = exp_cycles_dict[node.name] + # assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) + # assert exp_cycles != 0