From 34f8211247e391cf1b7e3d6b384b6be2308d8d4b Mon Sep 17 00:00:00 2001 From: Felix Jentzsch <felix.jentzsch@upb.de> Date: Mon, 13 Jun 2022 21:40:13 +0200 Subject: [PATCH] Cleanup and fixes --- ...mplate_mmv_1.v => swg_template_default.sv} | 230 ++--- ...dl_template.v => swg_template_parallel.sv} | 16 +- ...plate_wrapper.v => swg_template_wrapper.v} | 0 .../convolutioninputgenerator_rtl.py | 902 ++++++------------ ...est_fpgadataflow_convinputgenerator_rtl.py | 42 +- 5 files changed, 436 insertions(+), 754 deletions(-) rename finn-rtllib/swg/{swg_hdl_template_mmv_1.v => swg_template_default.sv} (57%) rename finn-rtllib/swg/{swg_hdl_template.v => swg_template_parallel.sv} (96%) rename finn-rtllib/swg/{swg_hdl_template_wrapper.v => swg_template_wrapper.v} (100%) diff --git a/finn-rtllib/swg/swg_hdl_template_mmv_1.v b/finn-rtllib/swg/swg_template_default.sv similarity index 57% rename from finn-rtllib/swg/swg_hdl_template_mmv_1.v rename to finn-rtllib/swg/swg_template_default.sv index 670598d9a..12cc65692 100644 --- a/finn-rtllib/swg/swg_hdl_template_mmv_1.v +++ b/finn-rtllib/swg/swg_template_default.sv @@ -9,38 +9,39 @@ module $TOP_MODULE_NAME$_controller tail_incr ); -input CLK; -input RST; -input advance; -output [31:0] addr_incr; //todo: minimize width -output [31:0] tail_incr; //todo: minimize width - -////code generation part: localparam LOOP_H_ITERATIONS = $LOOP_H_ITERATIONS$; localparam LOOP_W_ITERATIONS = $LOOP_W_ITERATIONS$; localparam LOOP_KH_ITERATIONS = $LOOP_KH_ITERATIONS$; localparam LOOP_KW_ITERATIONS = $LOOP_KW_ITERATIONS$; localparam LOOP_SIMD_ITERATIONS = $LOOP_SIMD_ITERATIONS$; -localparam [31:0] ADDR_INCREMENT_MAP [0:5] = $ADDR_INCREMENT_MAP$; //todo: minimize width -//// +localparam INCR_BITWIDTH = $INCR_BITWIDTH$; +localparam [INCR_BITWIDTH-1:0] ADDR_INCREMENT_MAP [0:5] = $ADDR_INCREMENT_MAP$; + +input CLK; +input RST; +input advance; +output [INCR_BITWIDTH-1:0] addr_incr; +output [INCR_BITWIDTH-1:0] tail_incr; //state and counters reg [2:0] state, state_next; parameter STATE_START = 0, STATE_LOOP_SIMD = 1, STATE_LOOP_KW = 2, STATE_LOOP_KH = 3, STATE_LOOP_W = 4, STATE_LOOP_H = 5; -integer counter_loop_h; //todo: minimize width -integer counter_loop_w; -integer counter_loop_kh; -integer counter_loop_kw; -integer counter_loop_simd; +reg [$clog2(LOOP_H_ITERATIONS+2)-1:0] counter_loop_h; //could add check if ITERATIONS > 0, then replace +2 with +1 +reg [$clog2(LOOP_W_ITERATIONS+2)-1:0] counter_loop_w; +reg [$clog2(LOOP_KH_ITERATIONS+2)-1:0] counter_loop_kh; +reg [$clog2(LOOP_KW_ITERATIONS+2)-1:0] counter_loop_kw; +reg [$clog2(LOOP_SIMD_ITERATIONS+2)-1:0] counter_loop_simd; +reg [INCR_BITWIDTH-1:0] tail_incr_reg; assign addr_incr = ADDR_INCREMENT_MAP[state]; +assign tail_incr = tail_incr_reg; //combinational logic for tail_incr generation $TAIL_INCR_GENERATION$ //combinational next state logic always @ (state, counter_loop_simd, counter_loop_kw, counter_loop_kh, counter_loop_w, counter_loop_h) begin - state_next = state; //default + state_next = state; if (state == $INNERMOST_STATE$) begin if (counter_loop_simd == 0) if (counter_loop_kw != 0) @@ -68,11 +69,10 @@ always @ (posedge CLK) begin counter_loop_kh <= LOOP_KH_ITERATIONS; counter_loop_kw <= LOOP_KW_ITERATIONS; counter_loop_simd <= LOOP_SIMD_ITERATIONS; - state <= $INNERMOST_STATE$; //STATE_START; //debug: omit start state to fix timing, maybe omit during FM transition as well + state <= $INNERMOST_STATE$; end else begin if (advance) begin state <= state_next; - if (state == $INNERMOST_STATE$) begin if (counter_loop_simd == 0) begin counter_loop_simd <= LOOP_SIMD_ITERATIONS; @@ -120,7 +120,7 @@ input [$clog2(DEPTH)-1:0] read_addr; // absolute (!) read address of cyclic buff input [WIDTH-1:0] data_in; output [WIDTH-1:0] data_out; -integer addr_w; //todo: minimize width (as reg) +reg [$clog2(DEPTH)-1:0] write_addr; $RAM_STYLE$ reg [WIDTH-1:0] ram [DEPTH-1:0]; @@ -129,18 +129,18 @@ assign data_out = out_reg; always @(posedge CLK) begin if (RST == 1'b0) begin - addr_w <= 0; + write_addr <= 0; end else begin if (read_enable) out_reg <= ram[read_addr]; if (write_enable) begin - ram[addr_w] <= data_in; + ram[write_addr] <= data_in; - if (addr_w == DEPTH-1) - addr_w <= 0; + if (write_addr == DEPTH-1) + write_addr <= 0; else - addr_w <= addr_w + 1; + write_addr <= write_addr + 1; end end end @@ -156,27 +156,29 @@ module $TOP_MODULE_NAME$_impl ( out_V_V_TVALID, out_V_V_TREADY ); - -parameter BIT_WIDTH = $BIT_WIDTH$; -parameter SIMD = $SIMD$; -parameter MMV_IN = $MMV_IN$; -parameter MMV_OUT = $MMV_OUT$; -parameter BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN; -parameter BUF_OUT_ELEM_WIDTH = BIT_WIDTH * SIMD; -parameter BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT; -parameter LAST_READ_ELEM = $LAST_READ_ELEM$; -parameter LAST_WRITE_ELEM = $LAST_WRITE_ELEM$; -parameter BUF_ELEM_TOTAL = $BUF_ELEM_TOTAL$; -parameter ELEM_PER_WINDOW = $ELEM_PER_WINDOW$; +//generated constants +localparam BIT_WIDTH = $BIT_WIDTH$; +localparam SIMD = $SIMD$; +localparam MMV_IN = $MMV_IN$; +localparam MMV_OUT = $MMV_OUT$; +localparam BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN; +localparam BUF_OUT_ELEM_WIDTH = BIT_WIDTH * SIMD; +localparam BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT; +localparam LAST_READ_ELEM = $LAST_READ_ELEM$; +localparam LAST_WRITE_ELEM = $LAST_WRITE_ELEM$; +//localparam [$clog2($BUF_ELEM_TOTAL$+1)-1:0] BUF_ELEM_TOTAL = $BUF_ELEM_TOTAL$; +localparam BUF_ELEM_TOTAL = $BUF_ELEM_TOTAL$; +localparam ELEM_PER_WINDOW = $ELEM_PER_WINDOW$; +localparam INCR_BITWIDTH = $INCR_BITWIDTH$; //IO ports input ap_clk; input ap_rst_n; -input [BUF_IN_WIDTH-1:0] in0_V_V_TDATA; +input [BUF_IN_WIDTH-1:0] in0_V_V_TDATA; input in0_V_V_TVALID; -output in0_V_V_TREADY; +output in0_V_V_TREADY; output [BUF_OUT_WIDTH-1:0] out_V_V_TDATA; -output out_V_V_TVALID; +output out_V_V_TVALID; input out_V_V_TREADY; //main buffer instantiation @@ -201,22 +203,10 @@ window_buffer_inst .data_out(window_buffer_out) ); -//counters to keep track when to read/write -integer newest_buffered_elem; //todo: minimize width -integer newest_buffered_elem_available; //todo: minimize width -integer current_elem; -integer current_elem_available; -integer first_elem_next_window; -integer k; - -reg [$clog2(BUF_ELEM_TOTAL)-1:0] window_buffer_read_addr_reg; -assign window_buffer_read_addr = window_buffer_read_addr_reg; - -//reg write_done; //keep track if W of current cycle was already completed, but we still wait on a R in the same cycle - +//controller instantiation wire advance_controller; -wire [31:0] addr_incr; -wire [31:0] tail_incr; +wire signed [INCR_BITWIDTH-1:0] addr_incr; +wire [INCR_BITWIDTH-1:0] tail_incr; $TOP_MODULE_NAME$_controller controller_inst @@ -228,88 +218,67 @@ controller_inst .tail_incr(tail_incr) ); +// Counters/address registers +// Add a sign bit even to (most) unsigned counters and window_buffer_read_addr_reg, +// so we can use automatic sign extension and simplify calculations w/ signed increment. +// Alternatively, we could manually sign-extend and shave off a bit here or there. +reg signed [$clog2(LAST_READ_ELEM+1)+1-1:0] newest_buffered_elem; +reg [$clog2(LAST_READ_ELEM+1)+1-1:0] current_elem; +reg [$clog2(LAST_READ_ELEM+1)+1-1:0] first_elem_next_window; +reg [$clog2(ELEM_PER_WINDOW)-1:0] k; +reg [$clog2(BUF_ELEM_TOTAL)+1-1:0] window_buffer_read_addr_reg; + +// Control signals/registers +wire read_cmd; +wire read_ok; wire reading_done; -assign reading_done = newest_buffered_elem == LAST_READ_ELEM; +wire fetch_cmd; reg fetching_done; -reg writing_done; //instead of a separate write cycle/element counter, trigger this flag once current_element reaches LAST_WRITE_ELEM -//assign writing_done = current_elem == LAST_WRITE_ELEM; - +reg write_cmd; +wire write_ok; wire write_blocked; +reg writing_done; -//reg write_prefetch_available; // stores if the write of prefetched data is still outstanding - -wire fetch_cmd; -assign fetch_cmd = !(current_elem > newest_buffered_elem) && !write_blocked && !fetching_done; - - -//determine whether to read/write in this cycle -//wire write_cmd; -//assign write_cmd = write_prefetch_available && !writing_done; -reg write_cmd; - - - -wire read_cmd; assign read_cmd = - ( - ( - (newest_buffered_elem - BUF_ELEM_TOTAL+1) < first_elem_next_window - &&(newest_buffered_elem - BUF_ELEM_TOTAL+1) < current_elem - ) // (over-)write to buffer if oldest buffered element is no longer needed - || fetching_done - ) //or if fetching is done (e.g. for skipped rows at FM end due to stride) - && !reading_done; //and if there is still an input element left to read - -//todo: optmize (e.g. is < or != more efficient?) -// ToDo: ideally this should point to the oldest elem of the next window, -// to allow reading while still writing the remainder of the current window - - - -assign write_blocked = write_cmd && !out_V_V_TREADY; //&& !write_done; + (( + $signed(((newest_buffered_elem - (BUF_ELEM_TOTAL - 1)))) < $signed(first_elem_next_window) + && $signed(((newest_buffered_elem - (BUF_ELEM_TOTAL - 1)))) < $signed(current_elem) + ) // (over-)write to buffer if oldest buffered element will no longer be needed + || fetching_done //or if fetching is done (e.g. for skipped rows at FM end due to stride) + ) + && !reading_done; //and if there is still an input element left to read +assign read_ok = read_cmd && in0_V_V_TVALID; +assign reading_done = newest_buffered_elem == LAST_READ_ELEM; -wire read_ok; -// with transition to next cycle: -// want to read can read source is ready (waiting on VALID allowed) -assign read_ok = read_cmd && !write_blocked && in0_V_V_TVALID; +assign fetch_cmd = !($signed(current_elem) > newest_buffered_elem) && !write_blocked && !fetching_done; -wire write_ok; -// with transition to next cycle: -// output is VALID sink is ready sink has already read (we are waiting on source) -//assign write_ok = write_cmd && (out_V_V_TREADY || write_done); assign write_ok = write_cmd && out_V_V_TREADY; - -//wire advance; -// includes waiting on W if W-only cycle: wait only on W no R/W to wait for -//assign advance = read_ok || (!read_cmd && write_ok) || (!read_cmd && !write_cmd); -//todo: optimize/simplify advance logic for write_done generation +assign write_blocked = write_cmd && !out_V_V_TREADY; //assign buffer control +assign window_buffer_read_addr = window_buffer_read_addr_reg; assign window_buffer_write_enable = read_ok; assign window_buffer_read_enable = fetch_cmd; -assign advance_controller = fetch_cmd; //write_ok +assign advance_controller = fetch_cmd; //assign I/O ports assign window_buffer_in = in0_V_V_TDATA; assign out_V_V_TDATA = window_buffer_out; assign in0_V_V_TREADY = ap_rst_n && read_ok; //only asserted if data is available and we can store it (allowed) -assign out_V_V_TVALID = ap_rst_n && write_cmd; //&& !write_done; //only asserted if we have data available and it has not been read yet (don't wait for READY from sink) +assign out_V_V_TVALID = ap_rst_n && write_cmd; //only asserted if we have data available and it has not been read yet (don't wait for READY from sink) //main process for advancing counters always @ (posedge ap_clk) begin if (ap_rst_n == 1'b0) begin newest_buffered_elem <= -1; - //newest_buffered_elem_available <= -1; current_elem <= 0; - //current_elem_available <= 0; first_elem_next_window <= 0; k <= 0; window_buffer_read_addr_reg <= 0; fetching_done <= 0; writing_done <= 0; - //write_prefetch_available <= 0; write_cmd <= 0; end else begin if (read_ok) begin @@ -332,54 +301,42 @@ always @ (posedge ap_clk) begin //use increment value calculated by controller //keep track where we are within a window - if (k == ELEM_PER_WINDOW-1) + if (k == ELEM_PER_WINDOW - 1) k <= 0; else k <= k+1; - //absolute buffer address always wraps around (in both directions for depthwise support) - if ($signed(window_buffer_read_addr_reg + addr_incr) > BUF_ELEM_TOTAL-1) - window_buffer_read_addr_reg <= window_buffer_read_addr_reg + addr_incr - BUF_ELEM_TOTAL; - else if ($signed(window_buffer_read_addr_reg + addr_incr) < 0) - window_buffer_read_addr_reg <= window_buffer_read_addr_reg + addr_incr + BUF_ELEM_TOTAL; + //update first element of next window to allow buffer overwrite up until that point + if (k == 0) + first_elem_next_window <= first_elem_next_window + tail_incr; + + //absolute buffer address wrap-around + if ($signed(window_buffer_read_addr_reg) + addr_incr > BUF_ELEM_TOTAL - 1) + window_buffer_read_addr_reg <= $signed(window_buffer_read_addr_reg) + addr_incr - BUF_ELEM_TOTAL; + else if ($signed(window_buffer_read_addr_reg) + addr_incr < 0) + window_buffer_read_addr_reg <= $signed(window_buffer_read_addr_reg) + addr_incr + BUF_ELEM_TOTAL; else - window_buffer_read_addr_reg <= window_buffer_read_addr_reg + addr_incr; + window_buffer_read_addr_reg <= $signed(window_buffer_read_addr_reg) + addr_incr; //check if this is the last write cycle (writing_done will be true afterwards) - if (current_elem == LAST_WRITE_ELEM) begin + if (current_elem == LAST_WRITE_ELEM) fetching_done <= 1; - end else begin - //current element index wraps around only at window boundary - //if (((current_elem + addr_incr) > BUF_ELEM_TOTAL-1) && (k == ELEM_PER_WINDOW-1)) - - //if (k == ELEM_PER_WINDOW-1) - // current_elem <= current_elem + addr_incr - BUF_ELEM_TOTAL; - //else - current_elem <= current_elem + addr_incr; - end - - if (k == 0) - first_elem_next_window <= first_elem_next_window + tail_incr; + else + current_elem <= $signed(current_elem) + addr_incr; // determine if prefetched data will be outstanding in the next cycle // if we fetch in this cycle -> yes - // if we do not fetch nor write successfully -> do not change - // if we do not fetch but write -> clear outstanding data - //write_prefetch_available <= fetch_cmd; + // if we do not fetch nor write -> do not change + // if we do not fetch but write successfully-> clear outstanding data write_cmd <= fetch_cmd; end if (write_ok) - // determine if prefetched data will be outstanding in the next cycle - // if we fetch in this cycle -> yes - // if we do not fetch nor write successfully -> do not change - // if we do not fetch but write -> clear outstanding data - //write_prefetch_available <= fetch_cmd; write_cmd <= fetch_cmd; if (write_ok && fetching_done) begin //check if this is the last write cycle (writing_done will be true afterwards) - if (reading_done || (read_ok && (newest_buffered_elem == LAST_READ_ELEM-1))) begin + if (reading_done || (read_ok && (newest_buffered_elem == LAST_READ_ELEM - 1))) begin //start processing of next FM if reading is done already, or completes in the same cycle newest_buffered_elem <= -1; current_elem <= 0; @@ -388,11 +345,6 @@ always @ (posedge ap_clk) begin end else writing_done <= 1; end - - //if (advance) - // write_done <= 1'b0; //reset flag - //else if (write_ok) // successful W in this cycle, but R still outstanding - // write_done <= 1'b1; //write can happen even if read is blocked, but only for the current cycle! end end diff --git a/finn-rtllib/swg/swg_hdl_template.v b/finn-rtllib/swg/swg_template_parallel.sv similarity index 96% rename from finn-rtllib/swg/swg_hdl_template.v rename to finn-rtllib/swg/swg_template_parallel.sv index 89ebb8da5..7c1e04222 100755 --- a/finn-rtllib/swg/swg_hdl_template.v +++ b/finn-rtllib/swg/swg_template_parallel.sv @@ -9,7 +9,7 @@ module $TOP_MODULE_NAME$_controller ); input CLK; -input [31:0] cycle; //todo: minimize width or switch to single bit flag/advance wire +input [31:0] cycle; //todo: minimize width or switch to single bit flag output cmd_read; output cmd_write; @@ -159,6 +159,8 @@ input [WIDTH-1:0] shift_in; output [WIDTH-1:0] shift_out; output [WIDTH*DEPTH-1:0] data_out; +// ToDo: experiment with SRL instead of FF-based shift register +// by force or by achieving automatic SRL inference //UG901 template for SRL inference: // 32-bit Shift Register // Rising edge clock @@ -303,12 +305,12 @@ module $TOP_MODULE_NAME$_impl ( ); parameter BIT_WIDTH = $BIT_WIDTH$; -parameter SIMD = $SIMD$; //assuming SIMD=C for now -parameter MMV_IN = $MMV_IN$; //assuming MMV_IN=1*M for now -parameter MMV_OUT = $MMV_OUT$; //assuming MMV_OUT=K*M for now -parameter BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN; //bit-width*C*MMV_in -parameter BUF_OUT_ELEM_WIDTH = BIT_WIDTH * SIMD; //bit-width*C -parameter BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT; //bit-width*C*MMV_out +parameter SIMD = $SIMD$; //assuming SIMD = C for this implementation style +parameter MMV_IN = $MMV_IN$; +parameter MMV_OUT = $MMV_OUT$; +parameter BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN; +parameter BUF_OUT_ELEM_WIDTH = BIT_WIDTH * SIMD; +parameter BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT; parameter CYCLES_TOTAL = $CYCLES_TOTAL$; parameter BUF_ELEM_TOTAL = $BUF_ELEM_TOTAL$; diff --git a/finn-rtllib/swg/swg_hdl_template_wrapper.v b/finn-rtllib/swg/swg_template_wrapper.v similarity index 100% rename from finn-rtllib/swg/swg_hdl_template_wrapper.v rename to finn-rtllib/swg/swg_template_wrapper.v diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py index 4b31b7c97..1aeeb9a1e 100755 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py @@ -47,12 +47,15 @@ try: except ModuleNotFoundError: PyVerilator = None -# note: the actual data layout produced by the hlslib kernels is different -# for depthwise and non-depthwise ops. +# RTL Convolution Input Generator / Sliding Window Generator (SWG) +# Matches and extends the functionality of all ConvolutionInputGenerator_* functions +# in finn-hlslib by generating HDL code for two different implementation styles: +# - Addressable cyclic buffer: to be used when out_width <= in_width +# - Parallel registers + line buffers: to be used when out_width > in_width +# Supports non-square, 1D, strided, dilated, and depthwise convolutions. +# Note: the actual data layout produced is different for depthwise and non-depthwise ops: # * non-depthwise SWG: (1, OFMDim_H, OFMDim_W, K_H, K_W, IFMChannels/SIMD, SIMD) # * depthwise SWG: (1, OFMDim_H, OFMDim_W, IFMChannels/SIMD, K_H, K_W, SIMD) -# see test_fpgadataflow_slidingwindow.py for an example of how to transform -# between the two layouts class ConvolutionInputGenerator_rtl(HLSCustomOp): """Class that does not correspond to one of the finn-hlslib ConvolutionInputGenerator @@ -201,54 +204,22 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): num_output_elems = np.prod(folded_oshape[:-1]) return num_output_elems - def get_1d_conv_attrs_normalized(self): - # support both (1, D) and (D, 1) cases transparently: - # For the kernel, presenting the input data of size D as - # [H, W] = [Y, X] = [1, D] or [D, 1] - # effectively gives the same result. Because the - # ConvolutionInputGenerator_NonSquare_Dilated(_dws) kernel currently only - # supports dilation>1 along the X-axis and the - # ConvolutionInputGenerator_NonSquare only works for stride>1 along the - # X-axis, we are working with the following assumption: - # the dummy ('1') dimension is the Y-dimension, i.e. - # images and kernels (and their attributes) of dimension - # [H, W] = [Y, X] = [D, 1] or [1, D] are always mapped to [1, D] + def get_exp_cycles(self): + # TODO: update + simd = self.get_nodeattr("SIMD") ifm_ch = self.get_nodeattr("IFMChannels") k = self.get_nodeattr("ConvKernelDim") ifm_dim = self.get_nodeattr("IFMDim") ofm_dim = self.get_nodeattr("OFMDim") stride = self.get_nodeattr("Stride") dilation = self.get_nodeattr("Dilation") - - # see defines() for an explanation - if ifm_dim[1] == 1: - ifm_dim = ifm_dim[::-1] - ofm_dim = ofm_dim[::-1] - k = k[::-1] - stride = stride[::-1] - dilation = dilation[::-1] - - return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation) - - def get_exp_cycles(self): - simd = self.get_nodeattr("SIMD") - ( - ifm_ch, - ifm_dim, - ofm_dim, - k, - stride, - dilation, - ) = self.get_1d_conv_attrs_normalized() ifm_dim_h, ifm_dim_w = ifm_dim ofm_dim_h, ofm_dim_w = ofm_dim k_h, k_w = k stride_h, stride_w = stride dilation_h, dilation_w = dilation - - # since mmv != 1 is not supported yet, we set mmv for now to 1 mmv = 1 - # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h + if (self.get_nodeattr("parallel_window")): exp_cycles = ifm_dim_w + 1 else: @@ -262,7 +233,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): return int(exp_cycles) def bram_estimation(self): - # NOTE: not tested for correctness + # TODO: update simd = self.get_nodeattr("SIMD") ifm_ch = self.get_nodeattr("IFMChannels") ifm_dim = np.prod(self.get_nodeattr("IFMDim")) @@ -294,6 +265,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): return 0 def lut_estimation(self): + # TODO: update # NOTE: not tested for correctness simd = self.get_nodeattr("SIMD") ifm_ch = self.get_nodeattr("IFMChannels") @@ -315,6 +287,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): return 300 + ram_luts def uram_estimation(self): + # TODO: update # NOTE: not tested for correctness simd = self.get_nodeattr("SIMD") ifm_ch = self.get_nodeattr("IFMChannels") @@ -375,7 +348,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): # pad test input stream to work when IFMdim % M != 0 # during normal operation, the AXI Stream should not care, in the last cycle garbage elements are read but not used - # ToDo: only works for 1D case + # TODO: only works for 1D case mmv_stream_padding_px = int((np.prod(folded_ishape) - np.prod(inp.shape)) / exp_ishape[-1]) if exp_ishape [2] == 1: inp = np.pad(inp, ((0,0),(0,mmv_stream_padding_px),(0,0),(0,0)), 'constant') @@ -447,12 +420,10 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): def generate_hdl(self): code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - f_debug = open(os.path.join(code_gen_dir, "swg_hdl_debuginfo.log"), "w") + #f_debug = open(os.path.join(code_gen_dir, "swg_hdl_debuginfo.log"), "w") code_gen_dict = {} - #-------------------- - # init hyperparameters - # for 1D case: it does not matter if dummy dim is x or y + ##### BEGIN INITIALIZE/CHECK CONFIGURATION ##### ifm_ch = self.get_nodeattr("IFMChannels") k = self.get_nodeattr("ConvKernelDim") ifm_dim = self.get_nodeattr("IFMDim") @@ -463,51 +434,16 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): n = 1 h, w = ifm_dim - c = 1 # ifm_ch not considered atm (always parallelize across c) + c = 1 # assume SIMD=C (parallelize across all channels) k_h, k_w = k pad = [0,0,0,0] # padding happens in separate padding node for now pad_val = 0 stride_h, stride_w = stride dilation_h, dilation_w = dilation - conv_c = 99 - - # init folding config - simd = self.get_nodeattr("SIMD") - M = self.get_nodeattr("M") - if (self.get_nodeattr("parallel_window")): - mmv_in = M*1 - mmv_out = M*k_h*k_w - assert ifm_ch==simd, "Constraint violated: SIMD must be equal to C" - else: - mmv_in = 1 - mmv_out = 1 - assert ifm_ch%simd==0, "Constraint violated: SIMD must divide C" - - # todo: check allowed hyperparams - # ToDo: move/duplicate these checks in corresponding convert_to_hls transformation - - # choose implementation style - if (mmv_out > 1 or (k_h==1 and k_w==1)): - impl_style = "parallel" - else: - impl_style = "default" - - # how many "unused" registers are allowed between buffer positions that will be accessed in parallel - # example: - # 0: only consecutive access patterns will be implemented in regs, rest in BRAM line buffers - # 2: [0, 3, 6] access pattern is still allowed and will be implemented with 1 7-position shift reg - REG_BRAM_THRESHOLD = 8 - #-------------------- in_shape = (n,c,h,w) #NCHW in_image = np.empty(in_shape, dtype=int) - - for index, x in np.ndenumerate(in_image): - # "HWC" dummy values - val = int((index[2]+1)*100+(index[3]+1)*10+(index[1]+1)*1) - in_image[index] = val - in_image_padded = np.pad( in_image, ((0, 0), (0, 0), (pad[0], pad[2]), (pad[1], pad[3])), @@ -523,416 +459,40 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): out_dim_h = im2col.compute_conv_output_dim(h, k_h, stride_h, pad_h, dilation_h) out_dim_w = im2col.compute_conv_output_dim(w, k_w, stride_w, pad_w, dilation_w) - f_debug.write("\n"+"in shape " + str(in_shape)) - f_debug.write("\n"+"in shape padded " + str(in_shape_padded)) - f_debug.write("\n"+"conv out shape " + str((n,conv_c,out_dim_h,out_dim_w))) - f_debug.write("\n"+"im2col out shape " + str((n,out_dim_h,out_dim_w,k_h*k_w*c))) - - idx_c, idx_h, idx_w = im2col.get_im2col_indices_nchw( - in_shape, - k_h, - k_w, - pad, - stride_h, - stride_w, - dilation_h, - dilation_w - ) - - f_debug.write("\n"+"c indices") - f_debug.write("\n"+str(idx_c)) - f_debug.write("\n"+"h indices") - f_debug.write("\n"+str(idx_h)) - f_debug.write("\n"+"w indices") - f_debug.write("\n"+str(idx_w)) - - cols = in_image_padded[:, idx_c, idx_h, idx_w] - cols = cols.transpose(1, 2, 0).reshape(k_h * k_w * c, -1) - - f_debug.write("\n"+"cols (shape %s)" % str(cols.shape)) - f_debug.write("\n"+str(cols)) - - # result shape is (k_H*k_W*N, out_dim_H*out_dim_W), convert to NCHW - out_image = cols.reshape(n, c, k_h, k_w, out_dim_h, out_dim_w) - # (N=0,C=1,kh=2,kw=3,H=4,W=5) -> (N=0,H=4,W=5,kh=2,kw=3,C=1) - out_image = out_image.transpose(0, 4, 5, 2, 3, 1) - out_image = out_image.reshape(n, out_dim_h, out_dim_w, k_h * k_w * c) - - f_debug.write("\n"+"output (shape %s)" % str(out_image.shape)) - f_debug.write("\n"+str(out_image)) - - f_debug.write("\n"+"h indices") - f_debug.write("\n"+str(idx_h)) - f_debug.write("\n"+"w indices") - f_debug.write("\n"+str(idx_w)) - - idx_px = idx_h*w+idx_w - f_debug.write("\n"+"sequential pixel indices (shape %s" % str(idx_px.shape)) - f_debug.write("\n"+str(idx_px)) - - k, cycles = idx_px.shape - - output_elements = mmv_out - output_cycles = int(cycles/(mmv_out/k)) - - # ToDo: what happens when output_cycles=OFMdim % M != 0 - # ...try to support IFMdim % M != 0 first, so we can work with the usual k=3 where OFMdim = IFMdim - -2 - # the additional garbage input elements that are read in the last cycle are not read by any window anyway - idx_px = idx_px.transpose() - idx_px = idx_px.reshape(output_cycles, output_elements) - idx_px = idx_px.transpose() - - # result: first dim is number of parallel output elements, second dim is the input element (pixel in case of SIMD=C) index that each output element outputs per cycle - f_debug.write("\n"+"sequential pixel indices, MMV_out grouping (shape %s" % str(idx_px.shape)) - f_debug.write("\n"+str(idx_px)) - #f_debug.close() - - buffer = [] - buffer_max_size = 0 - # buffer schedule (write from input, read to output) - schedule_write = [] - schedule_read = [] - schedule_shift = [] - - - schedule = [] - schedule_prev = '' - - next_in_px = 0 - oldest_px = 0 - buffer_space_freed = False - - idx_px_relative = idx_px.copy() - idx_px_addr = idx_px.copy() - idx_px_addr_incr = idx_px.copy() - idx_px_addr_rel = idx_px.copy() - - # compute schedule and buffer read pattern (output driven) - output_elem, output_cycles = idx_px_relative.shape - - if (impl_style == "parallel"): - for x in range(output_cycles): - # load missing inputs into buffer - for y in range(output_elem): - while int(idx_px_relative[y,x]) not in buffer: - # load M inputs at once (keep "buffer" list 1D for now, handle actual 2D buffer generation later) - for m in range(M): - buffer.append(next_in_px) - next_in_px += 1 - schedule_write.append(1) - schedule_read.append(0) - if schedule_prev == 'w': - count, cmd = schedule[-1] - schedule[-1] = (count+1, cmd) - else: - schedule.append((1, 'w')) - schedule_prev = 'w' - - # discard unused buffer elements - oldest_px = np.min(idx_px_relative[:,x:]) - #check whether M elements can be shifted out, not just the single oldest one - # must this be "while" for MMV to work?!? breaks mmvout = 1 case - #while all([buffer[i] < oldest_px for i in range(M)]): - if all([buffer[i] < oldest_px for i in range(M)]): - # M buffer elements are shifted out at once - for m in range(M): - buffer.pop(0) - - # adjust relative buffer index of current x (according to last discarded buffer elements) - for y in range(output_elem): - idx_px_relative[y,x] -= oldest_px - - - # read from buffer - # + simultaneously load next pixel(s) into buffer if there are any left - if (next_in_px > (h_padded*w_padded-1)): - # read only (append above) - schedule_read.append(1) - schedule_write.append(0) - if schedule_prev == 'r': - count, cmd = schedule[-1] - schedule[-1] = (count+1, cmd) - else: - schedule.append((1, 'r')) - schedule_prev = 'r' - else: - # load M inputs at once - for m in range(M): - buffer.append(next_in_px) - next_in_px += 1 - schedule_read.append(1) - schedule_write.append(1) - if schedule_prev == 'wr': - count, cmd = schedule[-1] - schedule[-1] = (count+1, cmd) - else: - schedule.append((1, 'wr')) - schedule_prev = 'wr' - - # record max needed buffer depth - #f_debug.write("\n"+str(buffer)) - if len(buffer) > buffer_max_size: - buffer_max_size = len(buffer) - - # insert dummy write operations for data at the input FM tail-end that is never read (e.g. in case of stride > 1) - while next_in_px <= (h_padded*w_padded-1): - next_in_px += 1 - schedule_write.append(1) - schedule_read.append(0) - if schedule_prev == 'w': - count, cmd = schedule[-1] - schedule[-1] = (count+1, cmd) - else: - schedule.append((1, 'w')) - schedule_prev = 'w' - - # find buffer access patterns - buffer_access_patterns = [] - for x in range(output_cycles): - if idx_px_relative[:,x].tolist() not in buffer_access_patterns: - buffer_access_patterns.append(idx_px_relative[:,x].tolist()) - - + # init folding config + simd = self.get_nodeattr("SIMD") + M = self.get_nodeattr("M") + if (self.get_nodeattr("parallel_window")): + mmv_in = M*1 + mmv_out = M*k_h*k_w + assert ifm_ch==simd, "Constraint violated: SIMD must be equal to C" else: + mmv_in = 1 + mmv_out = 1 + assert ifm_ch%simd==0, "Constraint violated: SIMD must divide C" - #simulate cyclic buffer, which is advanced on every write (as opposed to on overy sheduled cycle) - #buffer_tail = 0 - buffer_head = 0 #buffer_tail+1 - # compute minimal buffer length (assuming it holds 1 complete window) - buffer_len = (k_h-1) * dilation_h * w + (k_w-1) * dilation_w + 1 - buffer = [-1] * buffer_len - - # todo: remove this simulation, not needed and doesnt accout for SIMD anyways - for x in range(output_cycles): - - # load missing inputs into buffer - while int(idx_px_relative[0,x]) not in buffer: - # load M inputs at once (keep "buffer" list 1D for now, handle actual 2D buffer generation later) - for m in range(M): - #buffer.append(next_in_px) - buffer[buffer_head] = next_in_px - next_in_px += 1 - schedule_write.append(1) - schedule_read.append(0) - if schedule_prev == 'w': - count, cmd = schedule[-1] - schedule[-1] = (count+1, cmd) - else: - schedule.append((1, 'w')) - schedule_prev = 'w' - - #try to advance/shift the buffer by one, discarding the oldest element - #discard_oldest_elem = buffer[0] < np.min(idx_px_relative[0,x:]) - #if discard_oldest_elem: - # buffer.pop(0) - # schedule_shift.append(1) - #else: - # schedule_shift.append(0) - buffer_head += 1 - if buffer_head > buffer_len-1: - buffer_head = 0 - - ### perform read ### - - #try to advance/shift the buffer by one, discarding the oldest element - #discard_oldest_elem = buffer[0] < np.min(idx_px_relative[0,x:]) - #if discard_oldest_elem: - # buffer.pop(0) - # schedule_shift.append(1) - #else: - # schedule_shift.append(0) - - # note current relative addr in buffer - idx_px_addr[0,x] = buffer.index(idx_px_relative[0,x]) - if x > 0: - idx_px_addr_incr[0,x] = idx_px_addr[0,x] - idx_px_addr[0,x-1] - if idx_px_addr_incr[0,x] < 0: - idx_px_addr_incr[0,x] += buffer_len - else: - idx_px_addr_incr[0,x] = idx_px_addr[0,x] - - idx_px_addr_rel [0,x] = buffer.index(idx_px_relative[0,x]) - buffer_head - if idx_px_addr_rel [0,x] < 0: - idx_px_addr_rel [0,x] += buffer_len - - - #try to write a new input into the buffer simultaneously (during this read as opposed to before the next read) - # assume in-order write into the buffer (oldest element is always at head+1) - discard_oldest_elem = np.min(buffer) < np.min(idx_px_relative[0,x:]) - read_only = True - if not (next_in_px > (h_padded*w_padded-1)): - # input data available - #if (x < k_h*k_w) or discard_oldest_elem: - if discard_oldest_elem: - # buffer is not complete, as the first window has not been fully output - # or we can discard one element from the buffer after this read, so there is space for a new one - read_only = False - - - # read from buffer - # + simultaneously load next pixel(s) into buffer if there are any left - # if mmv_out = 1: addressable BRAM implementation style -> do not shift in while outputting K kernel elements to keep addressing consistent - #if (next_in_px > (h_padded*w_padded-1)) or ((x+1) % (k_h*k_w) != 0): - #if (next_in_px > (h_padded*w_padded-1)) or (x > 1 and (not buffer_space_freed)): - if read_only: - # read only - schedule_read.append(1) - schedule_write.append(0) - if schedule_prev == 'r': - count, cmd = schedule[-1] - schedule[-1] = (count+1, cmd) - else: - schedule.append((1, 'r')) - schedule_prev = 'r' - else: - # read + write - #buffer.append(next_in_px) - buffer[buffer_head] = next_in_px - next_in_px += 1 - schedule_read.append(1) - schedule_write.append(1) - if schedule_prev == 'wr': - count, cmd = schedule[-1] - schedule[-1] = (count+1, cmd) - else: - schedule.append((1, 'wr')) - schedule_prev = 'wr' - - # advance buffer - buffer_head += 1 - if buffer_head > buffer_len-1: - buffer_head = 0 - - # record max needed buffer depth - #f_debug.write("\n"+str(buffer)) - if len(buffer) > buffer_max_size: - buffer_max_size = len(buffer) - - # ToDo: maybe replace with directly-computed schedule (similar to addr. buffer impl. style) - def compact_schedule(schedule): - - # leave first sequence (pre-load) as is - start_sequence = schedule[0] - - loop_sequence_1_counter = 1 - loop_sequence_1 = schedule[1] - - loop_counter = 0 - loop_sequence_2 = None - end_sequence = None - - i = 2 - if i < len(schedule): - loop_sequence_1 += schedule[i] - i += 1 - - while i+1 < len(schedule): - candidate = schedule[i] + schedule[i+1] - if candidate == loop_sequence_1: - loop_sequence_1_counter += 1 - i += 2 - else: - break - - if i < len(schedule): - loop_sequence_2 = schedule[i] - i += 1 - - if i+1 < len(schedule): - candidate = schedule[i] + schedule[i+1] - if candidate != loop_sequence_1: - loop_sequence_2 += schedule[i] - - i -= 1 - loop_sequence_total_len = (int(len(loop_sequence_2)/2)) + loop_sequence_1_counter*(int(len(loop_sequence_1)/2)) - loop_sequence_total = loop_sequence_2 + loop_sequence_1_counter*loop_sequence_1 - while i+loop_sequence_total_len < len(schedule): - candidate = schedule[i] - for x in range (i+1, i+loop_sequence_total_len): - candidate += schedule[x] - - if candidate == loop_sequence_total: - loop_counter += 1 - i += loop_sequence_total_len - else: - break + # TODO: check allowed hyperparams + # for 1D case: it does not matter if dummy dim is x or y + # TODO: move/duplicate these checks in corresponding convert_to_hls transformation (?) - else: - if i < len(schedule): - end_sequence = loop_sequence_2 + schedule[i] - i += 1 - loop_sequence_2 = None - else: - end_sequence = loop_sequence_2 - loop_sequence_2 = None - - if i < len(schedule): - end_sequence = schedule[i] - i += 1 - - if i < len(schedule): - end_sequence = end_sequence + schedule[i] - i += 1 - - assert len(start_sequence) == 1*2, "ERROR: invalid start sequence" - assert len(loop_sequence_1) == 2*2, "ERROR: invalid loop 1 sequence" - if loop_sequence_2: - assert len(loop_sequence_2) <= 2*2, "ERROR: invalid loop 2 sequence" - if end_sequence: - assert len(end_sequence) <= 2*2, "ERROR: invalid end sequence" - assert i == len(schedule), "ERROR: schedule could not be compacted %d / %d" %(i, len(schedule)) - - return ( - start_sequence, - loop_counter, - loop_sequence_1_counter, - loop_sequence_1, - loop_sequence_2, - end_sequence - ) + # choose implementation style + if (mmv_out > 1 or (k_h==1 and k_w==1)): + impl_style = "parallel" + else: + impl_style = "default" - f_debug.write("\n"+"max buffer size observed: %d" %(buffer_max_size)) - f_debug.write("\n"+"output vector elements: relative buffer indices") - f_debug.write("\n"+str(idx_px_relative)) - f_debug.write("\n"+"output vector elements: absolute buffer address") - f_debug.write("\n"+str(idx_px_addr)) - f_debug.write("\n"+"output vector elements: absolute buffer address increment from last") - f_debug.write("\n"+str(idx_px_addr_incr)) - f_debug.write("\n"+"output vector elements: relative buffer address (from head)") - f_debug.write("\n"+str(idx_px_addr_rel)) - f_debug.write("\n"+"buffer write schedule (%d cycles)" % len(schedule_write)) - f_debug.write("\n"+str(schedule_write)) - f_debug.write("\n"+"writing buffer in %d cycles" % schedule_write.count(1)) - #f_debug.write("\n"+"buffer write schedule COMPRESSED") - #f_debug.write("\n"+str(schedule_write_compressed)) - #f_debug.write("\n"+"buffer write schedule ANALYZED") - #f_debug.write("\n"+str(analyse_schedule(schedule_write))) - f_debug.write("\n"+"buffer read schedule (%d cycles)" % len(schedule_read)) - f_debug.write("\n"+str(schedule_read)) - f_debug.write("\n"+"reading buffer in %d cycles" % schedule_read.count(1)) - - #f_debug.write("\n"+"buffer shift schedule (%d cycles)" % len(schedule_shift)) - #f_debug.write("\n"+str(schedule_shift)) - #f_debug.write("\n"+"shifting buffer in %d cycles" % schedule_shift.count(1)) - #f_debug.write("\n"+"buffer read schedule COMPRESSED") - #f_debug.write("\n"+str(schedule_read_compressed)) - #f_debug.write("\n"+"buffer read schedule ANALYZED") - #f_debug.write("\n"+str(analyse_schedule(schedule_read))) - - addr_incr_end_window_elem = 0 - addr_incr_end_window_row = 0 - addr_incr_end_window = 0 - addr_incr_end_row = 0 + ##### END INITIALIZE/CHECK CONFIGURATION ##### + ##### BEGIN CODE GEN FOR DEFAULT STYLE ##### if (impl_style == "default"): - f_debug.write("\n"+"mmv_out = 1: computing incremental addressing scheme directly:") - addressing_scheme = [[0]] + # Default implementation style for MMV_out = 1: addressable cyclic buffer + # Computing incremental addressing scheme directly.. # compute index/address increments for each nested loop channel_factor = int(ifm_ch/simd) - #todo: rename to (min) buffer len - buffer_max_size = buffer_max_size * channel_factor + # compute minimal buffer length (assuming it holds 1 complete window) + buffer_min_size = ((k_h-1) * dilation_h * w + (k_w-1) * dilation_w + 1) * channel_factor kernel_width = (k_w-1)*dilation_w+1 # incl. dilation addr_incr_end_simd = 1 @@ -942,17 +502,13 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): skip_lines = (dilation_h-1) * w * channel_factor addr_incr_end_window_row = remaining_line + skip_lines + 1 # 1 = wrap around of minimally sized buffer - #addr_incr_end_window = stride_w * channel_factor + 1 # 1 = wrap around of minimally sized buffer - addr_incr_end_window = -buffer_max_size + stride_w * channel_factor + 1 # 1 = wrap around of minimally sized buffer + addr_incr_end_window = -buffer_min_size + stride_w * channel_factor + 1 # 1 = wrap around of minimally sized buffer # rows that are skipped due to imperfect stride<->W combination skip_columns = w%(kernel_width + (out_dim_w-1)*stride_w) remaining_line = (skip_columns + kernel_width) * channel_factor # increment from oldest buffer position (top left) to end of line skip_lines = (stride_h-1) * w * channel_factor - #addr_incr_end_row = remaining_line + skip_lines + 1 # 1 = wrap around of minimally sized buffer - addr_incr_end_row = -buffer_max_size + remaining_line + skip_lines + 1 # 1 = wrap around of minimally sized buffer - - + addr_incr_end_row = -buffer_min_size + remaining_line + skip_lines + 1 # 1 = wrap around of minimally sized buffer if (depthwise): addr_incr_end_window_elem = dilation_w * channel_factor @@ -960,85 +516,11 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): + (w - kernel_width) * channel_factor + (dilation_h-1) * w * channel_factor ) - addr_incr_end_simd = -buffer_max_size + (channel_factor + 1) - #addr_incr_end_simd = channel_factor + 1 - - # just for testing: - for i_windows_per_h in range(out_dim_h): # LOOP_H - for i_windows_per_w in range(out_dim_w): # LOOP_W - for i_simd_per_px in range(channel_factor): # LOOP_SIMD - for i_px_per_window_h in range(k_h): # LOOP_KH - for i_px_per_window_w in range(k_w-1): # LOOP_KW - addressing_scheme[0].append(addr_incr_end_window_elem) - if i_px_per_window_h != k_h-1: # skip on last iteration - addressing_scheme[0].append(addr_incr_end_window_row) - if i_simd_per_px != channel_factor-1: # skip on last iteration - addressing_scheme[0].append(addr_incr_end_simd) - if i_windows_per_w != out_dim_w-1: # skip on last iteration - addressing_scheme[0].append(addr_incr_end_window) - if i_windows_per_h != out_dim_h-1: # skip on last iteration - addressing_scheme[0].append(addr_incr_end_row) - else: - # just for testing: - for i_windows_per_h in range(out_dim_h): # LOOP_H - for i_windows_per_w in range(out_dim_w): # LOOP_W - for i_px_per_window_h in range(k_h): # LOOP_KH - for i_px_per_window_w in range(k_w): # LOOP_KW - for i_simd_per_px in range(channel_factor-1): # LOOP_SIMD - addressing_scheme[0].append(addr_incr_end_simd) - if i_px_per_window_w != k_w-1: # skip on last iteration - addressing_scheme[0].append(addr_incr_end_window_elem) - if i_px_per_window_h != k_h-1: # skip on last iteration - addressing_scheme[0].append(addr_incr_end_window_row) - if i_windows_per_w != out_dim_w-1: # skip on last iteration - addressing_scheme[0].append(addr_incr_end_window) - if i_windows_per_h != out_dim_h-1: # skip on last iteration - addressing_scheme[0].append(addr_incr_end_row) - - f_debug.write("\n"+str(np.array(addressing_scheme))) - if simd == ifm_ch: - # simd < c currently not simulated - if (np.array(addressing_scheme) == idx_px_addr_incr).all: - f_debug.write("\n"+"computed addressing matches simulated addressing") - else: - f_debug.write("\n"+"ERROR") - else: - f_debug.write("\n"+"found %d buffer access patterns:" % len(buffer_access_patterns)) - f_debug.write("\n"+str(buffer_access_patterns)) - f_debug.write("\n"+"required parallel-access registers for mmv_out=k: %d" % len(sum(buffer_access_patterns,[]))) - f_debug.write("\n"+"buffer rw schedule NEW") - f_debug.write("\n"+str(schedule)) - f_debug.write("\n"+"buffer rw schedule NEW compacted") - f_debug.write("\n"+"\nstart_sequence: %s\nloop_counter: %s\nloop_sequence_1_counter: %s\nloop_sequence_1: %s\nloop_sequence_2: %s\nend_sequence: %s\n" % compact_schedule(schedule)) - assert len(schedule_write) == len(schedule_read), "ERROR: Schedules have different lenghts" - assert schedule_write.count(1) == self.get_number_input_values(), "ERROR: Writing buffer in fewer cycles than expected" - assert schedule_read.count(1) == self.get_number_output_values(), "ERROR: Reading buffer in fewer cycles than expected" - cycles_total = len(schedule_write) - - - - code_gen_dict["$TOP_MODULE_NAME$"] = [self.get_verilog_top_module_name()] - #save top module name so we can refer to it even after this node has been renamed (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) - self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) - code_gen_dict["$BIT_WIDTH$"] = [str(self.get_input_datatype().bitwidth())] - code_gen_dict["$SIMD$"] = [str(simd)] - code_gen_dict["$MMV_IN$"] = [str(mmv_in)] - code_gen_dict["$MMV_OUT$"] = [str(mmv_out)] - - - ram_style = self.get_nodeattr("ram_style") - if ram_style == "auto": - code_gen_dict["$RAM_STYLE$"]=[""] - else: - code_gen_dict["$RAM_STYLE$"]=["(* ram_style = \"{}\" *)".format(ram_style)] - - if (impl_style == "default"): - ### MMVout = 1: addressable buffer implementation style - f_debug.write("\n"+"Choosing implementation style: Addressable buffer due to mmv_out=1") + addr_incr_end_simd = -buffer_min_size + (channel_factor + 1) # add additional buffer space in case of stride > 1 # this minimizes cycle count, as it allows an earlier pre-load of skipped input elements - buffer_actual_size = (buffer_max_size + max(0,((stride_w-1) - (int(mmv_out*k_h*k_w/mmv_in)))*channel_factor) + buffer_actual_size = (buffer_min_size + max(0,((stride_w-1) - (int(mmv_out*k_h*k_w/mmv_in)))*channel_factor) + max(0,((stride_h-1)*w - (int(mmv_out*k_h*k_w/mmv_in)))*channel_factor)) code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_actual_size)] @@ -1068,29 +550,39 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): addr_incr_end_window_elem = addr_incr_end_window_row addr_incr_end_window_row = addr_incr_end_simd_ elem_per_window = k_h*k_w - + + tail_incr_w = addr_incr_end_window + buffer_min_size - channel_factor + tail_incr_h = addr_incr_end_row + buffer_min_size - channel_factor + tail_incr_last_window = buffer_min_size-1 code_gen_dict["$TAIL_INCR_GENERATION$"] = [""" always @ (counter_loop_kh, counter_loop_w, counter_loop_h) begin if (counter_loop_kh != 0) - tail_incr = 1; + tail_incr_reg = 1; else if (counter_loop_w != 0) - tail_incr = ADDR_INCREMENT_MAP[STATE_LOOP_W]-{channel_factor}+{buffer_min_size}; - else // do not check for counter_loop_h to increment past LAST_WRITE_ELEM during last window - tail_incr = ADDR_INCREMENT_MAP[STATE_LOOP_H]-{channel_factor}+{buffer_min_size}; + tail_incr_reg = {}; + else if (counter_loop_h != 0) + tail_incr_reg = {}; + else + tail_incr_reg = {}; end - """.format(channel_factor=channel_factor, buffer_min_size=buffer_max_size)] + """.format(tail_incr_w, tail_incr_h, tail_incr_last_window)] else: # depthwise output format is equivalent to non-depthwise if SIMD=C elem_per_window = k_h*k_w*channel_factor + tail_incr_w = addr_incr_end_window + buffer_min_size - 1 + tail_incr_h = addr_incr_end_row + buffer_min_size - 1 + tail_incr_last_window = buffer_min_size-1 code_gen_dict["$TAIL_INCR_GENERATION$"] = [""" always @ (counter_loop_w, counter_loop_h) begin if (counter_loop_w != 0) - tail_incr = ADDR_INCREMENT_MAP[STATE_LOOP_W]-1+{buffer_min_size}; - else // do not check for counter_loop_h to increment past LAST_WRITE_ELEM during last window - tail_incr = ADDR_INCREMENT_MAP[STATE_LOOP_H]-1+{buffer_min_size}; + tail_incr_reg = {}; + else if (counter_loop_h != 0) + tail_incr_reg = {}; + else + tail_incr_reg = {}; end - """.format(buffer_min_size=buffer_max_size)] + """.format(tail_incr_w, tail_incr_h, tail_incr_last_window)] # support SIMD = C and k_w = 1 cases # for k = [k_h, k_w] = [1, k_w], no adjustment is needed @@ -1115,22 +607,215 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): code_gen_dict["$LOOP_KW_ITERATIONS$"]=[str(loop_kw_iterations-1)] code_gen_dict["$LOOP_SIMD_ITERATIONS$"]=[str(loop_simd_iterations-1)] - w = 32 #ToDo: minimize - code_gen_dict["$ADDR_INCREMENT_MAP$"]=["'{{ {}'d0, {}'d{}, {}'d{}, {}'d{}, {}'d{}, {}'d{}}}".format(w, - int(copysign(w,addr_incr_end_simd)),abs(addr_incr_end_simd), - int(copysign(w,addr_incr_end_window_elem)),abs(addr_incr_end_window_elem), - int(copysign(w,addr_incr_end_window_row)),abs(addr_incr_end_window_row), - int(copysign(w,addr_incr_end_window)),abs(addr_incr_end_window), - int(copysign(w,addr_incr_end_row)),abs(addr_incr_end_row))] + incr_bitwidth = 1 + math.ceil(math.log2(max(abs(addr_incr_end_simd)+1, + abs(addr_incr_end_window_elem)+1, + abs(addr_incr_end_window_row)+1, + abs(addr_incr_end_window)+1, + abs(addr_incr_end_row)+1, + abs(tail_incr_w)+1, + abs(tail_incr_h)+1, + abs(tail_incr_last_window)+1))) + code_gen_dict["$INCR_BITWIDTH$"] = [str(incr_bitwidth)] + code_gen_dict["$ADDR_INCREMENT_MAP$"]=["'{{ {}'d0, {}'d{}, {}'d{}, {}'d{}, {}'d{}, {}'d{}}}".format(incr_bitwidth, + int(copysign(incr_bitwidth,addr_incr_end_simd)),abs(addr_incr_end_simd), + int(copysign(incr_bitwidth,addr_incr_end_window_elem)),abs(addr_incr_end_window_elem), + int(copysign(incr_bitwidth,addr_incr_end_window_row)),abs(addr_incr_end_window_row), + int(copysign(incr_bitwidth,addr_incr_end_window)),abs(addr_incr_end_window), + int(copysign(incr_bitwidth,addr_incr_end_row)),abs(addr_incr_end_row))] code_gen_dict["$ELEM_PER_WINDOW$"] = [str(elem_per_window)] - with open("/workspace/finn/finn-rtllib/swg/swg_hdl_template_mmv_1.v", "r") as f: + with open("/workspace/finn/finn-rtllib/swg/swg_template_default.sv", "r") as f: template = f.read() - else: - f_debug.write("\n"+"Choosing implementation style: Parallel Registers (+ line buffers) due to mmv_out>1") + + ##### END CODE GEN FOR DEFAULT STYLE ##### + + ##### BEGIN CODE GEN FOR PARALLEL STYLE ##### + elif (impl_style == "parallel"): + # Out width > In width: Parallel implementation style using registers + line buffers + idx_c, idx_h, idx_w = im2col.get_im2col_indices_nchw( + in_shape, + k_h, + k_w, + pad, + stride_h, + stride_w, + dilation_h, + dilation_w + ) + + cols = in_image_padded[:, idx_c, idx_h, idx_w] + cols = cols.transpose(1, 2, 0).reshape(k_h * k_w * c, -1) + + # result shape is (k_H*k_W*N, out_dim_H*out_dim_W), convert to NCHW + out_image = cols.reshape(n, c, k_h, k_w, out_dim_h, out_dim_w) + # (N=0,C=1,kh=2,kw=3,H=4,W=5) -> (N=0,H=4,W=5,kh=2,kw=3,C=1) + out_image = out_image.transpose(0, 4, 5, 2, 3, 1) + out_image = out_image.reshape(n, out_dim_h, out_dim_w, k_h * k_w * c) + + idx_px = idx_h*w+idx_w # sequential pixel indices + + k, cycles = idx_px.shape + + output_elements = mmv_out + output_cycles = int(cycles/(mmv_out/k)) + + # TODO: what happens when output_cycles=OFMdim % M != 0 + # ...try to support IFMdim % M != 0 first, so we can work with the usual k=3 where OFMdim = IFMdim - -2 + # the additional garbage input elements that are read in the last cycle are not read by any window anyway + idx_px = idx_px.transpose() + idx_px = idx_px.reshape(output_cycles, output_elements) + idx_px = idx_px.transpose() + # result: first dim is number of parallel output elements, + # second dim is the input element (pixel in case of SIMD=C) index that each output element outputs per cycle + + buffer = [] + buffer_max_size = 0 + schedule = [] + next_in_px = 0 + oldest_px = 0 + + def schedule_append(schedule, op): + if len(schedule) > 0 and schedule[-1][1] == op: + count, op_ = schedule[-1] + schedule[-1] = (count+1, op_) + else: + schedule.append((1, op)) + return schedule + + # compute schedule and buffer read pattern (output driven) + idx_px_relative = idx_px.copy() + output_elem, output_cycles = idx_px_relative.shape + + for x in range(output_cycles): + # load missing inputs into buffer + for y in range(output_elem): + while int(idx_px_relative[y,x]) not in buffer: + # load M inputs at once (keep "buffer" list 1D for now, handle actual 2D buffer generation later) + for m in range(M): + buffer.append(next_in_px) + next_in_px += 1 + schedule = schedule_append(schedule,'w') + + # discard unused buffer elements + oldest_px = np.min(idx_px_relative[:,x:]) + #check whether M elements can be shifted out, not just the single oldest one + #while all([buffer[i] < oldest_px for i in range(M)]): + if all([buffer[i] < oldest_px for i in range(M)]): + # M buffer elements are shifted out at once + for m in range(M): + buffer.pop(0) + + # adjust relative buffer index of current x (according to last discarded buffer elements) + for y in range(output_elem): + idx_px_relative[y,x] -= oldest_px + + # read from buffer + # + simultaneously load next pixel(s) into buffer if there are any left + if (next_in_px > (h_padded*w_padded-1)): + # read only (append above) + schedule = schedule_append(schedule,'r') + else: + # load M inputs at once + for m in range(M): + buffer.append(next_in_px) + next_in_px += 1 + schedule = schedule_append(schedule,'wr') + + # record max needed buffer depth + if len(buffer) > buffer_max_size: + buffer_max_size = len(buffer) + + # insert dummy write operations for data at the input FM tail-end that is never read (e.g. in case of stride > 1) + while next_in_px <= (h_padded*w_padded-1): + next_in_px += 1 + schedule = schedule_append(schedule,'w') + + # find buffer access patterns + buffer_access_patterns = [] + for x in range(output_cycles): + if idx_px_relative[:,x].tolist() not in buffer_access_patterns: + buffer_access_patterns.append(idx_px_relative[:,x].tolist()) + + # Experimental implementation to map fixed controller loop structure to R/W schedule by analyzing + # the access pattern given by Im2Col, rather than direct computation. + # TODO: Probably replace this with a directly-computed schedule, similar to the default implementation style. + def compact_schedule(schedule): + # leave first sequence (pre-load) as is + start_sequence = schedule[0] + loop_sequence_1_counter = 1 + loop_sequence_1 = schedule[1] + loop_counter = 0 + loop_sequence_2 = None + end_sequence = None + + i = 2 + if i < len(schedule): + loop_sequence_1 += schedule[i] + i += 1 + while i+1 < len(schedule): + candidate = schedule[i] + schedule[i+1] + if candidate == loop_sequence_1: + loop_sequence_1_counter += 1 + i += 2 + else: + break + + if i < len(schedule): + loop_sequence_2 = schedule[i] + i += 1 + if i+1 < len(schedule): + candidate = schedule[i] + schedule[i+1] + if candidate != loop_sequence_1: + loop_sequence_2 += schedule[i] + i -= 1 + loop_sequence_total_len = (int(len(loop_sequence_2)/2)) + loop_sequence_1_counter*(int(len(loop_sequence_1)/2)) + loop_sequence_total = loop_sequence_2 + loop_sequence_1_counter*loop_sequence_1 + while i+loop_sequence_total_len < len(schedule): + candidate = schedule[i] + for x in range (i+1, i+loop_sequence_total_len): + candidate += schedule[x] + + if candidate == loop_sequence_total: + loop_counter += 1 + i += loop_sequence_total_len + else: + break + else: + if i < len(schedule): + end_sequence = loop_sequence_2 + schedule[i] + i += 1 + loop_sequence_2 = None + else: + end_sequence = loop_sequence_2 + loop_sequence_2 = None + + if i < len(schedule): + end_sequence = schedule[i] + i += 1 + if i < len(schedule): + end_sequence = end_sequence + schedule[i] + i += 1 + + assert len(start_sequence) == 1*2, "ERROR: invalid start sequence" + assert len(loop_sequence_1) == 2*2, "ERROR: invalid loop 1 sequence" + if loop_sequence_2: + assert len(loop_sequence_2) <= 2*2, "ERROR: invalid loop 2 sequence" + if end_sequence: + assert len(end_sequence) <= 2*2, "ERROR: invalid end sequence" + assert i == len(schedule), "ERROR: schedule could not be compacted %d / %d" %(i, len(schedule)) + + return (start_sequence, loop_counter, loop_sequence_1_counter, + loop_sequence_1, loop_sequence_2, end_sequence) + ### determine buffer partitioning into REG FIFOs (parallel access) and BRAM FIFOs (line buffers) - # ToDo: this part doesn't fully account for M (2D buffer) yet + # TODO: this part doesn't fully account for M for 2D buffers yet + + # how many "unused" registers are allowed between buffer positions that will be accessed in parallel + # example: + # 0: only consecutive access patterns will be implemented in regs, rest in (LUTRAM/BRAM) line buffers + # 2: [0, 3, 6] access pattern is still allowed and will be implemented with one 7-position shift reg + REG_BRAM_THRESHOLD = 8 code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_max_size)] @@ -1147,7 +832,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): current.append(access_idx) else: # assume non-decreasing index order in access pattern - # ToDo: this assumption does not hold for M>1 case (2D buffer) + # TODO: this assumption does not hold for M>1 case (2D buffer) distance = access_idx - max(current) if not (distance-1 > REG_BRAM_THRESHOLD): for i in range(distance-1): @@ -1161,20 +846,14 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): bram_fifos_depth.append(math.ceil((distance-1)/M)) # really ceil? # start with new REG FIFO reg_fifos.append(current) - #reg_fifos_depth.append(math.ceil((max(current)+1)/M)) #ToDo: fix for M again + #reg_fifos_depth.append(math.ceil((max(current)+1)/M)) # fix for M again reg_fifos_depth.append(len(current)) current = [] current.append(access_idx) reg_fifos.append(current) - #reg_fifos_depth.append(math.ceil((max(current)+1)/M)) #ToDo fix for M again + #reg_fifos_depth.append(math.ceil((max(current)+1)/M)) # fix for M again reg_fifos_depth.append(len(current)) - f_debug.write("\n"+"Buffer partitioning using REG_BRAM_THRESHOLD=%d" % REG_BRAM_THRESHOLD) - f_debug.write("\n"+"%d REG FIFOs (parallel read access):" % len(reg_fifos)) - f_debug.write("\n"+str(reg_fifos)) - f_debug.write("\n"+"%d BRAM FIFOs (line buffers):" % len(bram_fifos)) - f_debug.write("\n"+str(bram_fifos)) - code_gen_dict["$GENERATE_REG_FIFOS$"] = [] for i in range(len(reg_fifos)): code_gen_dict["$GENERATE_REG_FIFOS$"].append( @@ -1274,6 +953,9 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): loop_sequence_2 = convert_tuple(loop_sequence_2) end_sequence = convert_tuple(end_sequence) + cycles_total = 0 + for t in schedule: + cycles_total += t[0] code_gen_dict["$CYCLES_TOTAL$"] = [str(cycles_total)] code_gen_dict["$START_COUNTER$"]=[str(start_sequence[0])] @@ -1294,11 +976,28 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): code_gen_dict["$WRITE_CMD_MAP$"]=["{{ {}, {}, {}, {}, {}, {}, {} }}".format( start_sequence[1][1],loop_sequence_1[1][1],loop_sequence_1[3][1],loop_sequence_2[1][1],loop_sequence_2[3][1],end_sequence[1][1],end_sequence[3][1])] - with open("/workspace/finn/finn-rtllib/swg/swg_hdl_template.v", "r") as f: + with open("/workspace/finn/finn-rtllib/swg/swg_template_parallel.sv", "r") as f: template = f.read() + + ##### END CODE GEN FOR PARALLEL STYLE ##### + + ##### BEGIN GENERAL CODE GEN ##### + code_gen_dict["$TOP_MODULE_NAME$"] = [self.get_verilog_top_module_name()] + # save top module name so we can refer to it even after this node has been renamed + # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) + self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) + code_gen_dict["$BIT_WIDTH$"] = [str(self.get_input_datatype().bitwidth())] + code_gen_dict["$SIMD$"] = [str(simd)] + code_gen_dict["$MMV_IN$"] = [str(mmv_in)] + code_gen_dict["$MMV_OUT$"] = [str(mmv_out)] - - with open("/workspace/finn/finn-rtllib/swg/swg_hdl_template_wrapper.v", "r") as f: + ram_style = self.get_nodeattr("ram_style") + if ram_style == "auto": + code_gen_dict["$RAM_STYLE$"]=[""] + else: + code_gen_dict["$RAM_STYLE$"]=["(* ram_style = \"{}\" *)".format(ram_style)] + + with open("/workspace/finn/finn-rtllib/swg/swg_template_wrapper.v", "r") as f: template_wrapper = f.read() for key in code_gen_dict: @@ -1310,22 +1009,21 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): f = open(os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv"), "w") f.write(template) f.close() - f = open(os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"), "w") f.write(template_wrapper) f.close() - - f_debug.close() + #f_debug.close() #set ipgen_path and ip_path so that HLS-Synth transformation and stich_ip transformation do not complain self.set_nodeattr("ipgen_path", code_gen_dir) self.set_nodeattr("ip_path", code_gen_dir) + ##### END GENERAL CODE GEN ##### def prepare_rtlsim(self): """Creates a Verilator emulation library for the RTL code generated for this node, sets the rtlsim_so attribute to its path and returns a PyVerilator wrapper around it.""" - #modified to use generated verilog instead of HLS output products + # Modified to use generated (System-)Verilog instead of HLS output products if PyVerilator is None: raise ImportError("Installation of PyVerilator is required.") @@ -1374,4 +1072,4 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): pass def compile_singlenode_code(self): - pass \ No newline at end of file + pass diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py index 870f5593b..01133dc5f 100755 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py @@ -148,21 +148,31 @@ def prepare_inputs(input_tensor): # input datatype @pytest.mark.parametrize("idt", [DataType["UINT4"]]) + +# @pytest.mark.parametrize( +# "conv_config", +# [ +# [[12,12], [3, 3], [1, 1], [1, 1]], +# [[13,13], [3, 3], [1, 1], [1, 1]], +# [[12,12], [3, 3], [2, 2], [1, 1]], +# [[13,13], [3, 3], [2, 2], [1, 1]], +# ], +# ) # kernel size -@pytest.mark.parametrize("k", [[3,3]]) +@pytest.mark.parametrize("k", [[1,1],[2,2],[3,3],[4,5],[1,3]]) # input dimension -@pytest.mark.parametrize("ifm_dim", [[24,24]]) +@pytest.mark.parametrize("ifm_dim", [[8,8],[13,13],[1,12]]) # input channels -@pytest.mark.parametrize("ifm_ch", [8]) +@pytest.mark.parametrize("ifm_ch", [6]) # Stride -@pytest.mark.parametrize("stride", [[3,3],[6,6]]) +@pytest.mark.parametrize("stride", [[1,1],[2,2],[3,4]]) # Dilation -@pytest.mark.parametrize("dilation", [[1,1],[2,2]]) +@pytest.mark.parametrize("dilation", [[1,1],[2,2],[4,3]]) # depthwise @pytest.mark.parametrize("dw", [0,1]) # input channel parallelism ("SIMD") -@pytest.mark.parametrize("simd", [1,2,8]) +@pytest.mark.parametrize("simd", [1,2,3,6]) # in/out MMV ("M") @pytest.mark.parametrize("m", [1]) # paralle_window enable (MMV_out = M*K) @@ -175,7 +185,14 @@ def prepare_inputs(input_tensor): def test_fpgadataflow_slidingwindow_rtl( idt, k, ifm_dim, ifm_ch, stride, dilation, dw, simd, m, parallel_window, flip ): + #ifm_dim = conv_config[0] + #k = conv_config[1] + #stride = conv_config[2] + #dilation= conv_config[3] + if flip: + if (ifm_dim[0]==ifm_dim[1] and k[0]==k[1] and stride[0]==stride[1] and dilation[0] == dilation[1]): + pytest.skip("Dimension flip would have no effect") k = k[::-1] ifm_dim = ifm_dim[::-1] stride = stride[::-1] @@ -186,8 +203,21 @@ def test_fpgadataflow_slidingwindow_rtl( stride_h, stride_w = stride dilation_h, dilation_w = dilation + kernel_width = (k_w-1)*dilation_w+1 # incl. dilation + kernel_height = (k_h-1)*dilation_h+1 # incl. dilation + if simd > ifm_ch: pytest.skip("SIMD cannot be larger than number of input channels") + if ifm_ch % simd != 0: + pytest.skip("SIMD must divide number of input channels") + if kernel_width > ifm_dim_h or stride_h > ifm_dim_h: + pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension") + if kernel_height > ifm_dim_w or stride_w > ifm_dim_w: + pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension") + if (k_h==1 and (stride_h!=1 or dilation_h!=1)) or (k_w==1 and (stride_w!=1 or dilation_w!=1)): + pytest.skip("Illegal convolution configuration: stride or dilation defined for unitary kernel dim") + if k_h==1 and k_w==1 and simd != ifm_ch: + pytest.skip("1x1 Kernel only supported in parallel mode (SIMD=C)") ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h) ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w) -- GitLab