From f46e2d0a79a6a19cd09e4ee3d0503d81a42cc87e Mon Sep 17 00:00:00 2001 From: Felix Jentzsch <felix.jentzsch@upb.de> Date: Thu, 25 Aug 2022 00:16:02 +0200 Subject: [PATCH] Restructure, basic resource estimation --- finn-rtllib/swg/swg_template_parallel.sv | 74 +- .../convolutioninputgenerator_rtl.py | 1474 +++++++++-------- ...est_fpgadataflow_convinputgenerator_rtl.py | 81 +- 3 files changed, 915 insertions(+), 714 deletions(-) diff --git a/finn-rtllib/swg/swg_template_parallel.sv b/finn-rtllib/swg/swg_template_parallel.sv index 7c1e04222..19638d8a1 100755 --- a/finn-rtllib/swg/swg_template_parallel.sv +++ b/finn-rtllib/swg/swg_template_parallel.sv @@ -3,13 +3,15 @@ module $TOP_MODULE_NAME$_controller ( CLK, - cycle, + RST, + advance, cmd_read, cmd_write ); input CLK; -input [31:0] cycle; //todo: minimize width or switch to single bit flag +input RST; +input advance; output cmd_read; output cmd_write; @@ -39,10 +41,6 @@ integer counter_loop_inter; assign cmd_read = READ_CMD_MAP[state_next]; //read command indicates read in *upcoming* cycle, due to how schedule is constructed assign cmd_write = WRITE_CMD_MAP[state]; -reg cycle_last; -wire cycle_advance; -assign cycle_advance = !(cycle == cycle_last); - //combinational next state logic always @ (state, counter_current, counter_loop_main, counter_loop_inter) begin state_next = state; //default @@ -67,7 +65,7 @@ always @ (state, counter_current, counter_loop_main, counter_loop_inter) begin if (LOOP_END_1_COUNTER != 0) state_next = STATE_END_1; else - state_next = STATE_START; + state_next = STATE_LOOP_MAIN_2; //wait in current state until reset end end end @@ -91,49 +89,46 @@ always @ (state, counter_current, counter_loop_main, counter_loop_inter) begin if (LOOP_END_2_COUNTER != 0) state_next = STATE_END_2; else - state_next = STATE_START; + state_next = STATE_END_1; //wait in current state until reset end end STATE_END_2: if (counter_current == LOOP_END_2_COUNTER-1) - state_next = STATE_START; + state_next = STATE_END_2; //wait in current state until reset endcase end //sequential logic always @ (posedge CLK) begin - if (cycle == 0) begin - counter_current <= 0; + if (RST) begin + counter_current <= -1; counter_loop_main <= 0; counter_loop_inter <= 0; - cycle_last <= 0; state <= STATE_START; end else begin - cycle_last <= cycle; - state <= state_next; - - if (cycle_advance) begin + if (advance) begin counter_current <= counter_current+1; - end + state <= state_next; - if (state != state_next) begin - counter_current <= 0; + if (state != state_next) begin + counter_current <= 0; - //count up main loop upon re-entering this loop (not on first enter from start) - if ((state_next == STATE_LOOP_MAIN_1) && (state != STATE_START)) begin - if (counter_loop_main == LOOP_MAIN_COUNTER-1) begin - counter_loop_main <= 0; - end else begin - counter_loop_main <= counter_loop_main+1; + //count up main loop upon re-entering this loop (not on first enter from start) + if ((state_next == STATE_LOOP_MAIN_1) && (state != STATE_START)) begin + if (counter_loop_main == LOOP_MAIN_COUNTER-1) begin + counter_loop_main <= 0; + end else begin + counter_loop_main <= counter_loop_main+1; + end end - end - if (state_next == STATE_LOOP_INTER_1) begin - if (counter_loop_inter == LOOP_INTER_COUNTER) begin //no -1 because this counter marks the currently active iteration, not finished iterations - counter_loop_inter <= 0; - end else begin - counter_loop_inter <= counter_loop_inter+1; + if (state_next == STATE_LOOP_INTER_1) begin + if (counter_loop_inter == LOOP_INTER_COUNTER) begin //no -1 because this counter marks the currently active iteration, not finished iterations + counter_loop_inter <= 0; + end else begin + counter_loop_inter <= counter_loop_inter+1; + end end end end @@ -169,8 +164,8 @@ output [WIDTH*DEPTH-1:0] data_out; // File: shift_registers_1.v // //module shift_registers_1 (clk, clken, SI, SO); -//parameter WIDTH = 32; -//input clk, clken, SI; +//parameter WIDTH = 32; +//input clk, clken, SI; //output SO; //reg [WIDTH-1:0] shreg; // @@ -181,7 +176,7 @@ output [WIDTH*DEPTH-1:0] data_out; // begin // for (i = 0; i < WIDTH-1; i = i+1) // shreg[i+1] <= shreg[i]; -// shreg[0] <= SI; +// shreg[0] <= SI; // end //end //assign SO = shreg[WIDTH-1]; @@ -227,7 +222,7 @@ integer addr_w, addr_r; //todo: minimize width (as reg), make r addr depend on w $RAM_STYLE$ reg [WIDTH-1:0] ram [DEPTH-1:0]; -always @(posedge CLK) begin +always @(posedge CLK) begin if (RST == 1'b0) begin addr_w <= 0; addr_r <= 1; @@ -349,11 +344,15 @@ wire read_cmd; wire write_cmd; reg write_done; //keep track if W of current cycle was already completed, but we still wait on a R in the same cycle +wire controller_reset; +wire controller_advance; + $TOP_MODULE_NAME$_controller controller_inst ( .CLK(ap_clk), - .cycle(cycle), + .RST(controller_reset), + .advance(controller_advance), .cmd_read(read_cmd), .cmd_write(write_cmd) ); @@ -379,6 +378,9 @@ assign advance = read_ok || (!read_cmd && write_ok) || (!read_c //todo: if mmv_out < k: might not shift and/or write for multiple read_cmd cycles assign window_buffer_shift_enable = advance; +assign controller_reset = !ap_rst_n || ((cycle == CYCLES_TOTAL-1) && advance); +assign controller_advance = advance; + //assign I/O ports assign window_buffer_in = in0_V_V_TDATA; assign out_V_V_TDATA = window_buffer_out; diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py index 936954258..f1e0f53a7 100755 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py @@ -27,21 +27,17 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import math -from math import copysign import numpy as np import os - +from math import copysign from qonnx.core.datatype import DataType from qonnx.custom_op.general import im2col from qonnx.custom_op.general.im2col import compute_conv_output_dim + from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.util.basic import get_rtlsim_trace_depth, make_build_dir from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -from finn.util.basic import ( - get_rtlsim_trace_depth, - make_build_dir, -) - try: from pyverilator import PyVerilator except ModuleNotFoundError: @@ -57,9 +53,124 @@ except ModuleNotFoundError: # * non-depthwise SWG: (1, OFMDim_H, OFMDim_W, K_H, K_W, IFMChannels/SIMD, SIMD) # * depthwise SWG: (1, OFMDim_H, OFMDim_W, IFMChannels/SIMD, K_H, K_W, SIMD) + +# helper functions for parallel mode buffer scheduling (to be superseded by improved implementation): + + +def schedule_append(schedule, op): + if len(schedule) > 0 and schedule[-1][1] == op: + count, op_ = schedule[-1] + schedule[-1] = (count + 1, op_) + else: + schedule.append((1, op)) + return schedule + + +def schedule_map_cmds(seq): + mapping = { + "w": ("1'b1", "1'b0"), + "r": ("1'b0", "1'b1"), + "wr": ("1'b1", "1'b1"), + "n": ("1'b0", "1'b0"), + } + if seq: + if len(seq) == 2: + return (seq[0], mapping[seq[1]], 0, mapping["n"]) + if len(seq) == 4: + return (seq[0], mapping[seq[1]], seq[2], mapping[seq[3]]) + else: + return (0, mapping["n"], 0, mapping["n"]) + + +def schedule_map_controller(schedule): + # Experimental implementation to map fixed controller loop structure to R/W schedule by analyzing + # the access pattern given by Im2Col, rather than direct computation. + # TODO: Probably replace this with a directly-computed schedule, similar to the default implementation style. + + # leave first sequence (pre-load) as is + start_sequence = schedule[0] + loop_sequence_1_counter = 1 + loop_sequence_1 = schedule[1] + loop_counter = 0 + loop_sequence_2 = None + end_sequence = None + + i = 2 + if i < len(schedule): + loop_sequence_1 += schedule[i] + i += 1 + while i + 1 < len(schedule): + candidate = schedule[i] + schedule[i + 1] + if candidate == loop_sequence_1: + loop_sequence_1_counter += 1 + i += 2 + else: + break + + if i < len(schedule): + loop_sequence_2 = schedule[i] + i += 1 + if i + 1 < len(schedule): + candidate = schedule[i] + schedule[i + 1] + if candidate != loop_sequence_1: + loop_sequence_2 += schedule[i] + i -= 1 + loop_sequence_total_len = ( + int(len(loop_sequence_2) / 2) + ) + loop_sequence_1_counter * (int(len(loop_sequence_1) / 2)) + loop_sequence_total = ( + loop_sequence_2 + loop_sequence_1_counter * loop_sequence_1 + ) + while i + loop_sequence_total_len < len(schedule): + candidate = schedule[i] + for x in range(i + 1, i + loop_sequence_total_len): + candidate += schedule[x] + + if candidate == loop_sequence_total: + loop_counter += 1 + i += loop_sequence_total_len + else: + break + else: + if i < len(schedule): + end_sequence = loop_sequence_2 + schedule[i] + i += 1 + loop_sequence_2 = None + else: + end_sequence = loop_sequence_2 + loop_sequence_2 = None + + if i < len(schedule): + end_sequence = schedule[i] + i += 1 + if i < len(schedule): + end_sequence = end_sequence + schedule[i] + i += 1 + + assert len(start_sequence) == 1 * 2, "ERROR: invalid start sequence" + assert len(loop_sequence_1) == 2 * 2, "ERROR: invalid loop 1 sequence" + if loop_sequence_2: + assert len(loop_sequence_2) <= 2 * 2, "ERROR: invalid loop 2 sequence" + if end_sequence: + assert len(end_sequence) <= 2 * 2, "ERROR: invalid end sequence" + assert i == len(schedule), "ERROR: schedule could not be compacted %d / %d" % ( + i, + len(schedule), + ) + + return ( + start_sequence, + loop_counter, + loop_sequence_1_counter, + loop_sequence_1, + loop_sequence_2, + end_sequence, + ) + + class ConvolutionInputGenerator_rtl(HLSCustomOp): """Class that does not correspond to one of the finn-hlslib ConvolutionInputGenerator - (sliding window) function variants! ... """ + (sliding window) function variants! ...""" def __init__(self, onnx_node): super().__init__(onnx_node) @@ -108,12 +219,12 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): M = self.get_nodeattr("M") assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" wf = int(ifm_ch / simd) - #folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd) - #round up to support ifm_dim % M != 0 + # folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd) + # round up to support ifm_dim % M != 0 if ifm_dim_w == 1: - folded_ishape = (1, math.ceil(ifm_dim_h/M), ifm_dim_w, wf, int(simd*M)) + folded_ishape = (1, math.ceil(ifm_dim_h / M), ifm_dim_w, wf, int(simd * M)) else: - folded_ishape = (1, ifm_dim_h, math.ceil(ifm_dim_w/M), wf, int(simd*M)) + folded_ishape = (1, ifm_dim_h, math.ceil(ifm_dim_w / M), wf, int(simd * M)) return folded_ishape def get_normal_output_shape(self): @@ -140,13 +251,25 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" - if (self.get_nodeattr("parallel_window")): + if self.get_nodeattr("parallel_window"): wf = int((ifm_ch) // simd) - #folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd) + # folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd) if ofm_dim_w == 1: - folded_oshape = (1, int(ofm_dim_h/M), ofm_dim_w, wf, k_h * k_w * int(simd*M)) + folded_oshape = ( + 1, + int(ofm_dim_h / M), + ofm_dim_w, + wf, + k_h * k_w * int(simd * M), + ) else: - folded_oshape = (1, ofm_dim_h, int(ofm_dim_w/M), wf, k_h * k_w * int(simd*M)) + folded_oshape = ( + 1, + ofm_dim_h, + int(ofm_dim_w / M), + wf, + k_h * k_w * int(simd * M), + ) else: wf = int((k_h * k_w * ifm_ch) // simd) folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd) @@ -186,7 +309,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): return in_width def get_outstream_width(self): - if (self.get_nodeattr("parallel_window")): + if self.get_nodeattr("parallel_window"): # feed all window pixels in parallel k_h, k_w = self.get_nodeattr("ConvKernelDim") return self.get_instream_width() * k_h * k_w @@ -205,25 +328,31 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): return num_output_elems def get_exp_cycles(self): - # TODO: update simd = self.get_nodeattr("SIMD") + m = self.get_nodeattr("M") ifm_ch = self.get_nodeattr("IFMChannels") k = self.get_nodeattr("ConvKernelDim") ifm_dim = self.get_nodeattr("IFMDim") ofm_dim = self.get_nodeattr("OFMDim") stride = self.get_nodeattr("Stride") dilation = self.get_nodeattr("Dilation") + depthwise = self.get_nodeattr("depthwise") ifm_dim_h, ifm_dim_w = ifm_dim ofm_dim_h, ofm_dim_w = ofm_dim k_h, k_w = k stride_h, stride_w = stride dilation_h, dilation_w = dilation - mmv = 1 + k_h, k_w = k + stride_h, stride_w = stride + dilation_h, dilation_w = dilation - if (self.get_nodeattr("parallel_window")): - exp_cycles = ifm_dim_w + 1 + impl_style = self.select_impl_style() + if impl_style == "parallel": + exp_cycles = self.get_number_input_values() + 2 else: - cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv + # based on 2D HLS SWG estimate + # FIXME: increase accuracy for newly supported parameter scenarios + cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / 1 cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd) max_cycles = max(cycles_write_block, cycles_read_block) exp_cycles = ( @@ -233,15 +362,21 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): return int(exp_cycles) def bram_estimation(self): - # TODO: update simd = self.get_nodeattr("SIMD") - ifm_ch = self.get_nodeattr("IFMChannels") - ifm_dim = np.prod(self.get_nodeattr("IFMDim")) - k = np.prod(self.get_nodeattr("ConvKernelDim")) - stride = np.prod(self.get_nodeattr("Stride")) ram_style = self.get_nodeattr("ram_style") + + impl_style = self.select_impl_style() + # call codegen preparation to populate self.buffer_depth + if impl_style == "default": + template_path, code_gen_dict = self.prepare_codegen_default() + elif impl_style == "parallel": + template_path, code_gen_dict = self.prepare_codegen_parallel() + + buffer_width = simd * self.get_input_datatype().bitwidth() + buffer_depth = self.buffer_depth + if ram_style == "block" or ram_style == "auto": - ram_depth = ifm_dim * ifm_ch / simd + ram_depth = buffer_depth if ram_depth <= 512: ram_width = 36 elif ram_depth <= 1024: @@ -254,57 +389,37 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): ram_width = 2 else: ram_width = 1 - return int( - (k + stride) - * ( - math.ceil(simd * self.get_input_datatype().bitwidth() / ram_width) - * math.ceil(ifm_dim * ifm_ch / simd / ram_depth) - ) - ) + + ram_cascade_depth = math.ceil(buffer_depth / 16384) + ram_cascade_width = math.ceil(buffer_width / ram_width) + + return int(ram_cascade_depth * ram_cascade_width) else: return 0 def lut_estimation(self): - # TODO: update - # NOTE: not tested for correctness simd = self.get_nodeattr("SIMD") - ifm_ch = self.get_nodeattr("IFMChannels") - ifm_dim = np.prod(self.get_nodeattr("IFMDim")) - k = np.prod(self.get_nodeattr("ConvKernelDim")) - stride = np.prod(self.get_nodeattr("Stride")) ram_style = self.get_nodeattr("ram_style") + + impl_style = self.select_impl_style() + # call codegen preparation to populate self.buffer_depth + if impl_style == "default": + template_path, code_gen_dict = self.prepare_codegen_default() + elif impl_style == "parallel": + template_path, code_gen_dict = self.prepare_codegen_parallel() + + buffer_width = simd * self.get_input_datatype().bitwidth() + buffer_depth = self.buffer_depth + if ram_style == "distributed": - ram_luts = int( - (k + stride) - * ( - simd - * self.get_input_datatype().bitwidth() - * math.ceil(ifm_dim * ifm_ch / simd / 64) - ) - ) + ram_luts = int(buffer_width * math.ceil(buffer_depth / 32)) else: ram_luts = 0 return 300 + ram_luts def uram_estimation(self): - # TODO: update - # NOTE: not tested for correctness - simd = self.get_nodeattr("SIMD") - ifm_ch = self.get_nodeattr("IFMChannels") - ifm_dim = np.prod(self.get_nodeattr("IFMDim")) - k = np.prod(self.get_nodeattr("ConvKernelDim")) - stride = np.prod(self.get_nodeattr("Stride")) - ram_style = self.get_nodeattr("ram_style") - if ram_style == "ultra": - return int( - (k + stride) - * ( - math.ceil(simd * self.get_input_datatype().bitwidth() / 64) - * math.ceil(ifm_dim * ifm_ch / simd / 4096) - ) - ) - else: - return 0 + # TODO: implement URAM estimation + return 0 def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") @@ -314,14 +429,8 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): folded_ishape = self.get_folded_input_shape() folded_oshape = self.get_folded_output_shape() - # TODO ensure codegen dir exists if mode == "cppsim": - #code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - raise Exception( - """cppsim not possible for RTL SWG""".format( - mode - ) - ) + raise Exception("""cppsim not possible for RTL SWG""".format(mode)) elif mode == "rtlsim": code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") else: @@ -335,10 +444,10 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): inp = context[node.input[0]] assert str(inp.dtype) == "float32", "Input datatype is not float32" # disable this check to allow for IFMdim % M != 0 case (see below) where input comes from MMV-output capable node - #assert ( + # assert ( # inp.shape == exp_ishape - #), """Input shape doesn't - #match expected shape (1, ifm_dim, ifm_dim, ifm_ch).""" + # ), """Input shape doesn't + # match expected shape (1, ifm_dim, ifm_dim, ifm_ch).""" if self.get_input_datatype() == DataType["BIPOLAR"]: # store bipolar activations as binary inp = (inp + 1) / 2 @@ -349,11 +458,17 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): # pad test input stream to work when IFMdim % M != 0 # during normal operation, the AXI Stream should not care, in the last cycle garbage elements are read but not used # TODO: only works for 1D case - mmv_stream_padding_px = int((np.prod(folded_ishape) - np.prod(inp.shape)) / exp_ishape[-1]) - if exp_ishape [2] == 1: - inp = np.pad(inp, ((0,0),(0,mmv_stream_padding_px),(0,0),(0,0)), 'constant') + mmv_stream_padding_px = int( + (np.prod(folded_ishape) - np.prod(inp.shape)) / exp_ishape[-1] + ) + if exp_ishape[2] == 1: + inp = np.pad( + inp, ((0, 0), (0, mmv_stream_padding_px), (0, 0), (0, 0)), "constant" + ) else: - inp = np.pad(inp, ((0,0),(0,0),(0,mmv_stream_padding_px),(0,0)), 'constant') + inp = np.pad( + inp, ((0, 0), (0, 0), (0, mmv_stream_padding_px), (0, 0)), "constant" + ) # reshape input into folded form inp = inp.reshape(folded_ishape) # make copy before saving array @@ -391,633 +506,660 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): ), """Output shape doesn't match expected shape (1, ofm_dim_h, ofm_dim_w, k_h*k_w*ifm_ch).""" - def global_includes(self): - pass + def prepare_codegen_default(self): + # Default implementation style for MMV_out = 1: addressable cyclic buffer + # Computing incremental addressing scheme directly.. + template_path = ( + os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_template_default.sv" + ) + code_gen_dict = {} - def defines(self, var): - pass + ifm_ch = self.get_nodeattr("IFMChannels") + k = self.get_nodeattr("ConvKernelDim") + ifm_dim = self.get_nodeattr("IFMDim") + stride = self.get_nodeattr("Stride") + dilation = self.get_nodeattr("Dilation") + depthwise = self.get_nodeattr("depthwise") + simd = self.get_nodeattr("SIMD") + M = self.get_nodeattr("M") - def read_npy_data(self): - pass + k_h, k_w = k + h, w = ifm_dim + pad = [0, 0, 0, 0] # padding happens in separate padding node for now + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + pad_h = pad[0] + pad[2] + pad_w = pad[1] + pad[3] + out_dim_h = im2col.compute_conv_output_dim(h, k_h, stride_h, pad_h, dilation_h) + out_dim_w = im2col.compute_conv_output_dim(w, k_w, stride_w, pad_w, dilation_w) - def strm_decl(self): - pass + if self.get_nodeattr("parallel_window"): + mmv_in = M * 1 + mmv_out = M * k_h * k_w + else: + mmv_in = 1 + mmv_out = 1 - def docompute(self): - pass + # compute index/address increments for each nested loop + channel_factor = int(ifm_ch / simd) + + # compute minimal buffer length (assuming it holds 1 complete window) + buffer_min_size = ( + (k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1 + ) * channel_factor + + # add additional buffer space in case of stride > 1 + # this minimizes cycle count, as it allows an earlier pre-load of skipped input elements + buffer_actual_size = ( + buffer_min_size + + max( + 0, + ((stride_w - 1) - (int(mmv_out * k_h * k_w / mmv_in))) * channel_factor, + ) + + max( + 0, + ((stride_h - 1) * w - (int(mmv_out * k_h * k_w / mmv_in))) + * channel_factor, + ) + ) + self.buffer_depth = buffer_actual_size # for resource estimation + code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_actual_size)] + + # compute some intermediate values, e.g., kernel "width" = k_w incl. dilation + # or cols/rows that are skipped due to imperfect stride<->dim combination + kernel_width = (k_w - 1) * dilation_w + 1 + kernel_height = (k_h - 1) * dilation_h + 1 + skip_columns = w % (kernel_width + (out_dim_w - 1) * stride_w) + skip_rows = h % (kernel_height + (out_dim_h - 1) * stride_h) + + # compute address increment values for 5-loop nest + addr_incr_end_simd = 1 + addr_incr_end_window_elem = (dilation_w - 1) * channel_factor + 1 + addr_incr_end_window_row = ( + ((w - kernel_width) * channel_factor) # remaining line + + ((dilation_h - 1) * w * channel_factor) # skip lines + + 1 # wrap-around of minimally sized buffer + ) + addr_incr_end_window = -buffer_min_size + stride_w * channel_factor + 1 + addr_incr_end_row = ( + -buffer_min_size + + ((skip_columns + kernel_width) * channel_factor) # remaining line + + ((stride_h - 1) * w * channel_factor) # skip lines + + 1 + ) - def dataoutstrm(self): - pass + # re-use same controller structure -> re-assign address increments for the dw case + if depthwise: + addr_incr_end_window_elem = dilation_w * channel_factor + addr_incr_end_window_row = ( + channel_factor + + (w - kernel_width) * channel_factor + + (dilation_h - 1) * w * channel_factor + ) + addr_incr_end_simd = -buffer_min_size + (channel_factor + 1) + + # sanity check + assert not ( + abs(addr_incr_end_window) > buffer_actual_size + ), "ERROR: W increment > buffer size, wrap logic doesn't account for this" + assert not ( + abs(addr_incr_end_row) > buffer_actual_size + ), "ERROR: H increment > buffer size, wrap logic doesn't account for this" + + # set certain threshold indices to detect when reading/writing finishes + code_gen_dict["$LAST_READ_ELEM$"] = [str(h * w * channel_factor - 1)] + code_gen_dict["$LAST_WRITE_ELEM$"] = [ + str(((h - skip_rows - 1) * w + (w - skip_columns)) * channel_factor - 1) + ] + + # default controller loop structure: # iterations (counters) map directly + loop_h_iterations = out_dim_h + loop_w_iterations = out_dim_w + loop_kh_iterations = k_h + loop_kw_iterations = k_w + loop_simd_iterations = channel_factor + + if depthwise and channel_factor > 1: + # re-arrange existing controller loop structure for depthwise convolutions + loop_kh_iterations = channel_factor + loop_kw_iterations = k_h + loop_simd_iterations = k_w + addr_incr_end_simd_ = addr_incr_end_simd + addr_incr_end_simd = addr_incr_end_window_elem + addr_incr_end_window_elem = addr_incr_end_window_row + addr_incr_end_window_row = addr_incr_end_simd_ + elem_per_window = k_h * k_w + + tail_incr_w = addr_incr_end_window + buffer_min_size - channel_factor + tail_incr_h = addr_incr_end_row + buffer_min_size - channel_factor + tail_incr_last_window = buffer_min_size - 1 + code_gen_dict["$TAIL_INCR_GENERATION$"] = [ + """ + always @ (counter_loop_kh, counter_loop_w, counter_loop_h) begin + if (counter_loop_kh >= 0) + tail_incr_reg = 1; + else if (counter_loop_w >= 0) + tail_incr_reg = {}; + else if (counter_loop_h >= 0) + tail_incr_reg = {}; + else + tail_incr_reg = {}; + end + """.format( + tail_incr_w, tail_incr_h, tail_incr_last_window + ) + ] + else: + # depthwise output format is equivalent to non-depthwise if SIMD=C + elem_per_window = k_h * k_w * channel_factor + + tail_incr_w = addr_incr_end_window + buffer_min_size - 1 + tail_incr_h = addr_incr_end_row + buffer_min_size - 1 + tail_incr_last_window = buffer_min_size - 1 + code_gen_dict["$TAIL_INCR_GENERATION$"] = [ + """ + always @ (counter_loop_w, counter_loop_h) begin + if (counter_loop_w >= 0) + tail_incr_reg = {}; + else if (counter_loop_h >= 0) + tail_incr_reg = {}; + else + tail_incr_reg = {}; + end + """.format( + tail_incr_w, tail_incr_h, tail_incr_last_window + ) + ] + + # support SIMD = C and k_w = 1 cases + # for k = [k_h, k_w] = [1, k_w], no adjustment is needed + # for k = [k_h, k_w] = [1, 1], do not use this impl. style (mmv_out=K=1) + # innermost loop is executed at least once -> adjust if needed + if loop_simd_iterations == 1: + # skip innermost SIMD loop completely + if loop_kw_iterations == 1: + # skip innermost KW loop completely + code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_KH"] + loop_kh_iterations -= 1 # -1 because state is initial state + else: + code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_KW"] + loop_kw_iterations -= 1 # -1 because state is initial state + else: + code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_SIMD"] + loop_simd_iterations -= 1 # -1 because state is initial state + + code_gen_dict["$LOOP_H_ITERATIONS$"] = [str(loop_h_iterations - 1)] + code_gen_dict["$LOOP_W_ITERATIONS$"] = [str(loop_w_iterations - 1)] + code_gen_dict["$LOOP_KH_ITERATIONS$"] = [str(loop_kh_iterations - 1)] + code_gen_dict["$LOOP_KW_ITERATIONS$"] = [str(loop_kw_iterations - 1)] + code_gen_dict["$LOOP_SIMD_ITERATIONS$"] = [str(loop_simd_iterations - 1)] + + incr_bitwidth = 1 + math.ceil( + math.log2( + max( + abs(addr_incr_end_simd) + 1, + abs(addr_incr_end_window_elem) + 1, + abs(addr_incr_end_window_row) + 1, + abs(addr_incr_end_window) + 1, + abs(addr_incr_end_row) + 1, + abs(tail_incr_w) + 1, + abs(tail_incr_h) + 1, + abs(tail_incr_last_window) + 1, + ) + ) + ) + code_gen_dict["$INCR_BITWIDTH$"] = [str(incr_bitwidth)] + code_gen_dict["$ADDR_INCREMENT_MAP$"] = [ + "'{{ {}'d0, {}'d{}, {}'d{}, {}'d{}, {}'d{}, {}'d{}}}".format( + incr_bitwidth, + int(copysign(incr_bitwidth, addr_incr_end_simd)), + abs(addr_incr_end_simd), + int(copysign(incr_bitwidth, addr_incr_end_window_elem)), + abs(addr_incr_end_window_elem), + int(copysign(incr_bitwidth, addr_incr_end_window_row)), + abs(addr_incr_end_window_row), + int(copysign(incr_bitwidth, addr_incr_end_window)), + abs(addr_incr_end_window), + int(copysign(incr_bitwidth, addr_incr_end_row)), + abs(addr_incr_end_row), + ) + ] - def save_as_npy(self): - pass + code_gen_dict["$ELEM_PER_WINDOW$"] = [str(elem_per_window)] + code_gen_dict["$SIMD$"] = [str(simd)] + code_gen_dict["$MMV_IN$"] = [str(mmv_in)] + code_gen_dict["$MMV_OUT$"] = [str(mmv_out)] - def blackboxfunction(self): - pass + return template_path, code_gen_dict - def pragmas(self): - pass - - def generate_hdl(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - #f_debug = open(os.path.join(code_gen_dir, "swg_hdl_debuginfo.log"), "w") + def prepare_codegen_parallel(self): + # Parallel implementation style for MMV_out = K: + # mix of shift-registers (for parallel read) and line buffers (BRAM or LUTRAM) + # compute a static schedule by analyzing access pattern (from im2col function) + template_path = ( + os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_template_parallel.sv" + ) code_gen_dict = {} - ##### BEGIN INITIALIZE/CHECK CONFIGURATION ##### ifm_ch = self.get_nodeattr("IFMChannels") k = self.get_nodeattr("ConvKernelDim") ifm_dim = self.get_nodeattr("IFMDim") - ofm_dim = self.get_nodeattr("OFMDim") stride = self.get_nodeattr("Stride") dilation = self.get_nodeattr("Dilation") - depthwise = self.get_nodeattr("depthwise") + simd = self.get_nodeattr("SIMD") + M = self.get_nodeattr("M") - n = 1 - h, w = ifm_dim - c = 1 # assume SIMD=C (parallelize across all channels) k_h, k_w = k - pad = [0,0,0,0] # padding happens in separate padding node for now - pad_val = 0 + h, w = ifm_dim + n = c = 1 # no need to consider fully-parallel C dimension + in_shape = (n, c, h, w) + pad = [0, 0, 0, 0] stride_h, stride_w = stride dilation_h, dilation_w = dilation - - in_shape = (n,c,h,w) #NCHW - in_image = np.empty(in_shape, dtype=int) in_image_padded = np.pad( in_image, ((0, 0), (0, 0), (pad[0], pad[2]), (pad[1], pad[3])), mode="constant", - constant_values=pad_val, + constant_values=0, ) in_shape_padded = in_image_padded.shape h_padded = in_shape_padded[2] w_padded = in_shape_padded[3] - pad_h = pad[0] + pad[2] pad_w = pad[1] + pad[3] out_dim_h = im2col.compute_conv_output_dim(h, k_h, stride_h, pad_h, dilation_h) out_dim_w = im2col.compute_conv_output_dim(w, k_w, stride_w, pad_w, dilation_w) - # init folding config - simd = self.get_nodeattr("SIMD") - M = self.get_nodeattr("M") - if (self.get_nodeattr("parallel_window")): - mmv_in = M*1 - mmv_out = M*k_h*k_w - assert ifm_ch==simd, "Constraint violated: SIMD must be equal to C" + if self.get_nodeattr("parallel_window"): + mmv_in = M * 1 + mmv_out = M * k_h * k_w + assert ifm_ch == simd, "Constraint violated: SIMD must be equal to C" else: mmv_in = 1 mmv_out = 1 - assert ifm_ch%simd==0, "Constraint violated: SIMD must divide C" + assert ifm_ch % simd == 0, "Constraint violated: SIMD must divide C" - # TODO: check allowed hyperparams - # for 1D case: it does not matter if dummy dim is x or y - # TODO: move/duplicate these checks in corresponding convert_to_hls transformation (?) - - # choose implementation style - if (mmv_out > 1 or (k_h==1 and k_w==1)): - impl_style = "parallel" - else: - impl_style = "default" - - ##### END INITIALIZE/CHECK CONFIGURATION ##### - - ##### BEGIN CODE GEN FOR DEFAULT STYLE ##### - if (impl_style == "default"): - # Default implementation style for MMV_out = 1: addressable cyclic buffer - # Computing incremental addressing scheme directly.. - - # compute index/address increments for each nested loop - channel_factor = int(ifm_ch/simd) - - # compute minimal buffer length (assuming it holds 1 complete window) - buffer_min_size = ((k_h-1) * dilation_h * w + (k_w-1) * dilation_w + 1) * channel_factor - - kernel_width = (k_w-1)*dilation_w+1 # incl. dilation - addr_incr_end_simd = 1 - addr_incr_end_window_elem = (dilation_w-1) * channel_factor + 1 - - remaining_line = (w - kernel_width) * channel_factor - skip_lines = (dilation_h-1) * w * channel_factor - addr_incr_end_window_row = remaining_line + skip_lines + 1 # 1 = wrap around of minimally sized buffer - - addr_incr_end_window = -buffer_min_size + stride_w * channel_factor + 1 # 1 = wrap around of minimally sized buffer - - # rows that are skipped due to imperfect stride<->W combination - skip_columns = w%(kernel_width + (out_dim_w-1)*stride_w) - remaining_line = (skip_columns + kernel_width) * channel_factor # increment from oldest buffer position (top left) to end of line - skip_lines = (stride_h-1) * w * channel_factor - addr_incr_end_row = -buffer_min_size + remaining_line + skip_lines + 1 # 1 = wrap around of minimally sized buffer - - if (depthwise): - addr_incr_end_window_elem = dilation_w * channel_factor - addr_incr_end_window_row = (channel_factor - + (w - kernel_width) * channel_factor - + (dilation_h-1) * w * channel_factor - ) - addr_incr_end_simd = -buffer_min_size + (channel_factor + 1) - - # add additional buffer space in case of stride > 1 - # this minimizes cycle count, as it allows an earlier pre-load of skipped input elements - buffer_actual_size = (buffer_min_size + max(0,((stride_w-1) - (int(mmv_out*k_h*k_w/mmv_in)))*channel_factor) - + max(0,((stride_h-1)*w - (int(mmv_out*k_h*k_w/mmv_in)))*channel_factor)) - code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_actual_size)] - - assert not(abs(addr_incr_end_window) > buffer_actual_size), "ERROR: W increment > buffer size, wrap logic doesn't account for this" - assert not(abs(addr_incr_end_row) > buffer_actual_size), "ERROR: H increment > buffer size, wrap logic doesn't account for this" - - kernel_width = (k_w-1)*dilation_w+1 # incl. dilation - kernel_height = (k_h-1)*dilation_h+1 # incl. dilation - skip_columns = w%(kernel_width + (out_dim_w-1)*stride_w) - skip_rows = h%(kernel_height + (out_dim_h-1)*stride_h) - code_gen_dict["$LAST_READ_ELEM$"] = [str(h*w*channel_factor-1)] - code_gen_dict["$LAST_WRITE_ELEM$"] = [str(((h - skip_rows - 1) * w + (w - skip_columns))*channel_factor -1)] - - loop_h_iterations = out_dim_h - loop_w_iterations = out_dim_w - loop_kh_iterations = k_h - loop_kw_iterations = k_w - loop_simd_iterations = channel_factor - - if (depthwise and channel_factor > 1): - # re-arrange existing controller loop structure for depthwise convolutions - loop_kh_iterations = channel_factor - loop_kw_iterations = k_h - loop_simd_iterations = k_w - addr_incr_end_simd_ = addr_incr_end_simd - addr_incr_end_simd = addr_incr_end_window_elem - addr_incr_end_window_elem = addr_incr_end_window_row - addr_incr_end_window_row = addr_incr_end_simd_ - elem_per_window = k_h*k_w - - tail_incr_w = addr_incr_end_window + buffer_min_size - channel_factor - tail_incr_h = addr_incr_end_row + buffer_min_size - channel_factor - tail_incr_last_window = buffer_min_size-1 - code_gen_dict["$TAIL_INCR_GENERATION$"] = [""" - always @ (counter_loop_kh, counter_loop_w, counter_loop_h) begin - if (counter_loop_kh >= 0) - tail_incr_reg = 1; - else if (counter_loop_w >= 0) - tail_incr_reg = {}; - else if (counter_loop_h >= 0) - tail_incr_reg = {}; - else - tail_incr_reg = {}; - end - """.format(tail_incr_w, tail_incr_h, tail_incr_last_window)] - else: - # depthwise output format is equivalent to non-depthwise if SIMD=C - elem_per_window = k_h*k_w*channel_factor - - tail_incr_w = addr_incr_end_window + buffer_min_size - 1 - tail_incr_h = addr_incr_end_row + buffer_min_size - 1 - tail_incr_last_window = buffer_min_size-1 - code_gen_dict["$TAIL_INCR_GENERATION$"] = [""" - always @ (counter_loop_w, counter_loop_h) begin - if (counter_loop_w >= 0) - tail_incr_reg = {}; - else if (counter_loop_h >= 0) - tail_incr_reg = {}; - else - tail_incr_reg = {}; - end - """.format(tail_incr_w, tail_incr_h, tail_incr_last_window)] - - # support SIMD = C and k_w = 1 cases - # for k = [k_h, k_w] = [1, k_w], no adjustment is needed - # for k = [k_h, k_w] = [1, 1], do not use this impl. style (mmv_out=K=1) - # innermost loop is executed at least once -> adjust if needed - if (loop_simd_iterations == 1): - # skip innermost SIMD loop completely - if (loop_kw_iterations == 1): - # skip innermost KW loop completely - code_gen_dict["$INNERMOST_STATE$"]=["STATE_LOOP_KH"] - loop_kh_iterations -= 1 # -1 because state is initial state - else: - code_gen_dict["$INNERMOST_STATE$"]=["STATE_LOOP_KW"] - loop_kw_iterations -= 1 # -1 because state is initial state - else: - code_gen_dict["$INNERMOST_STATE$"]=["STATE_LOOP_SIMD"] - loop_simd_iterations -= 1 # -1 because state is initial state - - code_gen_dict["$LOOP_H_ITERATIONS$"]=[str(loop_h_iterations-1)] - code_gen_dict["$LOOP_W_ITERATIONS$"]=[str(loop_w_iterations-1)] - code_gen_dict["$LOOP_KH_ITERATIONS$"]=[str(loop_kh_iterations-1)] - code_gen_dict["$LOOP_KW_ITERATIONS$"]=[str(loop_kw_iterations-1)] - code_gen_dict["$LOOP_SIMD_ITERATIONS$"]=[str(loop_simd_iterations-1)] - - incr_bitwidth = 1 + math.ceil(math.log2(max(abs(addr_incr_end_simd)+1, - abs(addr_incr_end_window_elem)+1, - abs(addr_incr_end_window_row)+1, - abs(addr_incr_end_window)+1, - abs(addr_incr_end_row)+1, - abs(tail_incr_w)+1, - abs(tail_incr_h)+1, - abs(tail_incr_last_window)+1))) - code_gen_dict["$INCR_BITWIDTH$"] = [str(incr_bitwidth)] - code_gen_dict["$ADDR_INCREMENT_MAP$"]=["'{{ {}'d0, {}'d{}, {}'d{}, {}'d{}, {}'d{}, {}'d{}}}".format(incr_bitwidth, - int(copysign(incr_bitwidth,addr_incr_end_simd)),abs(addr_incr_end_simd), - int(copysign(incr_bitwidth,addr_incr_end_window_elem)),abs(addr_incr_end_window_elem), - int(copysign(incr_bitwidth,addr_incr_end_window_row)),abs(addr_incr_end_window_row), - int(copysign(incr_bitwidth,addr_incr_end_window)),abs(addr_incr_end_window), - int(copysign(incr_bitwidth,addr_incr_end_row)),abs(addr_incr_end_row))] - - code_gen_dict["$ELEM_PER_WINDOW$"] = [str(elem_per_window)] - - with open(os.environ['FINN_ROOT']+"/finn-rtllib/swg/swg_template_default.sv", "r") as f: - template = f.read() - - ##### END CODE GEN FOR DEFAULT STYLE ##### - - ##### BEGIN CODE GEN FOR PARALLEL STYLE ##### - elif (impl_style == "parallel"): - # Out width > In width: Parallel implementation style using registers + line buffers - idx_c, idx_h, idx_w = im2col.get_im2col_indices_nchw( - in_shape, - k_h, - k_w, - pad, - stride_h, - stride_w, - dilation_h, - dilation_w - ) + # Out width > In width: Parallel implementation style using registers + line buffers + idx_c, idx_h, idx_w = im2col.get_im2col_indices_nchw( + in_shape, k_h, k_w, pad, stride_h, stride_w, dilation_h, dilation_w + ) - cols = in_image_padded[:, idx_c, idx_h, idx_w] - cols = cols.transpose(1, 2, 0).reshape(k_h * k_w * c, -1) - - # result shape is (k_H*k_W*N, out_dim_H*out_dim_W), convert to NCHW - out_image = cols.reshape(n, c, k_h, k_w, out_dim_h, out_dim_w) - # (N=0,C=1,kh=2,kw=3,H=4,W=5) -> (N=0,H=4,W=5,kh=2,kw=3,C=1) - out_image = out_image.transpose(0, 4, 5, 2, 3, 1) - out_image = out_image.reshape(n, out_dim_h, out_dim_w, k_h * k_w * c) - - idx_px = idx_h*w+idx_w # sequential pixel indices - - k, cycles = idx_px.shape - - output_elements = mmv_out - output_cycles = int(cycles/(mmv_out/k)) - - # TODO: what happens when output_cycles=OFMdim % M != 0 - # ...try to support IFMdim % M != 0 first, so we can work with the usual k=3 where OFMdim = IFMdim - -2 - # the additional garbage input elements that are read in the last cycle are not read by any window anyway - idx_px = idx_px.transpose() - idx_px = idx_px.reshape(output_cycles, output_elements) - idx_px = idx_px.transpose() - # result: first dim is number of parallel output elements, - # second dim is the input element (pixel in case of SIMD=C) index that each output element outputs per cycle - - buffer = [] - buffer_max_size = 0 - schedule = [] - next_in_px = 0 - oldest_px = 0 - - def schedule_append(schedule, op): - if len(schedule) > 0 and schedule[-1][1] == op: - count, op_ = schedule[-1] - schedule[-1] = (count+1, op_) - else: - schedule.append((1, op)) - return schedule - - # compute schedule and buffer read pattern (output driven) - idx_px_relative = idx_px.copy() - output_elem, output_cycles = idx_px_relative.shape - - for x in range(output_cycles): - # load missing inputs into buffer - for y in range(output_elem): - while int(idx_px_relative[y,x]) not in buffer: - # load M inputs at once (keep "buffer" list 1D for now, handle actual 2D buffer generation later) - for m in range(M): - buffer.append(next_in_px) - next_in_px += 1 - schedule = schedule_append(schedule,'w') - - # discard unused buffer elements - oldest_px = np.min(idx_px_relative[:,x:]) - #check whether M elements can be shifted out, not just the single oldest one - #while all([buffer[i] < oldest_px for i in range(M)]): - if all([buffer[i] < oldest_px for i in range(M)]): - # M buffer elements are shifted out at once - for m in range(M): - buffer.pop(0) - - # adjust relative buffer index of current x (according to last discarded buffer elements) - for y in range(output_elem): - idx_px_relative[y,x] -= oldest_px - - # read from buffer - # + simultaneously load next pixel(s) into buffer if there are any left - if (next_in_px > (h_padded*w_padded-1)): - # read only (append above) - schedule = schedule_append(schedule,'r') - else: - # load M inputs at once + cols = in_image_padded[:, idx_c, idx_h, idx_w] + cols = cols.transpose(1, 2, 0).reshape(k_h * k_w * c, -1) + # result shape is (k_H*k_W*N, out_dim_H*out_dim_W), convert to NCHW + out_image = cols.reshape(n, c, k_h, k_w, out_dim_h, out_dim_w) + # (N=0,C=1,kh=2,kw=3,H=4,W=5) -> (N=0,H=4,W=5,kh=2,kw=3,C=1) + out_image = out_image.transpose(0, 4, 5, 2, 3, 1) + out_image = out_image.reshape(n, out_dim_h, out_dim_w, k_h * k_w * c) + idx_px = idx_h * w + idx_w # sequential pixel indices + k, cycles = idx_px.shape + output_elements = mmv_out + output_cycles = int(cycles / (mmv_out / k)) + + idx_px = idx_px.transpose() + idx_px = idx_px.reshape(output_cycles, output_elements) + idx_px = idx_px.transpose() + # result: first dim is number of parallel output elements, + # second dim is the input element (pixel in case of SIMD=C) index that each output element outputs per cycle + + buffer = [] + buffer_max_size = 0 + schedule = [] + next_in_px = 0 + oldest_px = 0 + + # compute schedule and buffer read pattern (output driven) + idx_px_relative = idx_px.copy() + output_elem, output_cycles = idx_px_relative.shape + + for x in range(output_cycles): + # load missing inputs into buffer + for y in range(output_elem): + while int(idx_px_relative[y, x]) >= next_in_px: + # load M inputs at once (keep "buffer" list 1D for now, handle actual 2D buffer generation later) for m in range(M): buffer.append(next_in_px) next_in_px += 1 - schedule = schedule_append(schedule,'wr') - - # record max needed buffer depth - if len(buffer) > buffer_max_size: - buffer_max_size = len(buffer) - - # insert dummy write operations for data at the input FM tail-end that is never read (e.g. in case of stride > 1) - while next_in_px <= (h_padded*w_padded-1): - next_in_px += 1 - schedule = schedule_append(schedule,'w') - - # find buffer access patterns - buffer_access_patterns = [] - for x in range(output_cycles): - if idx_px_relative[:,x].tolist() not in buffer_access_patterns: - buffer_access_patterns.append(idx_px_relative[:,x].tolist()) - - # Experimental implementation to map fixed controller loop structure to R/W schedule by analyzing - # the access pattern given by Im2Col, rather than direct computation. - # TODO: Probably replace this with a directly-computed schedule, similar to the default implementation style. - def compact_schedule(schedule): - # leave first sequence (pre-load) as is - start_sequence = schedule[0] - loop_sequence_1_counter = 1 - loop_sequence_1 = schedule[1] - loop_counter = 0 - loop_sequence_2 = None - end_sequence = None - - i = 2 - if i < len(schedule): - loop_sequence_1 += schedule[i] - i += 1 - while i+1 < len(schedule): - candidate = schedule[i] + schedule[i+1] - if candidate == loop_sequence_1: - loop_sequence_1_counter += 1 - i += 2 - else: - break - - if i < len(schedule): - loop_sequence_2 = schedule[i] - i += 1 - if i+1 < len(schedule): - candidate = schedule[i] + schedule[i+1] - if candidate != loop_sequence_1: - loop_sequence_2 += schedule[i] - i -= 1 - loop_sequence_total_len = (int(len(loop_sequence_2)/2)) + loop_sequence_1_counter*(int(len(loop_sequence_1)/2)) - loop_sequence_total = loop_sequence_2 + loop_sequence_1_counter*loop_sequence_1 - while i+loop_sequence_total_len < len(schedule): - candidate = schedule[i] - for x in range (i+1, i+loop_sequence_total_len): - candidate += schedule[x] - - if candidate == loop_sequence_total: - loop_counter += 1 - i += loop_sequence_total_len - else: - break - else: - if i < len(schedule): - end_sequence = loop_sequence_2 + schedule[i] - i += 1 - loop_sequence_2 = None - else: - end_sequence = loop_sequence_2 - loop_sequence_2 = None - - if i < len(schedule): - end_sequence = schedule[i] - i += 1 - if i < len(schedule): - end_sequence = end_sequence + schedule[i] - i += 1 - - assert len(start_sequence) == 1*2, "ERROR: invalid start sequence" - assert len(loop_sequence_1) == 2*2, "ERROR: invalid loop 1 sequence" - if loop_sequence_2: - assert len(loop_sequence_2) <= 2*2, "ERROR: invalid loop 2 sequence" - if end_sequence: - assert len(end_sequence) <= 2*2, "ERROR: invalid end sequence" - assert i == len(schedule), "ERROR: schedule could not be compacted %d / %d" %(i, len(schedule)) - - return (start_sequence, loop_counter, loop_sequence_1_counter, - loop_sequence_1, loop_sequence_2, end_sequence) - - ### determine buffer partitioning into REG FIFOs (parallel access) and BRAM FIFOs (line buffers) - # TODO: this part doesn't fully account for M for 2D buffers yet - - # how many "unused" registers are allowed between buffer positions that will be accessed in parallel - # example: - # 0: only consecutive access patterns will be implemented in regs, rest in (LUTRAM/BRAM) line buffers - # 2: [0, 3, 6] access pattern is still allowed and will be implemented with one 7-position shift reg - REG_BRAM_THRESHOLD = 8 - - code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_max_size)] - - assert len(buffer_access_patterns) == 1, "ERROR: Buffer access pattern is not static" - buf_static_access_pattern = buffer_access_patterns[0] - reg_fifos = [] - reg_fifos_depth = [] - bram_fifos = [] - bram_fifos_depth = [] - current = [] - for i in range(len(buf_static_access_pattern)): - access_idx = buf_static_access_pattern[i] - if len(current) == 0: + schedule = schedule_append(schedule, "w") + + # discard unused buffer elements + # FIXME: this is very slow for large feature maps (e.g., 4096x4096) + oldest_px = np.min(idx_px_relative[:, x:]) + # check whether M elements can be shifted out, not just the single oldest one + # while all([buffer[i] < oldest_px for i in range(M)]): + if all([buffer[i] < oldest_px for i in range(M)]): + # M buffer elements are shifted out at once + for m in range(M): + buffer.pop(0) + + # adjust relative buffer index of current x (according to last discarded buffer elements) + for y in range(output_elem): + idx_px_relative[y, x] -= oldest_px + + # read from buffer + # + simultaneously load next pixel(s) into buffer if there are any left + if next_in_px > (h_padded * w_padded - 1): + # read only (append above) + schedule = schedule_append(schedule, "r") + else: + # load M inputs at once + for m in range(M): + buffer.append(next_in_px) + next_in_px += 1 + schedule = schedule_append(schedule, "wr") + + # record max needed buffer depth + if len(buffer) > buffer_max_size: + buffer_max_size = len(buffer) + + # insert dummy write operations for data at the input FM tail-end that is never read (e.g. in case of stride > 1) + while next_in_px <= (h_padded * w_padded - 1): + next_in_px += 1 + schedule = schedule_append(schedule, "w") + + # add 1 extra cycle after final READ+WRITE cycle for transition b/w feature maps + if schedule[-1][1] == "wr": + schedule_append(schedule, "n") + + # find buffer access patterns + buffer_access_patterns = [] + for x in range(output_cycles): + if idx_px_relative[:, x].tolist() not in buffer_access_patterns: + buffer_access_patterns.append(idx_px_relative[:, x].tolist()) + + ### determine buffer partitioning into REG FIFOs (parallel access) and BRAM FIFOs (line buffers) + # TODO: this part doesn't fully account for M>1 for 2D buffers yet + REG_BRAM_THRESHOLD = 8 + # how many "unused" registers are allowed between buffer positions that will be accessed in parallel + # example: + # 0: only consecutive access patterns will be implemented in regs, rest in (LUTRAM/BRAM) line buffers + # 2: [0, 3, 6] access pattern is still allowed and will be implemented with one 7-position shift reg + + code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_max_size)] + self.buffer_depth = buffer_max_size # for resource estimation + + assert ( + len(buffer_access_patterns) == 1 + ), "ERROR: Buffer access pattern is not static" + buf_static_access_pattern = buffer_access_patterns[0] + reg_fifos = [] + reg_fifos_depth = [] + bram_fifos = [] + bram_fifos_depth = [] + current = [] + for i in range(len(buf_static_access_pattern)): + access_idx = buf_static_access_pattern[i] + if len(current) == 0: + current.append(access_idx) + else: + # assume non-decreasing index order in access pattern + # TODO: this assumption does not hold for M>1 for the 2D case + distance = access_idx - max(current) + if not (distance - 1 > REG_BRAM_THRESHOLD): + for i in range(distance - 1): + # insert dummy into REG FIFO (not read as part of window) + current.append(-1) + # assign this access to same REG FIFO as previous one current.append(access_idx) else: - # assume non-decreasing index order in access pattern - # TODO: this assumption does not hold for M>1 case (2D buffer) - distance = access_idx - max(current) - if not (distance-1 > REG_BRAM_THRESHOLD): - for i in range(distance-1): - # insert dummy into REG FIFO (not read as part of window) - current.append(-1) - # assign this access to same REG FIFO as previous one - current.append(access_idx) - else: - # assign skipped accesses to new BRAM FIFO - bram_fifos.append([-1]*(distance-1)) - bram_fifos_depth.append(math.ceil((distance-1)/M)) # really ceil? - # start with new REG FIFO - reg_fifos.append(current) - #reg_fifos_depth.append(math.ceil((max(current)+1)/M)) # fix for M again - reg_fifos_depth.append(len(current)) - current = [] - current.append(access_idx) - reg_fifos.append(current) - #reg_fifos_depth.append(math.ceil((max(current)+1)/M)) # fix for M again - reg_fifos_depth.append(len(current)) - - code_gen_dict["$GENERATE_REG_FIFOS$"] = [] - for i in range(len(reg_fifos)): - code_gen_dict["$GENERATE_REG_FIFOS$"].append( - """ - wire [IN_WIDTH-1:0] reg_fifo_{id}_in; - wire [IN_WIDTH-1:0] reg_fifo_{id}_out; - wire [IN_WIDTH*{len}-1:0] reg_fifo_{id}; - {name}_reg_buffer - #( - .WIDTH(IN_WIDTH), - .DEPTH({len}) + # assign skipped accesses to new BRAM FIFO + bram_fifos.append([-1] * (distance - 1)) + bram_fifos_depth.append( + math.ceil((distance - 1) / M) + ) # really ceil? + # start with new REG FIFO + reg_fifos.append(current) + # reg_fifos_depth.append(math.ceil((max(current)+1)/M)) # allows for MMV in the 1D case + reg_fifos_depth.append(len(current)) + current = [] + current.append(access_idx) + reg_fifos.append(current) + # reg_fifos_depth.append(math.ceil((max(current)+1)/M)) # allows for MMV in the 1D case + reg_fifos_depth.append(len(current)) + + code_gen_dict["$GENERATE_REG_FIFOS$"] = [] + for i in range(len(reg_fifos)): + code_gen_dict["$GENERATE_REG_FIFOS$"].append( + """ + wire [IN_WIDTH-1:0] reg_fifo_{id}_in; + wire [IN_WIDTH-1:0] reg_fifo_{id}_out; + wire [IN_WIDTH*{len}-1:0] reg_fifo_{id}; + {name}_reg_buffer + #( + .WIDTH(IN_WIDTH), + .DEPTH({len}) + ) + reg_buffer_inst_{id} + ( + .CLK(CLK), + .shift_enable(shift_enable), + .shift_in(reg_fifo_{id}_in), + .shift_out(reg_fifo_{id}_out), + .data_out(reg_fifo_{id}) + );""".format( + name=self.get_verilog_top_module_name(), + id=i, + len=reg_fifos_depth[i], + ) + ) + + code_gen_dict["$GENERATE_BRAM_FIFOS$"] = [] + for i in range(len(bram_fifos)): + code_gen_dict["$GENERATE_BRAM_FIFOS$"].append( + """ + wire [IN_WIDTH-1:0] bram_fifo_{id}_in; + wire [IN_WIDTH-1:0] bram_fifo_{id}_out; + {name}_ram_buffer + #( + .WIDTH(IN_WIDTH), + .DEPTH({len}) + ) + ram_buffer_inst_{id} + ( + .CLK(CLK), + .RST(RST), + .shift_enable(shift_enable), + .shift_in(bram_fifo_{id}_in), + .shift_out(bram_fifo_{id}_out) + );""".format( + name=self.get_verilog_top_module_name(), + id=i, + len=bram_fifos_depth[i], + ) + ) + + code_gen_dict["$GENERATE_OUTPUT_MAPPING$"] = [] + out_idx = mmv_out - 1 + for fifo_id, reg_fifo in enumerate(reg_fifos): + for fifo_idx, access_idx in enumerate(reg_fifo): + if access_idx != -1: + code_gen_dict["$GENERATE_OUTPUT_MAPPING$"].append( + "assign data_out[OUT_ELEM_WIDTH*{out_idx}+:OUT_ELEM_WIDTH] = reg_fifo_{fifo_id}[{access_idx}*{mmv}*OUT_ELEM_WIDTH+OUT_ELEM_WIDTH*{mmv_idx}+:OUT_ELEM_WIDTH];".format( + out_idx=out_idx, + fifo_id=fifo_id, + access_idx=reg_fifos_depth[fifo_id] + - 1 + - int((max(reg_fifo) - access_idx) / M), + mmv_idx=(max(reg_fifo) - access_idx) % M, + mmv=M, + ) ) - reg_buffer_inst_{id} - ( - .CLK(CLK), - .shift_enable(shift_enable), - .shift_in(reg_fifo_{id}_in), - .shift_out(reg_fifo_{id}_out), - .data_out(reg_fifo_{id}) - );""".format(name=self.get_verilog_top_module_name(), id=i, len=reg_fifos_depth[i])) - - code_gen_dict["$GENERATE_BRAM_FIFOS$"] = [] - for i in range(len(bram_fifos)): - code_gen_dict["$GENERATE_BRAM_FIFOS$"].append( - """ - wire [IN_WIDTH-1:0] bram_fifo_{id}_in; - wire [IN_WIDTH-1:0] bram_fifo_{id}_out; - {name}_ram_buffer - #( - .WIDTH(IN_WIDTH), - .DEPTH({len}) + # reversal: out_idx=0 -> oldest buffer element -> highest access_idx + out_idx = out_idx - 1 + assert out_idx == -1, "ERROR: Not all output vector elements connected" + + code_gen_dict["$GENERATE_BUFFER_CONNECTION$"] = [] + for i in range(len(reg_fifos)): + if i == 0: + # first FIFO containing newest elements -> input comes from input reg + code_gen_dict["$GENERATE_BUFFER_CONNECTION$"].append( + """assign reg_fifo_{fifo_id}_in = reg_input;""".format( + fifo_id=i, ) - ram_buffer_inst_{id} - ( - .CLK(CLK), - .RST(RST), - .shift_enable(shift_enable), - .shift_in(bram_fifo_{id}_in), - .shift_out(bram_fifo_{id}_out) - );""".format(name=self.get_verilog_top_module_name(), id=i, len=bram_fifos_depth[i])) - - code_gen_dict["$GENERATE_OUTPUT_MAPPING$"] = [] - out_idx = mmv_out-1 - for fifo_id, reg_fifo in enumerate(reg_fifos): - for fifo_idx, access_idx in enumerate(reg_fifo): - if(access_idx != -1): - #code_gen_dict["$GENERATE_OUTPUT_MAPPING$"].append( - # "assign data_out[OUT_ELEM_WIDTH*{out_idx}+:OUT_ELEM_WIDTH] = reg_fifo_{fifo_id}[{fifo_idx}]; //{access_idx}".format( - # out_idx=out_idx, fifo_id=fifo_id, fifo_idx=fifo_idx, access_idx=access_idx - # ) - #) - code_gen_dict["$GENERATE_OUTPUT_MAPPING$"].append( - "assign data_out[OUT_ELEM_WIDTH*{out_idx}+:OUT_ELEM_WIDTH] = reg_fifo_{fifo_id}[{access_idx}*{mmv}*OUT_ELEM_WIDTH+OUT_ELEM_WIDTH*{mmv_idx}+:OUT_ELEM_WIDTH];".format( - out_idx=out_idx, fifo_id=fifo_id, - access_idx=reg_fifos_depth[fifo_id]-1-int((max(reg_fifo)-access_idx)/M), - mmv_idx=(max(reg_fifo)-access_idx)%M, - mmv = M - ) - ) - # reversal: out_idx=0 -> oldest buffer element -> highest access_idx - out_idx = out_idx-1 - assert out_idx==-1, "ERROR: Not all output vector elements connected" - - code_gen_dict["$GENERATE_BUFFER_CONNECTION$"] = [] - for i in range(len(reg_fifos)): - if i == 0: - # first FIFO containing newest elements -> input comes from input reg - code_gen_dict["$GENERATE_BUFFER_CONNECTION$"].append( - """assign reg_fifo_{fifo_id}_in = reg_input;""".format(fifo_id=i,)) - else: - # other REG FIFOs -> input comes from connected BRAM FIFO (line buffer) - input_fifo_id = i-1 - code_gen_dict["$GENERATE_BUFFER_CONNECTION$"].append( - """assign reg_fifo_{fifo_id}_in = bram_fifo_{input_fifo_id}_out;""".format(fifo_id=i, input_fifo_id=input_fifo_id)) - for i in range(len(bram_fifos)): - input_fifo_id = i + ) + else: + # other REG FIFOs -> input comes from connected BRAM FIFO (line buffer) + input_fifo_id = i - 1 code_gen_dict["$GENERATE_BUFFER_CONNECTION$"].append( - """assign bram_fifo_{fifo_id}_in = reg_fifo_{input_fifo_id}_out;""".format(fifo_id=i, input_fifo_id=input_fifo_id)) - - def convert_tuple(seq): - mapping = {'w': ("1'b1", "1'b0"), - 'r': ("1'b0", "1'b1"), - 'wr':("1'b1", "1'b1"), - 'n': ("1'b0", "1'b0")} - if seq: - if len(seq) == 2: - return (seq[0], mapping[seq[1]], 0, mapping['n']) - if len(seq) == 4: - return (seq[0], mapping[seq[1]], seq[2], mapping[seq[3]]) - else: - return (0, mapping['n'], 0, mapping['n']) + """assign reg_fifo_{fifo_id}_in = bram_fifo_{input_fifo_id}_out;""".format( + fifo_id=i, input_fifo_id=input_fifo_id + ) + ) + for i in range(len(bram_fifos)): + input_fifo_id = i + code_gen_dict["$GENERATE_BUFFER_CONNECTION$"].append( + """assign bram_fifo_{fifo_id}_in = reg_fifo_{input_fifo_id}_out;""".format( + fifo_id=i, input_fifo_id=input_fifo_id + ) + ) - start_sequence,loop_counter,loop_sequence_1_counter,loop_sequence_1,loop_sequence_2,end_sequence = compact_schedule(schedule) + ( + start_sequence, + loop_counter, + loop_sequence_1_counter, + loop_sequence_1, + loop_sequence_2, + end_sequence, + ) = schedule_map_controller(schedule) + + start_sequence = schedule_map_cmds(start_sequence) + loop_sequence_1 = schedule_map_cmds(loop_sequence_1) + loop_sequence_2 = schedule_map_cmds(loop_sequence_2) + end_sequence = schedule_map_cmds(end_sequence) + + cycles_total = 0 + for t in schedule: + cycles_total += t[0] + # add extra cycle if schedule ends on READ + if schedule[-1][1] == "r": + cycles_total += 1 + code_gen_dict["$CYCLES_TOTAL$"] = [str(cycles_total)] + + code_gen_dict["$START_COUNTER$"] = [str(start_sequence[0])] + code_gen_dict["$LOOP_MAIN_COUNTER$"] = [str(loop_sequence_1_counter)] + code_gen_dict["$LOOP_INTER_COUNTER$"] = [str(loop_counter)] + + code_gen_dict["$LOOP_MAIN_1_COUNTER$"] = [str(loop_sequence_1[0])] + code_gen_dict["$LOOP_MAIN_2_COUNTER$"] = [str(loop_sequence_1[2])] + + code_gen_dict["$LOOP_INTER_1_COUNTER$"] = [str(loop_sequence_2[0])] + code_gen_dict["$LOOP_INTER_2_COUNTER$"] = [str(loop_sequence_2[2])] + + code_gen_dict["$LOOP_END_1_COUNTER$"] = [str(end_sequence[0])] + code_gen_dict["$LOOP_END_2_COUNTER$"] = [str(end_sequence[2])] + + code_gen_dict["$READ_CMD_MAP$"] = [ + "{{ {}, {}, {}, {}, {}, {}, {} }}".format( + start_sequence[1][0], + loop_sequence_1[1][0], + loop_sequence_1[3][0], + loop_sequence_2[1][0], + loop_sequence_2[3][0], + end_sequence[1][0], + end_sequence[3][0], + ) + ] + code_gen_dict["$WRITE_CMD_MAP$"] = [ + "{{ {}, {}, {}, {}, {}, {}, {} }}".format( + start_sequence[1][1], + loop_sequence_1[1][1], + loop_sequence_1[3][1], + loop_sequence_2[1][1], + loop_sequence_2[3][1], + end_sequence[1][1], + end_sequence[3][1], + ) + ] - start_sequence = convert_tuple(start_sequence) - loop_sequence_1 = convert_tuple(loop_sequence_1) - loop_sequence_2 = convert_tuple(loop_sequence_2) - end_sequence = convert_tuple(end_sequence) + code_gen_dict["$SIMD$"] = [str(simd)] + code_gen_dict["$MMV_IN$"] = [str(mmv_in)] + code_gen_dict["$MMV_OUT$"] = [str(mmv_out)] - cycles_total = 0 - for t in schedule: - cycles_total += t[0] - code_gen_dict["$CYCLES_TOTAL$"] = [str(cycles_total)] + return template_path, code_gen_dict - code_gen_dict["$START_COUNTER$"]=[str(start_sequence[0])] - code_gen_dict["$LOOP_MAIN_COUNTER$"]=[str(loop_sequence_1_counter)] - code_gen_dict["$LOOP_INTER_COUNTER$"]=[str(loop_counter)] + def select_impl_style(self): + ifm_ch = self.get_nodeattr("IFMChannels") + k = self.get_nodeattr("ConvKernelDim") + simd = self.get_nodeattr("SIMD") + M = self.get_nodeattr("M") - code_gen_dict["$LOOP_MAIN_1_COUNTER$"]=[str(loop_sequence_1[0])] - code_gen_dict["$LOOP_MAIN_2_COUNTER$"]=[str(loop_sequence_1[2])] + k_h, k_w = k + # init folding config + if self.get_nodeattr("parallel_window"): + mmv_in = M * 1 + mmv_out = M * k_h * k_w + assert ifm_ch == simd, "Constraint violated: SIMD must be equal to C" + else: + mmv_in = 1 + mmv_out = 1 + assert ifm_ch % simd == 0, "Constraint violated: SIMD must divide C" - code_gen_dict["$LOOP_INTER_1_COUNTER$"]=[str(loop_sequence_2[0])] - code_gen_dict["$LOOP_INTER_2_COUNTER$"]=[str(loop_sequence_2[2])] + # TODO: check allowed hyperparams + # for 1D case: it does not matter if dummy dim is x or y + # TODO: move/duplicate these checks in corresponding convert_to_hls transformation (?) - code_gen_dict["$LOOP_END_1_COUNTER$"]=[str(end_sequence[0])] - code_gen_dict["$LOOP_END_2_COUNTER$"]=[str(end_sequence[2])] + # choose implementation style + if mmv_out > 1 or (k_h == 1 and k_w == 1): + impl_style = "parallel" + assert ifm_ch == simd, "Constraint violated: SIMD must be equal to C" + else: + impl_style = "default" - code_gen_dict["$READ_CMD_MAP$"]=["{{ {}, {}, {}, {}, {}, {}, {} }}".format( - start_sequence[1][0],loop_sequence_1[1][0],loop_sequence_1[3][0],loop_sequence_2[1][0],loop_sequence_2[3][0],end_sequence[1][0],end_sequence[3][0])] - code_gen_dict["$WRITE_CMD_MAP$"]=["{{ {}, {}, {}, {}, {}, {}, {} }}".format( - start_sequence[1][1],loop_sequence_1[1][1],loop_sequence_1[3][1],loop_sequence_2[1][1],loop_sequence_2[3][1],end_sequence[1][1],end_sequence[3][1])] + return impl_style - with open(os.environ['FINN_ROOT']+"/finn-rtllib/swg/swg_template_parallel.sv", "r") as f: - template = f.read() + def generate_hdl(self): + impl_style = self.select_impl_style() - ##### END CODE GEN FOR PARALLEL STYLE ##### + # prepare code generation by filling out dictionaries + if impl_style == "default": + template_path, code_gen_dict = self.prepare_codegen_default() + elif impl_style == "parallel": + template_path, code_gen_dict = self.prepare_codegen_parallel() - ##### BEGIN GENERAL CODE GEN ##### + # add general parameters to dictionary code_gen_dict["$TOP_MODULE_NAME$"] = [self.get_verilog_top_module_name()] - # save top module name so we can refer to it even after this node has been renamed + # save top module name so we can refer to it even after this node has been renamed # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) code_gen_dict["$BIT_WIDTH$"] = [str(self.get_input_datatype().bitwidth())] - code_gen_dict["$SIMD$"] = [str(simd)] - code_gen_dict["$MMV_IN$"] = [str(mmv_in)] - code_gen_dict["$MMV_OUT$"] = [str(mmv_out)] - ram_style = self.get_nodeattr("ram_style") if ram_style == "auto": - code_gen_dict["$RAM_STYLE$"]=[""] + code_gen_dict["$RAM_STYLE$"] = [""] else: - code_gen_dict["$RAM_STYLE$"]=["(* ram_style = \"{}\" *)".format(ram_style)] + code_gen_dict["$RAM_STYLE$"] = ['(* ram_style = "{}" *)'.format(ram_style)] - with open(os.environ['FINN_ROOT']+"/finn-rtllib/swg/swg_template_wrapper.v", "r") as f: + # apply code generation to templates + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + with open(template_path, "r") as f: + template = f.read() + with open( + os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_template_wrapper.v", "r" + ) as f: template_wrapper = f.read() - for key in code_gen_dict: # transform list into long string separated by '\n' code_gen_line = "\n".join(code_gen_dict[key]) template = template.replace(key, code_gen_line) template_wrapper = template_wrapper.replace(key, code_gen_line) + with open( + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv" + ), + "w", + ) as f: + f.write(template) + with open( + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" + ), + "w", + ) as f: + f.write(template_wrapper) - f = open(os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv"), "w") - f.write(template) - f.close() - f = open(os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"), "w") - f.write(template_wrapper) - f.close() - #f_debug.close() - - #set ipgen_path and ip_path so that HLS-Synth transformation and stich_ip transformation do not complain + # set ipgen_path and ip_path so that HLS-Synth transformation and stich_ip transformation do not complain self.set_nodeattr("ipgen_path", code_gen_dir) self.set_nodeattr("ip_path", code_gen_dir) - ##### END GENERAL CODE GEN ##### def prepare_rtlsim(self): """Creates a Verilator emulation library for the RTL code generated @@ -1029,9 +1171,11 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): raise ImportError("Installation of PyVerilator is required.") code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - verilog_paths = [code_gen_dir] - verilog_files = [self.get_nodeattr("gen_top_module") + "_wrapper.v", - self.get_nodeattr("gen_top_module") + "_impl.sv"] + verilog_paths = [code_gen_dir] + verilog_files = [ + self.get_nodeattr("gen_top_module") + "_wrapper.v", + self.get_nodeattr("gen_top_module") + "_impl.sv", + ] # build the Verilator emu library sim = PyVerilator.build( @@ -1045,31 +1189,69 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): self.set_nodeattr("rtlsim_so", sim.lib._name) return sim - def code_generation_ipi(self): """Constructs and returns the TCL for node instantiation in Vivado IPI.""" vlnv = self.get_nodeattr("ip_vlnv") code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - cmd = ["add_files -norecurse %s" % (os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v")), - "add_files -norecurse %s" % (os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv")), - "create_bd_cell -type module -reference %s %s" % (self.get_nodeattr("gen_top_module"), self.onnx_node.name)] + cmd = [ + "add_files -norecurse %s" + % ( + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" + ) + ), + "add_files -norecurse %s" + % ( + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv" + ) + ), + "create_bd_cell -type module -reference %s %s" + % (self.get_nodeattr("gen_top_module"), self.onnx_node.name), + ] return cmd def code_generation_ipgen(self, model, fpgapart, clk): - """Normally: Generates c++ code and tcl script for ip generation. - Here: Generates (System-)Verilog code for ip generation.""" + """Normally: Generates C++ code and tcl script for IP generation. + Here: Generates (System-)Verilog code for IP generation.""" self.generate_hdl() def ipgen_singlenode_code(self): - """Normally: Builds the bash script for ip generation using the CallHLS from - finn.util.hls.""" + """Normally: Builds the bash script for IP generation.""" pass def code_generation_cppsim(self, model): - """Normally: Generates c++ code for simulation (cppsim).""" + """Normally: Generates C++ code for simulation (cppsim).""" pass def compile_singlenode_code(self): pass + + def global_includes(self): + pass + + def defines(self, var): + pass + + def read_npy_data(self): + pass + + def strm_decl(self): + pass + + def docompute(self): + pass + + def dataoutstrm(self): + pass + + def save_as_npy(self): + pass + + def blackboxfunction(self): + pass + + def pragmas(self): + pass diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py index c94aa1eab..d3ea9d117 100755 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py @@ -30,22 +30,21 @@ import pytest import numpy as np from onnx import TensorProto, helper - -import finn.core.onnx_exec as oxe -from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.general.im2col import compute_conv_output_dim from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.general import GiveUniqueNodeNames from qonnx.util.basic import gen_finn_dt_tensor + +import finn.core.onnx_exec as oxe +from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode -def make_single_im2col_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt -): + +def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt): k_h, k_w = k ifm_dim_h, ifm_dim_w = ifm_dim stride_h, stride_w = stride @@ -134,10 +133,10 @@ def make_single_slidingwindow_modelwrapper( model.set_tensor_datatype("inp", idt) model.set_tensor_datatype("outp", odt) - #DEBUG - swg_node = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")[0] - swg_inst = getCustomOp(swg_node) - swg_inst.set_nodeattr("rtlsim_trace", "/workspace/finn/finn-rtllib/swg/swg_test_trace.vcd") + # DEBUG + # swg_node = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")[0] + # swg_inst = getCustomOp(swg_node) + # swg_inst.set_nodeattr("rtlsim_trace", "/home/felixj/WD/finn/finn-rtllib/swg/swg_test_trace.vcd") return model @@ -159,39 +158,46 @@ def prepare_inputs(input_tensor): # ], # ) # kernel size -@pytest.mark.parametrize("k", [[1,1],[2,2],[3,3],[4,5],[1,3]]) +@pytest.mark.parametrize("k", [[1, 1], [2, 2], [3, 3], [1, 2], [1, 3]]) # input dimension -@pytest.mark.parametrize("ifm_dim", [[8,8],[13,13],[1,12]]) +@pytest.mark.parametrize( + "ifm_dim", [[8, 8], [13, 13], [1, 11], [1, 12], [1, 13], [1, 14]] +) # input channels @pytest.mark.parametrize("ifm_ch", [6]) # Stride -@pytest.mark.parametrize("stride", [[1,1],[2,2],[3,4]]) +@pytest.mark.parametrize("stride", [[1, 1], [2, 2], [1, 2]]) # Dilation -@pytest.mark.parametrize("dilation", [[1,1],[2,2],[4,3]]) +@pytest.mark.parametrize("dilation", [[1, 1], [2, 2], [1, 3]]) # depthwise -@pytest.mark.parametrize("dw", [0,1]) +@pytest.mark.parametrize("dw", [0, 1]) # input channel parallelism ("SIMD") -@pytest.mark.parametrize("simd", [1,2,3,6]) +@pytest.mark.parametrize("simd", [1, 2, 3, 6]) # parallel_window enable (MMV_out = M*K) -@pytest.mark.parametrize("parallel_window", [0,1]) +@pytest.mark.parametrize("parallel_window", [0, 1]) # in/out MMV ("M") @pytest.mark.parametrize("m", [1]) # Flip dimensions -@pytest.mark.parametrize("flip", [False,True]) +@pytest.mark.parametrize("flip", [False]) @pytest.mark.slow @pytest.mark.vivado def test_fpgadataflow_slidingwindow_rtl( idt, k, ifm_dim, ifm_ch, stride, dilation, dw, simd, m, parallel_window, flip ): - #ifm_dim = conv_config[0] - #k = conv_config[1] - #stride = conv_config[2] - #dilation= conv_config[3] + # ifm_dim = conv_config[0] + # k = conv_config[1] + # stride = conv_config[2] + # dilation= conv_config[3] if flip: - if (ifm_dim[0]==ifm_dim[1] and k[0]==k[1] and stride[0]==stride[1] and dilation[0] == dilation[1]): + if ( + ifm_dim[0] == ifm_dim[1] + and k[0] == k[1] + and stride[0] == stride[1] + and dilation[0] == dilation[1] + ): pytest.skip("Dimension flip would have no effect") k = k[::-1] ifm_dim = ifm_dim[::-1] @@ -203,21 +209,31 @@ def test_fpgadataflow_slidingwindow_rtl( stride_h, stride_w = stride dilation_h, dilation_w = dilation - kernel_width = (k_w-1)*dilation_w+1 # incl. dilation - kernel_height = (k_h-1)*dilation_h+1 # incl. dilation + kernel_width = (k_w - 1) * dilation_w + 1 # incl. dilation + kernel_height = (k_h - 1) * dilation_h + 1 # incl. dilation if simd > ifm_ch: pytest.skip("SIMD cannot be larger than number of input channels") if ifm_ch % simd != 0: pytest.skip("SIMD must divide number of input channels") if kernel_height > ifm_dim_h or stride_h > ifm_dim_h: - pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension") + pytest.skip( + "Illegal convolution configuration: kernel or stride > FM dimension" + ) if kernel_width > ifm_dim_w or stride_w > ifm_dim_w: - pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension") - if (k_h==1 and (stride_h!=1 or dilation_h!=1)) or (k_w==1 and (stride_w!=1 or dilation_w!=1)): - pytest.skip("Illegal convolution configuration: stride or dilation defined for unitary kernel dim") - if k_h==1 and k_w==1 and simd != ifm_ch: + pytest.skip( + "Illegal convolution configuration: kernel or stride > FM dimension" + ) + if (k_h == 1 and (stride_h != 1 or dilation_h != 1)) or ( + k_w == 1 and (stride_w != 1 or dilation_w != 1) + ): + pytest.skip( + "Illegal convolution configuration: stride or dilation defined for unitary kernel dim" + ) + if k_h == 1 and k_w == 1 and simd != ifm_ch: pytest.skip("1x1 Kernel only supported in parallel mode (SIMD=C)") + if parallel_window and simd != ifm_ch: + pytest.skip("Parallel window requires SIMD=C") ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h) ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w) @@ -258,7 +274,7 @@ def test_fpgadataflow_slidingwindow_rtl( ) y_expected = oxe.execute_onnx(golden, input_dict)["outp"] - #DEBUG + # DEBUG print("-------expected:") print(y_expected) print("--------produced:") @@ -267,7 +283,7 @@ def test_fpgadataflow_slidingwindow_rtl( node = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") - print("RTLSIM cycles: %d"%cycles_rtlsim) + print("RTLSIM cycles: %d" % cycles_rtlsim) if dw == 0: assert (y_produced == y_expected).all() @@ -279,6 +295,7 @@ def test_fpgadataflow_slidingwindow_rtl( y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, ifm_ch * k_h * k_w) assert (y_produced == y_expected).all() + # exp_cycles_dict = model.analysis(exp_cycles_per_layer) # exp_cycles = exp_cycles_dict[node.name] # assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) -- GitLab