diff --git a/docker/Dockerfile.finn_ci b/docker/Dockerfile.finn_ci index 0d610ec66a5f433d156f4e8da976767ce6458aef..2668927602ebb8de5fdc3d7c25b20a0c8c4a2e55 100644 --- a/docker/Dockerfile.finn_ci +++ b/docker/Dockerfile.finn_ci @@ -47,7 +47,7 @@ RUN git clone https://github.com/Xilinx/brevitas.git /workspace/brevitas # CNPY RUN git clone https://github.com/rogersce/cnpy.git /workspace/cnpy # FINN hlslib -RUN git clone https://github.com/maltanar/finn-hlslib.git /workspace/finn-hlslib +RUN git clone https://github.com/Xilinx/finn-hlslib.git /workspace/finn-hlslib # PyVerilator RUN git clone https://github.com/maltanar/pyverilator /workspace/pyverilator # PYNQ-HelloWorld diff --git a/docker/Dockerfile.finn_dev b/docker/Dockerfile.finn_dev index 1c2cb19d14137b866b55417522fdebb8e0d7ad90..1200c7d5d15bbd62e15f19f84e70d5fe0b8aca28 100644 --- a/docker/Dockerfile.finn_dev +++ b/docker/Dockerfile.finn_dev @@ -76,7 +76,7 @@ RUN git clone https://github.com/Xilinx/brevitas.git /workspace/brevitas # CNPY RUN git clone https://github.com/rogersce/cnpy.git /workspace/cnpy # FINN hlslib -RUN git clone https://github.com/maltanar/finn-hlslib.git /workspace/finn-hlslib +RUN git clone https://github.com/Xilinx/finn-hlslib.git /workspace/finn-hlslib # PyVerilator RUN git clone https://github.com/maltanar/pyverilator /workspace/pyverilator # PYNQ-HelloWorld diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh index 9cc239319fe94f482a4a6564399943b4dfe6ff53..7e13e117859365531f459928b7c664edb3fbf4ce 100644 --- a/docker/finn_entrypoint.sh +++ b/docker/finn_entrypoint.sh @@ -15,8 +15,8 @@ gecho () { # the repos themselves are cloned in the Dockerfile BREVITAS_COMMIT=989cdfdba4700fdd900ba0b25a820591d561c21a CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4 -HLSLIB_COMMIT=6b88db826bb023937506913a23d964775a7606af -PYVERILATOR_COMMIT=1d89cb0d4e0c97469cc6352c611f876ec13edfa6 +HLSLIB_COMMIT=13e9b0772a27a3a1efc40c878d8e78ed09efb716 +PYVERILATOR_COMMIT=c97a5ba41bbc7c419d6f25c74cdf3bdc3393174f PYNQSHELL_COMMIT=0c82a61b0ec1a07fa275a14146233824ded7a13d diff --git a/requirements.txt b/requirements.txt index 2427f9490a3dd5a7ffe0e0a8cf2ad19af0934cdf..b15d86ed89f7b0e76b772ce42aba6481937310b0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,6 @@ pre-commit pyverilator scipy sphinx +toposort +vcdvcd wget diff --git a/run-docker.sh b/run-docker.sh index 186efc322a8f437be0371b5a142a9dd524d1abf3..e07556716db335421f57a390f1e6a17168ac058b 100755 --- a/run-docker.sh +++ b/run-docker.sh @@ -96,7 +96,7 @@ gecho "Port-forwarding for Netron $NETRON_PORT:$NETRON_PORT" gecho "Vivado IP cache dir is at $VIVADO_IP_CACHE" gecho "Using default PYNQ board $PYNQ_BOARD" -DOCKER_INTERACTIVE = "" +DOCKER_INTERACTIVE="" if [ "$1" = "test" ]; then gecho "Running test suite (all tests)" diff --git a/src/finn/core/modelwrapper.py b/src/finn/core/modelwrapper.py index ed32426abcc8ea71428a7f746a99454e8e4a2c17..646add188c5d475cf37ccd33cf24d29d61754ae1 100644 --- a/src/finn/core/modelwrapper.py +++ b/src/finn/core/modelwrapper.py @@ -259,11 +259,10 @@ class ModelWrapper: def find_producer(self, tensor_name): """Finds and returns the node that produces the tensor with given name.""" - ret = None for x in self._model_proto.graph.node: if tensor_name in x.output: - ret = x - return ret + return x + return None def find_upstream(self, tensor_name, finder_fxn): """Follow the producer chain upstream, calling finder_fxn on each upstream @@ -510,3 +509,41 @@ class ModelWrapper: qa.tensor_name = tensor_name qa.quant_parameter_tensor_names.append(dt) qnt_annotations.append(qa) + + def get_tensor_sparsity(self, tensor_name): + """Returns the sparsity of a given tensor as dictionary.""" + graph = self._model_proto.graph + qnt_annotations = graph.quantization_annotation + ret = util.get_by_name(qnt_annotations, tensor_name, "tensor_name") + if ret is not None: + ret = util.get_by_name( + ret.quant_parameter_tensor_names, "tensor_sparsity", "key" + ) + if ret is not None: + return eval(ret.value) + return None + + def set_tensor_sparsity(self, tensor_name, sparsity_dict): + """Sets the sparsity annotation of a tensor with given name.""" + graph = self._model_proto.graph + qnt_annotations = graph.quantization_annotation + ret = util.get_by_name(qnt_annotations, tensor_name, "tensor_name") + if ret is not None: + ret_ts = util.get_by_name( + ret.quant_parameter_tensor_names, "tensor_sparsity", "key" + ) + if ret_ts is not None: + ret_ts.value = str(sparsity_dict) + else: + ts = onnx.StringStringEntryProto() + ts.key = "tensor_sparsity" + ts.value = str(sparsity_dict) + ret.quant_parameter_tensor_names.append(ts) + else: + qa = onnx.TensorAnnotation() + dt = onnx.StringStringEntryProto() + dt.key = "tensor_sparsity" + dt.value = str(sparsity_dict) + qa.tensor_name = tensor_name + qa.quant_parameter_tensor_names.append(dt) + qnt_annotations.append(qa) diff --git a/src/finn/core/onnx_exec.py b/src/finn/core/onnx_exec.py index 44787e1d26049e6075e2222316b45ab3898acbc7..c2f68a35076418e0cf2edb578bdb8d548772fc78 100644 --- a/src/finn/core/onnx_exec.py +++ b/src/finn/core/onnx_exec.py @@ -103,7 +103,7 @@ def execute_node(node, context, graph): """Output shapes disagree after node execution: found %s vs expected %s""" % ( - str(output_list[list_ind].shape.shape), + str(output_list[list_ind].shape), str(context[outp].shape), ) ) diff --git a/src/finn/core/rtlsim_exec.py b/src/finn/core/rtlsim_exec.py index 91bd3d1198f997eaf96ef3883b2c25e32c5da050..1e1bee3aa7435d5cab6cbf5ea23dd37dcdfa4380 100644 --- a/src/finn/core/rtlsim_exec.py +++ b/src/finn/core/rtlsim_exec.py @@ -99,9 +99,7 @@ def rtlsim_exec(model, execution_context): sim = pyverilate_stitched_ip(model) model.set_metadata_prop("rtlsim_so", sim.lib._name) else: - sim = PyVerilator(rtlsim_so) - _reset_rtlsim(sim) - _toggle_clk(sim) + sim = PyVerilator(rtlsim_so, auto_eval=False) ret = _run_rtlsim(sim, packed_input, num_out_values, trace_file) packed_output = ret[0] model.set_metadata_prop("sim_cycles", str(ret[1])) @@ -117,18 +115,22 @@ def _reset_rtlsim(sim): """Sets reset input in pyverilator to zero, toggles the clock and set it back to one""" sim.io.ap_rst_n_0 = 0 - sim.io.ap_clk_0 = 1 - sim.io.ap_clk_0 = 0 + _toggle_clk(sim) + _toggle_clk(sim) sim.io.ap_rst_n_0 = 1 + _toggle_clk(sim) + _toggle_clk(sim) def _toggle_clk(sim): """Toggles the clock input in pyverilator once.""" - sim.io.ap_clk_0 = 1 sim.io.ap_clk_0 = 0 + sim.eval() + sim.io.ap_clk_0 = 1 + sim.eval() -def _run_rtlsim(sim, inp, num_out_values, trace_file=None): +def _run_rtlsim(sim, inp, num_out_values, trace_file=None, reset=True): """Runs the pyverilator simulation by passing the input values to the simulation, toggle the clock and observing the execution time. Argument num_out_values contains the number of expected output values, so the simulation is closed after all @@ -153,6 +155,8 @@ def _run_rtlsim(sim, inp, num_out_values, trace_file=None): if trace_file is not None: sim.start_vcd_trace(trace_file) + if reset: + _reset_rtlsim(sim) while not (output_observed): sim.io.in0_V_V_0_tvalid = 1 if len(inputs) > 0 else 0 @@ -161,8 +165,7 @@ def _run_rtlsim(sim, inp, num_out_values, trace_file=None): inputs = inputs[1:] if sim.io.out_r_0_tvalid == 1 and sim.io.out_r_0_tready == 1: outputs = outputs + [sim.io.out_r_0_tdata] - sim.io.ap_clk_0 = 1 - sim.io.ap_clk_0 = 0 + _toggle_clk(sim) observation_count = observation_count + 1 no_change_count = no_change_count + 1 diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 17a55e519ed0440f68e295aecaab179e6adf632f..a688898f4a43b33fd3f07cda12144b84829e451f 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -219,6 +219,7 @@ class HLSCustomOp(CustomOp): self.code_gen_dict["$CLKPERIOD$"] = [str(clk)] self.code_gen_dict["$EXTRA_DIRECTIVES$"] = self.ipgen_extra_directives() + template = self.ipgentcl_template for key in self.code_gen_dict: @@ -234,7 +235,7 @@ class HLSCustomOp(CustomOp): def ipgen_extra_directives(self): "Return a list of extra tcl directives for HLS synthesis." return [] - + def ipgen_singlenode_code(self): """Builds the bash script for ip generation using the IPGenBuilder from finn.util.fpgadataflow.""" diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py index e4d106068d4d128c66b2ce5f3d6c925dfe414b90..3e40ad70208909551365c51324153859ccc79ceb 100644 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py @@ -41,10 +41,19 @@ from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy # output 0 is the output tensor, shape NHWC: # = (1, OFMDim, OFMDim, (ConvKernelDim^2)*IFMChannels) +# note: the actual data layout produced by the hlslib kernels is different +# for depthwise and non-depthwise ops. +# * non-depthwise SWG: (1, OFMDim, OFMDim, K, K, IFMChannels/SIMD, SIMD) +# * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/SIMD, K, K, SIMD) +# see test_fpgadataflow_slidingwindow.py for an example of how to transform +# between the two layouts + class ConvolutionInputGenerator(HLSCustomOp): - """Class that corresponds to finn-hlslib ConvolutionInputGenerator - (sliding window) function.""" + """Class that corresponds to one of the finn-hlslib ConvolutionInputGenerator + (sliding window) function variants. Depending on the combination of + attributes (e.g. depthwise or not, whether k % stride is 0) a different + variant will be picked for the actual HLS implementation.""" def __init__(self, onnx_node): super().__init__(onnx_node) @@ -60,6 +69,7 @@ class ConvolutionInputGenerator(HLSCustomOp): # FINN DataTypes for inputs, weights, outputs "inputDataType": ("s", True, ""), "outputDataType": ("s", True, ""), + "depthwise": ("i", False, 0), # FPGA resource type for ConvolutionInputGenerator input buffer # auto -- let Vivado HLS decide # block -- use BRAM @@ -106,7 +116,6 @@ class ConvolutionInputGenerator(HLSCustomOp): pad = 0 ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, pad) assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" - assert k % stride == 0, "stride must divide kernel size k" wf = int((k * k * ifm_ch) // simd) folded_oshape = (1, ofm_dim, ofm_dim, wf, simd) return folded_oshape @@ -305,12 +314,35 @@ class ConvolutionInputGenerator(HLSCustomOp): def docompute(self): node = self.onnx_node - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{}<ConvKernelDim1, IFMChannels1, Input_precision1, IFMDim1, - OFMDim1, SIMD1, Stride1> (in0, out, numReps);""".format( - node.op_type - ) - ] + ram_style = self.get_nodeattr("ram_style") + map_to_hls_ram_style = { + "auto": "ap_resource_dflt()", + "block": "ap_resource_bram()", + "distributed": "ap_resource_lutram()", + "ultra": "ap_resource_uram()", + } + hls_ram_style = map_to_hls_ram_style[ram_style] + hls_call = node.op_type + # check if non optimized ConvolutionInputGenerator is needed + k = self.get_nodeattr("ConvKernelDim") + stride = self.get_nodeattr("Stride") + if k % stride != 0: + hls_call += "_kernel_stride" + + if self.get_nodeattr("depthwise") == 1: + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}_dws<ConvKernelDim1, IFMChannels1, Input_precision1, IFMDim1, + OFMDim1, SIMD1, Stride1> (in0, out, numReps, {});""".format( + hls_call, hls_ram_style + ) + ] + else: + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<ConvKernelDim1, IFMChannels1, Input_precision1, IFMDim1, + OFMDim1, SIMD1, Stride1> (in0, out, numReps, {});""".format( + hls_call, hls_ram_style + ) + ] def dataoutstrm(self): code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") @@ -356,17 +388,3 @@ class ConvolutionInputGenerator(HLSCustomOp): self.code_gen_dict["$PRAGMAS$"].append( "#pragma HLS INTERFACE ap_ctrl_none port=return" ) - - def ipgen_extra_directives(self): - # add directive to control input buffer memory resources - ram_style = self.get_nodeattr("ram_style") - map_to_hls_ram_style = { - "auto": "RAM_2P", - "block": "RAM_2P_BRAM", - "distributed": "RAM_2P_LUTRAM", - "ultra": "RAM_2P_URAM", - } - hls_ram_style = map_to_hls_ram_style[ram_style] - directive = "set_directive_resource -core %s " % hls_ram_style - directive += "ConvolutionInputGenerator inputBuf" - return [directive] diff --git a/src/finn/custom_op/fpgadataflow/sameresize_batch.py b/src/finn/custom_op/fpgadataflow/fmpadding.py similarity index 86% rename from src/finn/custom_op/fpgadataflow/sameresize_batch.py rename to src/finn/custom_op/fpgadataflow/fmpadding.py index c459cac1e9c17336200a1fc85aad2af5e14e2c61..fa321dfa65d14b67fa218fb6a49f602ddab8d57e 100644 --- a/src/finn/custom_op/fpgadataflow/sameresize_batch.py +++ b/src/finn/custom_op/fpgadataflow/fmpadding.py @@ -6,27 +6,40 @@ from finn.custom_op.fpgadataflow import HLSCustomOp from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -class SameResize_Batch(HLSCustomOp): - """Class that corresponds to finn-hlslib SameResize function. - Implements 'same' padding on a given input image.""" +class FMPadding_Batch(HLSCustomOp): + """Corresponds to finn-hlslib FMPadding_Batch function. + Pads input image by given amount.""" def __init__(self, onnx_node): super().__init__(onnx_node) def get_nodeattr_types(self): my_attrs = { + # spatial size of input images "ImgDim": ("i", True, 0), - "KernelDim": ("i", True, 0), - "Stride": ("i", True, 0), + # total padding (per dimension) to apply + "Padding": ("i", True, 2), + # number of channels in input image "NumChannels": ("i", True, 0), # FINN input datatype "inputDataType": ("s", True, ""), - # distribution of added values to achieve "same" padding - "PaddingStyle": ("i", True, 2), + # controls distribution of padded pixels + # in case of uneven padding -- see FMPadding fxn + # in hlslib + "PaddingStyle": ("i", False, 2), + # shape describing input vecs per execution + "numInputVectors": ("i", False, 1), } my_attrs.update(super().get_nodeattr_types()) return my_attrs + def get_padded_odim(self): + "Return the padded spatial size of the output." + + idim = self.get_nodeattr("ImgDim") + pad = self.get_nodeattr("Padding") + return idim + pad + def get_normal_input_shape(self): idim = self.get_nodeattr("ImgDim") num_ch = self.get_nodeattr("NumChannels") @@ -35,14 +48,8 @@ class SameResize_Batch(HLSCustomOp): return ishape def get_normal_output_shape(self): - idim = self.get_nodeattr("ImgDim") + odim = self.get_padded_odim() num_ch = self.get_nodeattr("NumChannels") - kdim = self.get_nodeattr("KernelDim") - stride = self.get_nodeattr("Stride") - assert idim % stride == 0, "Stride must divide input dimension." - # number of "same" windows over the input data - same_windows = idim // stride - odim = kdim + stride * (same_windows - 1) oshape = (1, odim, odim, num_ch) return oshape @@ -87,7 +94,7 @@ class SameResize_Batch(HLSCustomOp): # data type stays the same dtype = model.get_tensor_datatype(node.input[0]) exp_idtype = self.get_input_datatype() - assert dtype == exp_idtype, "Unexpected datatype for SameResize_Batch" + assert dtype == exp_idtype, "Unexpected datatype for FMPadding_Batch" model.set_tensor_datatype(node.output[0], dtype) def verify_node(self): @@ -96,9 +103,9 @@ class SameResize_Batch(HLSCustomOp): def get_input_datatype(self): """Returns FINN DataType of input.""" ret = DataType[self.get_nodeattr("inputDataType")] - # the hlslib op always pads with zeroes, so ensure that the DataType - # is able to represent zeroes - assert ret.allowed(0), "SameResize_Batch DataType must support zero" + # the hlslib op always pads with zeros, so ensure that the DataType + # is able to represent zeros + assert ret.allowed(0), "FMPadding_Batch DataType must support zero" return ret def get_output_datatype(self): @@ -125,18 +132,16 @@ class SameResize_Batch(HLSCustomOp): self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"'] def defines(self, var): - numReps = 1 - assert self.get_nodeattr("PaddingStyle") == 2, "Only PaddingStyle=2 supported" self.code_gen_dict["$DEFINES$"] = [ - """#define ImgDim1 {}\n #define KernelDim1 {}\n - #define Stride1 {}\n #define NumChannels1 {}\n - #define PaddingStyle1 {}\n #define numReps {}""".format( + """#define ImgDim1 {}\n#define OutputDim1 {}\n + #define Padding1 {}\n#define NumChannels1 {}\n + #define PaddingStyle1 {}\n#define numReps {}\n""".format( self.get_nodeattr("ImgDim"), - self.get_nodeattr("KernelDim"), - self.get_nodeattr("Stride"), + self.get_padded_odim(), + self.get_nodeattr("Padding"), self.get_nodeattr("NumChannels"), self.get_nodeattr("PaddingStyle"), - numReps, + self.get_nodeattr("numInputVectors"), ) ] @@ -171,8 +176,8 @@ class SameResize_Batch(HLSCustomOp): in_t = self.get_input_datatype().get_hls_datatype_str() node = self.onnx_node self.code_gen_dict["$DOCOMPUTE$"] = [ - """{}<ImgDim1, KernelDim1, Stride1, NumChannels1, - {}, PaddingStyle1> (in0, out, numReps);""".format( + """{}<ImgDim1, OutputDim1, Padding1, NumChannels1, + {}, PaddingStyle1> (in0, out, numReps);""".format( node.op_type, in_t ) ] @@ -261,8 +266,7 @@ class SameResize_Batch(HLSCustomOp): super().npy_to_dynamic_output(context) assert ( context[node.output[0]].shape == folded_oshape - ), "cppsim \ - did not produce expected ofolded utput shape" + ), "cppsim did not produce expected folded output shape" context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) elif mode == "rtlsim": sim = self.get_rtlsim() diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py index f650442401b49f1ad0a602b6b2ad3e50fbb5e5c2..9b73ba1e100aa83fd19aa8799195c99891fca3fd 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py +++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py @@ -513,40 +513,44 @@ class StreamingFCLayer_Batch(HLSCustomOp): elif mem_mode == "decoupled": """Saves weights in corresponding file format for cppsim or rtlsim""" # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD) - # and save as unflipped weight tensor to be able to differentiate between - # flipped an unflipped weight tensor (has to be flipped for cppsim) - weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3)) - # flip PE dimension and reverse SIMD flip for saving weights in .npy - weight_tensor_flipped = np.flip(weight_tensor_unflipped, axis=-2) - weight_tensor_flipped = np.flip(weight_tensor_flipped, axis=-1) + # reverse SIMD flip for saving weights in .npy + weight_tensor_simd_flipped = np.flip(weight_tensor_unflipped, axis=-1) + # PE flip for saving weights in .dat + weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2) - # reshape weight tensor (flipped and unflipped) to desired shape + # reshape weight tensor (simd_flipped and pe_flipped) to desired shape pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") - # unflipped - weight_tensor_unflipped = weight_tensor_unflipped.reshape(1, -1, pe * simd) - weight_tensor_unflipped = weight_tensor_unflipped.copy() + # simd_flipped + weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape( + 1, -1, pe * simd + ) + weight_tensor_simd_flipped = weight_tensor_simd_flipped.copy() # flipped - weight_tensor_flipped = weight_tensor_flipped.reshape(1, -1, pe * simd) - weight_tensor_flipped = weight_tensor_flipped.copy() + weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape( + 1, -1, pe * simd + ) + weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy() """Saves weights into .npy file""" - np.save(os.path.join(code_gen_dir, "weights.npy"), weight_tensor_flipped) + np.save( + os.path.join(code_gen_dir, "weights.npy"), weight_tensor_simd_flipped + ) """Saves weights into .dat file""" # convert weight values into hexstring weight_width = self.get_weightstream_width() # pad to nearest 4 bits to get hex strings weight_width_padded = roundup_to_integer_multiple(weight_width, 4) - weight_tensor_unflipped = pack_innermost_dim_as_hex_string( - weight_tensor_unflipped, export_wdt, weight_width_padded, prefix="" + weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( + weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" ) - weight_stream_len = np.prod(weight_tensor_unflipped.shape) + weight_stream_len = np.prod(weight_tensor_pe_flipped.shape) factor = math.ceil(weight_stream_len / 1024) # add zeroes to pad out file to 1024 entries - weight_stream = weight_tensor_unflipped.flatten() + weight_stream = weight_tensor_pe_flipped.flatten() pad_amt = (factor * 1024) - weight_stream_len weight_stream = np.pad( weight_stream, (0, pad_amt), mode="constant", constant_values="0" diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py index 66190333ce8d71dafba99aaeae4fb2c973d67410..1f734b548f923341687843c538d1887fcc069bee 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfifo.py +++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py @@ -110,6 +110,8 @@ class StreamingFIFO(HLSCustomOp): ] # make instream width a multiple of 8 for axi interface in_width = self.get_instream_width_padded() + count_width = int(self.get_nodeattr("depth") - 1).bit_length() + self.code_gen_dict["$COUNT_RANGE$"] = ["[{}:0]".format(count_width - 1)] self.code_gen_dict["$IN_RANGE$"] = ["[{}:0]".format(in_width - 1)] self.code_gen_dict["$OUT_RANGE$"] = ["[{}:0]".format(in_width - 1)] self.code_gen_dict["$WIDTH$"] = [str(in_width)] diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py index 5f526aa2aa1917144c7a048c9d9314aa9288a2d8..1a8216f64bf71b7fb9f1f8becf4732970b5bf451 100644 --- a/src/finn/custom_op/fpgadataflow/templates.py +++ b/src/finn/custom_op/fpgadataflow/templates.py @@ -408,6 +408,7 @@ strm_fifo_wrapper = """ module $TOPNAME$( ap_clk, ap_rst_n, +count, in0_V_V_TDATA, in0_V_V_TVALID, in0_V_V_TREADY, @@ -418,6 +419,7 @@ out_V_V_TREADY input ap_clk; input ap_rst_n; +output $COUNT_RANGE$ count; input $IN_RANGE$ in0_V_V_TDATA; input in0_V_V_TVALID; output in0_V_V_TREADY; @@ -433,6 +435,7 @@ $LAYER_NAME$ ( .clock(ap_clk), .reset(!ap_rst_n), + .count(count), .i_d(in0_V_V_TDATA), .i_v(in0_V_V_TVALID), .i_r(in0_V_V_TREADY), diff --git a/src/finn/custom_op/im2col.py b/src/finn/custom_op/im2col.py index 16446c15d46ee7996162f864708f7fde6cfedaf3..82a6b140f7af1be4e5c0f429d077b99c7865383e 100644 --- a/src/finn/custom_op/im2col.py +++ b/src/finn/custom_op/im2col.py @@ -21,8 +21,6 @@ def get_im2col_indices_nchw( """Returns im2col indices.""" # First figure out what the size of the output should be N, C, H, W = x_shape - assert (H + 2 * padding - field_height) % stride_y == 0 - assert (W + 2 * padding - field_width) % stride_x == 0 out_height = compute_conv_output_dim(H, field_height, stride_y, padding) out_width = compute_conv_output_dim(W, field_width, stride_x, padding) @@ -70,6 +68,9 @@ def im2col_indices_nchw( # * ifm is the number of input channels # * k is the convolutional kernel size +# note: for the innermost (dot product) dimension of k*k*ifm, we +# assume an internal ordering (k, k, ifm) + class Im2Col(CustomOp): def get_nodeattr_types(self): diff --git a/src/finn/custom_op/registry.py b/src/finn/custom_op/registry.py index 238829e03353d79fab7c51e7d1b9dca6e2a96a11..614a3d7ffd70d0b102bad2b76177a2d3b32765c7 100644 --- a/src/finn/custom_op/registry.py +++ b/src/finn/custom_op/registry.py @@ -44,7 +44,7 @@ from finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch import ( StreamingDataWidthConverter_Batch, ) from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch -from finn.custom_op.fpgadataflow.sameresize_batch import SameResize_Batch +from finn.custom_op.fpgadataflow.fmpadding import FMPadding_Batch from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch from finn.custom_op.fpgadataflow.addstreams_batch import AddStreams_Batch from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch @@ -65,7 +65,7 @@ custom_op["MaxPoolNHWC"] = MaxPoolNHWC custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch custom_op["StreamingFIFO"] = StreamingFIFO custom_op["GlobalAccPool_Batch"] = GlobalAccPool_Batch -custom_op["SameResize_Batch"] = SameResize_Batch +custom_op["FMPadding_Batch"] = FMPadding_Batch custom_op["Thresholding_Batch"] = Thresholding_Batch custom_op["AddStreams_Batch"] = AddStreams_Batch custom_op["LabelSelect_Batch"] = LabelSelect_Batch diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index 3ff86cab48d365c10e69bc2c764e8083c6a36880..d421a5f3ef8ca980b399087de1482b2ae913da1b 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -26,7 +26,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from onnx import helper +from onnx import helper, TensorProto from finn.core.datatype import DataType from finn.transformation import Transformation @@ -59,27 +59,61 @@ class InferConvInpGen(Transformation): ifm_ch = i2c_in_shape[-1] ifm_dim = i2c_in_shape[1] ofm_dim = i2c_out_shape[1] - # if padding enabled, ensure pad_val supported by DataType + + # default params for ConvolutionInputGenerator + ConvInpGen_node_idx = node_ind + ConvInpGen_input = i2c_input + ConvInpGen_idim = ifm_dim + if pad > 0: + # if padding enabled, ensure pad_val supported by DataType assert dt.allowed(pad_val), "Im2Col DataType must support pad_val" + + odim_padding = ifm_dim + 2 * pad + + padding_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (1, odim_padding, odim_padding, ifm_ch), + ) + graph.value_info.append(padding_out) + padding_out = padding_out.name + model.set_tensor_datatype(padding_out, dt) + + ConvInpGen_node_idx += 1 + ConvInpGen_input = padding_out + ConvInpGen_idim = odim_padding + + padding_node = helper.make_node( + "FMPadding_Batch", + [i2c_input], + [padding_out], + domain="finn", + backend="fpgadataflow", + ImgDim=ifm_dim, + Padding=2 * pad, + NumChannels=ifm_ch, + inputDataType=dt.name, + ) + graph.node.insert(node_ind, padding_node) + # create equivalent ConvolutionInputGenerator node - # TODO support padding - new_node = helper.make_node( + ConvInpGen_node = helper.make_node( "ConvolutionInputGenerator", - [i2c_input], + [ConvInpGen_input], [i2c_output], domain="finn", backend="fpgadataflow", ConvKernelDim=k, IFMChannels=ifm_ch, - IFMDim=ifm_dim, + IFMDim=ConvInpGen_idim, OFMDim=ofm_dim, SIMD=ifm_ch, Stride=stride, inputDataType=dt.name, outputDataType=dt.name, ) - graph.node.insert(node_ind, new_node) + graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) # remove old nodes graph.node.remove(n) graph_modified = True diff --git a/src/finn/transformation/general.py b/src/finn/transformation/general.py index f51ffbcfd9f62e06bf4942409fbb163e92ff6370..488391740fc25f1f7caa657adc9ed55bdc2f9722 100644 --- a/src/finn/transformation/general.py +++ b/src/finn/transformation/general.py @@ -28,6 +28,7 @@ import finn.util.basic as util from finn.transformation import Transformation +from toposort import toposort_flatten class GiveUniqueNodeNames(Transformation): @@ -104,11 +105,13 @@ class GiveUniqueParameterTensors(Transformation): # first occurance seen_parameters += [node_input] continue - + new_param_name = model.make_new_valueinfo_name() model.set_initializer(new_param_name, input_init) - model.set_tensor_datatype(new_param_name, model.get_tensor_datatype(node_input)) + model.set_tensor_datatype( + new_param_name, model.get_tensor_datatype(node_input) + ) # point node input to new tensor n.input[input_idx] = new_param_name @@ -116,6 +119,56 @@ class GiveUniqueParameterTensors(Transformation): return (model, graph_modified) +class SortGraph(Transformation): + """ Returns the model with its node list sorted topologically. + Any ONNX graph to be executed must have a topologically sorted node list, as dictated + by the ONNX standard. + """ + + # Notes on SortGraph performance: + # benchmark in tests/transformation/test_sort_graph.py + # + # The algorithm doesn't move initializers so its performance should only depend on + # the number of nodes + # + # Relative order of magnitudes for time per step: + # - Gather graph structure: base + # - Sort nodes: 0.1 of base + # - Remove and insert in order : 0.001 of base + # + # Notes: + # Remove nodes and insert them in order: + # Probably this is faster than copying initializers and more robust in general + + def apply(self, model): + # Gather graph structure + graph_dependencies = {} + node_list = [ + n for n in model.graph.node + ] # I also need the list to remove the nodes + for node_idx, n in enumerate(node_list): + node_pred = model.find_direct_predecessors(n) + if node_pred is None: + # Will also eliminate nodes that are floating around for some reason + continue + + node_dependencies = [node_list.index(pred) for pred in node_pred] + graph_dependencies[node_idx] = set(node_dependencies) + + # Sort nodes + sorted_node_indexes = toposort_flatten(graph_dependencies) + + # Remove nodes and insert them in order + # Can't remove nodes before if I want to use model.find_direct_predecessors() + for n in node_list: + model.graph.node.remove(n) + + for new_idx, sorted_idx in enumerate(sorted_node_indexes): + model.graph.node.insert(new_idx, node_list[sorted_idx]) + + return model, False + + class ConvertSubToAdd(Transformation): """Convert subtract-a-constant nodes to add-a-constant nodes.""" diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py index 3880bb9591e27af5fe9d063dba2485d304e4db54..d3bfb73fe239d7194fab3760555663895a209e84 100644 --- a/src/finn/util/basic.py +++ b/src/finn/util/basic.py @@ -56,6 +56,12 @@ def get_rtlsim_trace_depth(): via the RTLSIM_TRACE_DEPTH environment variable. If the env.var. is undefined, the default value of 1 is returned. A trace depth of 1 will only show top-level signals and yield smaller .vcd files. + + The following depth values are of interest for whole-network stitched IP + rtlsim: + - level 1 shows top-level input/output streams + - level 2 shows per-layer input/output streams + - level 3 shows per full-layer I/O including FIFO count signals """ try: diff --git a/src/finn/util/fpgadataflow.py b/src/finn/util/fpgadataflow.py index d1669444e55cb0fddb2690e51849c4603d47d32c..3fe747a84985b2702ffb1e5855d9071362efebda 100644 --- a/src/finn/util/fpgadataflow.py +++ b/src/finn/util/fpgadataflow.py @@ -104,6 +104,7 @@ def pyverilate_stitched_ip(model): build_dir=build_dir, trace_depth=get_rtlsim_trace_depth(), top_module_name=top_module_name, + auto_eval=False, ) return sim diff --git a/src/finn/util/vcd.py b/src/finn/util/vcd.py new file mode 100644 index 0000000000000000000000000000000000000000..d9e244422065314ceb790dc6719b57688ff76828 --- /dev/null +++ b/src/finn/util/vcd.py @@ -0,0 +1,184 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from vcdvcd import VCDVCD +from finn.util.basic import get_num_default_workers +import multiprocessing as mp + +# string patterns to search for to find particular interfaces +# streaming interfaces +vname = "TVALID" +rname = "TREADY" +# FIFO count signals +fifo_mod_name = "StreamingFIFO" +fifo_cname = "count" + + +def list_stream_if(vcd_file): + "Return a list of stream interface names from given vcd trace." + + sig_names = VCDVCD(vcd_file, print_dumps=False, only_sigs=True).get_signals() + stream_if_names = [] + for cand_name in filter(lambda x: x.endswith(vname), sig_names): + base_name = cand_name.replace(vname, "") + if base_name + rname in sig_names: + stream_if_names.append(base_name) + return stream_if_names + + +def list_fifo_count_signals(vcd_file): + "Return a list of FIFO count signal names from given vcd trace." + + sig_names = VCDVCD(vcd_file, print_dumps=False, only_sigs=True).get_signals() + fifo_cnt_names = [] + for cand_name in filter(lambda x: fifo_cname in x, sig_names): + if fifo_mod_name in cand_name: + fifo_cnt_names.append(cand_name) + return fifo_cnt_names + + +def get_fifo_count_max(vcd_file, fifo_count_signal): + "Return the maximum value of the given FIFO count signal in vcd trace." + + d = VCDVCD(vcd_file, signals=[fifo_count_signal], store_tvs=True).get_data() + assert len(d) != 0, "FIFO count signal not found" + events = list(d.values())[0]["tv"] + max = 0 + for (time, val) in events: + current = int(val, base=2) + if current > max: + max = current + return max + + +def _get_fifo_max(x): + return (x[0], get_fifo_count_max(x[1], x[0])) + + +def get_all_fifo_count_max(vcd_file, fifo_count_signals=None): + """Return a list of max FIFO counts. If fifo_count_signals is None, + all FIFO count signals will be returned, otherwise treated as a list of + signal names to return the stats for.""" + if fifo_count_signals is None: + fifo_count_signals = list_fifo_count_signals(vcd_file) + + with mp.Pool(get_num_default_workers()) as p: + fifo_count_signals = map(lambda x: (x, vcd_file), fifo_count_signals) + all_stats = p.map(_get_fifo_max, fifo_count_signals) + + return all_stats + + +def get_stream_if_stats(vcd_file, if_base_name): + """Return statistics for given streaming interface in vcd trace in the + following dict format: + + <stream_state>: (<num_samples>, <fraction_of_time>), + + where <stream_state> is the combination of (V)alid/(R)eady values, + <num_samples> is the approximate number of rising clock edges spent in <state> + , and <fraction_of_time> is the fraction of <num_samples> to total + amount of time recorded by the trace. + + Example: + {"{'V': 0, 'R': 0}": (5, 0.0006060606060606061), + "{'V': 1, 'R': 0}": (0, 0.0), + "{'V': 0, 'R': 1}": (7605, 0.9218181818181819), + "{'V': 1, 'R': 1}": (640, 0.07757575757575758)} + + Here we can see the stream was transmitting values 7.7% of the time, + and 9.2% of the time there was no incoming data (valid 0, ready 1) + """ + if_valid = if_base_name + vname + if_ready = if_base_name + rname + v = VCDVCD(vcd_file, signals=[if_valid], store_tvs=True) + endtime = v.get_endtime() + v = v.get_data() + assert len(v) != 0, "Streaming interface not found" + v = list(v.values())[0]["tv"] + v = list(map(lambda x: ("V", x[0], x[1]), v)) + v.append(("V", endtime, "0")) + r = VCDVCD(vcd_file, signals=[if_ready], store_tvs=True).get_data() + assert len(r) != 0, "Streaming interface not found" + r = list(r.values())[0]["tv"] + r = list(map(lambda x: ("R", x[0], x[1]), r)) + r.append(("R", endtime, "0")) + events = sorted(v + r, key=lambda x: x[1]) + ret = { + "{'V': 0, 'R': 0}": 0, + "{'V': 1, 'R': 0}": 0, + "{'V': 0, 'R': 1}": 0, + "{'V': 1, 'R': 1}": 0, + } + status = {"V": 0, "R": 0} + last_time = 0 + total_rising_clock_edges = 0 + for (sig, time, val) in events: + # pyverilator generates 5 time units per sample + time = time / 5 + # pyverilator generates 4 samples per clock period + n_rising_clock_edges = int((time - last_time) / 4) + # note that the calculation of n_rising_clock_edges is approximate + # doing this exactly would require a cycle-by-cycle walkthrough of the + # trace, which can take very long + ret[str(status)] += n_rising_clock_edges + total_rising_clock_edges += n_rising_clock_edges + status[sig] = int(val) + last_time = time + + for state in ret: + v = ret[state] + ret[state] = (v, v / total_rising_clock_edges) + + return ret + + +def _get_stats(x): + return (x[0], get_stream_if_stats(x[1], x[0])) + + +def get_all_stream_if_stats(vcd_file, stream_ifs=None, sort_by="{'V': 1, 'R': 0}"): + """Return a list of streaming interface stats, sorted by the percentage + for the given sort_by key. If stream_ifs is None, all streamin interface + stats will be returned, otherwise treated as a list of interface names to + return the stats for.""" + + if stream_ifs is None: + stream_ifs = list_stream_if(vcd_file) + + with mp.Pool(get_num_default_workers()) as p: + stream_ifs = map(lambda x: (x, vcd_file), stream_ifs) + all_stats = p.map(_get_stats, stream_ifs) + + def sort_key(x): + stat = x[1] + (samples, percent) = stat[sort_by] + return percent + + ret = sorted(all_stats, key=sort_key) + return ret diff --git a/tests/core/test_modelwrapper.py b/tests/core/test_modelwrapper.py index 4bd9385536bc6721c66726169dfa4c69e5f06772..5fa9b23bad5c5b67f65530c55f862f889c07b1ac 100644 --- a/tests/core/test_modelwrapper.py +++ b/tests/core/test_modelwrapper.py @@ -73,6 +73,11 @@ def test_modelwrapper(): inp_layout = DataLayout.NCHW model.set_tensor_layout(inp_name, inp_layout) assert model.get_tensor_layout(inp_name) == inp_layout + inp_sparsity = model.get_tensor_sparsity(inp_name) + assert inp_sparsity is None + inp_sparsity = {"dw": {"kernel_shape": 3}} + model.set_tensor_sparsity(inp_name, inp_sparsity) + assert model.get_tensor_sparsity(inp_name) == inp_sparsity os.remove(export_onnx_path) diff --git a/tests/end2end/test_end2end_tfc_w1a1.py b/tests/end2end/test_end2end_tfc_w1a1.py index 354e8d88931f758a57231ebcf3564046cc0f3ab9..f9bd408ebac8a011eb0c461d7f8e48b5cc76be86 100644 --- a/tests/end2end/test_end2end_tfc_w1a1.py +++ b/tests/end2end/test_end2end_tfc_w1a1.py @@ -73,6 +73,7 @@ from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.core.throughput_test import throughput_test_rtlsim +import finn.util.vcd as vcd build_dir = "/tmp/" + os.environ["FINN_INST_NAME"] test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1") @@ -198,6 +199,8 @@ def test_end2end_tfc_w1a1_verify_dataflow_part(): res_rtlsim_nodebynode = ret_rtlsim_nodebynode[out_name] # whole-network (ip-stitched) rtlsim model.set_metadata_prop("exec_mode", "rtlsim") + model.set_metadata_prop("rtlsim_trace", build_dir + "/tfc_w1a1.vcd") + os.environ["RTLSIM_TRACE_DEPTH"] = "3" model.save(build_dir + "/end2end_tfc_w1a1_ipstitch_whole_rtlsim.onnx") ret_rtlsim_whole = execute_onnx(model, inp_dict, True) res_rtlsim_whole = ret_rtlsim_whole[out_name] @@ -205,6 +208,24 @@ def test_end2end_tfc_w1a1_verify_dataflow_part(): assert np.isclose(res_cppsim, res_rtlsim_whole).all() +def test_end2end_tfc_w1a1_verify_fifo_fullness(): + vcdf = build_dir + "/tfc_w1a1.vcd" + if not os.path.isfile(vcdf): + pytest.skip("Cannot find %s, skipping" % vcdf) + stream_ifs = vcd.list_stream_if(vcdf) + fifos = vcd.list_fifo_count_signals(vcdf) + assert len(stream_ifs) == 37 + assert len(fifos) == 6 + fifo_max = vcd.get_all_fifo_count_max(vcdf) + assert fifo_max[0][0] == "TOP.v.finn_design_i.StreamingFIFO_0.count[3:0]" + assert fifo_max[0][1] == 3 + stream_stat = vcd.get_all_stream_if_stats(vcdf) + assert ( + stream_stat[0][0] + == "TOP.v.finn_design_i.StreamingDataWidthConverter_Batch_0_out_V_V_" + ) + + @pytest.mark.vivado def test_end2end_tfc_w1a1_throughput_test_rtlsim(): model = load_test_checkpoint_or_skip( diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..ee65326ec57fb7fa7fa0490a8980dbabb8efc13c --- /dev/null +++ b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py @@ -0,0 +1,106 @@ +from onnx import TensorProto, helper +import numpy as np +import pytest + +from finn.core.datatype import DataType +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.infer_datatypes import InferDataTypes +from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames +from finn.transformation.infer_data_layouts import InferDataLayouts +from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul + +import finn.core.onnx_exec as oxe +from finn.core.modelwrapper import ModelWrapper +from finn.util.basic import gen_finn_dt_tensor +import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls + +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode + + +@pytest.mark.parametrize("padding", [True, False]) +@pytest.mark.parametrize("kernel_size", [3, 5]) +@pytest.mark.slow +@pytest.mark.vivado +def test_convert_to_hls_conv_layer(padding, kernel_size): + + assert ( + kernel_size % 2 != 0 + ), """test_convert_to_hls_conv_layer test only + supports odd kernel_size""" + + np.random.seed(0) + padding = True + idt = DataType.UINT4 + + in_feature_dim = 7 + in_chn = 3 + + stages = 1 # just one convolution + + out_feature_dim = ( + in_feature_dim if padding else in_feature_dim - (kernel_size // 2 * 2) * stages + ) + + input_shape = [1, in_chn, in_feature_dim, in_feature_dim] + output_shape = [1, in_chn, out_feature_dim, out_feature_dim] + + conv_param_shape = [in_chn, in_chn, kernel_size, kernel_size] + + conv_config = {} + conv_config["dilations"] = [1, 1] + conv_config["group"] = 1 + conv_config["kernel_shape"] = [kernel_size, kernel_size] + if padding: + pad = kernel_size // 2 + conv_config["pads"] = [pad, pad, pad, pad] + else: + conv_config["pads"] = [0, 0, 0, 0] + conv_config["strides"] = [1, 1] + + top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape) + top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape) + value_info = [ + helper.make_tensor_value_info("p1", TensorProto.FLOAT, conv_param_shape) + ] + + modelproto = helper.make_model( + helper.make_graph( + name="conv_test", + inputs=[top_in], + outputs=[top_out], + value_info=value_info, + nodes=[ + helper.make_node("Conv", ["top_in", "p1"], ["top_out"], **conv_config) + ], + ) + ) + + model = ModelWrapper(modelproto) + model.set_tensor_datatype("top_in", idt) + model.set_tensor_datatype("top_out", idt) + model.set_tensor_datatype("p1", DataType.UINT4) + + model = model.transform(InferShapes()) + model.set_initializer( + "p1", np.round(np.random.rand(*conv_param_shape).astype(np.float32) * 16) + ) + + model.set_tensor_datatype(model.graph.input[0].name, idt) + model = model.transform(InferShapes()) + model = model.transform(InferDataLayouts()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model = model.transform(InferDataTypes()) + + new_model = model.transform(LowerConvsToMatMul()) + new_model = new_model.transform(to_hls.InferConvInpGen()) + + new_model = new_model.transform(PrepareCppSim()) + new_model = new_model.transform(CompileCppSim()) + new_model = new_model.transform(SetExecMode("cppsim")) + + x = gen_finn_dt_tensor(idt, input_shape) + inp_dict = {model.graph.input[0].name: x} + assert oxe.compare_execution(model, new_model, inp_dict) diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py index 5051bf34dc690daf8b6186859d3717cc8e217eee..b5fc85caf274edc9e7afc52df962862fa8a99ba3 100644 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py @@ -78,7 +78,7 @@ def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, simd, stride, i def make_single_slidingwindow_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt + k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt, dw=0 ): odt = idt inp = helper.make_tensor_value_info( @@ -102,6 +102,7 @@ def make_single_slidingwindow_modelwrapper( Stride=stride, inputDataType=idt.name, outputDataType=odt.name, + depthwise=dw, ) graph = helper.make_graph( nodes=[SlidingWindow_node], @@ -126,25 +127,29 @@ def prepare_inputs(input_tensor): # input datatype @pytest.mark.parametrize("idt", [DataType.BIPOLAR, DataType.INT2]) # kernel size -@pytest.mark.parametrize("k", [2, 4]) +@pytest.mark.parametrize("k", [2, 3]) # input dimension -@pytest.mark.parametrize("ifm_dim", [4, 6, 8]) +@pytest.mark.parametrize("ifm_dim", [6, 8]) # input channels -@pytest.mark.parametrize("ifm_ch", [2, 4]) # , 2, 3, 4]) +@pytest.mark.parametrize("ifm_ch", [2, 4]) # Stride @pytest.mark.parametrize("stride", [1, 2]) # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) # input channel parallelism ("SIMD") @pytest.mark.parametrize("simd", [1, 2]) +# depthwise +@pytest.mark.parametrize("dw", [0, 1]) @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_slidingwindow(idt, k, ifm_dim, ifm_ch, stride, exec_mode, simd): +def test_fpgadataflow_slidingwindow( + idt, k, ifm_dim, ifm_ch, stride, exec_mode, simd, dw +): ofm_dim = int(((ifm_dim - k) / stride) + 1) x = gen_finn_dt_tensor(idt, (1, ifm_dim, ifm_dim, ifm_ch)) model = make_single_slidingwindow_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt + k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt, dw ) if exec_mode == "cppsim": @@ -168,6 +173,12 @@ def test_fpgadataflow_slidingwindow(idt, k, ifm_dim, ifm_ch, stride, exec_mode, k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt ) y_expected = oxe.execute_onnx(golden, input_dict)["outp"] - # if idt == DataType.BIPOLAR: - # y_expected = 2 * y_expected - 1 - assert (y_produced == y_expected).all() + if dw == 0: + assert (y_produced == y_expected).all() + else: + y_expected = y_expected.reshape( + 1, ofm_dim, ofm_dim, k * k, ifm_ch // simd, simd + ) + y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5) + y_expected = y_expected.reshape(1, ofm_dim, ofm_dim, ifm_ch * k * k) + assert (y_produced == y_expected).all() diff --git a/tests/fpgadataflow/test_fpgadataflow_sameresize.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py similarity index 75% rename from tests/fpgadataflow/test_fpgadataflow_sameresize.py rename to tests/fpgadataflow/test_fpgadataflow_fmpadding.py index ea6130c3891443595b038460233ebb85799ac461..9d6390b2673e5d2c0e72748183ac04ed222d078e 100644 --- a/tests/fpgadataflow/test_fpgadataflow_sameresize.py +++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py @@ -23,9 +23,11 @@ test_fpga_part = pynq_part_map[test_pynq_board] target_clk_ns = 10 -def make_single_sameresize_modelwrapper( - idim, odim, kdim, stride, num_ch, idt, pad_style -): +def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style): + assert pad_style == 2, "only pad_style == 2 supported in hlslib" + assert padding > 0, "Output dim should be greater than input dim" + odim = idim + padding + inp = helper.make_tensor_value_info( "inp", TensorProto.FLOAT, [1, idim, idim, num_ch] ) @@ -33,25 +35,25 @@ def make_single_sameresize_modelwrapper( "outp", TensorProto.FLOAT, [1, odim, odim, num_ch] ) - SameResize_node = helper.make_node( - "SameResize_Batch", + FMPadding = helper.make_node( + "FMPadding_Batch", ["inp"], ["outp"], domain="finn", backend="fpgadataflow", ImgDim=idim, - KernelDim=kdim, - Stride=stride, + Padding=padding, NumChannels=num_ch, inputDataType=str(idt.name), PaddingStyle=pad_style, + numInputVectors=1, ) graph = helper.make_graph( - nodes=[SameResize_node], name="sameresize_graph", inputs=[inp], outputs=[outp] + nodes=[FMPadding], name="fmpadding_graph", inputs=[inp], outputs=[outp] ) - model = helper.make_model(graph, producer_name="sameresize-model") + model = helper.make_model(graph, producer_name="fmpadding-model") model = ModelWrapper(model) model.set_tensor_datatype("inp", idt) @@ -60,34 +62,28 @@ def make_single_sameresize_modelwrapper( return model -# image dimension +# input image dimension @pytest.mark.parametrize("idim", [8, 16]) -# kernel dimension -@pytest.mark.parametrize("kdim", [2, 3]) -# stride -@pytest.mark.parametrize("stride", [1, 2]) +# number of rows and number of cols to add +@pytest.mark.parametrize("pad", [2, 3]) # number of channels @pytest.mark.parametrize("num_ch", [1, 2]) +# PaddingStyle: selects behavior when (odim-idim)%2 != 0 +@pytest.mark.parametrize("pad_style", [2]) # FINN input datatype @pytest.mark.parametrize("idt", [DataType.INT2, DataType.INT4]) # execution mode @pytest.mark.parametrize("mode", ["cppsim", "rtlsim"]) @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_sameresize(idim, kdim, stride, num_ch, idt, mode): - pad_style = 2 - assert idim % stride == 0, "Stride must divide input dimension." - # number of "same" windows over the input data - same_windows = idim // stride - odim = kdim + stride * (same_windows - 1) +def test_fpgadataflow_fmpadding(idim, pad, num_ch, pad_style, idt, mode): # generate input data x = gen_finn_dt_tensor(idt, [1, idim, idim, num_ch]) input_dict = {"inp": x} + odim = idim + pad - model = make_single_sameresize_modelwrapper( - idim, odim, kdim, stride, num_ch, idt, pad_style - ) + model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, idt, pad_style) model = model.transform(InferShapes()) model = model.transform(SetExecMode(mode)) model = model.transform(GiveUniqueNodeNames()) @@ -103,8 +99,7 @@ def test_fpgadataflow_sameresize(idim, kdim, stride, num_ch, idt, mode): assert y_produced.shape == expected_oshape # calculate reference - # calculate correct padding according to parameters - pad = odim - idim + # calculate correct pad according to parameters if pad_style == 2: if pad % 2 == 0: pad_up = pad // 2 @@ -115,6 +110,7 @@ def test_fpgadataflow_sameresize(idim, kdim, stride, num_ch, idt, mode): else: pad_up = pad // 2 pad_left = pad // 2 + pad_down = pad - pad_up pad_right = pad - pad_left diff --git a/tests/transformation/test_sort_graph.py b/tests/transformation/test_sort_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..05842504c13b144bb34e8084fb12b5086fa84115 --- /dev/null +++ b/tests/transformation/test_sort_graph.py @@ -0,0 +1,150 @@ +from onnx import TensorProto, helper +import numpy as np + +from finn.core.modelwrapper import ModelWrapper +from finn.transformation.general import SortGraph +from finn.transformation.infer_shapes import InferShapes +import pytest +import finn.analysis.topology as ta + + +def make_randomly_sorted_linear_model(num_of_nodes, seed=None): + if seed is not None: + np.random.seed(seed) + + ch = 2 + ifmdim = 16 + input_shape = (1, ch, ifmdim, ifmdim) + + top_in = helper.make_tensor_value_info("t0", TensorProto.FLOAT, input_shape) + top_out = helper.make_tensor_value_info( + "t" + str(num_of_nodes), TensorProto.FLOAT, input_shape + ) + + value_info = [] + nodes = [] + for i in range(num_of_nodes): + nodes += [ + helper.make_node("Add", ["t" + str(i), "p" + str(i)], ["t" + str(i + 1)]) + ] + value_info += [ + helper.make_tensor_value_info("p" + str(i), TensorProto.FLOAT, input_shape) + ] + + nodes = np.random.permutation(nodes) + + modelproto = helper.make_model( + helper.make_graph( + name="test", + inputs=[top_in], + outputs=[top_out], + value_info=value_info, + nodes=nodes, + ) + ) + model = ModelWrapper(modelproto) + model = model.transform(InferShapes()) + + for i in range(num_of_nodes): + model.set_initializer( + "p" + str(i), np.random.rand(*input_shape).astype(np.float32) + ) + + return model + + +@pytest.mark.parametrize("num_of_nodes", [64]) +def test_sort_linear_graph(num_of_nodes): + model = make_randomly_sorted_linear_model(num_of_nodes, seed=0) + new_model = model.transform(SortGraph()) + + # Test + ret = new_model.analysis(ta.nodes_topologically_sorted) + assert ret["nodes_topologically_sorted"], "Nodes are not topologically sorted." + + +def test_sort_nonlinear_graph(): + ch = 2 + ifmdim = 16 + input_shape = (1, ch, ifmdim, ifmdim) + + top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape) + top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, input_shape) + + num_of_params = 8 + value_info = [] + for i in range(num_of_params): + value_info += [ + helper.make_tensor_value_info("p" + str(i), TensorProto.FLOAT, input_shape) + ] + + modelproto = helper.make_model( + helper.make_graph( + name="test", + inputs=[top_in], + outputs=[top_out], + value_info=value_info, + nodes=[ + # Not sorted nodes + helper.make_node("Mul", ["fork1", "p2"], ["t3"]), + helper.make_node("Add", ["t4", "p3"], ["t5"]), + helper.make_node("Add", ["t2", "t3"], ["t4"]), + helper.make_node("Add", ["t6", "t7"], ["t8"]), + helper.make_node("Add", ["fork3", "fork3"], ["top_out"]), + helper.make_node("Mul", ["t5", "p4"], ["fork2"]), + helper.make_node("Add", ["top_in", "p0"], ["fork1"]), + helper.make_node("Mul", ["fork1", "p1"], ["t2"]), + helper.make_node("Add", ["fork2", "p5"], ["t6"]), + helper.make_node("Add", ["fork2", "p6"], ["t7"]), + helper.make_node("Mul", ["t8", "p7"], ["fork3"]), + ], + ) + ) + model = ModelWrapper(modelproto) + model = model.transform(InferShapes()) + + np.random.seed(0) + for i in range(num_of_params): + model.set_initializer( + "p" + str(i), np.random.rand(*input_shape).astype(np.float32) + ) + + new_model = model.transform(SortGraph()) + + # Test + ret = new_model.analysis(ta.nodes_topologically_sorted) + assert ret["nodes_topologically_sorted"], "Nodes are not topologically sorted." + + +if __name__ == "__main__": + import time + + sizes = [10, 50, 100, 500, 1000] + times = [] + reps = 10 + + print("SortGraph performance test:") + print("Test sizes", sizes) + print("Repetitions per size:", reps) + for sz in sizes: + acc_time = 0 + print(" Testing size ", sz) + for i in range(reps): + # it should take the same time even with the sorted one + # but better new model each time as it is a more general approach + model = make_randomly_sorted_linear_model(sz) # new model as seed is None + bef = time.time() + new_model = model.transform(SortGraph(), make_deepcopy=False) + acc_time += time.time() - bef + + times += [acc_time / reps] + + # print csv + print("\nnum_of_nodes, seconds") + for sz, tm in zip(sizes, times): + print("{:12d}, {:6.4e}".format(sz, tm)) + + # plot + # import matplotlib.pyplot as plt + # plt.plot(sizes,times,"--o") + # plt.grid(True)